Coverage for python/lsst/daf/butler/datastores/chainedDatastore.py: 92%

415 statements  

« prev     ^ index     » next       coverage.py v7.2.7, created at 2023-06-23 09:29 +0000

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22from __future__ import annotations 

23 

24"""Chained datastore.""" 

25 

26__all__ = ("ChainedDatastore",) 

27 

28import itertools 

29import logging 

30import time 

31import warnings 

32from collections.abc import Iterable, Mapping, Sequence 

33from typing import TYPE_CHECKING, Any 

34 

35from lsst.daf.butler import ( 

36 Constraints, 

37 DatasetRef, 

38 DatasetRefURIs, 

39 DatasetTypeNotSupportedError, 

40 Datastore, 

41 DatastoreConfig, 

42 DatastoreRecordData, 

43 DatastoreValidationError, 

44 FileDataset, 

45) 

46from lsst.resources import ResourcePath 

47from lsst.utils import doImportType 

48 

49if TYPE_CHECKING: 

50 from lsst.daf.butler import Config, DatasetType, LookupKey, StorageClass 

51 from lsst.daf.butler.registry.interfaces import DatasetIdRef, DatastoreRegistryBridgeManager 

52 from lsst.resources import ResourcePathExpression 

53 

54log = logging.getLogger(__name__) 

55 

56 

57class _IngestPrepData(Datastore.IngestPrepData): 

58 """Helper class for ChainedDatastore ingest implementation. 

59 

60 Parameters 

61 ---------- 

62 children : `list` of `tuple` 

63 Pairs of `Datastore`, `IngestPrepData` for all child datastores. 

64 """ 

65 

66 def __init__(self, children: list[tuple[Datastore, Datastore.IngestPrepData, set[ResourcePath]]]): 

67 super().__init__(itertools.chain.from_iterable(data.refs.values() for _, data, _ in children)) 

68 self.children = children 

69 

70 

71class ChainedDatastore(Datastore): 

72 """Chained Datastores to allow read and writes from multiple datastores. 

73 

74 A ChainedDatastore is configured with multiple datastore configurations. 

75 A ``put()`` is always sent to each datastore. A ``get()`` 

76 operation is sent to each datastore in turn and the first datastore 

77 to return a valid dataset is used. 

78 

79 Parameters 

80 ---------- 

81 config : `DatastoreConfig` or `str` 

82 Configuration. This configuration must include a ``datastores`` field 

83 as a sequence of datastore configurations. The order in this sequence 

84 indicates the order to use for read operations. 

85 bridgeManager : `DatastoreRegistryBridgeManager` 

86 Object that manages the interface between `Registry` and datastores. 

87 butlerRoot : `str`, optional 

88 New datastore root to use to override the configuration value. This 

89 root is sent to each child datastore. 

90 

91 Notes 

92 ----- 

93 ChainedDatastore never supports `None` or `"move"` as an `ingest` transfer 

94 mode. It supports `"copy"`, `"symlink"`, `"relsymlink"` 

95 and `"hardlink"` if and only if all its child datastores do. 

96 """ 

97 

98 defaultConfigFile = "datastores/chainedDatastore.yaml" 

99 """Path to configuration defaults. Accessed within the ``configs`` resource 

100 or relative to a search path. Can be None if no defaults specified. 

101 """ 

102 

103 containerKey = "datastores" 

104 """Key to specify where child datastores are configured.""" 

105 

106 datastores: list[Datastore] 

107 """All the child datastores known to this datastore.""" 

108 

109 datastoreConstraints: Sequence[Constraints | None] 

110 """Constraints to be applied to each of the child datastores.""" 

111 

112 @classmethod 

113 def setConfigRoot(cls, root: str, config: Config, full: Config, overwrite: bool = True) -> None: 

114 """Set any filesystem-dependent config options for child Datastores to 

115 be appropriate for a new empty repository with the given root. 

116 

117 Parameters 

118 ---------- 

119 root : `str` 

120 Filesystem path to the root of the data repository. 

121 config : `Config` 

122 A `Config` to update. Only the subset understood by 

123 this component will be updated. Will not expand 

124 defaults. 

125 full : `Config` 

126 A complete config with all defaults expanded that can be 

127 converted to a `DatastoreConfig`. Read-only and will not be 

128 modified by this method. 

129 Repository-specific options that should not be obtained 

130 from defaults when Butler instances are constructed 

131 should be copied from ``full`` to ``config``. 

132 overwrite : `bool`, optional 

133 If `False`, do not modify a value in ``config`` if the value 

134 already exists. Default is always to overwrite with the provided 

135 ``root``. 

136 

137 Notes 

138 ----- 

139 If a keyword is explicitly defined in the supplied ``config`` it 

140 will not be overridden by this method if ``overwrite`` is `False`. 

141 This allows explicit values set in external configs to be retained. 

142 """ 

143 

144 # Extract the part of the config we care about updating 

145 datastoreConfig = DatastoreConfig(config, mergeDefaults=False) 

146 

147 # And the subset of the full config that we can use for reference. 

148 # Do not bother with defaults because we are told this already has 

149 # them. 

150 fullDatastoreConfig = DatastoreConfig(full, mergeDefaults=False) 

151 

152 # Loop over each datastore config and pass the subsets to the 

153 # child datastores to process. 

154 

155 containerKey = cls.containerKey 

156 for idx, (child, fullChild) in enumerate( 

157 zip(datastoreConfig[containerKey], fullDatastoreConfig[containerKey]) 

158 ): 

159 childConfig = DatastoreConfig(child, mergeDefaults=False) 

160 fullChildConfig = DatastoreConfig(fullChild, mergeDefaults=False) 

161 datastoreClass = doImportType(fullChildConfig["cls"]) 

162 if not issubclass(datastoreClass, Datastore): 162 ↛ 163line 162 didn't jump to line 163, because the condition on line 162 was never true

163 raise TypeError(f"Imported child class {fullChildConfig['cls']} is not a Datastore") 

164 newroot = f"{root}/{datastoreClass.__qualname__}_{idx}" 

165 datastoreClass.setConfigRoot(newroot, childConfig, fullChildConfig, overwrite=overwrite) 

166 

167 # Reattach to parent 

168 datastoreConfig[containerKey, idx] = childConfig 

169 

170 # Reattach modified datastore config to parent 

171 # If this has a datastore key we attach there, otherwise we assume 

172 # this information goes at the top of the config hierarchy. 

173 if DatastoreConfig.component in config: 

174 config[DatastoreConfig.component] = datastoreConfig 

175 else: 

176 config.update(datastoreConfig) 

177 

178 return 

179 

180 def __init__( 

181 self, 

182 config: Config | ResourcePathExpression, 

183 bridgeManager: DatastoreRegistryBridgeManager, 

184 butlerRoot: str | None = None, 

185 ): 

186 super().__init__(config, bridgeManager) 

187 

188 # Scan for child datastores and instantiate them with the same registry 

189 self.datastores = [] 

190 for c in self.config["datastores"]: 

191 c = DatastoreConfig(c) 

192 datastoreType = doImportType(c["cls"]) 

193 if not issubclass(datastoreType, Datastore): 193 ↛ 194line 193 didn't jump to line 194, because the condition on line 193 was never true

194 raise TypeError(f"Imported child class {c['cls']} is not a Datastore") 

195 datastore = datastoreType(c, bridgeManager, butlerRoot=butlerRoot) 

196 log.debug("Creating child datastore %s", datastore.name) 

197 self.datastores.append(datastore) 

198 

199 # Name ourself based on our children 

200 if self.datastores: 200 ↛ 205line 200 didn't jump to line 205, because the condition on line 200 was never false

201 # We must set the names explicitly 

202 self._names = [d.name for d in self.datastores] 

203 childNames = ",".join(self.names) 

204 else: 

205 childNames = f"(empty@{time.time()})" 

206 self._names = [childNames] 

207 self.name = f"{type(self).__qualname__}[{childNames}]" 

208 

209 # We declare we are ephemeral if all our child datastores declare 

210 # they are ephemeral 

211 isEphemeral = True 

212 for d in self.datastores: 

213 if not d.isEphemeral: 

214 isEphemeral = False 

215 break 

216 self.isEphemeral = isEphemeral 

217 

218 # per-datastore override constraints 

219 if "datastore_constraints" in self.config: 

220 overrides = self.config["datastore_constraints"] 

221 

222 if len(overrides) != len(self.datastores): 222 ↛ 223line 222 didn't jump to line 223, because the condition on line 222 was never true

223 raise DatastoreValidationError( 

224 f"Number of registered datastores ({len(self.datastores)})" 

225 " differs from number of constraints overrides" 

226 f" {len(overrides)}" 

227 ) 

228 

229 self.datastoreConstraints = [ 

230 Constraints(c.get("constraints"), universe=bridgeManager.universe) for c in overrides 

231 ] 

232 

233 else: 

234 self.datastoreConstraints = (None,) * len(self.datastores) 

235 

236 log.debug("Created %s (%s)", self.name, ("ephemeral" if self.isEphemeral else "permanent")) 

237 

238 @property 

239 def names(self) -> tuple[str, ...]: 

240 return tuple(self._names) 

241 

242 def __str__(self) -> str: 

243 chainName = ", ".join(str(ds) for ds in self.datastores) 

244 return chainName 

245 

246 def knows(self, ref: DatasetRef) -> bool: 

247 """Check if the dataset is known to any of the datastores. 

248 

249 Does not check for existence of any artifact. 

250 

251 Parameters 

252 ---------- 

253 ref : `DatasetRef` 

254 Reference to the required dataset. 

255 

256 Returns 

257 ------- 

258 exists : `bool` 

259 `True` if the dataset is known to the datastore. 

260 """ 

261 for datastore in self.datastores: 

262 if datastore.knows(ref): 

263 log.debug("%s known to datastore %s", ref, datastore.name) 

264 return True 

265 return False 

266 

267 def knows_these(self, refs: Iterable[DatasetRef]) -> dict[DatasetRef, bool]: 

268 # Docstring inherited from the base class. 

269 refs_known: dict[DatasetRef, bool] = {} 

270 for datastore in self.datastores: 

271 refs_known.update(datastore.knows_these(refs)) 

272 

273 # No need to check in next datastore for refs that are known. 

274 # We only update entries that were initially False. 

275 refs = [ref for ref, known in refs_known.items() if not known] 

276 

277 return refs_known 

278 

279 def mexists( 

280 self, refs: Iterable[DatasetRef], artifact_existence: dict[ResourcePath, bool] | None = None 

281 ) -> dict[DatasetRef, bool]: 

282 """Check the existence of multiple datasets at once. 

283 

284 Parameters 

285 ---------- 

286 refs : iterable of `DatasetRef` 

287 The datasets to be checked. 

288 artifact_existence : `dict` [`lsst.resources.ResourcePath`, `bool`] 

289 Optional mapping of datastore artifact to existence. Updated by 

290 this method with details of all artifacts tested. Can be `None` 

291 if the caller is not interested. 

292 

293 Returns 

294 ------- 

295 existence : `dict` of [`DatasetRef`, `bool`] 

296 Mapping from dataset to boolean indicating existence in any 

297 of the child datastores. 

298 """ 

299 dataset_existence: dict[DatasetRef, bool] = {} 

300 for datastore in self.datastores: 

301 dataset_existence.update(datastore.mexists(refs, artifact_existence=artifact_existence)) 

302 

303 # For next datastore no point asking about ones we know 

304 # exist already. No special exemption for ephemeral datastores. 

305 refs = [ref for ref, exists in dataset_existence.items() if not exists] 

306 

307 return dataset_existence 

308 

309 def exists(self, ref: DatasetRef) -> bool: 

310 """Check if the dataset exists in one of the datastores. 

311 

312 Parameters 

313 ---------- 

314 ref : `DatasetRef` 

315 Reference to the required dataset. 

316 

317 Returns 

318 ------- 

319 exists : `bool` 

320 `True` if the entity exists in one of the child datastores. 

321 """ 

322 for datastore in self.datastores: 

323 if datastore.exists(ref): 

324 log.debug("Found %s in datastore %s", ref, datastore.name) 

325 return True 

326 return False 

327 

328 def get( 

329 self, 

330 ref: DatasetRef, 

331 parameters: Mapping[str, Any] | None = None, 

332 storageClass: StorageClass | str | None = None, 

333 ) -> Any: 

334 """Load an InMemoryDataset from the store. 

335 

336 The dataset is returned from the first datastore that has 

337 the dataset. 

338 

339 Parameters 

340 ---------- 

341 ref : `DatasetRef` 

342 Reference to the required Dataset. 

343 parameters : `dict` 

344 `StorageClass`-specific parameters that specify, for example, 

345 a slice of the dataset to be loaded. 

346 storageClass : `StorageClass` or `str`, optional 

347 The storage class to be used to override the Python type 

348 returned by this method. By default the returned type matches 

349 the dataset type definition for this dataset. Specifying a 

350 read `StorageClass` can force a different type to be returned. 

351 This type must be compatible with the original type. 

352 

353 Returns 

354 ------- 

355 inMemoryDataset : `object` 

356 Requested dataset or slice thereof as an InMemoryDataset. 

357 

358 Raises 

359 ------ 

360 FileNotFoundError 

361 Requested dataset can not be retrieved. 

362 TypeError 

363 Return value from formatter has unexpected type. 

364 ValueError 

365 Formatter failed to process the dataset. 

366 """ 

367 

368 for datastore in self.datastores: 

369 try: 

370 inMemoryObject = datastore.get(ref, parameters, storageClass=storageClass) 

371 log.debug("Found dataset %s in datastore %s", ref, datastore.name) 

372 return inMemoryObject 

373 except FileNotFoundError: 

374 pass 

375 

376 raise FileNotFoundError(f"Dataset {ref} could not be found in any of the datastores") 

377 

378 def put(self, inMemoryDataset: Any, ref: DatasetRef) -> None: 

379 """Write a InMemoryDataset with a given `DatasetRef` to each 

380 datastore. 

381 

382 The put() to child datastores can fail with 

383 `DatasetTypeNotSupportedError`. The put() for this datastore will be 

384 deemed to have succeeded so long as at least one child datastore 

385 accepted the inMemoryDataset. 

386 

387 Parameters 

388 ---------- 

389 inMemoryDataset : `object` 

390 The dataset to store. 

391 ref : `DatasetRef` 

392 Reference to the associated Dataset. 

393 

394 Raises 

395 ------ 

396 TypeError 

397 Supplied object and storage class are inconsistent. 

398 DatasetTypeNotSupportedError 

399 All datastores reported `DatasetTypeNotSupportedError`. 

400 """ 

401 log.debug("Put %s", ref) 

402 

403 # Confirm that we can accept this dataset 

404 if not self.constraints.isAcceptable(ref): 

405 # Raise rather than use boolean return value. 

406 raise DatasetTypeNotSupportedError( 

407 f"Dataset {ref} has been rejected by this datastore via configuration." 

408 ) 

409 

410 isPermanent = False 

411 nsuccess = 0 

412 npermanent = 0 

413 nephemeral = 0 

414 for datastore, constraints in zip(self.datastores, self.datastoreConstraints): 

415 if ( 

416 constraints is not None and not constraints.isAcceptable(ref) 

417 ) or not datastore.constraints.isAcceptable(ref): 

418 log.debug("Datastore %s skipping put via configuration for ref %s", datastore.name, ref) 

419 continue 

420 

421 if datastore.isEphemeral: 

422 nephemeral += 1 

423 else: 

424 npermanent += 1 

425 try: 

426 datastore.put(inMemoryDataset, ref) 

427 nsuccess += 1 

428 if not datastore.isEphemeral: 

429 isPermanent = True 

430 except DatasetTypeNotSupportedError: 

431 pass 

432 

433 if nsuccess == 0: 

434 raise DatasetTypeNotSupportedError(f"None of the chained datastores supported ref {ref}") 

435 

436 if not isPermanent and npermanent > 0: 436 ↛ 437line 436 didn't jump to line 437, because the condition on line 436 was never true

437 warnings.warn(f"Put of {ref} only succeeded in ephemeral databases", stacklevel=2) 

438 

439 if self._transaction is not None: 

440 self._transaction.registerUndo("put", self.remove, ref) 

441 

442 def _overrideTransferMode(self, *datasets: Any, transfer: str | None = None) -> str | None: 

443 # Docstring inherited from base class. 

444 if transfer != "auto": 

445 return transfer 

446 # Ask each datastore what they think auto means 

447 transfers = {d._overrideTransferMode(*datasets, transfer=transfer) for d in self.datastores} 

448 

449 # Remove any untranslated "auto" values 

450 transfers.discard(transfer) 

451 

452 if len(transfers) == 1: 452 ↛ 453line 452 didn't jump to line 453, because the condition on line 452 was never true

453 return transfers.pop() 

454 if not transfers: 454 ↛ 458line 454 didn't jump to line 458, because the condition on line 454 was never false

455 # Everything reported "auto" 

456 return transfer 

457 

458 raise RuntimeError( 

459 "Chained datastore does not yet support different transfer modes" 

460 f" from 'auto' in each child datastore (wanted {transfers})" 

461 ) 

462 

463 def _prepIngest(self, *datasets: FileDataset, transfer: str | None = None) -> _IngestPrepData: 

464 # Docstring inherited from Datastore._prepIngest. 

465 if transfer is None: 

466 raise NotImplementedError("ChainedDatastore does not support transfer=None.") 

467 

468 def isDatasetAcceptable(dataset: FileDataset, *, name: str, constraints: Constraints) -> bool: 

469 acceptable = [ref for ref in dataset.refs if constraints.isAcceptable(ref)] 

470 if not acceptable: 

471 log.debug( 

472 "Datastore %s skipping ingest via configuration for refs %s", 

473 name, 

474 ", ".join(str(ref) for ref in dataset.refs), 

475 ) 

476 return False 

477 else: 

478 return True 

479 

480 # Filter down to just datasets the chained datastore's own 

481 # configuration accepts. 

482 okForParent: list[FileDataset] = [ 

483 dataset 

484 for dataset in datasets 

485 if isDatasetAcceptable(dataset, name=self.name, constraints=self.constraints) 

486 ] 

487 

488 # Iterate over nested datastores and call _prepIngest on each. 

489 # Save the results to a list: 

490 children: list[tuple[Datastore, Datastore.IngestPrepData, set[ResourcePath]]] = [] 

491 # ...and remember whether all of the failures are due to 

492 # NotImplementedError being raised. 

493 allFailuresAreNotImplementedError = True 

494 for datastore, constraints in zip(self.datastores, self.datastoreConstraints): 

495 okForChild: list[FileDataset] 

496 if constraints is not None: 

497 okForChild = [ 

498 dataset 

499 for dataset in okForParent 

500 if isDatasetAcceptable(dataset, name=datastore.name, constraints=constraints) 

501 ] 

502 else: 

503 okForChild = okForParent 

504 try: 

505 prepDataForChild = datastore._prepIngest(*okForChild, transfer=transfer) 

506 except NotImplementedError: 

507 log.debug( 

508 "Skipping ingest for datastore %s because transfer mode %s is not supported.", 

509 datastore.name, 

510 transfer, 

511 ) 

512 continue 

513 allFailuresAreNotImplementedError = False 

514 if okForChild: 

515 # Do not store for later if a datastore has rejected 

516 # everything. 

517 # Include the source paths if this is a "move". It's clearer 

518 # to find the paths now rather than try to infer how 

519 # each datastore has stored them in the internal prep class. 

520 paths = ( 

521 {ResourcePath(dataset.path) for dataset in okForChild} if transfer == "move" else set() 

522 ) 

523 children.append((datastore, prepDataForChild, paths)) 

524 if allFailuresAreNotImplementedError: 

525 raise NotImplementedError(f"No child datastore supports transfer mode {transfer}.") 

526 return _IngestPrepData(children=children) 

527 

528 def _finishIngest( 

529 self, 

530 prepData: _IngestPrepData, 

531 *, 

532 transfer: str | None = None, 

533 record_validation_info: bool = True, 

534 ) -> None: 

535 # Docstring inherited from Datastore._finishIngest. 

536 # For "move" we must use "copy" and then delete the input 

537 # data at the end. This has no rollback option if the ingest 

538 # subsequently fails. If there is only one active datastore 

539 # accepting any files we can leave it as "move" 

540 actual_transfer: str | None 

541 if transfer == "move" and len(prepData.children) > 1: 

542 actual_transfer = "copy" 

543 else: 

544 actual_transfer = transfer 

545 to_be_deleted: set[ResourcePath] = set() 

546 for datastore, prepDataForChild, paths in prepData.children: 

547 datastore._finishIngest( 

548 prepDataForChild, transfer=actual_transfer, record_validation_info=record_validation_info 

549 ) 

550 to_be_deleted.update(paths) 

551 if actual_transfer != transfer: 

552 # These datasets were copied but now need to be deleted. 

553 # This can not be rolled back. 

554 for uri in to_be_deleted: 

555 uri.remove() 

556 

557 def getManyURIs( 

558 self, 

559 refs: Iterable[DatasetRef], 

560 predict: bool = False, 

561 allow_missing: bool = False, 

562 ) -> dict[DatasetRef, DatasetRefURIs]: 

563 # Docstring inherited 

564 

565 uris: dict[DatasetRef, DatasetRefURIs] = {} 

566 missing_refs = set(refs) 

567 

568 # If predict is True we don't want to predict a dataset in the first 

569 # datastore if it actually exists in a later datastore, so in that 

570 # case check all datastores with predict=False first, and then try 

571 # again with predict=True. 

572 for p in (False, True) if predict else (False,): 

573 if not missing_refs: 

574 break 

575 for datastore in self.datastores: 

576 try: 

577 got_uris = datastore.getManyURIs(missing_refs, p, allow_missing=True) 

578 except NotImplementedError: 

579 # some datastores may not implement generating URIs 

580 continue 

581 missing_refs -= got_uris.keys() 

582 uris.update(got_uris) 

583 if not missing_refs: 

584 break 

585 

586 if missing_refs and not allow_missing: 

587 raise FileNotFoundError(f"Dataset(s) {missing_refs} not in this datastore.") 

588 

589 return uris 

590 

591 def getURIs(self, ref: DatasetRef, predict: bool = False) -> DatasetRefURIs: 

592 """Return URIs associated with dataset. 

593 

594 Parameters 

595 ---------- 

596 ref : `DatasetRef` 

597 Reference to the required dataset. 

598 predict : `bool`, optional 

599 If the datastore does not know about the dataset, should it 

600 return a predicted URI or not? 

601 

602 Returns 

603 ------- 

604 uris : `DatasetRefURIs` 

605 The URI to the primary artifact associated with this dataset (if 

606 the dataset was disassembled within the datastore this may be 

607 `None`), and the URIs to any components associated with the dataset 

608 artifact. (can be empty if there are no components). 

609 

610 Notes 

611 ----- 

612 The returned URI is from the first datastore in the list that has 

613 the dataset with preference given to the first dataset coming from 

614 a permanent datastore. If no datastores have the dataset and prediction 

615 is allowed, the predicted URI for the first datastore in the list will 

616 be returned. 

617 """ 

618 log.debug("Requesting URIs for %s", ref) 

619 predictedUri: DatasetRefURIs | None = None 

620 predictedEphemeralUri: DatasetRefURIs | None = None 

621 firstEphemeralUri: DatasetRefURIs | None = None 

622 for datastore in self.datastores: 

623 if datastore.exists(ref): 

624 if not datastore.isEphemeral: 

625 uri = datastore.getURIs(ref) 

626 log.debug("Retrieved non-ephemeral URI: %s", uri) 

627 return uri 

628 elif not firstEphemeralUri: 

629 firstEphemeralUri = datastore.getURIs(ref) 

630 elif predict: 

631 if not predictedUri and not datastore.isEphemeral: 

632 predictedUri = datastore.getURIs(ref, predict) 

633 elif not predictedEphemeralUri and datastore.isEphemeral: 

634 predictedEphemeralUri = datastore.getURIs(ref, predict) 

635 

636 if firstEphemeralUri: 

637 log.debug("Retrieved ephemeral URI: %s", firstEphemeralUri) 

638 return firstEphemeralUri 

639 

640 if predictedUri: 

641 log.debug("Retrieved predicted URI: %s", predictedUri) 

642 return predictedUri 

643 

644 if predictedEphemeralUri: 

645 log.debug("Retrieved predicted ephemeral URI: %s", predictedEphemeralUri) 

646 return predictedEphemeralUri 

647 

648 raise FileNotFoundError(f"Dataset {ref} not in any datastore") 

649 

650 def getURI(self, ref: DatasetRef, predict: bool = False) -> ResourcePath: 

651 """URI to the Dataset. 

652 

653 The returned URI is from the first datastore in the list that has 

654 the dataset with preference given to the first dataset coming from 

655 a permanent datastore. If no datastores have the dataset and prediction 

656 is allowed, the predicted URI for the first datastore in the list will 

657 be returned. 

658 

659 Parameters 

660 ---------- 

661 ref : `DatasetRef` 

662 Reference to the required Dataset. 

663 predict : `bool` 

664 If `True`, allow URIs to be returned of datasets that have not 

665 been written. 

666 

667 Returns 

668 ------- 

669 uri : `lsst.resources.ResourcePath` 

670 URI pointing to the dataset within the datastore. If the 

671 dataset does not exist in the datastore, and if ``predict`` is 

672 `True`, the URI will be a prediction and will include a URI 

673 fragment "#predicted". 

674 

675 Notes 

676 ----- 

677 If the datastore does not have entities that relate well 

678 to the concept of a URI the returned URI string will be 

679 descriptive. The returned URI is not guaranteed to be obtainable. 

680 

681 Raises 

682 ------ 

683 FileNotFoundError 

684 A URI has been requested for a dataset that does not exist and 

685 guessing is not allowed. 

686 RuntimeError 

687 Raised if a request is made for a single URI but multiple URIs 

688 are associated with this dataset. 

689 """ 

690 log.debug("Requesting URI for %s", ref) 

691 primary, components = self.getURIs(ref, predict) 

692 if primary is None or components: 692 ↛ 693line 692 didn't jump to line 693, because the condition on line 692 was never true

693 raise RuntimeError( 

694 f"Dataset ({ref}) includes distinct URIs for components. Use Datastore.getURIs() instead." 

695 ) 

696 return primary 

697 

698 def retrieveArtifacts( 

699 self, 

700 refs: Iterable[DatasetRef], 

701 destination: ResourcePath, 

702 transfer: str = "auto", 

703 preserve_path: bool = True, 

704 overwrite: bool = False, 

705 ) -> list[ResourcePath]: 

706 """Retrieve the file artifacts associated with the supplied refs. 

707 

708 Parameters 

709 ---------- 

710 refs : iterable of `DatasetRef` 

711 The datasets for which file artifacts are to be retrieved. 

712 A single ref can result in multiple files. The refs must 

713 be resolved. 

714 destination : `lsst.resources.ResourcePath` 

715 Location to write the file artifacts. 

716 transfer : `str`, optional 

717 Method to use to transfer the artifacts. Must be one of the options 

718 supported by `lsst.resources.ResourcePath.transfer_from()`. 

719 "move" is not allowed. 

720 preserve_path : `bool`, optional 

721 If `True` the full path of the file artifact within the datastore 

722 is preserved. If `False` the final file component of the path 

723 is used. 

724 overwrite : `bool`, optional 

725 If `True` allow transfers to overwrite existing files at the 

726 destination. 

727 

728 Returns 

729 ------- 

730 targets : `list` of `lsst.resources.ResourcePath` 

731 URIs of file artifacts in destination location. Order is not 

732 preserved. 

733 """ 

734 if not destination.isdir(): 734 ↛ 735line 734 didn't jump to line 735, because the condition on line 734 was never true

735 raise ValueError(f"Destination location must refer to a directory. Given {destination}") 

736 

737 # Using getURIs is not feasible since it becomes difficult to 

738 # determine the path within the datastore later on. For now 

739 # follow getURIs implementation approach. 

740 

741 pending = set(refs) 

742 

743 # There is a question as to whether an exception should be raised 

744 # early if some of the refs are missing, or whether files should be 

745 # transferred until a problem is hit. Prefer to complain up front. 

746 # Use the datastore integer as primary key. 

747 grouped_by_datastore: dict[int, set[DatasetRef]] = {} 

748 

749 for number, datastore in enumerate(self.datastores): 

750 if datastore.isEphemeral: 

751 # In the future we will want to distinguish in-memory from 

752 # caching datastore since using an on-disk local 

753 # cache is exactly what we should be doing. 

754 continue 

755 try: 

756 datastore_refs = {ref for ref in pending if datastore.exists(ref)} 

757 except NotImplementedError: 

758 # Some datastores may not support retrieving artifacts 

759 continue 

760 

761 if datastore_refs: 

762 grouped_by_datastore[number] = datastore_refs 

763 

764 # Remove these from the pending list so that we do not bother 

765 # looking for them any more. 

766 pending = pending - datastore_refs 

767 

768 if pending: 768 ↛ 769line 768 didn't jump to line 769, because the condition on line 768 was never true

769 raise RuntimeError(f"Some datasets were not found in any datastores: {pending}") 

770 

771 # Now do the transfer. 

772 targets: list[ResourcePath] = [] 

773 for number, datastore_refs in grouped_by_datastore.items(): 

774 targets.extend( 

775 self.datastores[number].retrieveArtifacts( 

776 datastore_refs, 

777 destination, 

778 transfer=transfer, 

779 preserve_path=preserve_path, 

780 overwrite=overwrite, 

781 ) 

782 ) 

783 

784 return targets 

785 

786 def remove(self, ref: DatasetRef) -> None: 

787 """Indicate to the datastore that a dataset can be removed. 

788 

789 The dataset will be removed from each datastore. The dataset is 

790 not required to exist in every child datastore. 

791 

792 Parameters 

793 ---------- 

794 ref : `DatasetRef` 

795 Reference to the required dataset. 

796 

797 Raises 

798 ------ 

799 FileNotFoundError 

800 Attempt to remove a dataset that does not exist. Raised if none 

801 of the child datastores removed the dataset. 

802 """ 

803 log.debug("Removing %s", ref) 

804 self.trash(ref, ignore_errors=False) 

805 self.emptyTrash(ignore_errors=False) 

806 

807 def forget(self, refs: Iterable[DatasetRef]) -> None: 

808 for datastore in tuple(self.datastores): 

809 datastore.forget(refs) 

810 

811 def trash(self, ref: DatasetRef | Iterable[DatasetRef], ignore_errors: bool = True) -> None: 

812 if isinstance(ref, DatasetRef): 

813 ref_label = str(ref) 

814 else: 

815 ref_label = "bulk datasets" 

816 

817 log.debug("Trashing %s", ref_label) 

818 

819 counter = 0 

820 for datastore in self.datastores: 

821 try: 

822 datastore.trash(ref, ignore_errors=ignore_errors) 

823 counter += 1 

824 except FileNotFoundError: 

825 pass 

826 

827 if counter == 0: 

828 err_msg = f"Could not mark for removal from any child datastore: {ref_label}" 

829 if ignore_errors: 829 ↛ 830line 829 didn't jump to line 830, because the condition on line 829 was never true

830 log.warning(err_msg) 

831 else: 

832 raise FileNotFoundError(err_msg) 

833 

834 def emptyTrash(self, ignore_errors: bool = True) -> None: 

835 for datastore in self.datastores: 

836 datastore.emptyTrash(ignore_errors=ignore_errors) 

837 

838 def transfer(self, inputDatastore: Datastore, ref: DatasetRef) -> None: 

839 """Retrieve a dataset from an input `Datastore`, 

840 and store the result in this `Datastore`. 

841 

842 Parameters 

843 ---------- 

844 inputDatastore : `Datastore` 

845 The external `Datastore` from which to retreive the Dataset. 

846 ref : `DatasetRef` 

847 Reference to the required dataset in the input data store. 

848 

849 Returns 

850 ------- 

851 results : `list` 

852 List containing the return value from the ``put()`` to each 

853 child datastore. 

854 """ 

855 assert inputDatastore is not self # unless we want it for renames? 

856 inMemoryDataset = inputDatastore.get(ref) 

857 self.put(inMemoryDataset, ref) 

858 

859 def validateConfiguration( 

860 self, entities: Iterable[DatasetRef | DatasetType | StorageClass], logFailures: bool = False 

861 ) -> None: 

862 """Validate some of the configuration for this datastore. 

863 

864 Parameters 

865 ---------- 

866 entities : iterable of `DatasetRef`, `DatasetType`, or `StorageClass` 

867 Entities to test against this configuration. Can be differing 

868 types. 

869 logFailures : `bool`, optional 

870 If `True`, output a log message for every validation error 

871 detected. 

872 

873 Raises 

874 ------ 

875 DatastoreValidationError 

876 Raised if there is a validation problem with a configuration. 

877 All the problems are reported in a single exception. 

878 

879 Notes 

880 ----- 

881 This method checks each datastore in turn. 

882 """ 

883 

884 # Need to catch each of the datastore outputs and ensure that 

885 # all are tested. 

886 failures = [] 

887 for datastore in self.datastores: 

888 try: 

889 datastore.validateConfiguration(entities, logFailures=logFailures) 

890 except DatastoreValidationError as e: 

891 if logFailures: 891 ↛ 893line 891 didn't jump to line 893, because the condition on line 891 was never false

892 log.critical("Datastore %s failed validation", datastore.name) 

893 failures.append(f"Datastore {self.name}: {e}") 

894 

895 if failures: 

896 msg = ";\n".join(failures) 

897 raise DatastoreValidationError(msg) 

898 

899 def validateKey(self, lookupKey: LookupKey, entity: DatasetRef | DatasetType | StorageClass) -> None: 

900 # Docstring is inherited from base class 

901 failures = [] 

902 for datastore in self.datastores: 

903 try: 

904 datastore.validateKey(lookupKey, entity) 

905 except DatastoreValidationError as e: 

906 failures.append(f"Datastore {self.name}: {e}") 

907 

908 if failures: 

909 msg = ";\n".join(failures) 

910 raise DatastoreValidationError(msg) 

911 

912 def getLookupKeys(self) -> set[LookupKey]: 

913 # Docstring is inherited from base class 

914 keys = set() 

915 for datastore in self.datastores: 

916 keys.update(datastore.getLookupKeys()) 

917 

918 keys.update(self.constraints.getLookupKeys()) 

919 for p in self.datastoreConstraints: 

920 if p is not None: 920 ↛ 919line 920 didn't jump to line 919, because the condition on line 920 was never false

921 keys.update(p.getLookupKeys()) 

922 

923 return keys 

924 

925 def needs_expanded_data_ids( 

926 self, 

927 transfer: str | None, 

928 entity: DatasetRef | DatasetType | StorageClass | None = None, 

929 ) -> bool: 

930 # Docstring inherited. 

931 # We can't safely use `self.datastoreConstraints` with `entity` to 

932 # check whether a child datastore would even want to ingest this 

933 # dataset, because we don't want to filter out datastores that might 

934 # need an expanded data ID based in incomplete information (e.g. we 

935 # pass a StorageClass, but the constraint dispatches on DatasetType). 

936 # So we pessimistically check if any datastore would need an expanded 

937 # data ID for this transfer mode. 

938 return any(datastore.needs_expanded_data_ids(transfer, entity) for datastore in self.datastores) 

939 

940 def import_records(self, data: Mapping[str, DatastoreRecordData]) -> None: 

941 # Docstring inherited from the base class. 

942 

943 for datastore in self.datastores: 

944 datastore.import_records(data) 

945 

946 def export_records(self, refs: Iterable[DatasetIdRef]) -> Mapping[str, DatastoreRecordData]: 

947 # Docstring inherited from the base class. 

948 

949 all_records: dict[str, DatastoreRecordData] = {} 

950 

951 # Merge all sub-datastore records into one structure 

952 for datastore in self.datastores: 

953 sub_records = datastore.export_records(refs) 

954 for name, record_data in sub_records.items(): 

955 # All datastore names must be unique in a chain. 

956 if name in all_records: 956 ↛ 957line 956 didn't jump to line 957, because the condition on line 956 was never true

957 raise ValueError("Non-unique datastore name found in datastore {datastore}") 

958 all_records[name] = record_data 

959 

960 return all_records 

961 

962 def export( 

963 self, 

964 refs: Iterable[DatasetRef], 

965 *, 

966 directory: ResourcePathExpression | None = None, 

967 transfer: str | None = "auto", 

968 ) -> Iterable[FileDataset]: 

969 # Docstring inherited from Datastore.export. 

970 if transfer == "auto" and directory is None: 

971 transfer = None 

972 

973 if transfer is not None and directory is None: 

974 raise TypeError(f"Cannot export using transfer mode {transfer} with no export directory given") 

975 

976 if transfer == "move": 

977 raise TypeError("Can not export by moving files out of datastore.") 

978 

979 # Exporting from a chain has the potential for a dataset to be 

980 # in one or more of the datastores in the chain. We only need one 

981 # of them since we assume the datasets are the same in all (but 

982 # the file format could be different of course since that is a 

983 # per-datastore configuration). 

984 # We also do not know whether any of the datastores in the chain 

985 # support file export. 

986 

987 # Ensure we have an ordered sequence that is not an iterator or set. 

988 if not isinstance(refs, Sequence): 

989 refs = list(refs) 

990 

991 # If any of the datasets are missing entirely we need to raise early 

992 # before we try to run the export. This can be a little messy but is 

993 # better than exporting files from the first datastore and then finding 

994 # that one is missing but is not in the second datastore either. 

995 known = [datastore.knows_these(refs) for datastore in self.datastores] 

996 refs_known: set[DatasetRef] = set() 

997 for known_to_this in known: 

998 refs_known.update({ref for ref, knows_this in known_to_this.items() if knows_this}) 

999 missing_count = len(refs) - len(refs_known) 

1000 if missing_count: 

1001 raise FileNotFoundError(f"Not all datasets known to this datastore. Missing {missing_count}") 

1002 

1003 # To allow us to slot each result into the right place after 

1004 # asking each datastore, create a dict with the index. 

1005 ref_positions = {ref: i for i, ref in enumerate(refs)} 

1006 

1007 # Presize the final export list. 

1008 exported: list[FileDataset | None] = [None] * len(refs) 

1009 

1010 # The order of the returned dataset has to match the order of the 

1011 # given refs, even if they are all from different datastores. 

1012 for i, datastore in enumerate(self.datastores): 

1013 known_to_this = known[i] 

1014 filtered = [ref for ref, knows in known_to_this.items() if knows and ref in ref_positions] 

1015 

1016 try: 

1017 this_export = datastore.export(filtered, directory=directory, transfer=transfer) 

1018 except NotImplementedError: 

1019 # Try the next datastore. 

1020 continue 

1021 

1022 for ref, export in zip(filtered, this_export): 

1023 # Get the position and also delete it from the list. 

1024 exported[ref_positions.pop(ref)] = export 

1025 

1026 # Every dataset should be accounted for because of the earlier checks 

1027 # but make sure that we did fill all the slots to appease mypy. 

1028 for i, dataset in enumerate(exported): 

1029 if dataset is None: 1029 ↛ 1030line 1029 didn't jump to line 1030, because the condition on line 1029 was never true

1030 raise FileNotFoundError(f"Failed to export dataset {refs[i]}.") 

1031 yield dataset 

1032 

1033 def transfer_from( 

1034 self, 

1035 source_datastore: Datastore, 

1036 refs: Iterable[DatasetRef], 

1037 transfer: str = "auto", 

1038 artifact_existence: dict[ResourcePath, bool] | None = None, 

1039 ) -> tuple[set[DatasetRef], set[DatasetRef]]: 

1040 # Docstring inherited 

1041 # mypy does not understand "type(self) is not type(source)" 

1042 if isinstance(source_datastore, ChainedDatastore): 

1043 # Both the source and destination are chained datastores. 

1044 source_datastores = tuple(source_datastore.datastores) 

1045 else: 

1046 # The source datastore is different, forward everything to the 

1047 # child datastores. 

1048 source_datastores = tuple([source_datastore]) 

1049 

1050 # Need to know the set of all possible refs that could be transferred. 

1051 remaining_refs = set(refs) 

1052 

1053 missing_from_source: set[DatasetRef] | None = None 

1054 all_accepted = set() 

1055 nsuccess = 0 

1056 for source_child in source_datastores: 

1057 # If we are reading from a chained datastore, it's possible that 

1058 # only a subset of the datastores know about the dataset. We can't 

1059 # ask the receiving datastore to copy it when it doesn't exist 

1060 # so we have to filter again based on what the source datastore 

1061 # understands. 

1062 known_to_source = source_child.knows_these([ref for ref in refs]) 

1063 

1064 # Need to know that there is a possibility that some of these 

1065 # datasets exist but are unknown to the source datastore if 

1066 # trust is enabled. 

1067 if getattr(source_child, "trustGetRequest", False): 

1068 unknown = [ref for ref, known in known_to_source.items() if not known] 

1069 existence = source_child.mexists(unknown, artifact_existence) 

1070 for ref, exists in existence.items(): 

1071 known_to_source[ref] = exists 

1072 

1073 missing = {ref for ref, known in known_to_source.items() if not known} 

1074 if missing: 

1075 if missing_from_source is None: 

1076 missing_from_source = missing 

1077 else: 

1078 missing_from_source &= missing 

1079 

1080 # Try to transfer from each source datastore to each child 

1081 # datastore. Have to make sure we don't transfer something 

1082 # we've already transferred to this destination on later passes. 

1083 

1084 # Filter the initial list based on the datasets we have 

1085 # not yet transferred. 

1086 these_refs = [] 

1087 for ref in refs: 

1088 if ref in remaining_refs and known_to_source[ref]: 

1089 these_refs.append(ref) 

1090 

1091 if not these_refs: 

1092 # Already transferred all datasets known to this datastore. 

1093 continue 

1094 

1095 for datastore, constraints in zip(self.datastores, self.datastoreConstraints): 

1096 if constraints is not None: 1096 ↛ 1104line 1096 didn't jump to line 1104, because the condition on line 1096 was never false

1097 filtered_refs = [] 

1098 for ref in these_refs: 

1099 if constraints.isAcceptable(ref): 

1100 filtered_refs.append(ref) 

1101 else: 

1102 log.debug("Rejecting ref by constraints: %s", ref) 

1103 else: 

1104 filtered_refs = [ref for ref in these_refs] 

1105 try: 

1106 accepted, _ = datastore.transfer_from( 

1107 source_child, filtered_refs, transfer, artifact_existence 

1108 ) 

1109 except (TypeError, NotImplementedError): 

1110 # The datastores were incompatible. 

1111 continue 

1112 else: 

1113 nsuccess += 1 

1114 

1115 # Remove the accepted datasets from those remaining. 

1116 remaining_refs = remaining_refs - accepted 

1117 

1118 # Keep track of everything we have accepted. 

1119 all_accepted.update(accepted) 

1120 

1121 if missing_from_source: 

1122 for ref in missing_from_source: 

1123 log.warning("Asked to transfer dataset %s but no file artifacts exist for it", ref) 

1124 

1125 if nsuccess == 0: 1125 ↛ 1126line 1125 didn't jump to line 1126, because the condition on line 1125 was never true

1126 raise TypeError(f"None of the child datastores could accept transfers from {source_datastore!r}") 

1127 

1128 return all_accepted, remaining_refs