Coverage for python/lsst/daf/butler/datastores/chainedDatastore.py: 92%

415 statements  

« prev     ^ index     » next       coverage.py v7.2.7, created at 2023-06-28 10:09 +0000

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22from __future__ import annotations 

23 

24"""Chained datastore.""" 

25 

26__all__ = ("ChainedDatastore",) 

27 

28import itertools 

29import logging 

30import time 

31import warnings 

32from collections.abc import Iterable, Mapping, Sequence 

33from typing import TYPE_CHECKING, Any 

34 

35from lsst.daf.butler import ( 

36 Constraints, 

37 DatasetRef, 

38 DatasetRefURIs, 

39 DatasetTypeNotSupportedError, 

40 Datastore, 

41 DatastoreConfig, 

42 DatastoreRecordData, 

43 DatastoreValidationError, 

44 FileDataset, 

45) 

46from lsst.resources import ResourcePath 

47from lsst.utils import doImportType 

48 

49if TYPE_CHECKING: 

50 from lsst.daf.butler import Config, DatasetType, LookupKey, StorageClass 

51 from lsst.daf.butler.registry.interfaces import DatasetIdRef, DatastoreRegistryBridgeManager 

52 from lsst.resources import ResourcePathExpression 

53 

54log = logging.getLogger(__name__) 

55 

56 

57class _IngestPrepData(Datastore.IngestPrepData): 

58 """Helper class for ChainedDatastore ingest implementation. 

59 

60 Parameters 

61 ---------- 

62 children : `list` of `tuple` 

63 Pairs of `Datastore`, `IngestPrepData` for all child datastores. 

64 """ 

65 

66 def __init__(self, children: list[tuple[Datastore, Datastore.IngestPrepData, set[ResourcePath]]]): 

67 super().__init__(itertools.chain.from_iterable(data.refs.values() for _, data, _ in children)) 

68 self.children = children 

69 

70 

71class ChainedDatastore(Datastore): 

72 """Chained Datastores to allow read and writes from multiple datastores. 

73 

74 A ChainedDatastore is configured with multiple datastore configurations. 

75 A ``put()`` is always sent to each datastore. A ``get()`` 

76 operation is sent to each datastore in turn and the first datastore 

77 to return a valid dataset is used. 

78 

79 Parameters 

80 ---------- 

81 config : `DatastoreConfig` or `str` 

82 Configuration. This configuration must include a ``datastores`` field 

83 as a sequence of datastore configurations. The order in this sequence 

84 indicates the order to use for read operations. 

85 bridgeManager : `DatastoreRegistryBridgeManager` 

86 Object that manages the interface between `Registry` and datastores. 

87 butlerRoot : `str`, optional 

88 New datastore root to use to override the configuration value. This 

89 root is sent to each child datastore. 

90 

91 Notes 

92 ----- 

93 ChainedDatastore never supports `None` or `"move"` as an `ingest` transfer 

94 mode. It supports `"copy"`, `"symlink"`, `"relsymlink"` 

95 and `"hardlink"` if and only if all its child datastores do. 

96 """ 

97 

98 defaultConfigFile = "datastores/chainedDatastore.yaml" 

99 """Path to configuration defaults. Accessed within the ``configs`` resource 

100 or relative to a search path. Can be None if no defaults specified. 

101 """ 

102 

103 containerKey = "datastores" 

104 """Key to specify where child datastores are configured.""" 

105 

106 datastores: list[Datastore] 

107 """All the child datastores known to this datastore.""" 

108 

109 datastoreConstraints: Sequence[Constraints | None] 

110 """Constraints to be applied to each of the child datastores.""" 

111 

112 @classmethod 

113 def setConfigRoot(cls, root: str, config: Config, full: Config, overwrite: bool = True) -> None: 

114 """Set any filesystem-dependent config options for child Datastores to 

115 be appropriate for a new empty repository with the given root. 

116 

117 Parameters 

118 ---------- 

119 root : `str` 

120 Filesystem path to the root of the data repository. 

121 config : `Config` 

122 A `Config` to update. Only the subset understood by 

123 this component will be updated. Will not expand 

124 defaults. 

125 full : `Config` 

126 A complete config with all defaults expanded that can be 

127 converted to a `DatastoreConfig`. Read-only and will not be 

128 modified by this method. 

129 Repository-specific options that should not be obtained 

130 from defaults when Butler instances are constructed 

131 should be copied from ``full`` to ``config``. 

132 overwrite : `bool`, optional 

133 If `False`, do not modify a value in ``config`` if the value 

134 already exists. Default is always to overwrite with the provided 

135 ``root``. 

136 

137 Notes 

138 ----- 

139 If a keyword is explicitly defined in the supplied ``config`` it 

140 will not be overridden by this method if ``overwrite`` is `False`. 

141 This allows explicit values set in external configs to be retained. 

142 """ 

143 # Extract the part of the config we care about updating 

144 datastoreConfig = DatastoreConfig(config, mergeDefaults=False) 

145 

146 # And the subset of the full config that we can use for reference. 

147 # Do not bother with defaults because we are told this already has 

148 # them. 

149 fullDatastoreConfig = DatastoreConfig(full, mergeDefaults=False) 

150 

151 # Loop over each datastore config and pass the subsets to the 

152 # child datastores to process. 

153 

154 containerKey = cls.containerKey 

155 for idx, (child, fullChild) in enumerate( 

156 zip(datastoreConfig[containerKey], fullDatastoreConfig[containerKey]) 

157 ): 

158 childConfig = DatastoreConfig(child, mergeDefaults=False) 

159 fullChildConfig = DatastoreConfig(fullChild, mergeDefaults=False) 

160 datastoreClass = doImportType(fullChildConfig["cls"]) 

161 if not issubclass(datastoreClass, Datastore): 161 ↛ 162line 161 didn't jump to line 162, because the condition on line 161 was never true

162 raise TypeError(f"Imported child class {fullChildConfig['cls']} is not a Datastore") 

163 newroot = f"{root}/{datastoreClass.__qualname__}_{idx}" 

164 datastoreClass.setConfigRoot(newroot, childConfig, fullChildConfig, overwrite=overwrite) 

165 

166 # Reattach to parent 

167 datastoreConfig[containerKey, idx] = childConfig 

168 

169 # Reattach modified datastore config to parent 

170 # If this has a datastore key we attach there, otherwise we assume 

171 # this information goes at the top of the config hierarchy. 

172 if DatastoreConfig.component in config: 

173 config[DatastoreConfig.component] = datastoreConfig 

174 else: 

175 config.update(datastoreConfig) 

176 

177 return 

178 

179 def __init__( 

180 self, 

181 config: Config | ResourcePathExpression, 

182 bridgeManager: DatastoreRegistryBridgeManager, 

183 butlerRoot: str | None = None, 

184 ): 

185 super().__init__(config, bridgeManager) 

186 

187 # Scan for child datastores and instantiate them with the same registry 

188 self.datastores = [] 

189 for c in self.config["datastores"]: 

190 c = DatastoreConfig(c) 

191 datastoreType = doImportType(c["cls"]) 

192 if not issubclass(datastoreType, Datastore): 192 ↛ 193line 192 didn't jump to line 193, because the condition on line 192 was never true

193 raise TypeError(f"Imported child class {c['cls']} is not a Datastore") 

194 datastore = datastoreType(c, bridgeManager, butlerRoot=butlerRoot) 

195 log.debug("Creating child datastore %s", datastore.name) 

196 self.datastores.append(datastore) 

197 

198 # Name ourself based on our children 

199 if self.datastores: 199 ↛ 204line 199 didn't jump to line 204, because the condition on line 199 was never false

200 # We must set the names explicitly 

201 self._names = [d.name for d in self.datastores] 

202 childNames = ",".join(self.names) 

203 else: 

204 childNames = f"(empty@{time.time()})" 

205 self._names = [childNames] 

206 self.name = f"{type(self).__qualname__}[{childNames}]" 

207 

208 # We declare we are ephemeral if all our child datastores declare 

209 # they are ephemeral 

210 isEphemeral = True 

211 for d in self.datastores: 

212 if not d.isEphemeral: 

213 isEphemeral = False 

214 break 

215 self.isEphemeral = isEphemeral 

216 

217 # per-datastore override constraints 

218 if "datastore_constraints" in self.config: 

219 overrides = self.config["datastore_constraints"] 

220 

221 if len(overrides) != len(self.datastores): 221 ↛ 222line 221 didn't jump to line 222, because the condition on line 221 was never true

222 raise DatastoreValidationError( 

223 f"Number of registered datastores ({len(self.datastores)})" 

224 " differs from number of constraints overrides" 

225 f" {len(overrides)}" 

226 ) 

227 

228 self.datastoreConstraints = [ 

229 Constraints(c.get("constraints"), universe=bridgeManager.universe) for c in overrides 

230 ] 

231 

232 else: 

233 self.datastoreConstraints = (None,) * len(self.datastores) 

234 

235 log.debug("Created %s (%s)", self.name, ("ephemeral" if self.isEphemeral else "permanent")) 

236 

237 @property 

238 def names(self) -> tuple[str, ...]: 

239 return tuple(self._names) 

240 

241 def __str__(self) -> str: 

242 chainName = ", ".join(str(ds) for ds in self.datastores) 

243 return chainName 

244 

245 def knows(self, ref: DatasetRef) -> bool: 

246 """Check if the dataset is known to any of the datastores. 

247 

248 Does not check for existence of any artifact. 

249 

250 Parameters 

251 ---------- 

252 ref : `DatasetRef` 

253 Reference to the required dataset. 

254 

255 Returns 

256 ------- 

257 exists : `bool` 

258 `True` if the dataset is known to the datastore. 

259 """ 

260 for datastore in self.datastores: 

261 if datastore.knows(ref): 

262 log.debug("%s known to datastore %s", ref, datastore.name) 

263 return True 

264 return False 

265 

266 def knows_these(self, refs: Iterable[DatasetRef]) -> dict[DatasetRef, bool]: 

267 # Docstring inherited from the base class. 

268 refs_known: dict[DatasetRef, bool] = {} 

269 for datastore in self.datastores: 

270 refs_known.update(datastore.knows_these(refs)) 

271 

272 # No need to check in next datastore for refs that are known. 

273 # We only update entries that were initially False. 

274 refs = [ref for ref, known in refs_known.items() if not known] 

275 

276 return refs_known 

277 

278 def mexists( 

279 self, refs: Iterable[DatasetRef], artifact_existence: dict[ResourcePath, bool] | None = None 

280 ) -> dict[DatasetRef, bool]: 

281 """Check the existence of multiple datasets at once. 

282 

283 Parameters 

284 ---------- 

285 refs : iterable of `DatasetRef` 

286 The datasets to be checked. 

287 artifact_existence : `dict` [`lsst.resources.ResourcePath`, `bool`] 

288 Optional mapping of datastore artifact to existence. Updated by 

289 this method with details of all artifacts tested. Can be `None` 

290 if the caller is not interested. 

291 

292 Returns 

293 ------- 

294 existence : `dict` of [`DatasetRef`, `bool`] 

295 Mapping from dataset to boolean indicating existence in any 

296 of the child datastores. 

297 """ 

298 dataset_existence: dict[DatasetRef, bool] = {} 

299 for datastore in self.datastores: 

300 dataset_existence.update(datastore.mexists(refs, artifact_existence=artifact_existence)) 

301 

302 # For next datastore no point asking about ones we know 

303 # exist already. No special exemption for ephemeral datastores. 

304 refs = [ref for ref, exists in dataset_existence.items() if not exists] 

305 

306 return dataset_existence 

307 

308 def exists(self, ref: DatasetRef) -> bool: 

309 """Check if the dataset exists in one of the datastores. 

310 

311 Parameters 

312 ---------- 

313 ref : `DatasetRef` 

314 Reference to the required dataset. 

315 

316 Returns 

317 ------- 

318 exists : `bool` 

319 `True` if the entity exists in one of the child datastores. 

320 """ 

321 for datastore in self.datastores: 

322 if datastore.exists(ref): 

323 log.debug("Found %s in datastore %s", ref, datastore.name) 

324 return True 

325 return False 

326 

327 def get( 

328 self, 

329 ref: DatasetRef, 

330 parameters: Mapping[str, Any] | None = None, 

331 storageClass: StorageClass | str | None = None, 

332 ) -> Any: 

333 """Load an InMemoryDataset from the store. 

334 

335 The dataset is returned from the first datastore that has 

336 the dataset. 

337 

338 Parameters 

339 ---------- 

340 ref : `DatasetRef` 

341 Reference to the required Dataset. 

342 parameters : `dict` 

343 `StorageClass`-specific parameters that specify, for example, 

344 a slice of the dataset to be loaded. 

345 storageClass : `StorageClass` or `str`, optional 

346 The storage class to be used to override the Python type 

347 returned by this method. By default the returned type matches 

348 the dataset type definition for this dataset. Specifying a 

349 read `StorageClass` can force a different type to be returned. 

350 This type must be compatible with the original type. 

351 

352 Returns 

353 ------- 

354 inMemoryDataset : `object` 

355 Requested dataset or slice thereof as an InMemoryDataset. 

356 

357 Raises 

358 ------ 

359 FileNotFoundError 

360 Requested dataset can not be retrieved. 

361 TypeError 

362 Return value from formatter has unexpected type. 

363 ValueError 

364 Formatter failed to process the dataset. 

365 """ 

366 for datastore in self.datastores: 

367 try: 

368 inMemoryObject = datastore.get(ref, parameters, storageClass=storageClass) 

369 log.debug("Found dataset %s in datastore %s", ref, datastore.name) 

370 return inMemoryObject 

371 except FileNotFoundError: 

372 pass 

373 

374 raise FileNotFoundError(f"Dataset {ref} could not be found in any of the datastores") 

375 

376 def put(self, inMemoryDataset: Any, ref: DatasetRef) -> None: 

377 """Write a InMemoryDataset with a given `DatasetRef` to each 

378 datastore. 

379 

380 The put() to child datastores can fail with 

381 `DatasetTypeNotSupportedError`. The put() for this datastore will be 

382 deemed to have succeeded so long as at least one child datastore 

383 accepted the inMemoryDataset. 

384 

385 Parameters 

386 ---------- 

387 inMemoryDataset : `object` 

388 The dataset to store. 

389 ref : `DatasetRef` 

390 Reference to the associated Dataset. 

391 

392 Raises 

393 ------ 

394 TypeError 

395 Supplied object and storage class are inconsistent. 

396 DatasetTypeNotSupportedError 

397 All datastores reported `DatasetTypeNotSupportedError`. 

398 """ 

399 log.debug("Put %s", ref) 

400 

401 # Confirm that we can accept this dataset 

402 if not self.constraints.isAcceptable(ref): 

403 # Raise rather than use boolean return value. 

404 raise DatasetTypeNotSupportedError( 

405 f"Dataset {ref} has been rejected by this datastore via configuration." 

406 ) 

407 

408 isPermanent = False 

409 nsuccess = 0 

410 npermanent = 0 

411 nephemeral = 0 

412 for datastore, constraints in zip(self.datastores, self.datastoreConstraints): 

413 if ( 

414 constraints is not None and not constraints.isAcceptable(ref) 

415 ) or not datastore.constraints.isAcceptable(ref): 

416 log.debug("Datastore %s skipping put via configuration for ref %s", datastore.name, ref) 

417 continue 

418 

419 if datastore.isEphemeral: 

420 nephemeral += 1 

421 else: 

422 npermanent += 1 

423 try: 

424 datastore.put(inMemoryDataset, ref) 

425 nsuccess += 1 

426 if not datastore.isEphemeral: 

427 isPermanent = True 

428 except DatasetTypeNotSupportedError: 

429 pass 

430 

431 if nsuccess == 0: 

432 raise DatasetTypeNotSupportedError(f"None of the chained datastores supported ref {ref}") 

433 

434 if not isPermanent and npermanent > 0: 434 ↛ 435line 434 didn't jump to line 435, because the condition on line 434 was never true

435 warnings.warn(f"Put of {ref} only succeeded in ephemeral databases", stacklevel=2) 

436 

437 if self._transaction is not None: 

438 self._transaction.registerUndo("put", self.remove, ref) 

439 

440 def _overrideTransferMode(self, *datasets: Any, transfer: str | None = None) -> str | None: 

441 # Docstring inherited from base class. 

442 if transfer != "auto": 

443 return transfer 

444 # Ask each datastore what they think auto means 

445 transfers = {d._overrideTransferMode(*datasets, transfer=transfer) for d in self.datastores} 

446 

447 # Remove any untranslated "auto" values 

448 transfers.discard(transfer) 

449 

450 if len(transfers) == 1: 450 ↛ 451line 450 didn't jump to line 451, because the condition on line 450 was never true

451 return transfers.pop() 

452 if not transfers: 452 ↛ 456line 452 didn't jump to line 456, because the condition on line 452 was never false

453 # Everything reported "auto" 

454 return transfer 

455 

456 raise RuntimeError( 

457 "Chained datastore does not yet support different transfer modes" 

458 f" from 'auto' in each child datastore (wanted {transfers})" 

459 ) 

460 

461 def _prepIngest(self, *datasets: FileDataset, transfer: str | None = None) -> _IngestPrepData: 

462 # Docstring inherited from Datastore._prepIngest. 

463 if transfer is None: 

464 raise NotImplementedError("ChainedDatastore does not support transfer=None.") 

465 

466 def isDatasetAcceptable(dataset: FileDataset, *, name: str, constraints: Constraints) -> bool: 

467 acceptable = [ref for ref in dataset.refs if constraints.isAcceptable(ref)] 

468 if not acceptable: 

469 log.debug( 

470 "Datastore %s skipping ingest via configuration for refs %s", 

471 name, 

472 ", ".join(str(ref) for ref in dataset.refs), 

473 ) 

474 return False 

475 else: 

476 return True 

477 

478 # Filter down to just datasets the chained datastore's own 

479 # configuration accepts. 

480 okForParent: list[FileDataset] = [ 

481 dataset 

482 for dataset in datasets 

483 if isDatasetAcceptable(dataset, name=self.name, constraints=self.constraints) 

484 ] 

485 

486 # Iterate over nested datastores and call _prepIngest on each. 

487 # Save the results to a list: 

488 children: list[tuple[Datastore, Datastore.IngestPrepData, set[ResourcePath]]] = [] 

489 # ...and remember whether all of the failures are due to 

490 # NotImplementedError being raised. 

491 allFailuresAreNotImplementedError = True 

492 for datastore, constraints in zip(self.datastores, self.datastoreConstraints): 

493 okForChild: list[FileDataset] 

494 if constraints is not None: 

495 okForChild = [ 

496 dataset 

497 for dataset in okForParent 

498 if isDatasetAcceptable(dataset, name=datastore.name, constraints=constraints) 

499 ] 

500 else: 

501 okForChild = okForParent 

502 try: 

503 prepDataForChild = datastore._prepIngest(*okForChild, transfer=transfer) 

504 except NotImplementedError: 

505 log.debug( 

506 "Skipping ingest for datastore %s because transfer mode %s is not supported.", 

507 datastore.name, 

508 transfer, 

509 ) 

510 continue 

511 allFailuresAreNotImplementedError = False 

512 if okForChild: 

513 # Do not store for later if a datastore has rejected 

514 # everything. 

515 # Include the source paths if this is a "move". It's clearer 

516 # to find the paths now rather than try to infer how 

517 # each datastore has stored them in the internal prep class. 

518 paths = ( 

519 {ResourcePath(dataset.path) for dataset in okForChild} if transfer == "move" else set() 

520 ) 

521 children.append((datastore, prepDataForChild, paths)) 

522 if allFailuresAreNotImplementedError: 

523 raise NotImplementedError(f"No child datastore supports transfer mode {transfer}.") 

524 return _IngestPrepData(children=children) 

525 

526 def _finishIngest( 

527 self, 

528 prepData: _IngestPrepData, 

529 *, 

530 transfer: str | None = None, 

531 record_validation_info: bool = True, 

532 ) -> None: 

533 # Docstring inherited from Datastore._finishIngest. 

534 # For "move" we must use "copy" and then delete the input 

535 # data at the end. This has no rollback option if the ingest 

536 # subsequently fails. If there is only one active datastore 

537 # accepting any files we can leave it as "move" 

538 actual_transfer: str | None 

539 if transfer == "move" and len(prepData.children) > 1: 

540 actual_transfer = "copy" 

541 else: 

542 actual_transfer = transfer 

543 to_be_deleted: set[ResourcePath] = set() 

544 for datastore, prepDataForChild, paths in prepData.children: 

545 datastore._finishIngest( 

546 prepDataForChild, transfer=actual_transfer, record_validation_info=record_validation_info 

547 ) 

548 to_be_deleted.update(paths) 

549 if actual_transfer != transfer: 

550 # These datasets were copied but now need to be deleted. 

551 # This can not be rolled back. 

552 for uri in to_be_deleted: 

553 uri.remove() 

554 

555 def getManyURIs( 

556 self, 

557 refs: Iterable[DatasetRef], 

558 predict: bool = False, 

559 allow_missing: bool = False, 

560 ) -> dict[DatasetRef, DatasetRefURIs]: 

561 # Docstring inherited 

562 

563 uris: dict[DatasetRef, DatasetRefURIs] = {} 

564 missing_refs = set(refs) 

565 

566 # If predict is True we don't want to predict a dataset in the first 

567 # datastore if it actually exists in a later datastore, so in that 

568 # case check all datastores with predict=False first, and then try 

569 # again with predict=True. 

570 for p in (False, True) if predict else (False,): 

571 if not missing_refs: 

572 break 

573 for datastore in self.datastores: 

574 try: 

575 got_uris = datastore.getManyURIs(missing_refs, p, allow_missing=True) 

576 except NotImplementedError: 

577 # some datastores may not implement generating URIs 

578 continue 

579 missing_refs -= got_uris.keys() 

580 uris.update(got_uris) 

581 if not missing_refs: 

582 break 

583 

584 if missing_refs and not allow_missing: 

585 raise FileNotFoundError(f"Dataset(s) {missing_refs} not in this datastore.") 

586 

587 return uris 

588 

589 def getURIs(self, ref: DatasetRef, predict: bool = False) -> DatasetRefURIs: 

590 """Return URIs associated with dataset. 

591 

592 Parameters 

593 ---------- 

594 ref : `DatasetRef` 

595 Reference to the required dataset. 

596 predict : `bool`, optional 

597 If the datastore does not know about the dataset, should it 

598 return a predicted URI or not? 

599 

600 Returns 

601 ------- 

602 uris : `DatasetRefURIs` 

603 The URI to the primary artifact associated with this dataset (if 

604 the dataset was disassembled within the datastore this may be 

605 `None`), and the URIs to any components associated with the dataset 

606 artifact. (can be empty if there are no components). 

607 

608 Notes 

609 ----- 

610 The returned URI is from the first datastore in the list that has 

611 the dataset with preference given to the first dataset coming from 

612 a permanent datastore. If no datastores have the dataset and prediction 

613 is allowed, the predicted URI for the first datastore in the list will 

614 be returned. 

615 """ 

616 log.debug("Requesting URIs for %s", ref) 

617 predictedUri: DatasetRefURIs | None = None 

618 predictedEphemeralUri: DatasetRefURIs | None = None 

619 firstEphemeralUri: DatasetRefURIs | None = None 

620 for datastore in self.datastores: 

621 if datastore.exists(ref): 

622 if not datastore.isEphemeral: 

623 uri = datastore.getURIs(ref) 

624 log.debug("Retrieved non-ephemeral URI: %s", uri) 

625 return uri 

626 elif not firstEphemeralUri: 

627 firstEphemeralUri = datastore.getURIs(ref) 

628 elif predict: 

629 if not predictedUri and not datastore.isEphemeral: 

630 predictedUri = datastore.getURIs(ref, predict) 

631 elif not predictedEphemeralUri and datastore.isEphemeral: 

632 predictedEphemeralUri = datastore.getURIs(ref, predict) 

633 

634 if firstEphemeralUri: 

635 log.debug("Retrieved ephemeral URI: %s", firstEphemeralUri) 

636 return firstEphemeralUri 

637 

638 if predictedUri: 

639 log.debug("Retrieved predicted URI: %s", predictedUri) 

640 return predictedUri 

641 

642 if predictedEphemeralUri: 

643 log.debug("Retrieved predicted ephemeral URI: %s", predictedEphemeralUri) 

644 return predictedEphemeralUri 

645 

646 raise FileNotFoundError(f"Dataset {ref} not in any datastore") 

647 

648 def getURI(self, ref: DatasetRef, predict: bool = False) -> ResourcePath: 

649 """URI to the Dataset. 

650 

651 The returned URI is from the first datastore in the list that has 

652 the dataset with preference given to the first dataset coming from 

653 a permanent datastore. If no datastores have the dataset and prediction 

654 is allowed, the predicted URI for the first datastore in the list will 

655 be returned. 

656 

657 Parameters 

658 ---------- 

659 ref : `DatasetRef` 

660 Reference to the required Dataset. 

661 predict : `bool` 

662 If `True`, allow URIs to be returned of datasets that have not 

663 been written. 

664 

665 Returns 

666 ------- 

667 uri : `lsst.resources.ResourcePath` 

668 URI pointing to the dataset within the datastore. If the 

669 dataset does not exist in the datastore, and if ``predict`` is 

670 `True`, the URI will be a prediction and will include a URI 

671 fragment "#predicted". 

672 

673 Notes 

674 ----- 

675 If the datastore does not have entities that relate well 

676 to the concept of a URI the returned URI string will be 

677 descriptive. The returned URI is not guaranteed to be obtainable. 

678 

679 Raises 

680 ------ 

681 FileNotFoundError 

682 A URI has been requested for a dataset that does not exist and 

683 guessing is not allowed. 

684 RuntimeError 

685 Raised if a request is made for a single URI but multiple URIs 

686 are associated with this dataset. 

687 """ 

688 log.debug("Requesting URI for %s", ref) 

689 primary, components = self.getURIs(ref, predict) 

690 if primary is None or components: 690 ↛ 691line 690 didn't jump to line 691, because the condition on line 690 was never true

691 raise RuntimeError( 

692 f"Dataset ({ref}) includes distinct URIs for components. Use Datastore.getURIs() instead." 

693 ) 

694 return primary 

695 

696 def retrieveArtifacts( 

697 self, 

698 refs: Iterable[DatasetRef], 

699 destination: ResourcePath, 

700 transfer: str = "auto", 

701 preserve_path: bool = True, 

702 overwrite: bool = False, 

703 ) -> list[ResourcePath]: 

704 """Retrieve the file artifacts associated with the supplied refs. 

705 

706 Parameters 

707 ---------- 

708 refs : iterable of `DatasetRef` 

709 The datasets for which file artifacts are to be retrieved. 

710 A single ref can result in multiple files. The refs must 

711 be resolved. 

712 destination : `lsst.resources.ResourcePath` 

713 Location to write the file artifacts. 

714 transfer : `str`, optional 

715 Method to use to transfer the artifacts. Must be one of the options 

716 supported by `lsst.resources.ResourcePath.transfer_from()`. 

717 "move" is not allowed. 

718 preserve_path : `bool`, optional 

719 If `True` the full path of the file artifact within the datastore 

720 is preserved. If `False` the final file component of the path 

721 is used. 

722 overwrite : `bool`, optional 

723 If `True` allow transfers to overwrite existing files at the 

724 destination. 

725 

726 Returns 

727 ------- 

728 targets : `list` of `lsst.resources.ResourcePath` 

729 URIs of file artifacts in destination location. Order is not 

730 preserved. 

731 """ 

732 if not destination.isdir(): 732 ↛ 733line 732 didn't jump to line 733, because the condition on line 732 was never true

733 raise ValueError(f"Destination location must refer to a directory. Given {destination}") 

734 

735 # Using getURIs is not feasible since it becomes difficult to 

736 # determine the path within the datastore later on. For now 

737 # follow getURIs implementation approach. 

738 

739 pending = set(refs) 

740 

741 # There is a question as to whether an exception should be raised 

742 # early if some of the refs are missing, or whether files should be 

743 # transferred until a problem is hit. Prefer to complain up front. 

744 # Use the datastore integer as primary key. 

745 grouped_by_datastore: dict[int, set[DatasetRef]] = {} 

746 

747 for number, datastore in enumerate(self.datastores): 

748 if datastore.isEphemeral: 

749 # In the future we will want to distinguish in-memory from 

750 # caching datastore since using an on-disk local 

751 # cache is exactly what we should be doing. 

752 continue 

753 try: 

754 datastore_refs = {ref for ref in pending if datastore.exists(ref)} 

755 except NotImplementedError: 

756 # Some datastores may not support retrieving artifacts 

757 continue 

758 

759 if datastore_refs: 

760 grouped_by_datastore[number] = datastore_refs 

761 

762 # Remove these from the pending list so that we do not bother 

763 # looking for them any more. 

764 pending = pending - datastore_refs 

765 

766 if pending: 766 ↛ 767line 766 didn't jump to line 767, because the condition on line 766 was never true

767 raise RuntimeError(f"Some datasets were not found in any datastores: {pending}") 

768 

769 # Now do the transfer. 

770 targets: list[ResourcePath] = [] 

771 for number, datastore_refs in grouped_by_datastore.items(): 

772 targets.extend( 

773 self.datastores[number].retrieveArtifacts( 

774 datastore_refs, 

775 destination, 

776 transfer=transfer, 

777 preserve_path=preserve_path, 

778 overwrite=overwrite, 

779 ) 

780 ) 

781 

782 return targets 

783 

784 def remove(self, ref: DatasetRef) -> None: 

785 """Indicate to the datastore that a dataset can be removed. 

786 

787 The dataset will be removed from each datastore. The dataset is 

788 not required to exist in every child datastore. 

789 

790 Parameters 

791 ---------- 

792 ref : `DatasetRef` 

793 Reference to the required dataset. 

794 

795 Raises 

796 ------ 

797 FileNotFoundError 

798 Attempt to remove a dataset that does not exist. Raised if none 

799 of the child datastores removed the dataset. 

800 """ 

801 log.debug("Removing %s", ref) 

802 self.trash(ref, ignore_errors=False) 

803 self.emptyTrash(ignore_errors=False) 

804 

805 def forget(self, refs: Iterable[DatasetRef]) -> None: 

806 for datastore in tuple(self.datastores): 

807 datastore.forget(refs) 

808 

809 def trash(self, ref: DatasetRef | Iterable[DatasetRef], ignore_errors: bool = True) -> None: 

810 if isinstance(ref, DatasetRef): 

811 ref_label = str(ref) 

812 else: 

813 ref_label = "bulk datasets" 

814 

815 log.debug("Trashing %s", ref_label) 

816 

817 counter = 0 

818 for datastore in self.datastores: 

819 try: 

820 datastore.trash(ref, ignore_errors=ignore_errors) 

821 counter += 1 

822 except FileNotFoundError: 

823 pass 

824 

825 if counter == 0: 

826 err_msg = f"Could not mark for removal from any child datastore: {ref_label}" 

827 if ignore_errors: 827 ↛ 828line 827 didn't jump to line 828, because the condition on line 827 was never true

828 log.warning(err_msg) 

829 else: 

830 raise FileNotFoundError(err_msg) 

831 

832 def emptyTrash(self, ignore_errors: bool = True) -> None: 

833 for datastore in self.datastores: 

834 datastore.emptyTrash(ignore_errors=ignore_errors) 

835 

836 def transfer(self, inputDatastore: Datastore, ref: DatasetRef) -> None: 

837 """Retrieve a dataset from an input `Datastore`, 

838 and store the result in this `Datastore`. 

839 

840 Parameters 

841 ---------- 

842 inputDatastore : `Datastore` 

843 The external `Datastore` from which to retreive the Dataset. 

844 ref : `DatasetRef` 

845 Reference to the required dataset in the input data store. 

846 

847 Returns 

848 ------- 

849 results : `list` 

850 List containing the return value from the ``put()`` to each 

851 child datastore. 

852 """ 

853 assert inputDatastore is not self # unless we want it for renames? 

854 inMemoryDataset = inputDatastore.get(ref) 

855 self.put(inMemoryDataset, ref) 

856 

857 def validateConfiguration( 

858 self, entities: Iterable[DatasetRef | DatasetType | StorageClass], logFailures: bool = False 

859 ) -> None: 

860 """Validate some of the configuration for this datastore. 

861 

862 Parameters 

863 ---------- 

864 entities : iterable of `DatasetRef`, `DatasetType`, or `StorageClass` 

865 Entities to test against this configuration. Can be differing 

866 types. 

867 logFailures : `bool`, optional 

868 If `True`, output a log message for every validation error 

869 detected. 

870 

871 Raises 

872 ------ 

873 DatastoreValidationError 

874 Raised if there is a validation problem with a configuration. 

875 All the problems are reported in a single exception. 

876 

877 Notes 

878 ----- 

879 This method checks each datastore in turn. 

880 """ 

881 # Need to catch each of the datastore outputs and ensure that 

882 # all are tested. 

883 failures = [] 

884 for datastore in self.datastores: 

885 try: 

886 datastore.validateConfiguration(entities, logFailures=logFailures) 

887 except DatastoreValidationError as e: 

888 if logFailures: 888 ↛ 890line 888 didn't jump to line 890, because the condition on line 888 was never false

889 log.critical("Datastore %s failed validation", datastore.name) 

890 failures.append(f"Datastore {self.name}: {e}") 

891 

892 if failures: 

893 msg = ";\n".join(failures) 

894 raise DatastoreValidationError(msg) 

895 

896 def validateKey(self, lookupKey: LookupKey, entity: DatasetRef | DatasetType | StorageClass) -> None: 

897 # Docstring is inherited from base class 

898 failures = [] 

899 for datastore in self.datastores: 

900 try: 

901 datastore.validateKey(lookupKey, entity) 

902 except DatastoreValidationError as e: 

903 failures.append(f"Datastore {self.name}: {e}") 

904 

905 if failures: 

906 msg = ";\n".join(failures) 

907 raise DatastoreValidationError(msg) 

908 

909 def getLookupKeys(self) -> set[LookupKey]: 

910 # Docstring is inherited from base class 

911 keys = set() 

912 for datastore in self.datastores: 

913 keys.update(datastore.getLookupKeys()) 

914 

915 keys.update(self.constraints.getLookupKeys()) 

916 for p in self.datastoreConstraints: 

917 if p is not None: 917 ↛ 916line 917 didn't jump to line 916, because the condition on line 917 was never false

918 keys.update(p.getLookupKeys()) 

919 

920 return keys 

921 

922 def needs_expanded_data_ids( 

923 self, 

924 transfer: str | None, 

925 entity: DatasetRef | DatasetType | StorageClass | None = None, 

926 ) -> bool: 

927 # Docstring inherited. 

928 # We can't safely use `self.datastoreConstraints` with `entity` to 

929 # check whether a child datastore would even want to ingest this 

930 # dataset, because we don't want to filter out datastores that might 

931 # need an expanded data ID based in incomplete information (e.g. we 

932 # pass a StorageClass, but the constraint dispatches on DatasetType). 

933 # So we pessimistically check if any datastore would need an expanded 

934 # data ID for this transfer mode. 

935 return any(datastore.needs_expanded_data_ids(transfer, entity) for datastore in self.datastores) 

936 

937 def import_records(self, data: Mapping[str, DatastoreRecordData]) -> None: 

938 # Docstring inherited from the base class. 

939 

940 for datastore in self.datastores: 

941 datastore.import_records(data) 

942 

943 def export_records(self, refs: Iterable[DatasetIdRef]) -> Mapping[str, DatastoreRecordData]: 

944 # Docstring inherited from the base class. 

945 

946 all_records: dict[str, DatastoreRecordData] = {} 

947 

948 # Merge all sub-datastore records into one structure 

949 for datastore in self.datastores: 

950 sub_records = datastore.export_records(refs) 

951 for name, record_data in sub_records.items(): 

952 # All datastore names must be unique in a chain. 

953 if name in all_records: 953 ↛ 954line 953 didn't jump to line 954, because the condition on line 953 was never true

954 raise ValueError("Non-unique datastore name found in datastore {datastore}") 

955 all_records[name] = record_data 

956 

957 return all_records 

958 

959 def export( 

960 self, 

961 refs: Iterable[DatasetRef], 

962 *, 

963 directory: ResourcePathExpression | None = None, 

964 transfer: str | None = "auto", 

965 ) -> Iterable[FileDataset]: 

966 # Docstring inherited from Datastore.export. 

967 if transfer == "auto" and directory is None: 

968 transfer = None 

969 

970 if transfer is not None and directory is None: 

971 raise TypeError(f"Cannot export using transfer mode {transfer} with no export directory given") 

972 

973 if transfer == "move": 

974 raise TypeError("Can not export by moving files out of datastore.") 

975 

976 # Exporting from a chain has the potential for a dataset to be 

977 # in one or more of the datastores in the chain. We only need one 

978 # of them since we assume the datasets are the same in all (but 

979 # the file format could be different of course since that is a 

980 # per-datastore configuration). 

981 # We also do not know whether any of the datastores in the chain 

982 # support file export. 

983 

984 # Ensure we have an ordered sequence that is not an iterator or set. 

985 if not isinstance(refs, Sequence): 

986 refs = list(refs) 

987 

988 # If any of the datasets are missing entirely we need to raise early 

989 # before we try to run the export. This can be a little messy but is 

990 # better than exporting files from the first datastore and then finding 

991 # that one is missing but is not in the second datastore either. 

992 known = [datastore.knows_these(refs) for datastore in self.datastores] 

993 refs_known: set[DatasetRef] = set() 

994 for known_to_this in known: 

995 refs_known.update({ref for ref, knows_this in known_to_this.items() if knows_this}) 

996 missing_count = len(refs) - len(refs_known) 

997 if missing_count: 

998 raise FileNotFoundError(f"Not all datasets known to this datastore. Missing {missing_count}") 

999 

1000 # To allow us to slot each result into the right place after 

1001 # asking each datastore, create a dict with the index. 

1002 ref_positions = {ref: i for i, ref in enumerate(refs)} 

1003 

1004 # Presize the final export list. 

1005 exported: list[FileDataset | None] = [None] * len(refs) 

1006 

1007 # The order of the returned dataset has to match the order of the 

1008 # given refs, even if they are all from different datastores. 

1009 for i, datastore in enumerate(self.datastores): 

1010 known_to_this = known[i] 

1011 filtered = [ref for ref, knows in known_to_this.items() if knows and ref in ref_positions] 

1012 

1013 try: 

1014 this_export = datastore.export(filtered, directory=directory, transfer=transfer) 

1015 except NotImplementedError: 

1016 # Try the next datastore. 

1017 continue 

1018 

1019 for ref, export in zip(filtered, this_export): 

1020 # Get the position and also delete it from the list. 

1021 exported[ref_positions.pop(ref)] = export 

1022 

1023 # Every dataset should be accounted for because of the earlier checks 

1024 # but make sure that we did fill all the slots to appease mypy. 

1025 for i, dataset in enumerate(exported): 

1026 if dataset is None: 1026 ↛ 1027line 1026 didn't jump to line 1027, because the condition on line 1026 was never true

1027 raise FileNotFoundError(f"Failed to export dataset {refs[i]}.") 

1028 yield dataset 

1029 

1030 def transfer_from( 

1031 self, 

1032 source_datastore: Datastore, 

1033 refs: Iterable[DatasetRef], 

1034 transfer: str = "auto", 

1035 artifact_existence: dict[ResourcePath, bool] | None = None, 

1036 ) -> tuple[set[DatasetRef], set[DatasetRef]]: 

1037 # Docstring inherited 

1038 # mypy does not understand "type(self) is not type(source)" 

1039 if isinstance(source_datastore, ChainedDatastore): 

1040 # Both the source and destination are chained datastores. 

1041 source_datastores = tuple(source_datastore.datastores) 

1042 else: 

1043 # The source datastore is different, forward everything to the 

1044 # child datastores. 

1045 source_datastores = tuple([source_datastore]) 

1046 

1047 # Need to know the set of all possible refs that could be transferred. 

1048 remaining_refs = set(refs) 

1049 

1050 missing_from_source: set[DatasetRef] | None = None 

1051 all_accepted = set() 

1052 nsuccess = 0 

1053 for source_child in source_datastores: 

1054 # If we are reading from a chained datastore, it's possible that 

1055 # only a subset of the datastores know about the dataset. We can't 

1056 # ask the receiving datastore to copy it when it doesn't exist 

1057 # so we have to filter again based on what the source datastore 

1058 # understands. 

1059 known_to_source = source_child.knows_these([ref for ref in refs]) 

1060 

1061 # Need to know that there is a possibility that some of these 

1062 # datasets exist but are unknown to the source datastore if 

1063 # trust is enabled. 

1064 if getattr(source_child, "trustGetRequest", False): 

1065 unknown = [ref for ref, known in known_to_source.items() if not known] 

1066 existence = source_child.mexists(unknown, artifact_existence) 

1067 for ref, exists in existence.items(): 

1068 known_to_source[ref] = exists 

1069 

1070 missing = {ref for ref, known in known_to_source.items() if not known} 

1071 if missing: 

1072 if missing_from_source is None: 

1073 missing_from_source = missing 

1074 else: 

1075 missing_from_source &= missing 

1076 

1077 # Try to transfer from each source datastore to each child 

1078 # datastore. Have to make sure we don't transfer something 

1079 # we've already transferred to this destination on later passes. 

1080 

1081 # Filter the initial list based on the datasets we have 

1082 # not yet transferred. 

1083 these_refs = [] 

1084 for ref in refs: 

1085 if ref in remaining_refs and known_to_source[ref]: 

1086 these_refs.append(ref) 

1087 

1088 if not these_refs: 

1089 # Already transferred all datasets known to this datastore. 

1090 continue 

1091 

1092 for datastore, constraints in zip(self.datastores, self.datastoreConstraints): 

1093 if constraints is not None: 1093 ↛ 1101line 1093 didn't jump to line 1101, because the condition on line 1093 was never false

1094 filtered_refs = [] 

1095 for ref in these_refs: 

1096 if constraints.isAcceptable(ref): 

1097 filtered_refs.append(ref) 

1098 else: 

1099 log.debug("Rejecting ref by constraints: %s", ref) 

1100 else: 

1101 filtered_refs = [ref for ref in these_refs] 

1102 try: 

1103 accepted, _ = datastore.transfer_from( 

1104 source_child, filtered_refs, transfer, artifact_existence 

1105 ) 

1106 except (TypeError, NotImplementedError): 

1107 # The datastores were incompatible. 

1108 continue 

1109 else: 

1110 nsuccess += 1 

1111 

1112 # Remove the accepted datasets from those remaining. 

1113 remaining_refs = remaining_refs - accepted 

1114 

1115 # Keep track of everything we have accepted. 

1116 all_accepted.update(accepted) 

1117 

1118 if missing_from_source: 

1119 for ref in missing_from_source: 

1120 log.warning("Asked to transfer dataset %s but no file artifacts exist for it", ref) 

1121 

1122 if nsuccess == 0: 1122 ↛ 1123line 1122 didn't jump to line 1123, because the condition on line 1122 was never true

1123 raise TypeError(f"None of the child datastores could accept transfers from {source_datastore!r}") 

1124 

1125 return all_accepted, remaining_refs