Coverage for python/lsst/daf/butler/datastores/chainedDatastore.py: 92%

421 statements  

« prev     ^ index     » next       coverage.py v7.2.7, created at 2023-08-05 01:25 +0000

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22"""Chained datastore.""" 

23 

24from __future__ import annotations 

25 

26__all__ = ("ChainedDatastore",) 

27 

28import itertools 

29import logging 

30import time 

31import warnings 

32from collections.abc import Iterable, Mapping, Sequence 

33from typing import TYPE_CHECKING, Any 

34 

35from lsst.daf.butler import ( 

36 Constraints, 

37 DatasetRef, 

38 DatasetRefURIs, 

39 DatasetTypeNotSupportedError, 

40 Datastore, 

41 DatastoreConfig, 

42 DatastoreRecordData, 

43 DatastoreValidationError, 

44 FileDataset, 

45) 

46from lsst.resources import ResourcePath 

47from lsst.utils import doImportType 

48 

49if TYPE_CHECKING: 

50 from lsst.daf.butler import Config, DatasetType, LookupKey, StorageClass 

51 from lsst.daf.butler.registry.interfaces import DatasetIdRef, DatastoreRegistryBridgeManager 

52 from lsst.resources import ResourcePathExpression 

53 

54log = logging.getLogger(__name__) 

55 

56 

57class _IngestPrepData(Datastore.IngestPrepData): 

58 """Helper class for ChainedDatastore ingest implementation. 

59 

60 Parameters 

61 ---------- 

62 children : `list` of `tuple` 

63 Pairs of `Datastore`, `IngestPrepData` for all child datastores. 

64 """ 

65 

66 def __init__(self, children: list[tuple[Datastore, Datastore.IngestPrepData, set[ResourcePath]]]): 

67 super().__init__(itertools.chain.from_iterable(data.refs.values() for _, data, _ in children)) 

68 self.children = children 

69 

70 

71class ChainedDatastore(Datastore): 

72 """Chained Datastores to allow read and writes from multiple datastores. 

73 

74 A ChainedDatastore is configured with multiple datastore configurations. 

75 A ``put()`` is always sent to each datastore. A ``get()`` 

76 operation is sent to each datastore in turn and the first datastore 

77 to return a valid dataset is used. 

78 

79 Parameters 

80 ---------- 

81 config : `DatastoreConfig` or `str` 

82 Configuration. This configuration must include a ``datastores`` field 

83 as a sequence of datastore configurations. The order in this sequence 

84 indicates the order to use for read operations. 

85 bridgeManager : `DatastoreRegistryBridgeManager` 

86 Object that manages the interface between `Registry` and datastores. 

87 butlerRoot : `str`, optional 

88 New datastore root to use to override the configuration value. This 

89 root is sent to each child datastore. 

90 

91 Notes 

92 ----- 

93 ChainedDatastore never supports `None` or `"move"` as an `ingest` transfer 

94 mode. It supports `"copy"`, `"symlink"`, `"relsymlink"` 

95 and `"hardlink"` if and only if all its child datastores do. 

96 """ 

97 

98 defaultConfigFile = "datastores/chainedDatastore.yaml" 

99 """Path to configuration defaults. Accessed within the ``configs`` resource 

100 or relative to a search path. Can be None if no defaults specified. 

101 """ 

102 

103 containerKey = "datastores" 

104 """Key to specify where child datastores are configured.""" 

105 

106 datastores: list[Datastore] 

107 """All the child datastores known to this datastore.""" 

108 

109 datastoreConstraints: Sequence[Constraints | None] 

110 """Constraints to be applied to each of the child datastores.""" 

111 

112 @classmethod 

113 def setConfigRoot(cls, root: str, config: Config, full: Config, overwrite: bool = True) -> None: 

114 """Set any filesystem-dependent config options for child Datastores to 

115 be appropriate for a new empty repository with the given root. 

116 

117 Parameters 

118 ---------- 

119 root : `str` 

120 Filesystem path to the root of the data repository. 

121 config : `Config` 

122 A `Config` to update. Only the subset understood by 

123 this component will be updated. Will not expand 

124 defaults. 

125 full : `Config` 

126 A complete config with all defaults expanded that can be 

127 converted to a `DatastoreConfig`. Read-only and will not be 

128 modified by this method. 

129 Repository-specific options that should not be obtained 

130 from defaults when Butler instances are constructed 

131 should be copied from ``full`` to ``config``. 

132 overwrite : `bool`, optional 

133 If `False`, do not modify a value in ``config`` if the value 

134 already exists. Default is always to overwrite with the provided 

135 ``root``. 

136 

137 Notes 

138 ----- 

139 If a keyword is explicitly defined in the supplied ``config`` it 

140 will not be overridden by this method if ``overwrite`` is `False`. 

141 This allows explicit values set in external configs to be retained. 

142 """ 

143 # Extract the part of the config we care about updating 

144 datastoreConfig = DatastoreConfig(config, mergeDefaults=False) 

145 

146 # And the subset of the full config that we can use for reference. 

147 # Do not bother with defaults because we are told this already has 

148 # them. 

149 fullDatastoreConfig = DatastoreConfig(full, mergeDefaults=False) 

150 

151 # Loop over each datastore config and pass the subsets to the 

152 # child datastores to process. 

153 

154 containerKey = cls.containerKey 

155 for idx, (child, fullChild) in enumerate( 

156 zip(datastoreConfig[containerKey], fullDatastoreConfig[containerKey], strict=True) 

157 ): 

158 childConfig = DatastoreConfig(child, mergeDefaults=False) 

159 fullChildConfig = DatastoreConfig(fullChild, mergeDefaults=False) 

160 datastoreClass = doImportType(fullChildConfig["cls"]) 

161 if not issubclass(datastoreClass, Datastore): 161 ↛ 162line 161 didn't jump to line 162, because the condition on line 161 was never true

162 raise TypeError(f"Imported child class {fullChildConfig['cls']} is not a Datastore") 

163 newroot = f"{root}/{datastoreClass.__qualname__}_{idx}" 

164 datastoreClass.setConfigRoot(newroot, childConfig, fullChildConfig, overwrite=overwrite) 

165 

166 # Reattach to parent 

167 datastoreConfig[containerKey, idx] = childConfig 

168 

169 # Reattach modified datastore config to parent 

170 # If this has a datastore key we attach there, otherwise we assume 

171 # this information goes at the top of the config hierarchy. 

172 if DatastoreConfig.component in config: 

173 config[DatastoreConfig.component] = datastoreConfig 

174 else: 

175 config.update(datastoreConfig) 

176 

177 return 

178 

179 def __init__( 

180 self, 

181 config: Config | ResourcePathExpression, 

182 bridgeManager: DatastoreRegistryBridgeManager, 

183 butlerRoot: str | None = None, 

184 ): 

185 super().__init__(config, bridgeManager) 

186 

187 # Scan for child datastores and instantiate them with the same registry 

188 self.datastores = [] 

189 for c in self.config["datastores"]: 

190 c = DatastoreConfig(c) 

191 datastoreType = doImportType(c["cls"]) 

192 if not issubclass(datastoreType, Datastore): 192 ↛ 193line 192 didn't jump to line 193, because the condition on line 192 was never true

193 raise TypeError(f"Imported child class {c['cls']} is not a Datastore") 

194 datastore = datastoreType(c, bridgeManager, butlerRoot=butlerRoot) 

195 log.debug("Creating child datastore %s", datastore.name) 

196 self.datastores.append(datastore) 

197 

198 # Name ourself based on our children 

199 if self.datastores: 199 ↛ 204line 199 didn't jump to line 204, because the condition on line 199 was never false

200 # We must set the names explicitly 

201 self._names = [d.name for d in self.datastores] 

202 childNames = ",".join(self.names) 

203 else: 

204 childNames = f"(empty@{time.time()})" 

205 self._names = [childNames] 

206 self.name = f"{type(self).__qualname__}[{childNames}]" 

207 

208 # We declare we are ephemeral if all our child datastores declare 

209 # they are ephemeral 

210 isEphemeral = True 

211 for d in self.datastores: 

212 if not d.isEphemeral: 

213 isEphemeral = False 

214 break 

215 self.isEphemeral = isEphemeral 

216 

217 # per-datastore override constraints 

218 if "datastore_constraints" in self.config: 

219 overrides = self.config["datastore_constraints"] 

220 

221 if len(overrides) != len(self.datastores): 221 ↛ 222line 221 didn't jump to line 222, because the condition on line 221 was never true

222 raise DatastoreValidationError( 

223 f"Number of registered datastores ({len(self.datastores)})" 

224 " differs from number of constraints overrides" 

225 f" {len(overrides)}" 

226 ) 

227 

228 self.datastoreConstraints = [ 

229 Constraints(c.get("constraints"), universe=bridgeManager.universe) for c in overrides 

230 ] 

231 

232 else: 

233 self.datastoreConstraints = (None,) * len(self.datastores) 

234 

235 log.debug("Created %s (%s)", self.name, ("ephemeral" if self.isEphemeral else "permanent")) 

236 

237 @property 

238 def names(self) -> tuple[str, ...]: 

239 return tuple(self._names) 

240 

241 @property 

242 def roots(self) -> dict[str, ResourcePath | None]: 

243 # Docstring inherited. 

244 roots = {} 

245 for datastore in self.datastores: 

246 roots.update(datastore.roots) 

247 return roots 

248 

249 def __str__(self) -> str: 

250 chainName = ", ".join(str(ds) for ds in self.datastores) 

251 return chainName 

252 

253 def knows(self, ref: DatasetRef) -> bool: 

254 """Check if the dataset is known to any of the datastores. 

255 

256 Does not check for existence of any artifact. 

257 

258 Parameters 

259 ---------- 

260 ref : `DatasetRef` 

261 Reference to the required dataset. 

262 

263 Returns 

264 ------- 

265 exists : `bool` 

266 `True` if the dataset is known to the datastore. 

267 """ 

268 for datastore in self.datastores: 

269 if datastore.knows(ref): 

270 log.debug("%s known to datastore %s", ref, datastore.name) 

271 return True 

272 return False 

273 

274 def knows_these(self, refs: Iterable[DatasetRef]) -> dict[DatasetRef, bool]: 

275 # Docstring inherited from the base class. 

276 refs_known: dict[DatasetRef, bool] = {} 

277 for datastore in self.datastores: 

278 refs_known.update(datastore.knows_these(refs)) 

279 

280 # No need to check in next datastore for refs that are known. 

281 # We only update entries that were initially False. 

282 refs = [ref for ref, known in refs_known.items() if not known] 

283 

284 return refs_known 

285 

286 def mexists( 

287 self, refs: Iterable[DatasetRef], artifact_existence: dict[ResourcePath, bool] | None = None 

288 ) -> dict[DatasetRef, bool]: 

289 """Check the existence of multiple datasets at once. 

290 

291 Parameters 

292 ---------- 

293 refs : iterable of `DatasetRef` 

294 The datasets to be checked. 

295 artifact_existence : `dict` [`lsst.resources.ResourcePath`, `bool`] 

296 Optional mapping of datastore artifact to existence. Updated by 

297 this method with details of all artifacts tested. Can be `None` 

298 if the caller is not interested. 

299 

300 Returns 

301 ------- 

302 existence : `dict` of [`DatasetRef`, `bool`] 

303 Mapping from dataset to boolean indicating existence in any 

304 of the child datastores. 

305 """ 

306 dataset_existence: dict[DatasetRef, bool] = {} 

307 for datastore in self.datastores: 

308 dataset_existence.update(datastore.mexists(refs, artifact_existence=artifact_existence)) 

309 

310 # For next datastore no point asking about ones we know 

311 # exist already. No special exemption for ephemeral datastores. 

312 refs = [ref for ref, exists in dataset_existence.items() if not exists] 

313 

314 return dataset_existence 

315 

316 def exists(self, ref: DatasetRef) -> bool: 

317 """Check if the dataset exists in one of the datastores. 

318 

319 Parameters 

320 ---------- 

321 ref : `DatasetRef` 

322 Reference to the required dataset. 

323 

324 Returns 

325 ------- 

326 exists : `bool` 

327 `True` if the entity exists in one of the child datastores. 

328 """ 

329 for datastore in self.datastores: 

330 if datastore.exists(ref): 

331 log.debug("Found %s in datastore %s", ref, datastore.name) 

332 return True 

333 return False 

334 

335 def get( 

336 self, 

337 ref: DatasetRef, 

338 parameters: Mapping[str, Any] | None = None, 

339 storageClass: StorageClass | str | None = None, 

340 ) -> Any: 

341 """Load an InMemoryDataset from the store. 

342 

343 The dataset is returned from the first datastore that has 

344 the dataset. 

345 

346 Parameters 

347 ---------- 

348 ref : `DatasetRef` 

349 Reference to the required Dataset. 

350 parameters : `dict` 

351 `StorageClass`-specific parameters that specify, for example, 

352 a slice of the dataset to be loaded. 

353 storageClass : `StorageClass` or `str`, optional 

354 The storage class to be used to override the Python type 

355 returned by this method. By default the returned type matches 

356 the dataset type definition for this dataset. Specifying a 

357 read `StorageClass` can force a different type to be returned. 

358 This type must be compatible with the original type. 

359 

360 Returns 

361 ------- 

362 inMemoryDataset : `object` 

363 Requested dataset or slice thereof as an InMemoryDataset. 

364 

365 Raises 

366 ------ 

367 FileNotFoundError 

368 Requested dataset can not be retrieved. 

369 TypeError 

370 Return value from formatter has unexpected type. 

371 ValueError 

372 Formatter failed to process the dataset. 

373 """ 

374 for datastore in self.datastores: 

375 try: 

376 inMemoryObject = datastore.get(ref, parameters, storageClass=storageClass) 

377 log.debug("Found dataset %s in datastore %s", ref, datastore.name) 

378 return inMemoryObject 

379 except FileNotFoundError: 

380 pass 

381 

382 raise FileNotFoundError(f"Dataset {ref} could not be found in any of the datastores") 

383 

384 def put(self, inMemoryDataset: Any, ref: DatasetRef) -> None: 

385 """Write a InMemoryDataset with a given `DatasetRef` to each 

386 datastore. 

387 

388 The put() to child datastores can fail with 

389 `DatasetTypeNotSupportedError`. The put() for this datastore will be 

390 deemed to have succeeded so long as at least one child datastore 

391 accepted the inMemoryDataset. 

392 

393 Parameters 

394 ---------- 

395 inMemoryDataset : `object` 

396 The dataset to store. 

397 ref : `DatasetRef` 

398 Reference to the associated Dataset. 

399 

400 Raises 

401 ------ 

402 TypeError 

403 Supplied object and storage class are inconsistent. 

404 DatasetTypeNotSupportedError 

405 All datastores reported `DatasetTypeNotSupportedError`. 

406 """ 

407 log.debug("Put %s", ref) 

408 

409 # Confirm that we can accept this dataset 

410 if not self.constraints.isAcceptable(ref): 

411 # Raise rather than use boolean return value. 

412 raise DatasetTypeNotSupportedError( 

413 f"Dataset {ref} has been rejected by this datastore via configuration." 

414 ) 

415 

416 isPermanent = False 

417 nsuccess = 0 

418 npermanent = 0 

419 nephemeral = 0 

420 for datastore, constraints in zip(self.datastores, self.datastoreConstraints, strict=True): 

421 if ( 

422 constraints is not None and not constraints.isAcceptable(ref) 

423 ) or not datastore.constraints.isAcceptable(ref): 

424 log.debug("Datastore %s skipping put via configuration for ref %s", datastore.name, ref) 

425 continue 

426 

427 if datastore.isEphemeral: 

428 nephemeral += 1 

429 else: 

430 npermanent += 1 

431 try: 

432 datastore.put(inMemoryDataset, ref) 

433 nsuccess += 1 

434 if not datastore.isEphemeral: 

435 isPermanent = True 

436 except DatasetTypeNotSupportedError: 

437 pass 

438 

439 if nsuccess == 0: 

440 raise DatasetTypeNotSupportedError(f"None of the chained datastores supported ref {ref}") 

441 

442 if not isPermanent and npermanent > 0: 442 ↛ 443line 442 didn't jump to line 443, because the condition on line 442 was never true

443 warnings.warn(f"Put of {ref} only succeeded in ephemeral databases", stacklevel=2) 

444 

445 if self._transaction is not None: 

446 self._transaction.registerUndo("put", self.remove, ref) 

447 

448 def _overrideTransferMode(self, *datasets: Any, transfer: str | None = None) -> str | None: 

449 # Docstring inherited from base class. 

450 if transfer != "auto": 

451 return transfer 

452 # Ask each datastore what they think auto means 

453 transfers = {d._overrideTransferMode(*datasets, transfer=transfer) for d in self.datastores} 

454 

455 # Remove any untranslated "auto" values 

456 transfers.discard(transfer) 

457 

458 if len(transfers) == 1: 458 ↛ 459line 458 didn't jump to line 459, because the condition on line 458 was never true

459 return transfers.pop() 

460 if not transfers: 460 ↛ 464line 460 didn't jump to line 464, because the condition on line 460 was never false

461 # Everything reported "auto" 

462 return transfer 

463 

464 raise RuntimeError( 

465 "Chained datastore does not yet support different transfer modes" 

466 f" from 'auto' in each child datastore (wanted {transfers})" 

467 ) 

468 

469 def _prepIngest(self, *datasets: FileDataset, transfer: str | None = None) -> _IngestPrepData: 

470 # Docstring inherited from Datastore._prepIngest. 

471 if transfer is None: 

472 raise NotImplementedError("ChainedDatastore does not support transfer=None.") 

473 

474 def isDatasetAcceptable(dataset: FileDataset, *, name: str, constraints: Constraints) -> bool: 

475 acceptable = [ref for ref in dataset.refs if constraints.isAcceptable(ref)] 

476 if not acceptable: 

477 log.debug( 

478 "Datastore %s skipping ingest via configuration for refs %s", 

479 name, 

480 ", ".join(str(ref) for ref in dataset.refs), 

481 ) 

482 return False 

483 else: 

484 return True 

485 

486 # Filter down to just datasets the chained datastore's own 

487 # configuration accepts. 

488 okForParent: list[FileDataset] = [ 

489 dataset 

490 for dataset in datasets 

491 if isDatasetAcceptable(dataset, name=self.name, constraints=self.constraints) 

492 ] 

493 

494 # Iterate over nested datastores and call _prepIngest on each. 

495 # Save the results to a list: 

496 children: list[tuple[Datastore, Datastore.IngestPrepData, set[ResourcePath]]] = [] 

497 # ...and remember whether all of the failures are due to 

498 # NotImplementedError being raised. 

499 allFailuresAreNotImplementedError = True 

500 for datastore, constraints in zip(self.datastores, self.datastoreConstraints, strict=True): 

501 okForChild: list[FileDataset] 

502 if constraints is not None: 

503 okForChild = [ 

504 dataset 

505 for dataset in okForParent 

506 if isDatasetAcceptable(dataset, name=datastore.name, constraints=constraints) 

507 ] 

508 else: 

509 okForChild = okForParent 

510 try: 

511 prepDataForChild = datastore._prepIngest(*okForChild, transfer=transfer) 

512 except NotImplementedError: 

513 log.debug( 

514 "Skipping ingest for datastore %s because transfer mode %s is not supported.", 

515 datastore.name, 

516 transfer, 

517 ) 

518 continue 

519 allFailuresAreNotImplementedError = False 

520 if okForChild: 

521 # Do not store for later if a datastore has rejected 

522 # everything. 

523 # Include the source paths if this is a "move". It's clearer 

524 # to find the paths now rather than try to infer how 

525 # each datastore has stored them in the internal prep class. 

526 paths = ( 

527 {ResourcePath(dataset.path) for dataset in okForChild} if transfer == "move" else set() 

528 ) 

529 children.append((datastore, prepDataForChild, paths)) 

530 if allFailuresAreNotImplementedError: 

531 raise NotImplementedError(f"No child datastore supports transfer mode {transfer}.") 

532 return _IngestPrepData(children=children) 

533 

534 def _finishIngest( 

535 self, 

536 prepData: _IngestPrepData, 

537 *, 

538 transfer: str | None = None, 

539 record_validation_info: bool = True, 

540 ) -> None: 

541 # Docstring inherited from Datastore._finishIngest. 

542 # For "move" we must use "copy" and then delete the input 

543 # data at the end. This has no rollback option if the ingest 

544 # subsequently fails. If there is only one active datastore 

545 # accepting any files we can leave it as "move" 

546 actual_transfer: str | None 

547 if transfer == "move" and len(prepData.children) > 1: 

548 actual_transfer = "copy" 

549 else: 

550 actual_transfer = transfer 

551 to_be_deleted: set[ResourcePath] = set() 

552 for datastore, prepDataForChild, paths in prepData.children: 

553 datastore._finishIngest( 

554 prepDataForChild, transfer=actual_transfer, record_validation_info=record_validation_info 

555 ) 

556 to_be_deleted.update(paths) 

557 if actual_transfer != transfer: 

558 # These datasets were copied but now need to be deleted. 

559 # This can not be rolled back. 

560 for uri in to_be_deleted: 

561 uri.remove() 

562 

563 def getManyURIs( 

564 self, 

565 refs: Iterable[DatasetRef], 

566 predict: bool = False, 

567 allow_missing: bool = False, 

568 ) -> dict[DatasetRef, DatasetRefURIs]: 

569 # Docstring inherited 

570 

571 uris: dict[DatasetRef, DatasetRefURIs] = {} 

572 missing_refs = set(refs) 

573 

574 # If predict is True we don't want to predict a dataset in the first 

575 # datastore if it actually exists in a later datastore, so in that 

576 # case check all datastores with predict=False first, and then try 

577 # again with predict=True. 

578 for p in (False, True) if predict else (False,): 

579 if not missing_refs: 

580 break 

581 for datastore in self.datastores: 

582 try: 

583 got_uris = datastore.getManyURIs(missing_refs, p, allow_missing=True) 

584 except NotImplementedError: 

585 # some datastores may not implement generating URIs 

586 continue 

587 missing_refs -= got_uris.keys() 

588 uris.update(got_uris) 

589 if not missing_refs: 

590 break 

591 

592 if missing_refs and not allow_missing: 

593 raise FileNotFoundError(f"Dataset(s) {missing_refs} not in this datastore.") 

594 

595 return uris 

596 

597 def getURIs(self, ref: DatasetRef, predict: bool = False) -> DatasetRefURIs: 

598 """Return URIs associated with dataset. 

599 

600 Parameters 

601 ---------- 

602 ref : `DatasetRef` 

603 Reference to the required dataset. 

604 predict : `bool`, optional 

605 If the datastore does not know about the dataset, should it 

606 return a predicted URI or not? 

607 

608 Returns 

609 ------- 

610 uris : `DatasetRefURIs` 

611 The URI to the primary artifact associated with this dataset (if 

612 the dataset was disassembled within the datastore this may be 

613 `None`), and the URIs to any components associated with the dataset 

614 artifact. (can be empty if there are no components). 

615 

616 Notes 

617 ----- 

618 The returned URI is from the first datastore in the list that has 

619 the dataset with preference given to the first dataset coming from 

620 a permanent datastore. If no datastores have the dataset and prediction 

621 is allowed, the predicted URI for the first datastore in the list will 

622 be returned. 

623 """ 

624 log.debug("Requesting URIs for %s", ref) 

625 predictedUri: DatasetRefURIs | None = None 

626 predictedEphemeralUri: DatasetRefURIs | None = None 

627 firstEphemeralUri: DatasetRefURIs | None = None 

628 for datastore in self.datastores: 

629 if datastore.exists(ref): 

630 if not datastore.isEphemeral: 

631 uri = datastore.getURIs(ref) 

632 log.debug("Retrieved non-ephemeral URI: %s", uri) 

633 return uri 

634 elif not firstEphemeralUri: 

635 firstEphemeralUri = datastore.getURIs(ref) 

636 elif predict: 

637 if not predictedUri and not datastore.isEphemeral: 

638 predictedUri = datastore.getURIs(ref, predict) 

639 elif not predictedEphemeralUri and datastore.isEphemeral: 

640 predictedEphemeralUri = datastore.getURIs(ref, predict) 

641 

642 if firstEphemeralUri: 

643 log.debug("Retrieved ephemeral URI: %s", firstEphemeralUri) 

644 return firstEphemeralUri 

645 

646 if predictedUri: 

647 log.debug("Retrieved predicted URI: %s", predictedUri) 

648 return predictedUri 

649 

650 if predictedEphemeralUri: 

651 log.debug("Retrieved predicted ephemeral URI: %s", predictedEphemeralUri) 

652 return predictedEphemeralUri 

653 

654 raise FileNotFoundError(f"Dataset {ref} not in any datastore") 

655 

656 def getURI(self, ref: DatasetRef, predict: bool = False) -> ResourcePath: 

657 """URI to the Dataset. 

658 

659 The returned URI is from the first datastore in the list that has 

660 the dataset with preference given to the first dataset coming from 

661 a permanent datastore. If no datastores have the dataset and prediction 

662 is allowed, the predicted URI for the first datastore in the list will 

663 be returned. 

664 

665 Parameters 

666 ---------- 

667 ref : `DatasetRef` 

668 Reference to the required Dataset. 

669 predict : `bool` 

670 If `True`, allow URIs to be returned of datasets that have not 

671 been written. 

672 

673 Returns 

674 ------- 

675 uri : `lsst.resources.ResourcePath` 

676 URI pointing to the dataset within the datastore. If the 

677 dataset does not exist in the datastore, and if ``predict`` is 

678 `True`, the URI will be a prediction and will include a URI 

679 fragment "#predicted". 

680 

681 Notes 

682 ----- 

683 If the datastore does not have entities that relate well 

684 to the concept of a URI the returned URI string will be 

685 descriptive. The returned URI is not guaranteed to be obtainable. 

686 

687 Raises 

688 ------ 

689 FileNotFoundError 

690 A URI has been requested for a dataset that does not exist and 

691 guessing is not allowed. 

692 RuntimeError 

693 Raised if a request is made for a single URI but multiple URIs 

694 are associated with this dataset. 

695 """ 

696 log.debug("Requesting URI for %s", ref) 

697 primary, components = self.getURIs(ref, predict) 

698 if primary is None or components: 698 ↛ 699line 698 didn't jump to line 699, because the condition on line 698 was never true

699 raise RuntimeError( 

700 f"Dataset ({ref}) includes distinct URIs for components. Use Datastore.getURIs() instead." 

701 ) 

702 return primary 

703 

704 def retrieveArtifacts( 

705 self, 

706 refs: Iterable[DatasetRef], 

707 destination: ResourcePath, 

708 transfer: str = "auto", 

709 preserve_path: bool = True, 

710 overwrite: bool = False, 

711 ) -> list[ResourcePath]: 

712 """Retrieve the file artifacts associated with the supplied refs. 

713 

714 Parameters 

715 ---------- 

716 refs : iterable of `DatasetRef` 

717 The datasets for which file artifacts are to be retrieved. 

718 A single ref can result in multiple files. The refs must 

719 be resolved. 

720 destination : `lsst.resources.ResourcePath` 

721 Location to write the file artifacts. 

722 transfer : `str`, optional 

723 Method to use to transfer the artifacts. Must be one of the options 

724 supported by `lsst.resources.ResourcePath.transfer_from()`. 

725 "move" is not allowed. 

726 preserve_path : `bool`, optional 

727 If `True` the full path of the file artifact within the datastore 

728 is preserved. If `False` the final file component of the path 

729 is used. 

730 overwrite : `bool`, optional 

731 If `True` allow transfers to overwrite existing files at the 

732 destination. 

733 

734 Returns 

735 ------- 

736 targets : `list` of `lsst.resources.ResourcePath` 

737 URIs of file artifacts in destination location. Order is not 

738 preserved. 

739 """ 

740 if not destination.isdir(): 740 ↛ 741line 740 didn't jump to line 741, because the condition on line 740 was never true

741 raise ValueError(f"Destination location must refer to a directory. Given {destination}") 

742 

743 # Using getURIs is not feasible since it becomes difficult to 

744 # determine the path within the datastore later on. For now 

745 # follow getURIs implementation approach. 

746 

747 pending = set(refs) 

748 

749 # There is a question as to whether an exception should be raised 

750 # early if some of the refs are missing, or whether files should be 

751 # transferred until a problem is hit. Prefer to complain up front. 

752 # Use the datastore integer as primary key. 

753 grouped_by_datastore: dict[int, set[DatasetRef]] = {} 

754 

755 for number, datastore in enumerate(self.datastores): 

756 if datastore.isEphemeral: 

757 # In the future we will want to distinguish in-memory from 

758 # caching datastore since using an on-disk local 

759 # cache is exactly what we should be doing. 

760 continue 

761 try: 

762 datastore_refs = {ref for ref in pending if datastore.exists(ref)} 

763 except NotImplementedError: 

764 # Some datastores may not support retrieving artifacts 

765 continue 

766 

767 if datastore_refs: 

768 grouped_by_datastore[number] = datastore_refs 

769 

770 # Remove these from the pending list so that we do not bother 

771 # looking for them any more. 

772 pending = pending - datastore_refs 

773 

774 if pending: 774 ↛ 775line 774 didn't jump to line 775, because the condition on line 774 was never true

775 raise RuntimeError(f"Some datasets were not found in any datastores: {pending}") 

776 

777 # Now do the transfer. 

778 targets: list[ResourcePath] = [] 

779 for number, datastore_refs in grouped_by_datastore.items(): 

780 targets.extend( 

781 self.datastores[number].retrieveArtifacts( 

782 datastore_refs, 

783 destination, 

784 transfer=transfer, 

785 preserve_path=preserve_path, 

786 overwrite=overwrite, 

787 ) 

788 ) 

789 

790 return targets 

791 

792 def remove(self, ref: DatasetRef) -> None: 

793 """Indicate to the datastore that a dataset can be removed. 

794 

795 The dataset will be removed from each datastore. The dataset is 

796 not required to exist in every child datastore. 

797 

798 Parameters 

799 ---------- 

800 ref : `DatasetRef` 

801 Reference to the required dataset. 

802 

803 Raises 

804 ------ 

805 FileNotFoundError 

806 Attempt to remove a dataset that does not exist. Raised if none 

807 of the child datastores removed the dataset. 

808 """ 

809 log.debug("Removing %s", ref) 

810 self.trash(ref, ignore_errors=False) 

811 self.emptyTrash(ignore_errors=False) 

812 

813 def forget(self, refs: Iterable[DatasetRef]) -> None: 

814 for datastore in tuple(self.datastores): 

815 datastore.forget(refs) 

816 

817 def trash(self, ref: DatasetRef | Iterable[DatasetRef], ignore_errors: bool = True) -> None: 

818 if isinstance(ref, DatasetRef): 

819 ref_label = str(ref) 

820 else: 

821 ref_label = "bulk datasets" 

822 

823 log.debug("Trashing %s", ref_label) 

824 

825 counter = 0 

826 for datastore in self.datastores: 

827 try: 

828 datastore.trash(ref, ignore_errors=ignore_errors) 

829 counter += 1 

830 except FileNotFoundError: 

831 pass 

832 

833 if counter == 0: 

834 err_msg = f"Could not mark for removal from any child datastore: {ref_label}" 

835 if ignore_errors: 835 ↛ 836line 835 didn't jump to line 836, because the condition on line 835 was never true

836 log.warning(err_msg) 

837 else: 

838 raise FileNotFoundError(err_msg) 

839 

840 def emptyTrash(self, ignore_errors: bool = True) -> None: 

841 for datastore in self.datastores: 

842 datastore.emptyTrash(ignore_errors=ignore_errors) 

843 

844 def transfer(self, inputDatastore: Datastore, ref: DatasetRef) -> None: 

845 """Retrieve a dataset from an input `Datastore`, 

846 and store the result in this `Datastore`. 

847 

848 Parameters 

849 ---------- 

850 inputDatastore : `Datastore` 

851 The external `Datastore` from which to retreive the Dataset. 

852 ref : `DatasetRef` 

853 Reference to the required dataset in the input data store. 

854 

855 Returns 

856 ------- 

857 results : `list` 

858 List containing the return value from the ``put()`` to each 

859 child datastore. 

860 """ 

861 assert inputDatastore is not self # unless we want it for renames? 

862 inMemoryDataset = inputDatastore.get(ref) 

863 self.put(inMemoryDataset, ref) 

864 

865 def validateConfiguration( 

866 self, entities: Iterable[DatasetRef | DatasetType | StorageClass], logFailures: bool = False 

867 ) -> None: 

868 """Validate some of the configuration for this datastore. 

869 

870 Parameters 

871 ---------- 

872 entities : iterable of `DatasetRef`, `DatasetType`, or `StorageClass` 

873 Entities to test against this configuration. Can be differing 

874 types. 

875 logFailures : `bool`, optional 

876 If `True`, output a log message for every validation error 

877 detected. 

878 

879 Raises 

880 ------ 

881 DatastoreValidationError 

882 Raised if there is a validation problem with a configuration. 

883 All the problems are reported in a single exception. 

884 

885 Notes 

886 ----- 

887 This method checks each datastore in turn. 

888 """ 

889 # Need to catch each of the datastore outputs and ensure that 

890 # all are tested. 

891 failures = [] 

892 for datastore in self.datastores: 

893 try: 

894 datastore.validateConfiguration(entities, logFailures=logFailures) 

895 except DatastoreValidationError as e: 

896 if logFailures: 896 ↛ 898line 896 didn't jump to line 898, because the condition on line 896 was never false

897 log.critical("Datastore %s failed validation", datastore.name) 

898 failures.append(f"Datastore {self.name}: {e}") 

899 

900 if failures: 

901 msg = ";\n".join(failures) 

902 raise DatastoreValidationError(msg) 

903 

904 def validateKey(self, lookupKey: LookupKey, entity: DatasetRef | DatasetType | StorageClass) -> None: 

905 # Docstring is inherited from base class 

906 failures = [] 

907 for datastore in self.datastores: 

908 try: 

909 datastore.validateKey(lookupKey, entity) 

910 except DatastoreValidationError as e: 

911 failures.append(f"Datastore {self.name}: {e}") 

912 

913 if failures: 

914 msg = ";\n".join(failures) 

915 raise DatastoreValidationError(msg) 

916 

917 def getLookupKeys(self) -> set[LookupKey]: 

918 # Docstring is inherited from base class 

919 keys = set() 

920 for datastore in self.datastores: 

921 keys.update(datastore.getLookupKeys()) 

922 

923 keys.update(self.constraints.getLookupKeys()) 

924 for p in self.datastoreConstraints: 

925 if p is not None: 925 ↛ 924line 925 didn't jump to line 924, because the condition on line 925 was never false

926 keys.update(p.getLookupKeys()) 

927 

928 return keys 

929 

930 def needs_expanded_data_ids( 

931 self, 

932 transfer: str | None, 

933 entity: DatasetRef | DatasetType | StorageClass | None = None, 

934 ) -> bool: 

935 # Docstring inherited. 

936 # We can't safely use `self.datastoreConstraints` with `entity` to 

937 # check whether a child datastore would even want to ingest this 

938 # dataset, because we don't want to filter out datastores that might 

939 # need an expanded data ID based in incomplete information (e.g. we 

940 # pass a StorageClass, but the constraint dispatches on DatasetType). 

941 # So we pessimistically check if any datastore would need an expanded 

942 # data ID for this transfer mode. 

943 return any(datastore.needs_expanded_data_ids(transfer, entity) for datastore in self.datastores) 

944 

945 def import_records(self, data: Mapping[str, DatastoreRecordData]) -> None: 

946 # Docstring inherited from the base class. 

947 

948 for datastore in self.datastores: 

949 datastore.import_records(data) 

950 

951 def export_records(self, refs: Iterable[DatasetIdRef]) -> Mapping[str, DatastoreRecordData]: 

952 # Docstring inherited from the base class. 

953 

954 all_records: dict[str, DatastoreRecordData] = {} 

955 

956 # Merge all sub-datastore records into one structure 

957 for datastore in self.datastores: 

958 sub_records = datastore.export_records(refs) 

959 for name, record_data in sub_records.items(): 

960 # All datastore names must be unique in a chain. 

961 if name in all_records: 961 ↛ 962line 961 didn't jump to line 962, because the condition on line 961 was never true

962 raise ValueError("Non-unique datastore name found in datastore {datastore}") 

963 all_records[name] = record_data 

964 

965 return all_records 

966 

967 def export( 

968 self, 

969 refs: Iterable[DatasetRef], 

970 *, 

971 directory: ResourcePathExpression | None = None, 

972 transfer: str | None = "auto", 

973 ) -> Iterable[FileDataset]: 

974 # Docstring inherited from Datastore.export. 

975 if transfer == "auto" and directory is None: 

976 transfer = None 

977 

978 if transfer is not None and directory is None: 

979 raise TypeError(f"Cannot export using transfer mode {transfer} with no export directory given") 

980 

981 if transfer == "move": 

982 raise TypeError("Can not export by moving files out of datastore.") 

983 

984 # Exporting from a chain has the potential for a dataset to be 

985 # in one or more of the datastores in the chain. We only need one 

986 # of them since we assume the datasets are the same in all (but 

987 # the file format could be different of course since that is a 

988 # per-datastore configuration). 

989 # We also do not know whether any of the datastores in the chain 

990 # support file export. 

991 

992 # Ensure we have an ordered sequence that is not an iterator or set. 

993 if not isinstance(refs, Sequence): 

994 refs = list(refs) 

995 

996 # If any of the datasets are missing entirely we need to raise early 

997 # before we try to run the export. This can be a little messy but is 

998 # better than exporting files from the first datastore and then finding 

999 # that one is missing but is not in the second datastore either. 

1000 known = [datastore.knows_these(refs) for datastore in self.datastores] 

1001 refs_known: set[DatasetRef] = set() 

1002 for known_to_this in known: 

1003 refs_known.update({ref for ref, knows_this in known_to_this.items() if knows_this}) 

1004 missing_count = len(refs) - len(refs_known) 

1005 if missing_count: 

1006 raise FileNotFoundError(f"Not all datasets known to this datastore. Missing {missing_count}") 

1007 

1008 # To allow us to slot each result into the right place after 

1009 # asking each datastore, create a dict with the index. 

1010 ref_positions = {ref: i for i, ref in enumerate(refs)} 

1011 

1012 # Presize the final export list. 

1013 exported: list[FileDataset | None] = [None] * len(refs) 

1014 

1015 # The order of the returned dataset has to match the order of the 

1016 # given refs, even if they are all from different datastores. 

1017 for i, datastore in enumerate(self.datastores): 

1018 known_to_this = known[i] 

1019 filtered = [ref for ref, knows in known_to_this.items() if knows and ref in ref_positions] 

1020 

1021 try: 

1022 this_export = datastore.export(filtered, directory=directory, transfer=transfer) 

1023 except NotImplementedError: 

1024 # Try the next datastore. 

1025 continue 

1026 

1027 for ref, export in zip(filtered, this_export, strict=True): 

1028 # Get the position and also delete it from the list. 

1029 exported[ref_positions.pop(ref)] = export 

1030 

1031 # Every dataset should be accounted for because of the earlier checks 

1032 # but make sure that we did fill all the slots to appease mypy. 

1033 for i, dataset in enumerate(exported): 

1034 if dataset is None: 1034 ↛ 1035line 1034 didn't jump to line 1035, because the condition on line 1034 was never true

1035 raise FileNotFoundError(f"Failed to export dataset {refs[i]}.") 

1036 yield dataset 

1037 

1038 def transfer_from( 

1039 self, 

1040 source_datastore: Datastore, 

1041 refs: Iterable[DatasetRef], 

1042 transfer: str = "auto", 

1043 artifact_existence: dict[ResourcePath, bool] | None = None, 

1044 ) -> tuple[set[DatasetRef], set[DatasetRef]]: 

1045 # Docstring inherited 

1046 # mypy does not understand "type(self) is not type(source)" 

1047 if isinstance(source_datastore, ChainedDatastore): 

1048 # Both the source and destination are chained datastores. 

1049 source_datastores = tuple(source_datastore.datastores) 

1050 else: 

1051 # The source datastore is different, forward everything to the 

1052 # child datastores. 

1053 source_datastores = tuple([source_datastore]) 

1054 

1055 # Need to know the set of all possible refs that could be transferred. 

1056 remaining_refs = set(refs) 

1057 

1058 missing_from_source: set[DatasetRef] | None = None 

1059 all_accepted = set() 

1060 nsuccess = 0 

1061 for source_child in source_datastores: 

1062 # If we are reading from a chained datastore, it's possible that 

1063 # only a subset of the datastores know about the dataset. We can't 

1064 # ask the receiving datastore to copy it when it doesn't exist 

1065 # so we have to filter again based on what the source datastore 

1066 # understands. 

1067 known_to_source = source_child.knows_these([ref for ref in refs]) 

1068 

1069 # Need to know that there is a possibility that some of these 

1070 # datasets exist but are unknown to the source datastore if 

1071 # trust is enabled. 

1072 if getattr(source_child, "trustGetRequest", False): 

1073 unknown = [ref for ref, known in known_to_source.items() if not known] 

1074 existence = source_child.mexists(unknown, artifact_existence) 

1075 for ref, exists in existence.items(): 

1076 known_to_source[ref] = exists 

1077 

1078 missing = {ref for ref, known in known_to_source.items() if not known} 

1079 if missing: 

1080 if missing_from_source is None: 

1081 missing_from_source = missing 

1082 else: 

1083 missing_from_source &= missing 

1084 

1085 # Try to transfer from each source datastore to each child 

1086 # datastore. Have to make sure we don't transfer something 

1087 # we've already transferred to this destination on later passes. 

1088 

1089 # Filter the initial list based on the datasets we have 

1090 # not yet transferred. 

1091 these_refs = [] 

1092 for ref in refs: 

1093 if ref in remaining_refs and known_to_source[ref]: 

1094 these_refs.append(ref) 

1095 

1096 if not these_refs: 

1097 # Already transferred all datasets known to this datastore. 

1098 continue 

1099 

1100 for datastore, constraints in zip(self.datastores, self.datastoreConstraints, strict=True): 

1101 if constraints is not None: 1101 ↛ 1109line 1101 didn't jump to line 1109, because the condition on line 1101 was never false

1102 filtered_refs = [] 

1103 for ref in these_refs: 

1104 if constraints.isAcceptable(ref): 

1105 filtered_refs.append(ref) 

1106 else: 

1107 log.debug("Rejecting ref by constraints: %s", ref) 

1108 else: 

1109 filtered_refs = [ref for ref in these_refs] 

1110 try: 

1111 accepted, _ = datastore.transfer_from( 

1112 source_child, filtered_refs, transfer, artifact_existence 

1113 ) 

1114 except (TypeError, NotImplementedError): 

1115 # The datastores were incompatible. 

1116 continue 

1117 else: 

1118 nsuccess += 1 

1119 

1120 # Remove the accepted datasets from those remaining. 

1121 remaining_refs = remaining_refs - accepted 

1122 

1123 # Keep track of everything we have accepted. 

1124 all_accepted.update(accepted) 

1125 

1126 if missing_from_source: 

1127 for ref in missing_from_source: 

1128 log.warning("Asked to transfer dataset %s but no file artifacts exist for it", ref) 

1129 

1130 if nsuccess == 0: 1130 ↛ 1131line 1130 didn't jump to line 1131, because the condition on line 1130 was never true

1131 raise TypeError(f"None of the child datastores could accept transfers from {source_datastore!r}") 

1132 

1133 return all_accepted, remaining_refs