Coverage for python/lsst/daf/butler/datastores/chainedDatastore.py: 92%

421 statements  

« prev     ^ index     » next       coverage.py v7.3.1, created at 2023-10-02 07:59 +0000

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This software is dual licensed under the GNU General Public License and also 

10# under a 3-clause BSD license. Recipients may choose which of these licenses 

11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt, 

12# respectively. If you choose the GPL option then the following text applies 

13# (but note that there is still no warranty even if you opt for BSD instead): 

14# 

15# This program is free software: you can redistribute it and/or modify 

16# it under the terms of the GNU General Public License as published by 

17# the Free Software Foundation, either version 3 of the License, or 

18# (at your option) any later version. 

19# 

20# This program is distributed in the hope that it will be useful, 

21# but WITHOUT ANY WARRANTY; without even the implied warranty of 

22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

23# GNU General Public License for more details. 

24# 

25# You should have received a copy of the GNU General Public License 

26# along with this program. If not, see <http://www.gnu.org/licenses/>. 

27 

28"""Chained datastore.""" 

29 

30from __future__ import annotations 

31 

32__all__ = ("ChainedDatastore",) 

33 

34import itertools 

35import logging 

36import time 

37import warnings 

38from collections.abc import Iterable, Mapping, Sequence 

39from typing import TYPE_CHECKING, Any 

40 

41from lsst.daf.butler import ( 

42 Constraints, 

43 DatasetRef, 

44 DatasetRefURIs, 

45 DatasetTypeNotSupportedError, 

46 Datastore, 

47 DatastoreConfig, 

48 DatastoreRecordData, 

49 DatastoreValidationError, 

50 FileDataset, 

51) 

52from lsst.resources import ResourcePath 

53from lsst.utils import doImportType 

54 

55if TYPE_CHECKING: 

56 from lsst.daf.butler import Config, DatasetType, LookupKey, StorageClass 

57 from lsst.daf.butler.registry.interfaces import DatasetIdRef, DatastoreRegistryBridgeManager 

58 from lsst.resources import ResourcePathExpression 

59 

60log = logging.getLogger(__name__) 

61 

62 

63class _IngestPrepData(Datastore.IngestPrepData): 

64 """Helper class for ChainedDatastore ingest implementation. 

65 

66 Parameters 

67 ---------- 

68 children : `list` of `tuple` 

69 Pairs of `Datastore`, `IngestPrepData` for all child datastores. 

70 """ 

71 

72 def __init__(self, children: list[tuple[Datastore, Datastore.IngestPrepData, set[ResourcePath]]]): 

73 super().__init__(itertools.chain.from_iterable(data.refs.values() for _, data, _ in children)) 

74 self.children = children 

75 

76 

77class ChainedDatastore(Datastore): 

78 """Chained Datastores to allow read and writes from multiple datastores. 

79 

80 A ChainedDatastore is configured with multiple datastore configurations. 

81 A ``put()`` is always sent to each datastore. A ``get()`` 

82 operation is sent to each datastore in turn and the first datastore 

83 to return a valid dataset is used. 

84 

85 Parameters 

86 ---------- 

87 config : `DatastoreConfig` or `str` 

88 Configuration. This configuration must include a ``datastores`` field 

89 as a sequence of datastore configurations. The order in this sequence 

90 indicates the order to use for read operations. 

91 bridgeManager : `DatastoreRegistryBridgeManager` 

92 Object that manages the interface between `Registry` and datastores. 

93 butlerRoot : `str`, optional 

94 New datastore root to use to override the configuration value. This 

95 root is sent to each child datastore. 

96 

97 Notes 

98 ----- 

99 ChainedDatastore never supports `None` or `"move"` as an `ingest` transfer 

100 mode. It supports `"copy"`, `"symlink"`, `"relsymlink"` 

101 and `"hardlink"` if and only if all its child datastores do. 

102 """ 

103 

104 defaultConfigFile = "datastores/chainedDatastore.yaml" 

105 """Path to configuration defaults. Accessed within the ``configs`` resource 

106 or relative to a search path. Can be None if no defaults specified. 

107 """ 

108 

109 containerKey = "datastores" 

110 """Key to specify where child datastores are configured.""" 

111 

112 datastores: list[Datastore] 

113 """All the child datastores known to this datastore.""" 

114 

115 datastoreConstraints: Sequence[Constraints | None] 

116 """Constraints to be applied to each of the child datastores.""" 

117 

118 @classmethod 

119 def setConfigRoot(cls, root: str, config: Config, full: Config, overwrite: bool = True) -> None: 

120 """Set any filesystem-dependent config options for child Datastores to 

121 be appropriate for a new empty repository with the given root. 

122 

123 Parameters 

124 ---------- 

125 root : `str` 

126 Filesystem path to the root of the data repository. 

127 config : `Config` 

128 A `Config` to update. Only the subset understood by 

129 this component will be updated. Will not expand 

130 defaults. 

131 full : `Config` 

132 A complete config with all defaults expanded that can be 

133 converted to a `DatastoreConfig`. Read-only and will not be 

134 modified by this method. 

135 Repository-specific options that should not be obtained 

136 from defaults when Butler instances are constructed 

137 should be copied from ``full`` to ``config``. 

138 overwrite : `bool`, optional 

139 If `False`, do not modify a value in ``config`` if the value 

140 already exists. Default is always to overwrite with the provided 

141 ``root``. 

142 

143 Notes 

144 ----- 

145 If a keyword is explicitly defined in the supplied ``config`` it 

146 will not be overridden by this method if ``overwrite`` is `False`. 

147 This allows explicit values set in external configs to be retained. 

148 """ 

149 # Extract the part of the config we care about updating 

150 datastoreConfig = DatastoreConfig(config, mergeDefaults=False) 

151 

152 # And the subset of the full config that we can use for reference. 

153 # Do not bother with defaults because we are told this already has 

154 # them. 

155 fullDatastoreConfig = DatastoreConfig(full, mergeDefaults=False) 

156 

157 # Loop over each datastore config and pass the subsets to the 

158 # child datastores to process. 

159 

160 containerKey = cls.containerKey 

161 for idx, (child, fullChild) in enumerate( 

162 zip(datastoreConfig[containerKey], fullDatastoreConfig[containerKey], strict=True) 

163 ): 

164 childConfig = DatastoreConfig(child, mergeDefaults=False) 

165 fullChildConfig = DatastoreConfig(fullChild, mergeDefaults=False) 

166 datastoreClass = doImportType(fullChildConfig["cls"]) 

167 if not issubclass(datastoreClass, Datastore): 167 ↛ 168line 167 didn't jump to line 168, because the condition on line 167 was never true

168 raise TypeError(f"Imported child class {fullChildConfig['cls']} is not a Datastore") 

169 newroot = f"{root}/{datastoreClass.__qualname__}_{idx}" 

170 datastoreClass.setConfigRoot(newroot, childConfig, fullChildConfig, overwrite=overwrite) 

171 

172 # Reattach to parent 

173 datastoreConfig[containerKey, idx] = childConfig 

174 

175 # Reattach modified datastore config to parent 

176 # If this has a datastore key we attach there, otherwise we assume 

177 # this information goes at the top of the config hierarchy. 

178 if DatastoreConfig.component in config: 

179 config[DatastoreConfig.component] = datastoreConfig 

180 else: 

181 config.update(datastoreConfig) 

182 

183 return 

184 

185 def __init__( 

186 self, 

187 config: Config | ResourcePathExpression, 

188 bridgeManager: DatastoreRegistryBridgeManager, 

189 butlerRoot: str | None = None, 

190 ): 

191 super().__init__(config, bridgeManager) 

192 

193 # Scan for child datastores and instantiate them with the same registry 

194 self.datastores = [] 

195 for c in self.config["datastores"]: 

196 c = DatastoreConfig(c) 

197 datastoreType = doImportType(c["cls"]) 

198 if not issubclass(datastoreType, Datastore): 198 ↛ 199line 198 didn't jump to line 199, because the condition on line 198 was never true

199 raise TypeError(f"Imported child class {c['cls']} is not a Datastore") 

200 datastore = datastoreType(c, bridgeManager, butlerRoot=butlerRoot) 

201 log.debug("Creating child datastore %s", datastore.name) 

202 self.datastores.append(datastore) 

203 

204 # Name ourself based on our children 

205 if self.datastores: 205 ↛ 210line 205 didn't jump to line 210, because the condition on line 205 was never false

206 # We must set the names explicitly 

207 self._names = [d.name for d in self.datastores] 

208 childNames = ",".join(self.names) 

209 else: 

210 childNames = f"(empty@{time.time()})" 

211 self._names = [childNames] 

212 self.name = f"{type(self).__qualname__}[{childNames}]" 

213 

214 # We declare we are ephemeral if all our child datastores declare 

215 # they are ephemeral 

216 isEphemeral = True 

217 for d in self.datastores: 

218 if not d.isEphemeral: 

219 isEphemeral = False 

220 break 

221 self.isEphemeral = isEphemeral 

222 

223 # per-datastore override constraints 

224 if "datastore_constraints" in self.config: 

225 overrides = self.config["datastore_constraints"] 

226 

227 if len(overrides) != len(self.datastores): 227 ↛ 228line 227 didn't jump to line 228, because the condition on line 227 was never true

228 raise DatastoreValidationError( 

229 f"Number of registered datastores ({len(self.datastores)})" 

230 " differs from number of constraints overrides" 

231 f" {len(overrides)}" 

232 ) 

233 

234 self.datastoreConstraints = [ 

235 Constraints(c.get("constraints"), universe=bridgeManager.universe) for c in overrides 

236 ] 

237 

238 else: 

239 self.datastoreConstraints = (None,) * len(self.datastores) 

240 

241 log.debug("Created %s (%s)", self.name, ("ephemeral" if self.isEphemeral else "permanent")) 

242 

243 @property 

244 def names(self) -> tuple[str, ...]: 

245 return tuple(self._names) 

246 

247 @property 

248 def roots(self) -> dict[str, ResourcePath | None]: 

249 # Docstring inherited. 

250 roots = {} 

251 for datastore in self.datastores: 

252 roots.update(datastore.roots) 

253 return roots 

254 

255 def __str__(self) -> str: 

256 chainName = ", ".join(str(ds) for ds in self.datastores) 

257 return chainName 

258 

259 def knows(self, ref: DatasetRef) -> bool: 

260 """Check if the dataset is known to any of the datastores. 

261 

262 Does not check for existence of any artifact. 

263 

264 Parameters 

265 ---------- 

266 ref : `DatasetRef` 

267 Reference to the required dataset. 

268 

269 Returns 

270 ------- 

271 exists : `bool` 

272 `True` if the dataset is known to the datastore. 

273 """ 

274 for datastore in self.datastores: 

275 if datastore.knows(ref): 

276 log.debug("%s known to datastore %s", ref, datastore.name) 

277 return True 

278 return False 

279 

280 def knows_these(self, refs: Iterable[DatasetRef]) -> dict[DatasetRef, bool]: 

281 # Docstring inherited from the base class. 

282 refs_known: dict[DatasetRef, bool] = {} 

283 for datastore in self.datastores: 

284 refs_known.update(datastore.knows_these(refs)) 

285 

286 # No need to check in next datastore for refs that are known. 

287 # We only update entries that were initially False. 

288 refs = [ref for ref, known in refs_known.items() if not known] 

289 

290 return refs_known 

291 

292 def mexists( 

293 self, refs: Iterable[DatasetRef], artifact_existence: dict[ResourcePath, bool] | None = None 

294 ) -> dict[DatasetRef, bool]: 

295 """Check the existence of multiple datasets at once. 

296 

297 Parameters 

298 ---------- 

299 refs : iterable of `DatasetRef` 

300 The datasets to be checked. 

301 artifact_existence : `dict` [`lsst.resources.ResourcePath`, `bool`] 

302 Optional mapping of datastore artifact to existence. Updated by 

303 this method with details of all artifacts tested. Can be `None` 

304 if the caller is not interested. 

305 

306 Returns 

307 ------- 

308 existence : `dict` of [`DatasetRef`, `bool`] 

309 Mapping from dataset to boolean indicating existence in any 

310 of the child datastores. 

311 """ 

312 dataset_existence: dict[DatasetRef, bool] = {} 

313 for datastore in self.datastores: 

314 dataset_existence.update(datastore.mexists(refs, artifact_existence=artifact_existence)) 

315 

316 # For next datastore no point asking about ones we know 

317 # exist already. No special exemption for ephemeral datastores. 

318 refs = [ref for ref, exists in dataset_existence.items() if not exists] 

319 

320 return dataset_existence 

321 

322 def exists(self, ref: DatasetRef) -> bool: 

323 """Check if the dataset exists in one of the datastores. 

324 

325 Parameters 

326 ---------- 

327 ref : `DatasetRef` 

328 Reference to the required dataset. 

329 

330 Returns 

331 ------- 

332 exists : `bool` 

333 `True` if the entity exists in one of the child datastores. 

334 """ 

335 for datastore in self.datastores: 

336 if datastore.exists(ref): 

337 log.debug("Found %s in datastore %s", ref, datastore.name) 

338 return True 

339 return False 

340 

341 def get( 

342 self, 

343 ref: DatasetRef, 

344 parameters: Mapping[str, Any] | None = None, 

345 storageClass: StorageClass | str | None = None, 

346 ) -> Any: 

347 """Load an InMemoryDataset from the store. 

348 

349 The dataset is returned from the first datastore that has 

350 the dataset. 

351 

352 Parameters 

353 ---------- 

354 ref : `DatasetRef` 

355 Reference to the required Dataset. 

356 parameters : `dict` 

357 `StorageClass`-specific parameters that specify, for example, 

358 a slice of the dataset to be loaded. 

359 storageClass : `StorageClass` or `str`, optional 

360 The storage class to be used to override the Python type 

361 returned by this method. By default the returned type matches 

362 the dataset type definition for this dataset. Specifying a 

363 read `StorageClass` can force a different type to be returned. 

364 This type must be compatible with the original type. 

365 

366 Returns 

367 ------- 

368 inMemoryDataset : `object` 

369 Requested dataset or slice thereof as an InMemoryDataset. 

370 

371 Raises 

372 ------ 

373 FileNotFoundError 

374 Requested dataset can not be retrieved. 

375 TypeError 

376 Return value from formatter has unexpected type. 

377 ValueError 

378 Formatter failed to process the dataset. 

379 """ 

380 for datastore in self.datastores: 

381 try: 

382 inMemoryObject = datastore.get(ref, parameters, storageClass=storageClass) 

383 log.debug("Found dataset %s in datastore %s", ref, datastore.name) 

384 return inMemoryObject 

385 except FileNotFoundError: 

386 pass 

387 

388 raise FileNotFoundError(f"Dataset {ref} could not be found in any of the datastores") 

389 

390 def put(self, inMemoryDataset: Any, ref: DatasetRef) -> None: 

391 """Write a InMemoryDataset with a given `DatasetRef` to each 

392 datastore. 

393 

394 The put() to child datastores can fail with 

395 `DatasetTypeNotSupportedError`. The put() for this datastore will be 

396 deemed to have succeeded so long as at least one child datastore 

397 accepted the inMemoryDataset. 

398 

399 Parameters 

400 ---------- 

401 inMemoryDataset : `object` 

402 The dataset to store. 

403 ref : `DatasetRef` 

404 Reference to the associated Dataset. 

405 

406 Raises 

407 ------ 

408 TypeError 

409 Supplied object and storage class are inconsistent. 

410 DatasetTypeNotSupportedError 

411 All datastores reported `DatasetTypeNotSupportedError`. 

412 """ 

413 log.debug("Put %s", ref) 

414 

415 # Confirm that we can accept this dataset 

416 if not self.constraints.isAcceptable(ref): 

417 # Raise rather than use boolean return value. 

418 raise DatasetTypeNotSupportedError( 

419 f"Dataset {ref} has been rejected by this datastore via configuration." 

420 ) 

421 

422 isPermanent = False 

423 nsuccess = 0 

424 npermanent = 0 

425 nephemeral = 0 

426 for datastore, constraints in zip(self.datastores, self.datastoreConstraints, strict=True): 

427 if ( 

428 constraints is not None and not constraints.isAcceptable(ref) 

429 ) or not datastore.constraints.isAcceptable(ref): 

430 log.debug("Datastore %s skipping put via configuration for ref %s", datastore.name, ref) 

431 continue 

432 

433 if datastore.isEphemeral: 

434 nephemeral += 1 

435 else: 

436 npermanent += 1 

437 try: 

438 datastore.put(inMemoryDataset, ref) 

439 nsuccess += 1 

440 if not datastore.isEphemeral: 

441 isPermanent = True 

442 except DatasetTypeNotSupportedError: 

443 pass 

444 

445 if nsuccess == 0: 

446 raise DatasetTypeNotSupportedError(f"None of the chained datastores supported ref {ref}") 

447 

448 if not isPermanent and npermanent > 0: 448 ↛ 449line 448 didn't jump to line 449, because the condition on line 448 was never true

449 warnings.warn(f"Put of {ref} only succeeded in ephemeral databases", stacklevel=2) 

450 

451 if self._transaction is not None: 

452 self._transaction.registerUndo("put", self.remove, ref) 

453 

454 def _overrideTransferMode(self, *datasets: Any, transfer: str | None = None) -> str | None: 

455 # Docstring inherited from base class. 

456 if transfer != "auto": 

457 return transfer 

458 # Ask each datastore what they think auto means 

459 transfers = {d._overrideTransferMode(*datasets, transfer=transfer) for d in self.datastores} 

460 

461 # Remove any untranslated "auto" values 

462 transfers.discard(transfer) 

463 

464 if len(transfers) == 1: 464 ↛ 465line 464 didn't jump to line 465, because the condition on line 464 was never true

465 return transfers.pop() 

466 if not transfers: 466 ↛ 470line 466 didn't jump to line 470, because the condition on line 466 was never false

467 # Everything reported "auto" 

468 return transfer 

469 

470 raise RuntimeError( 

471 "Chained datastore does not yet support different transfer modes" 

472 f" from 'auto' in each child datastore (wanted {transfers})" 

473 ) 

474 

475 def _prepIngest(self, *datasets: FileDataset, transfer: str | None = None) -> _IngestPrepData: 

476 # Docstring inherited from Datastore._prepIngest. 

477 if transfer is None: 

478 raise NotImplementedError("ChainedDatastore does not support transfer=None.") 

479 

480 def isDatasetAcceptable(dataset: FileDataset, *, name: str, constraints: Constraints) -> bool: 

481 acceptable = [ref for ref in dataset.refs if constraints.isAcceptable(ref)] 

482 if not acceptable: 

483 log.debug( 

484 "Datastore %s skipping ingest via configuration for refs %s", 

485 name, 

486 ", ".join(str(ref) for ref in dataset.refs), 

487 ) 

488 return False 

489 else: 

490 return True 

491 

492 # Filter down to just datasets the chained datastore's own 

493 # configuration accepts. 

494 okForParent: list[FileDataset] = [ 

495 dataset 

496 for dataset in datasets 

497 if isDatasetAcceptable(dataset, name=self.name, constraints=self.constraints) 

498 ] 

499 

500 # Iterate over nested datastores and call _prepIngest on each. 

501 # Save the results to a list: 

502 children: list[tuple[Datastore, Datastore.IngestPrepData, set[ResourcePath]]] = [] 

503 # ...and remember whether all of the failures are due to 

504 # NotImplementedError being raised. 

505 allFailuresAreNotImplementedError = True 

506 for datastore, constraints in zip(self.datastores, self.datastoreConstraints, strict=True): 

507 okForChild: list[FileDataset] 

508 if constraints is not None: 

509 okForChild = [ 

510 dataset 

511 for dataset in okForParent 

512 if isDatasetAcceptable(dataset, name=datastore.name, constraints=constraints) 

513 ] 

514 else: 

515 okForChild = okForParent 

516 try: 

517 prepDataForChild = datastore._prepIngest(*okForChild, transfer=transfer) 

518 except NotImplementedError: 

519 log.debug( 

520 "Skipping ingest for datastore %s because transfer mode %s is not supported.", 

521 datastore.name, 

522 transfer, 

523 ) 

524 continue 

525 allFailuresAreNotImplementedError = False 

526 if okForChild: 

527 # Do not store for later if a datastore has rejected 

528 # everything. 

529 # Include the source paths if this is a "move". It's clearer 

530 # to find the paths now rather than try to infer how 

531 # each datastore has stored them in the internal prep class. 

532 paths = ( 

533 {ResourcePath(dataset.path) for dataset in okForChild} if transfer == "move" else set() 

534 ) 

535 children.append((datastore, prepDataForChild, paths)) 

536 if allFailuresAreNotImplementedError: 

537 raise NotImplementedError(f"No child datastore supports transfer mode {transfer}.") 

538 return _IngestPrepData(children=children) 

539 

540 def _finishIngest( 

541 self, 

542 prepData: _IngestPrepData, 

543 *, 

544 transfer: str | None = None, 

545 record_validation_info: bool = True, 

546 ) -> None: 

547 # Docstring inherited from Datastore._finishIngest. 

548 # For "move" we must use "copy" and then delete the input 

549 # data at the end. This has no rollback option if the ingest 

550 # subsequently fails. If there is only one active datastore 

551 # accepting any files we can leave it as "move" 

552 actual_transfer: str | None 

553 if transfer == "move" and len(prepData.children) > 1: 

554 actual_transfer = "copy" 

555 else: 

556 actual_transfer = transfer 

557 to_be_deleted: set[ResourcePath] = set() 

558 for datastore, prepDataForChild, paths in prepData.children: 

559 datastore._finishIngest( 

560 prepDataForChild, transfer=actual_transfer, record_validation_info=record_validation_info 

561 ) 

562 to_be_deleted.update(paths) 

563 if actual_transfer != transfer: 

564 # These datasets were copied but now need to be deleted. 

565 # This can not be rolled back. 

566 for uri in to_be_deleted: 

567 uri.remove() 

568 

569 def getManyURIs( 

570 self, 

571 refs: Iterable[DatasetRef], 

572 predict: bool = False, 

573 allow_missing: bool = False, 

574 ) -> dict[DatasetRef, DatasetRefURIs]: 

575 # Docstring inherited 

576 

577 uris: dict[DatasetRef, DatasetRefURIs] = {} 

578 missing_refs = set(refs) 

579 

580 # If predict is True we don't want to predict a dataset in the first 

581 # datastore if it actually exists in a later datastore, so in that 

582 # case check all datastores with predict=False first, and then try 

583 # again with predict=True. 

584 for p in (False, True) if predict else (False,): 

585 if not missing_refs: 

586 break 

587 for datastore in self.datastores: 

588 try: 

589 got_uris = datastore.getManyURIs(missing_refs, p, allow_missing=True) 

590 except NotImplementedError: 

591 # some datastores may not implement generating URIs 

592 continue 

593 missing_refs -= got_uris.keys() 

594 uris.update(got_uris) 

595 if not missing_refs: 

596 break 

597 

598 if missing_refs and not allow_missing: 

599 raise FileNotFoundError(f"Dataset(s) {missing_refs} not in this datastore.") 

600 

601 return uris 

602 

603 def getURIs(self, ref: DatasetRef, predict: bool = False) -> DatasetRefURIs: 

604 """Return URIs associated with dataset. 

605 

606 Parameters 

607 ---------- 

608 ref : `DatasetRef` 

609 Reference to the required dataset. 

610 predict : `bool`, optional 

611 If the datastore does not know about the dataset, should it 

612 return a predicted URI or not? 

613 

614 Returns 

615 ------- 

616 uris : `DatasetRefURIs` 

617 The URI to the primary artifact associated with this dataset (if 

618 the dataset was disassembled within the datastore this may be 

619 `None`), and the URIs to any components associated with the dataset 

620 artifact. (can be empty if there are no components). 

621 

622 Notes 

623 ----- 

624 The returned URI is from the first datastore in the list that has 

625 the dataset with preference given to the first dataset coming from 

626 a permanent datastore. If no datastores have the dataset and prediction 

627 is allowed, the predicted URI for the first datastore in the list will 

628 be returned. 

629 """ 

630 log.debug("Requesting URIs for %s", ref) 

631 predictedUri: DatasetRefURIs | None = None 

632 predictedEphemeralUri: DatasetRefURIs | None = None 

633 firstEphemeralUri: DatasetRefURIs | None = None 

634 for datastore in self.datastores: 

635 if datastore.exists(ref): 

636 if not datastore.isEphemeral: 

637 uri = datastore.getURIs(ref) 

638 log.debug("Retrieved non-ephemeral URI: %s", uri) 

639 return uri 

640 elif not firstEphemeralUri: 

641 firstEphemeralUri = datastore.getURIs(ref) 

642 elif predict: 

643 if not predictedUri and not datastore.isEphemeral: 

644 predictedUri = datastore.getURIs(ref, predict) 

645 elif not predictedEphemeralUri and datastore.isEphemeral: 

646 predictedEphemeralUri = datastore.getURIs(ref, predict) 

647 

648 if firstEphemeralUri: 

649 log.debug("Retrieved ephemeral URI: %s", firstEphemeralUri) 

650 return firstEphemeralUri 

651 

652 if predictedUri: 

653 log.debug("Retrieved predicted URI: %s", predictedUri) 

654 return predictedUri 

655 

656 if predictedEphemeralUri: 

657 log.debug("Retrieved predicted ephemeral URI: %s", predictedEphemeralUri) 

658 return predictedEphemeralUri 

659 

660 raise FileNotFoundError(f"Dataset {ref} not in any datastore") 

661 

662 def getURI(self, ref: DatasetRef, predict: bool = False) -> ResourcePath: 

663 """URI to the Dataset. 

664 

665 The returned URI is from the first datastore in the list that has 

666 the dataset with preference given to the first dataset coming from 

667 a permanent datastore. If no datastores have the dataset and prediction 

668 is allowed, the predicted URI for the first datastore in the list will 

669 be returned. 

670 

671 Parameters 

672 ---------- 

673 ref : `DatasetRef` 

674 Reference to the required Dataset. 

675 predict : `bool` 

676 If `True`, allow URIs to be returned of datasets that have not 

677 been written. 

678 

679 Returns 

680 ------- 

681 uri : `lsst.resources.ResourcePath` 

682 URI pointing to the dataset within the datastore. If the 

683 dataset does not exist in the datastore, and if ``predict`` is 

684 `True`, the URI will be a prediction and will include a URI 

685 fragment "#predicted". 

686 

687 Notes 

688 ----- 

689 If the datastore does not have entities that relate well 

690 to the concept of a URI the returned URI string will be 

691 descriptive. The returned URI is not guaranteed to be obtainable. 

692 

693 Raises 

694 ------ 

695 FileNotFoundError 

696 A URI has been requested for a dataset that does not exist and 

697 guessing is not allowed. 

698 RuntimeError 

699 Raised if a request is made for a single URI but multiple URIs 

700 are associated with this dataset. 

701 """ 

702 log.debug("Requesting URI for %s", ref) 

703 primary, components = self.getURIs(ref, predict) 

704 if primary is None or components: 704 ↛ 705line 704 didn't jump to line 705, because the condition on line 704 was never true

705 raise RuntimeError( 

706 f"Dataset ({ref}) includes distinct URIs for components. Use Datastore.getURIs() instead." 

707 ) 

708 return primary 

709 

710 def retrieveArtifacts( 

711 self, 

712 refs: Iterable[DatasetRef], 

713 destination: ResourcePath, 

714 transfer: str = "auto", 

715 preserve_path: bool = True, 

716 overwrite: bool = False, 

717 ) -> list[ResourcePath]: 

718 """Retrieve the file artifacts associated with the supplied refs. 

719 

720 Parameters 

721 ---------- 

722 refs : iterable of `DatasetRef` 

723 The datasets for which file artifacts are to be retrieved. 

724 A single ref can result in multiple files. The refs must 

725 be resolved. 

726 destination : `lsst.resources.ResourcePath` 

727 Location to write the file artifacts. 

728 transfer : `str`, optional 

729 Method to use to transfer the artifacts. Must be one of the options 

730 supported by `lsst.resources.ResourcePath.transfer_from()`. 

731 "move" is not allowed. 

732 preserve_path : `bool`, optional 

733 If `True` the full path of the file artifact within the datastore 

734 is preserved. If `False` the final file component of the path 

735 is used. 

736 overwrite : `bool`, optional 

737 If `True` allow transfers to overwrite existing files at the 

738 destination. 

739 

740 Returns 

741 ------- 

742 targets : `list` of `lsst.resources.ResourcePath` 

743 URIs of file artifacts in destination location. Order is not 

744 preserved. 

745 """ 

746 if not destination.isdir(): 746 ↛ 747line 746 didn't jump to line 747, because the condition on line 746 was never true

747 raise ValueError(f"Destination location must refer to a directory. Given {destination}") 

748 

749 # Using getURIs is not feasible since it becomes difficult to 

750 # determine the path within the datastore later on. For now 

751 # follow getURIs implementation approach. 

752 

753 pending = set(refs) 

754 

755 # There is a question as to whether an exception should be raised 

756 # early if some of the refs are missing, or whether files should be 

757 # transferred until a problem is hit. Prefer to complain up front. 

758 # Use the datastore integer as primary key. 

759 grouped_by_datastore: dict[int, set[DatasetRef]] = {} 

760 

761 for number, datastore in enumerate(self.datastores): 

762 if datastore.isEphemeral: 

763 # In the future we will want to distinguish in-memory from 

764 # caching datastore since using an on-disk local 

765 # cache is exactly what we should be doing. 

766 continue 

767 try: 

768 datastore_refs = {ref for ref in pending if datastore.exists(ref)} 

769 except NotImplementedError: 

770 # Some datastores may not support retrieving artifacts 

771 continue 

772 

773 if datastore_refs: 

774 grouped_by_datastore[number] = datastore_refs 

775 

776 # Remove these from the pending list so that we do not bother 

777 # looking for them any more. 

778 pending = pending - datastore_refs 

779 

780 if pending: 780 ↛ 781line 780 didn't jump to line 781, because the condition on line 780 was never true

781 raise RuntimeError(f"Some datasets were not found in any datastores: {pending}") 

782 

783 # Now do the transfer. 

784 targets: list[ResourcePath] = [] 

785 for number, datastore_refs in grouped_by_datastore.items(): 

786 targets.extend( 

787 self.datastores[number].retrieveArtifacts( 

788 datastore_refs, 

789 destination, 

790 transfer=transfer, 

791 preserve_path=preserve_path, 

792 overwrite=overwrite, 

793 ) 

794 ) 

795 

796 return targets 

797 

798 def remove(self, ref: DatasetRef) -> None: 

799 """Indicate to the datastore that a dataset can be removed. 

800 

801 The dataset will be removed from each datastore. The dataset is 

802 not required to exist in every child datastore. 

803 

804 Parameters 

805 ---------- 

806 ref : `DatasetRef` 

807 Reference to the required dataset. 

808 

809 Raises 

810 ------ 

811 FileNotFoundError 

812 Attempt to remove a dataset that does not exist. Raised if none 

813 of the child datastores removed the dataset. 

814 """ 

815 log.debug("Removing %s", ref) 

816 self.trash(ref, ignore_errors=False) 

817 self.emptyTrash(ignore_errors=False) 

818 

819 def forget(self, refs: Iterable[DatasetRef]) -> None: 

820 for datastore in tuple(self.datastores): 

821 datastore.forget(refs) 

822 

823 def trash(self, ref: DatasetRef | Iterable[DatasetRef], ignore_errors: bool = True) -> None: 

824 if isinstance(ref, DatasetRef): 

825 ref_label = str(ref) 

826 else: 

827 ref_label = "bulk datasets" 

828 

829 log.debug("Trashing %s", ref_label) 

830 

831 counter = 0 

832 for datastore in self.datastores: 

833 try: 

834 datastore.trash(ref, ignore_errors=ignore_errors) 

835 counter += 1 

836 except FileNotFoundError: 

837 pass 

838 

839 if counter == 0: 

840 err_msg = f"Could not mark for removal from any child datastore: {ref_label}" 

841 if ignore_errors: 841 ↛ 842line 841 didn't jump to line 842, because the condition on line 841 was never true

842 log.warning(err_msg) 

843 else: 

844 raise FileNotFoundError(err_msg) 

845 

846 def emptyTrash(self, ignore_errors: bool = True) -> None: 

847 for datastore in self.datastores: 

848 datastore.emptyTrash(ignore_errors=ignore_errors) 

849 

850 def transfer(self, inputDatastore: Datastore, ref: DatasetRef) -> None: 

851 """Retrieve a dataset from an input `Datastore`, 

852 and store the result in this `Datastore`. 

853 

854 Parameters 

855 ---------- 

856 inputDatastore : `Datastore` 

857 The external `Datastore` from which to retreive the Dataset. 

858 ref : `DatasetRef` 

859 Reference to the required dataset in the input data store. 

860 

861 Returns 

862 ------- 

863 results : `list` 

864 List containing the return value from the ``put()`` to each 

865 child datastore. 

866 """ 

867 assert inputDatastore is not self # unless we want it for renames? 

868 inMemoryDataset = inputDatastore.get(ref) 

869 self.put(inMemoryDataset, ref) 

870 

871 def validateConfiguration( 

872 self, entities: Iterable[DatasetRef | DatasetType | StorageClass], logFailures: bool = False 

873 ) -> None: 

874 """Validate some of the configuration for this datastore. 

875 

876 Parameters 

877 ---------- 

878 entities : iterable of `DatasetRef`, `DatasetType`, or `StorageClass` 

879 Entities to test against this configuration. Can be differing 

880 types. 

881 logFailures : `bool`, optional 

882 If `True`, output a log message for every validation error 

883 detected. 

884 

885 Raises 

886 ------ 

887 DatastoreValidationError 

888 Raised if there is a validation problem with a configuration. 

889 All the problems are reported in a single exception. 

890 

891 Notes 

892 ----- 

893 This method checks each datastore in turn. 

894 """ 

895 # Need to catch each of the datastore outputs and ensure that 

896 # all are tested. 

897 failures = [] 

898 for datastore in self.datastores: 

899 try: 

900 datastore.validateConfiguration(entities, logFailures=logFailures) 

901 except DatastoreValidationError as e: 

902 if logFailures: 902 ↛ 904line 902 didn't jump to line 904, because the condition on line 902 was never false

903 log.critical("Datastore %s failed validation", datastore.name) 

904 failures.append(f"Datastore {self.name}: {e}") 

905 

906 if failures: 

907 msg = ";\n".join(failures) 

908 raise DatastoreValidationError(msg) 

909 

910 def validateKey(self, lookupKey: LookupKey, entity: DatasetRef | DatasetType | StorageClass) -> None: 

911 # Docstring is inherited from base class 

912 failures = [] 

913 for datastore in self.datastores: 

914 try: 

915 datastore.validateKey(lookupKey, entity) 

916 except DatastoreValidationError as e: 

917 failures.append(f"Datastore {self.name}: {e}") 

918 

919 if failures: 

920 msg = ";\n".join(failures) 

921 raise DatastoreValidationError(msg) 

922 

923 def getLookupKeys(self) -> set[LookupKey]: 

924 # Docstring is inherited from base class 

925 keys = set() 

926 for datastore in self.datastores: 

927 keys.update(datastore.getLookupKeys()) 

928 

929 keys.update(self.constraints.getLookupKeys()) 

930 for p in self.datastoreConstraints: 

931 if p is not None: 931 ↛ 930line 931 didn't jump to line 930, because the condition on line 931 was never false

932 keys.update(p.getLookupKeys()) 

933 

934 return keys 

935 

936 def needs_expanded_data_ids( 

937 self, 

938 transfer: str | None, 

939 entity: DatasetRef | DatasetType | StorageClass | None = None, 

940 ) -> bool: 

941 # Docstring inherited. 

942 # We can't safely use `self.datastoreConstraints` with `entity` to 

943 # check whether a child datastore would even want to ingest this 

944 # dataset, because we don't want to filter out datastores that might 

945 # need an expanded data ID based in incomplete information (e.g. we 

946 # pass a StorageClass, but the constraint dispatches on DatasetType). 

947 # So we pessimistically check if any datastore would need an expanded 

948 # data ID for this transfer mode. 

949 return any(datastore.needs_expanded_data_ids(transfer, entity) for datastore in self.datastores) 

950 

951 def import_records(self, data: Mapping[str, DatastoreRecordData]) -> None: 

952 # Docstring inherited from the base class. 

953 

954 for datastore in self.datastores: 

955 datastore.import_records(data) 

956 

957 def export_records(self, refs: Iterable[DatasetIdRef]) -> Mapping[str, DatastoreRecordData]: 

958 # Docstring inherited from the base class. 

959 

960 all_records: dict[str, DatastoreRecordData] = {} 

961 

962 # Merge all sub-datastore records into one structure 

963 for datastore in self.datastores: 

964 sub_records = datastore.export_records(refs) 

965 for name, record_data in sub_records.items(): 

966 # All datastore names must be unique in a chain. 

967 if name in all_records: 967 ↛ 968line 967 didn't jump to line 968, because the condition on line 967 was never true

968 raise ValueError("Non-unique datastore name found in datastore {datastore}") 

969 all_records[name] = record_data 

970 

971 return all_records 

972 

973 def export( 

974 self, 

975 refs: Iterable[DatasetRef], 

976 *, 

977 directory: ResourcePathExpression | None = None, 

978 transfer: str | None = "auto", 

979 ) -> Iterable[FileDataset]: 

980 # Docstring inherited from Datastore.export. 

981 if transfer == "auto" and directory is None: 

982 transfer = None 

983 

984 if transfer is not None and directory is None: 

985 raise TypeError(f"Cannot export using transfer mode {transfer} with no export directory given") 

986 

987 if transfer == "move": 

988 raise TypeError("Can not export by moving files out of datastore.") 

989 

990 # Exporting from a chain has the potential for a dataset to be 

991 # in one or more of the datastores in the chain. We only need one 

992 # of them since we assume the datasets are the same in all (but 

993 # the file format could be different of course since that is a 

994 # per-datastore configuration). 

995 # We also do not know whether any of the datastores in the chain 

996 # support file export. 

997 

998 # Ensure we have an ordered sequence that is not an iterator or set. 

999 if not isinstance(refs, Sequence): 

1000 refs = list(refs) 

1001 

1002 # If any of the datasets are missing entirely we need to raise early 

1003 # before we try to run the export. This can be a little messy but is 

1004 # better than exporting files from the first datastore and then finding 

1005 # that one is missing but is not in the second datastore either. 

1006 known = [datastore.knows_these(refs) for datastore in self.datastores] 

1007 refs_known: set[DatasetRef] = set() 

1008 for known_to_this in known: 

1009 refs_known.update({ref for ref, knows_this in known_to_this.items() if knows_this}) 

1010 missing_count = len(refs) - len(refs_known) 

1011 if missing_count: 

1012 raise FileNotFoundError(f"Not all datasets known to this datastore. Missing {missing_count}") 

1013 

1014 # To allow us to slot each result into the right place after 

1015 # asking each datastore, create a dict with the index. 

1016 ref_positions = {ref: i for i, ref in enumerate(refs)} 

1017 

1018 # Presize the final export list. 

1019 exported: list[FileDataset | None] = [None] * len(refs) 

1020 

1021 # The order of the returned dataset has to match the order of the 

1022 # given refs, even if they are all from different datastores. 

1023 for i, datastore in enumerate(self.datastores): 

1024 known_to_this = known[i] 

1025 filtered = [ref for ref, knows in known_to_this.items() if knows and ref in ref_positions] 

1026 

1027 try: 

1028 this_export = datastore.export(filtered, directory=directory, transfer=transfer) 

1029 except NotImplementedError: 

1030 # Try the next datastore. 

1031 continue 

1032 

1033 for ref, export in zip(filtered, this_export, strict=True): 

1034 # Get the position and also delete it from the list. 

1035 exported[ref_positions.pop(ref)] = export 

1036 

1037 # Every dataset should be accounted for because of the earlier checks 

1038 # but make sure that we did fill all the slots to appease mypy. 

1039 for i, dataset in enumerate(exported): 

1040 if dataset is None: 1040 ↛ 1041line 1040 didn't jump to line 1041, because the condition on line 1040 was never true

1041 raise FileNotFoundError(f"Failed to export dataset {refs[i]}.") 

1042 yield dataset 

1043 

1044 def transfer_from( 

1045 self, 

1046 source_datastore: Datastore, 

1047 refs: Iterable[DatasetRef], 

1048 transfer: str = "auto", 

1049 artifact_existence: dict[ResourcePath, bool] | None = None, 

1050 ) -> tuple[set[DatasetRef], set[DatasetRef]]: 

1051 # Docstring inherited 

1052 # mypy does not understand "type(self) is not type(source)" 

1053 if isinstance(source_datastore, ChainedDatastore): 

1054 # Both the source and destination are chained datastores. 

1055 source_datastores = tuple(source_datastore.datastores) 

1056 else: 

1057 # The source datastore is different, forward everything to the 

1058 # child datastores. 

1059 source_datastores = (source_datastore,) 

1060 

1061 # Need to know the set of all possible refs that could be transferred. 

1062 remaining_refs = set(refs) 

1063 

1064 missing_from_source: set[DatasetRef] | None = None 

1065 all_accepted = set() 

1066 nsuccess = 0 

1067 for source_child in source_datastores: 

1068 # If we are reading from a chained datastore, it's possible that 

1069 # only a subset of the datastores know about the dataset. We can't 

1070 # ask the receiving datastore to copy it when it doesn't exist 

1071 # so we have to filter again based on what the source datastore 

1072 # understands. 

1073 known_to_source = source_child.knows_these(list(refs)) 

1074 

1075 # Need to know that there is a possibility that some of these 

1076 # datasets exist but are unknown to the source datastore if 

1077 # trust is enabled. 

1078 if getattr(source_child, "trustGetRequest", False): 

1079 unknown = [ref for ref, known in known_to_source.items() if not known] 

1080 existence = source_child.mexists(unknown, artifact_existence) 

1081 for ref, exists in existence.items(): 

1082 known_to_source[ref] = exists 

1083 

1084 missing = {ref for ref, known in known_to_source.items() if not known} 

1085 if missing: 

1086 if missing_from_source is None: 

1087 missing_from_source = missing 

1088 else: 

1089 missing_from_source &= missing 

1090 

1091 # Try to transfer from each source datastore to each child 

1092 # datastore. Have to make sure we don't transfer something 

1093 # we've already transferred to this destination on later passes. 

1094 

1095 # Filter the initial list based on the datasets we have 

1096 # not yet transferred. 

1097 these_refs = [] 

1098 for ref in refs: 

1099 if ref in remaining_refs and known_to_source[ref]: 

1100 these_refs.append(ref) 

1101 

1102 if not these_refs: 

1103 # Already transferred all datasets known to this datastore. 

1104 continue 

1105 

1106 for datastore, constraints in zip(self.datastores, self.datastoreConstraints, strict=True): 

1107 if constraints is not None: 1107 ↛ 1115line 1107 didn't jump to line 1115, because the condition on line 1107 was never false

1108 filtered_refs = [] 

1109 for ref in these_refs: 

1110 if constraints.isAcceptable(ref): 

1111 filtered_refs.append(ref) 

1112 else: 

1113 log.debug("Rejecting ref by constraints: %s", ref) 

1114 else: 

1115 filtered_refs = list(these_refs) 

1116 try: 

1117 accepted, _ = datastore.transfer_from( 

1118 source_child, filtered_refs, transfer, artifact_existence 

1119 ) 

1120 except (TypeError, NotImplementedError): 

1121 # The datastores were incompatible. 

1122 continue 

1123 else: 

1124 nsuccess += 1 

1125 

1126 # Remove the accepted datasets from those remaining. 

1127 remaining_refs = remaining_refs - accepted 

1128 

1129 # Keep track of everything we have accepted. 

1130 all_accepted.update(accepted) 

1131 

1132 if missing_from_source: 

1133 for ref in missing_from_source: 

1134 log.warning("Asked to transfer dataset %s but no file artifacts exist for it", ref) 

1135 

1136 if nsuccess == 0: 1136 ↛ 1137line 1136 didn't jump to line 1137, because the condition on line 1136 was never true

1137 raise TypeError(f"None of the child datastores could accept transfers from {source_datastore!r}") 

1138 

1139 return all_accepted, remaining_refs