Coverage for python/lsst/daf/butler/datastores/chainedDatastore.py: 92%

424 statements  

« prev     ^ index     » next       coverage.py v7.3.2, created at 2023-10-12 09:43 +0000

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This software is dual licensed under the GNU General Public License and also 

10# under a 3-clause BSD license. Recipients may choose which of these licenses 

11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt, 

12# respectively. If you choose the GPL option then the following text applies 

13# (but note that there is still no warranty even if you opt for BSD instead): 

14# 

15# This program is free software: you can redistribute it and/or modify 

16# it under the terms of the GNU General Public License as published by 

17# the Free Software Foundation, either version 3 of the License, or 

18# (at your option) any later version. 

19# 

20# This program is distributed in the hope that it will be useful, 

21# but WITHOUT ANY WARRANTY; without even the implied warranty of 

22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

23# GNU General Public License for more details. 

24# 

25# You should have received a copy of the GNU General Public License 

26# along with this program. If not, see <http://www.gnu.org/licenses/>. 

27 

28"""Chained datastore.""" 

29 

30from __future__ import annotations 

31 

32__all__ = ("ChainedDatastore",) 

33 

34import itertools 

35import logging 

36import time 

37import warnings 

38from collections.abc import Iterable, Mapping, Sequence 

39from typing import TYPE_CHECKING, Any 

40 

41from lsst.daf.butler import DatasetRef, DatasetTypeNotSupportedError, Datastore, FileDataset 

42from lsst.daf.butler.datastore import DatasetRefURIs, DatastoreConfig, DatastoreValidationError 

43from lsst.daf.butler.datastore.constraints import Constraints 

44from lsst.daf.butler.datastore.record_data import DatastoreRecordData 

45from lsst.resources import ResourcePath 

46from lsst.utils import doImportType 

47 

48if TYPE_CHECKING: 

49 from lsst.daf.butler import Config, DatasetType, LookupKey, StorageClass 

50 from lsst.daf.butler.registry.interfaces import DatasetIdRef, DatastoreRegistryBridgeManager 

51 from lsst.resources import ResourcePathExpression 

52 

53log = logging.getLogger(__name__) 

54 

55 

56class _IngestPrepData(Datastore.IngestPrepData): 

57 """Helper class for ChainedDatastore ingest implementation. 

58 

59 Parameters 

60 ---------- 

61 children : `list` of `tuple` 

62 Pairs of `Datastore`, `IngestPrepData` for all child datastores. 

63 """ 

64 

65 def __init__(self, children: list[tuple[Datastore, Datastore.IngestPrepData, set[ResourcePath]]]): 

66 super().__init__(itertools.chain.from_iterable(data.refs.values() for _, data, _ in children)) 

67 self.children = children 

68 

69 

70class ChainedDatastore(Datastore): 

71 """Chained Datastores to allow read and writes from multiple datastores. 

72 

73 A ChainedDatastore is configured with multiple datastore configurations. 

74 A ``put()`` is always sent to each datastore. A ``get()`` 

75 operation is sent to each datastore in turn and the first datastore 

76 to return a valid dataset is used. 

77 

78 Parameters 

79 ---------- 

80 config : `DatastoreConfig` or `str` 

81 Configuration. This configuration must include a ``datastores`` field 

82 as a sequence of datastore configurations. The order in this sequence 

83 indicates the order to use for read operations. 

84 bridgeManager : `DatastoreRegistryBridgeManager` 

85 Object that manages the interface between `Registry` and datastores. 

86 butlerRoot : `str`, optional 

87 New datastore root to use to override the configuration value. This 

88 root is sent to each child datastore. 

89 

90 Notes 

91 ----- 

92 ChainedDatastore never supports `None` or `"move"` as an `ingest` transfer 

93 mode. It supports `"copy"`, `"symlink"`, `"relsymlink"` 

94 and `"hardlink"` if and only if all its child datastores do. 

95 """ 

96 

97 defaultConfigFile = "datastores/chainedDatastore.yaml" 

98 """Path to configuration defaults. Accessed within the ``configs`` resource 

99 or relative to a search path. Can be None if no defaults specified. 

100 """ 

101 

102 containerKey = "datastores" 

103 """Key to specify where child datastores are configured.""" 

104 

105 datastores: list[Datastore] 

106 """All the child datastores known to this datastore.""" 

107 

108 datastoreConstraints: Sequence[Constraints | None] 

109 """Constraints to be applied to each of the child datastores.""" 

110 

111 @classmethod 

112 def setConfigRoot(cls, root: str, config: Config, full: Config, overwrite: bool = True) -> None: 

113 """Set any filesystem-dependent config options for child Datastores to 

114 be appropriate for a new empty repository with the given root. 

115 

116 Parameters 

117 ---------- 

118 root : `str` 

119 Filesystem path to the root of the data repository. 

120 config : `Config` 

121 A `Config` to update. Only the subset understood by 

122 this component will be updated. Will not expand 

123 defaults. 

124 full : `Config` 

125 A complete config with all defaults expanded that can be 

126 converted to a `DatastoreConfig`. Read-only and will not be 

127 modified by this method. 

128 Repository-specific options that should not be obtained 

129 from defaults when Butler instances are constructed 

130 should be copied from ``full`` to ``config``. 

131 overwrite : `bool`, optional 

132 If `False`, do not modify a value in ``config`` if the value 

133 already exists. Default is always to overwrite with the provided 

134 ``root``. 

135 

136 Notes 

137 ----- 

138 If a keyword is explicitly defined in the supplied ``config`` it 

139 will not be overridden by this method if ``overwrite`` is `False`. 

140 This allows explicit values set in external configs to be retained. 

141 """ 

142 # Extract the part of the config we care about updating 

143 datastoreConfig = DatastoreConfig(config, mergeDefaults=False) 

144 

145 # And the subset of the full config that we can use for reference. 

146 # Do not bother with defaults because we are told this already has 

147 # them. 

148 fullDatastoreConfig = DatastoreConfig(full, mergeDefaults=False) 

149 

150 # Loop over each datastore config and pass the subsets to the 

151 # child datastores to process. 

152 

153 containerKey = cls.containerKey 

154 for idx, (child, fullChild) in enumerate( 

155 zip(datastoreConfig[containerKey], fullDatastoreConfig[containerKey], strict=True) 

156 ): 

157 childConfig = DatastoreConfig(child, mergeDefaults=False) 

158 fullChildConfig = DatastoreConfig(fullChild, mergeDefaults=False) 

159 datastoreClass = doImportType(fullChildConfig["cls"]) 

160 if not issubclass(datastoreClass, Datastore): 160 ↛ 161line 160 didn't jump to line 161, because the condition on line 160 was never true

161 raise TypeError(f"Imported child class {fullChildConfig['cls']} is not a Datastore") 

162 newroot = f"{root}/{datastoreClass.__qualname__}_{idx}" 

163 datastoreClass.setConfigRoot(newroot, childConfig, fullChildConfig, overwrite=overwrite) 

164 

165 # Reattach to parent 

166 datastoreConfig[containerKey, idx] = childConfig 

167 

168 # Reattach modified datastore config to parent 

169 # If this has a datastore key we attach there, otherwise we assume 

170 # this information goes at the top of the config hierarchy. 

171 if DatastoreConfig.component in config: 

172 config[DatastoreConfig.component] = datastoreConfig 

173 else: 

174 config.update(datastoreConfig) 

175 

176 return 

177 

178 def __init__( 

179 self, 

180 config: Config | ResourcePathExpression, 

181 bridgeManager: DatastoreRegistryBridgeManager, 

182 butlerRoot: str | None = None, 

183 ): 

184 super().__init__(config, bridgeManager) 

185 

186 # Scan for child datastores and instantiate them with the same registry 

187 self.datastores = [] 

188 for c in self.config["datastores"]: 

189 c = DatastoreConfig(c) 

190 datastoreType = doImportType(c["cls"]) 

191 if not issubclass(datastoreType, Datastore): 191 ↛ 192line 191 didn't jump to line 192, because the condition on line 191 was never true

192 raise TypeError(f"Imported child class {c['cls']} is not a Datastore") 

193 datastore = datastoreType(c, bridgeManager, butlerRoot=butlerRoot) 

194 log.debug("Creating child datastore %s", datastore.name) 

195 self.datastores.append(datastore) 

196 

197 # Name ourself based on our children 

198 if self.datastores: 198 ↛ 203line 198 didn't jump to line 203, because the condition on line 198 was never false

199 # We must set the names explicitly 

200 self._names = [d.name for d in self.datastores] 

201 childNames = ",".join(self.names) 

202 else: 

203 childNames = f"(empty@{time.time()})" 

204 self._names = [childNames] 

205 self.name = f"{type(self).__qualname__}[{childNames}]" 

206 

207 # We declare we are ephemeral if all our child datastores declare 

208 # they are ephemeral 

209 isEphemeral = True 

210 for d in self.datastores: 

211 if not d.isEphemeral: 

212 isEphemeral = False 

213 break 

214 self.isEphemeral = isEphemeral 

215 

216 # per-datastore override constraints 

217 if "datastore_constraints" in self.config: 

218 overrides = self.config["datastore_constraints"] 

219 

220 if len(overrides) != len(self.datastores): 220 ↛ 221line 220 didn't jump to line 221, because the condition on line 220 was never true

221 raise DatastoreValidationError( 

222 f"Number of registered datastores ({len(self.datastores)})" 

223 " differs from number of constraints overrides" 

224 f" {len(overrides)}" 

225 ) 

226 

227 self.datastoreConstraints = [ 

228 Constraints(c.get("constraints"), universe=bridgeManager.universe) for c in overrides 

229 ] 

230 

231 else: 

232 self.datastoreConstraints = (None,) * len(self.datastores) 

233 

234 log.debug("Created %s (%s)", self.name, ("ephemeral" if self.isEphemeral else "permanent")) 

235 

236 @property 

237 def names(self) -> tuple[str, ...]: 

238 return tuple(self._names) 

239 

240 @property 

241 def roots(self) -> dict[str, ResourcePath | None]: 

242 # Docstring inherited. 

243 roots = {} 

244 for datastore in self.datastores: 

245 roots.update(datastore.roots) 

246 return roots 

247 

248 def __str__(self) -> str: 

249 chainName = ", ".join(str(ds) for ds in self.datastores) 

250 return chainName 

251 

252 def knows(self, ref: DatasetRef) -> bool: 

253 """Check if the dataset is known to any of the datastores. 

254 

255 Does not check for existence of any artifact. 

256 

257 Parameters 

258 ---------- 

259 ref : `DatasetRef` 

260 Reference to the required dataset. 

261 

262 Returns 

263 ------- 

264 exists : `bool` 

265 `True` if the dataset is known to the datastore. 

266 """ 

267 for datastore in self.datastores: 

268 if datastore.knows(ref): 

269 log.debug("%s known to datastore %s", ref, datastore.name) 

270 return True 

271 return False 

272 

273 def knows_these(self, refs: Iterable[DatasetRef]) -> dict[DatasetRef, bool]: 

274 # Docstring inherited from the base class. 

275 refs_known: dict[DatasetRef, bool] = {} 

276 for datastore in self.datastores: 

277 refs_known.update(datastore.knows_these(refs)) 

278 

279 # No need to check in next datastore for refs that are known. 

280 # We only update entries that were initially False. 

281 refs = [ref for ref, known in refs_known.items() if not known] 

282 

283 return refs_known 

284 

285 def mexists( 

286 self, refs: Iterable[DatasetRef], artifact_existence: dict[ResourcePath, bool] | None = None 

287 ) -> dict[DatasetRef, bool]: 

288 """Check the existence of multiple datasets at once. 

289 

290 Parameters 

291 ---------- 

292 refs : iterable of `DatasetRef` 

293 The datasets to be checked. 

294 artifact_existence : `dict` [`lsst.resources.ResourcePath`, `bool`] 

295 Optional mapping of datastore artifact to existence. Updated by 

296 this method with details of all artifacts tested. Can be `None` 

297 if the caller is not interested. 

298 

299 Returns 

300 ------- 

301 existence : `dict` of [`DatasetRef`, `bool`] 

302 Mapping from dataset to boolean indicating existence in any 

303 of the child datastores. 

304 """ 

305 dataset_existence: dict[DatasetRef, bool] = {} 

306 for datastore in self.datastores: 

307 dataset_existence.update(datastore.mexists(refs, artifact_existence=artifact_existence)) 

308 

309 # For next datastore no point asking about ones we know 

310 # exist already. No special exemption for ephemeral datastores. 

311 refs = [ref for ref, exists in dataset_existence.items() if not exists] 

312 

313 return dataset_existence 

314 

315 def exists(self, ref: DatasetRef) -> bool: 

316 """Check if the dataset exists in one of the datastores. 

317 

318 Parameters 

319 ---------- 

320 ref : `DatasetRef` 

321 Reference to the required dataset. 

322 

323 Returns 

324 ------- 

325 exists : `bool` 

326 `True` if the entity exists in one of the child datastores. 

327 """ 

328 for datastore in self.datastores: 

329 if datastore.exists(ref): 

330 log.debug("Found %s in datastore %s", ref, datastore.name) 

331 return True 

332 return False 

333 

334 def get( 

335 self, 

336 ref: DatasetRef, 

337 parameters: Mapping[str, Any] | None = None, 

338 storageClass: StorageClass | str | None = None, 

339 ) -> Any: 

340 """Load an InMemoryDataset from the store. 

341 

342 The dataset is returned from the first datastore that has 

343 the dataset. 

344 

345 Parameters 

346 ---------- 

347 ref : `DatasetRef` 

348 Reference to the required Dataset. 

349 parameters : `dict` 

350 `StorageClass`-specific parameters that specify, for example, 

351 a slice of the dataset to be loaded. 

352 storageClass : `StorageClass` or `str`, optional 

353 The storage class to be used to override the Python type 

354 returned by this method. By default the returned type matches 

355 the dataset type definition for this dataset. Specifying a 

356 read `StorageClass` can force a different type to be returned. 

357 This type must be compatible with the original type. 

358 

359 Returns 

360 ------- 

361 inMemoryDataset : `object` 

362 Requested dataset or slice thereof as an InMemoryDataset. 

363 

364 Raises 

365 ------ 

366 FileNotFoundError 

367 Requested dataset can not be retrieved. 

368 TypeError 

369 Return value from formatter has unexpected type. 

370 ValueError 

371 Formatter failed to process the dataset. 

372 """ 

373 for datastore in self.datastores: 

374 try: 

375 inMemoryObject = datastore.get(ref, parameters, storageClass=storageClass) 

376 log.debug("Found dataset %s in datastore %s", ref, datastore.name) 

377 return inMemoryObject 

378 except FileNotFoundError: 

379 pass 

380 

381 raise FileNotFoundError(f"Dataset {ref} could not be found in any of the datastores") 

382 

383 def put(self, inMemoryDataset: Any, ref: DatasetRef) -> None: 

384 """Write a InMemoryDataset with a given `DatasetRef` to each 

385 datastore. 

386 

387 The put() to child datastores can fail with 

388 `DatasetTypeNotSupportedError`. The put() for this datastore will be 

389 deemed to have succeeded so long as at least one child datastore 

390 accepted the inMemoryDataset. 

391 

392 Parameters 

393 ---------- 

394 inMemoryDataset : `object` 

395 The dataset to store. 

396 ref : `DatasetRef` 

397 Reference to the associated Dataset. 

398 

399 Raises 

400 ------ 

401 TypeError 

402 Supplied object and storage class are inconsistent. 

403 DatasetTypeNotSupportedError 

404 All datastores reported `DatasetTypeNotSupportedError`. 

405 """ 

406 log.debug("Put %s", ref) 

407 

408 # Confirm that we can accept this dataset 

409 if not self.constraints.isAcceptable(ref): 

410 # Raise rather than use boolean return value. 

411 raise DatasetTypeNotSupportedError( 

412 f"Dataset {ref} has been rejected by this datastore via configuration." 

413 ) 

414 

415 isPermanent = False 

416 nsuccess = 0 

417 npermanent = 0 

418 nephemeral = 0 

419 for datastore, constraints in zip(self.datastores, self.datastoreConstraints, strict=True): 

420 if ( 

421 constraints is not None and not constraints.isAcceptable(ref) 

422 ) or not datastore.constraints.isAcceptable(ref): 

423 log.debug("Datastore %s skipping put via configuration for ref %s", datastore.name, ref) 

424 continue 

425 

426 if datastore.isEphemeral: 

427 nephemeral += 1 

428 else: 

429 npermanent += 1 

430 try: 

431 datastore.put(inMemoryDataset, ref) 

432 nsuccess += 1 

433 if not datastore.isEphemeral: 

434 isPermanent = True 

435 except DatasetTypeNotSupportedError: 

436 pass 

437 

438 if nsuccess == 0: 

439 raise DatasetTypeNotSupportedError(f"None of the chained datastores supported ref {ref}") 

440 

441 if not isPermanent and npermanent > 0: 441 ↛ 442line 441 didn't jump to line 442, because the condition on line 441 was never true

442 warnings.warn(f"Put of {ref} only succeeded in ephemeral databases", stacklevel=2) 

443 

444 if self._transaction is not None: 

445 self._transaction.registerUndo("put", self.remove, ref) 

446 

447 def _overrideTransferMode(self, *datasets: Any, transfer: str | None = None) -> str | None: 

448 # Docstring inherited from base class. 

449 if transfer != "auto": 

450 return transfer 

451 # Ask each datastore what they think auto means 

452 transfers = {d._overrideTransferMode(*datasets, transfer=transfer) for d in self.datastores} 

453 

454 # Remove any untranslated "auto" values 

455 transfers.discard(transfer) 

456 

457 if len(transfers) == 1: 457 ↛ 458line 457 didn't jump to line 458, because the condition on line 457 was never true

458 return transfers.pop() 

459 if not transfers: 459 ↛ 463line 459 didn't jump to line 463, because the condition on line 459 was never false

460 # Everything reported "auto" 

461 return transfer 

462 

463 raise RuntimeError( 

464 "Chained datastore does not yet support different transfer modes" 

465 f" from 'auto' in each child datastore (wanted {transfers})" 

466 ) 

467 

468 def _prepIngest(self, *datasets: FileDataset, transfer: str | None = None) -> _IngestPrepData: 

469 # Docstring inherited from Datastore._prepIngest. 

470 if transfer is None: 

471 raise NotImplementedError("ChainedDatastore does not support transfer=None.") 

472 

473 def isDatasetAcceptable(dataset: FileDataset, *, name: str, constraints: Constraints) -> bool: 

474 acceptable = [ref for ref in dataset.refs if constraints.isAcceptable(ref)] 

475 if not acceptable: 

476 log.debug( 

477 "Datastore %s skipping ingest via configuration for refs %s", 

478 name, 

479 ", ".join(str(ref) for ref in dataset.refs), 

480 ) 

481 return False 

482 else: 

483 return True 

484 

485 # Filter down to just datasets the chained datastore's own 

486 # configuration accepts. 

487 okForParent: list[FileDataset] = [ 

488 dataset 

489 for dataset in datasets 

490 if isDatasetAcceptable(dataset, name=self.name, constraints=self.constraints) 

491 ] 

492 

493 # Iterate over nested datastores and call _prepIngest on each. 

494 # Save the results to a list: 

495 children: list[tuple[Datastore, Datastore.IngestPrepData, set[ResourcePath]]] = [] 

496 # ...and remember whether all of the failures are due to 

497 # NotImplementedError being raised. 

498 allFailuresAreNotImplementedError = True 

499 for datastore, constraints in zip(self.datastores, self.datastoreConstraints, strict=True): 

500 okForChild: list[FileDataset] 

501 if constraints is not None: 

502 okForChild = [ 

503 dataset 

504 for dataset in okForParent 

505 if isDatasetAcceptable(dataset, name=datastore.name, constraints=constraints) 

506 ] 

507 else: 

508 okForChild = okForParent 

509 try: 

510 prepDataForChild = datastore._prepIngest(*okForChild, transfer=transfer) 

511 except NotImplementedError: 

512 log.debug( 

513 "Skipping ingest for datastore %s because transfer mode %s is not supported.", 

514 datastore.name, 

515 transfer, 

516 ) 

517 continue 

518 allFailuresAreNotImplementedError = False 

519 if okForChild: 

520 # Do not store for later if a datastore has rejected 

521 # everything. 

522 # Include the source paths if this is a "move". It's clearer 

523 # to find the paths now rather than try to infer how 

524 # each datastore has stored them in the internal prep class. 

525 paths = ( 

526 {ResourcePath(dataset.path) for dataset in okForChild} if transfer == "move" else set() 

527 ) 

528 children.append((datastore, prepDataForChild, paths)) 

529 if allFailuresAreNotImplementedError: 

530 raise NotImplementedError(f"No child datastore supports transfer mode {transfer}.") 

531 return _IngestPrepData(children=children) 

532 

533 def _finishIngest( 

534 self, 

535 prepData: _IngestPrepData, 

536 *, 

537 transfer: str | None = None, 

538 record_validation_info: bool = True, 

539 ) -> None: 

540 # Docstring inherited from Datastore._finishIngest. 

541 # For "move" we must use "copy" and then delete the input 

542 # data at the end. This has no rollback option if the ingest 

543 # subsequently fails. If there is only one active datastore 

544 # accepting any files we can leave it as "move" 

545 actual_transfer: str | None 

546 if transfer == "move" and len(prepData.children) > 1: 

547 actual_transfer = "copy" 

548 else: 

549 actual_transfer = transfer 

550 to_be_deleted: set[ResourcePath] = set() 

551 for datastore, prepDataForChild, paths in prepData.children: 

552 datastore._finishIngest( 

553 prepDataForChild, transfer=actual_transfer, record_validation_info=record_validation_info 

554 ) 

555 to_be_deleted.update(paths) 

556 if actual_transfer != transfer: 

557 # These datasets were copied but now need to be deleted. 

558 # This can not be rolled back. 

559 for uri in to_be_deleted: 

560 uri.remove() 

561 

562 def getManyURIs( 

563 self, 

564 refs: Iterable[DatasetRef], 

565 predict: bool = False, 

566 allow_missing: bool = False, 

567 ) -> dict[DatasetRef, DatasetRefURIs]: 

568 # Docstring inherited 

569 

570 uris: dict[DatasetRef, DatasetRefURIs] = {} 

571 missing_refs = set(refs) 

572 

573 # If predict is True we don't want to predict a dataset in the first 

574 # datastore if it actually exists in a later datastore, so in that 

575 # case check all datastores with predict=False first, and then try 

576 # again with predict=True. 

577 for p in (False, True) if predict else (False,): 

578 if not missing_refs: 

579 break 

580 for datastore in self.datastores: 

581 try: 

582 got_uris = datastore.getManyURIs(missing_refs, p, allow_missing=True) 

583 except NotImplementedError: 

584 # some datastores may not implement generating URIs 

585 continue 

586 missing_refs -= got_uris.keys() 

587 uris.update(got_uris) 

588 if not missing_refs: 

589 break 

590 

591 if missing_refs and not allow_missing: 

592 raise FileNotFoundError(f"Dataset(s) {missing_refs} not in this datastore.") 

593 

594 return uris 

595 

596 def getURIs(self, ref: DatasetRef, predict: bool = False) -> DatasetRefURIs: 

597 """Return URIs associated with dataset. 

598 

599 Parameters 

600 ---------- 

601 ref : `DatasetRef` 

602 Reference to the required dataset. 

603 predict : `bool`, optional 

604 If the datastore does not know about the dataset, should it 

605 return a predicted URI or not? 

606 

607 Returns 

608 ------- 

609 uris : `DatasetRefURIs` 

610 The URI to the primary artifact associated with this dataset (if 

611 the dataset was disassembled within the datastore this may be 

612 `None`), and the URIs to any components associated with the dataset 

613 artifact. (can be empty if there are no components). 

614 

615 Notes 

616 ----- 

617 The returned URI is from the first datastore in the list that has 

618 the dataset with preference given to the first dataset coming from 

619 a permanent datastore. If no datastores have the dataset and prediction 

620 is allowed, the predicted URI for the first datastore in the list will 

621 be returned. 

622 """ 

623 log.debug("Requesting URIs for %s", ref) 

624 predictedUri: DatasetRefURIs | None = None 

625 predictedEphemeralUri: DatasetRefURIs | None = None 

626 firstEphemeralUri: DatasetRefURIs | None = None 

627 for datastore in self.datastores: 

628 if datastore.exists(ref): 

629 if not datastore.isEphemeral: 

630 uri = datastore.getURIs(ref) 

631 log.debug("Retrieved non-ephemeral URI: %s", uri) 

632 return uri 

633 elif not firstEphemeralUri: 

634 firstEphemeralUri = datastore.getURIs(ref) 

635 elif predict: 

636 if not predictedUri and not datastore.isEphemeral: 

637 predictedUri = datastore.getURIs(ref, predict) 

638 elif not predictedEphemeralUri and datastore.isEphemeral: 

639 predictedEphemeralUri = datastore.getURIs(ref, predict) 

640 

641 if firstEphemeralUri: 

642 log.debug("Retrieved ephemeral URI: %s", firstEphemeralUri) 

643 return firstEphemeralUri 

644 

645 if predictedUri: 

646 log.debug("Retrieved predicted URI: %s", predictedUri) 

647 return predictedUri 

648 

649 if predictedEphemeralUri: 

650 log.debug("Retrieved predicted ephemeral URI: %s", predictedEphemeralUri) 

651 return predictedEphemeralUri 

652 

653 raise FileNotFoundError(f"Dataset {ref} not in any datastore") 

654 

655 def getURI(self, ref: DatasetRef, predict: bool = False) -> ResourcePath: 

656 """URI to the Dataset. 

657 

658 The returned URI is from the first datastore in the list that has 

659 the dataset with preference given to the first dataset coming from 

660 a permanent datastore. If no datastores have the dataset and prediction 

661 is allowed, the predicted URI for the first datastore in the list will 

662 be returned. 

663 

664 Parameters 

665 ---------- 

666 ref : `DatasetRef` 

667 Reference to the required Dataset. 

668 predict : `bool` 

669 If `True`, allow URIs to be returned of datasets that have not 

670 been written. 

671 

672 Returns 

673 ------- 

674 uri : `lsst.resources.ResourcePath` 

675 URI pointing to the dataset within the datastore. If the 

676 dataset does not exist in the datastore, and if ``predict`` is 

677 `True`, the URI will be a prediction and will include a URI 

678 fragment "#predicted". 

679 

680 Notes 

681 ----- 

682 If the datastore does not have entities that relate well 

683 to the concept of a URI the returned URI string will be 

684 descriptive. The returned URI is not guaranteed to be obtainable. 

685 

686 Raises 

687 ------ 

688 FileNotFoundError 

689 A URI has been requested for a dataset that does not exist and 

690 guessing is not allowed. 

691 RuntimeError 

692 Raised if a request is made for a single URI but multiple URIs 

693 are associated with this dataset. 

694 """ 

695 log.debug("Requesting URI for %s", ref) 

696 primary, components = self.getURIs(ref, predict) 

697 if primary is None or components: 697 ↛ 698line 697 didn't jump to line 698, because the condition on line 697 was never true

698 raise RuntimeError( 

699 f"Dataset ({ref}) includes distinct URIs for components. Use Datastore.getURIs() instead." 

700 ) 

701 return primary 

702 

703 def retrieveArtifacts( 

704 self, 

705 refs: Iterable[DatasetRef], 

706 destination: ResourcePath, 

707 transfer: str = "auto", 

708 preserve_path: bool = True, 

709 overwrite: bool = False, 

710 ) -> list[ResourcePath]: 

711 """Retrieve the file artifacts associated with the supplied refs. 

712 

713 Parameters 

714 ---------- 

715 refs : iterable of `DatasetRef` 

716 The datasets for which file artifacts are to be retrieved. 

717 A single ref can result in multiple files. The refs must 

718 be resolved. 

719 destination : `lsst.resources.ResourcePath` 

720 Location to write the file artifacts. 

721 transfer : `str`, optional 

722 Method to use to transfer the artifacts. Must be one of the options 

723 supported by `lsst.resources.ResourcePath.transfer_from()`. 

724 "move" is not allowed. 

725 preserve_path : `bool`, optional 

726 If `True` the full path of the file artifact within the datastore 

727 is preserved. If `False` the final file component of the path 

728 is used. 

729 overwrite : `bool`, optional 

730 If `True` allow transfers to overwrite existing files at the 

731 destination. 

732 

733 Returns 

734 ------- 

735 targets : `list` of `lsst.resources.ResourcePath` 

736 URIs of file artifacts in destination location. Order is not 

737 preserved. 

738 """ 

739 if not destination.isdir(): 739 ↛ 740line 739 didn't jump to line 740, because the condition on line 739 was never true

740 raise ValueError(f"Destination location must refer to a directory. Given {destination}") 

741 

742 # Using getURIs is not feasible since it becomes difficult to 

743 # determine the path within the datastore later on. For now 

744 # follow getURIs implementation approach. 

745 

746 pending = set(refs) 

747 

748 # There is a question as to whether an exception should be raised 

749 # early if some of the refs are missing, or whether files should be 

750 # transferred until a problem is hit. Prefer to complain up front. 

751 # Use the datastore integer as primary key. 

752 grouped_by_datastore: dict[int, set[DatasetRef]] = {} 

753 

754 for number, datastore in enumerate(self.datastores): 

755 if datastore.isEphemeral: 

756 # In the future we will want to distinguish in-memory from 

757 # caching datastore since using an on-disk local 

758 # cache is exactly what we should be doing. 

759 continue 

760 try: 

761 datastore_refs = {ref for ref in pending if datastore.exists(ref)} 

762 except NotImplementedError: 

763 # Some datastores may not support retrieving artifacts 

764 continue 

765 

766 if datastore_refs: 

767 grouped_by_datastore[number] = datastore_refs 

768 

769 # Remove these from the pending list so that we do not bother 

770 # looking for them any more. 

771 pending = pending - datastore_refs 

772 

773 if pending: 773 ↛ 774line 773 didn't jump to line 774, because the condition on line 773 was never true

774 raise RuntimeError(f"Some datasets were not found in any datastores: {pending}") 

775 

776 # Now do the transfer. 

777 targets: list[ResourcePath] = [] 

778 for number, datastore_refs in grouped_by_datastore.items(): 

779 targets.extend( 

780 self.datastores[number].retrieveArtifacts( 

781 datastore_refs, 

782 destination, 

783 transfer=transfer, 

784 preserve_path=preserve_path, 

785 overwrite=overwrite, 

786 ) 

787 ) 

788 

789 return targets 

790 

791 def remove(self, ref: DatasetRef) -> None: 

792 """Indicate to the datastore that a dataset can be removed. 

793 

794 The dataset will be removed from each datastore. The dataset is 

795 not required to exist in every child datastore. 

796 

797 Parameters 

798 ---------- 

799 ref : `DatasetRef` 

800 Reference to the required dataset. 

801 

802 Raises 

803 ------ 

804 FileNotFoundError 

805 Attempt to remove a dataset that does not exist. Raised if none 

806 of the child datastores removed the dataset. 

807 """ 

808 log.debug("Removing %s", ref) 

809 self.trash(ref, ignore_errors=False) 

810 self.emptyTrash(ignore_errors=False) 

811 

812 def forget(self, refs: Iterable[DatasetRef]) -> None: 

813 for datastore in tuple(self.datastores): 

814 datastore.forget(refs) 

815 

816 def trash(self, ref: DatasetRef | Iterable[DatasetRef], ignore_errors: bool = True) -> None: 

817 if isinstance(ref, DatasetRef): 

818 ref_label = str(ref) 

819 else: 

820 ref_label = "bulk datasets" 

821 

822 log.debug("Trashing %s", ref_label) 

823 

824 counter = 0 

825 for datastore in self.datastores: 

826 try: 

827 datastore.trash(ref, ignore_errors=ignore_errors) 

828 counter += 1 

829 except FileNotFoundError: 

830 pass 

831 

832 if counter == 0: 

833 err_msg = f"Could not mark for removal from any child datastore: {ref_label}" 

834 if ignore_errors: 834 ↛ 835line 834 didn't jump to line 835, because the condition on line 834 was never true

835 log.warning(err_msg) 

836 else: 

837 raise FileNotFoundError(err_msg) 

838 

839 def emptyTrash(self, ignore_errors: bool = True) -> None: 

840 for datastore in self.datastores: 

841 datastore.emptyTrash(ignore_errors=ignore_errors) 

842 

843 def transfer(self, inputDatastore: Datastore, ref: DatasetRef) -> None: 

844 """Retrieve a dataset from an input `Datastore`, 

845 and store the result in this `Datastore`. 

846 

847 Parameters 

848 ---------- 

849 inputDatastore : `Datastore` 

850 The external `Datastore` from which to retreive the Dataset. 

851 ref : `DatasetRef` 

852 Reference to the required dataset in the input data store. 

853 

854 Returns 

855 ------- 

856 results : `list` 

857 List containing the return value from the ``put()`` to each 

858 child datastore. 

859 """ 

860 assert inputDatastore is not self # unless we want it for renames? 

861 inMemoryDataset = inputDatastore.get(ref) 

862 self.put(inMemoryDataset, ref) 

863 

864 def validateConfiguration( 

865 self, entities: Iterable[DatasetRef | DatasetType | StorageClass], logFailures: bool = False 

866 ) -> None: 

867 """Validate some of the configuration for this datastore. 

868 

869 Parameters 

870 ---------- 

871 entities : iterable of `DatasetRef`, `DatasetType`, or `StorageClass` 

872 Entities to test against this configuration. Can be differing 

873 types. 

874 logFailures : `bool`, optional 

875 If `True`, output a log message for every validation error 

876 detected. 

877 

878 Raises 

879 ------ 

880 DatastoreValidationError 

881 Raised if there is a validation problem with a configuration. 

882 All the problems are reported in a single exception. 

883 

884 Notes 

885 ----- 

886 This method checks each datastore in turn. 

887 """ 

888 # Need to catch each of the datastore outputs and ensure that 

889 # all are tested. 

890 failures = [] 

891 for datastore in self.datastores: 

892 try: 

893 datastore.validateConfiguration(entities, logFailures=logFailures) 

894 except DatastoreValidationError as e: 

895 if logFailures: 895 ↛ 897line 895 didn't jump to line 897, because the condition on line 895 was never false

896 log.critical("Datastore %s failed validation", datastore.name) 

897 failures.append(f"Datastore {self.name}: {e}") 

898 

899 if failures: 

900 msg = ";\n".join(failures) 

901 raise DatastoreValidationError(msg) 

902 

903 def validateKey(self, lookupKey: LookupKey, entity: DatasetRef | DatasetType | StorageClass) -> None: 

904 # Docstring is inherited from base class 

905 failures = [] 

906 for datastore in self.datastores: 

907 try: 

908 datastore.validateKey(lookupKey, entity) 

909 except DatastoreValidationError as e: 

910 failures.append(f"Datastore {self.name}: {e}") 

911 

912 if failures: 

913 msg = ";\n".join(failures) 

914 raise DatastoreValidationError(msg) 

915 

916 def getLookupKeys(self) -> set[LookupKey]: 

917 # Docstring is inherited from base class 

918 keys = set() 

919 for datastore in self.datastores: 

920 keys.update(datastore.getLookupKeys()) 

921 

922 keys.update(self.constraints.getLookupKeys()) 

923 for p in self.datastoreConstraints: 

924 if p is not None: 924 ↛ 923line 924 didn't jump to line 923, because the condition on line 924 was never false

925 keys.update(p.getLookupKeys()) 

926 

927 return keys 

928 

929 def needs_expanded_data_ids( 

930 self, 

931 transfer: str | None, 

932 entity: DatasetRef | DatasetType | StorageClass | None = None, 

933 ) -> bool: 

934 # Docstring inherited. 

935 # We can't safely use `self.datastoreConstraints` with `entity` to 

936 # check whether a child datastore would even want to ingest this 

937 # dataset, because we don't want to filter out datastores that might 

938 # need an expanded data ID based in incomplete information (e.g. we 

939 # pass a StorageClass, but the constraint dispatches on DatasetType). 

940 # So we pessimistically check if any datastore would need an expanded 

941 # data ID for this transfer mode. 

942 return any(datastore.needs_expanded_data_ids(transfer, entity) for datastore in self.datastores) 

943 

944 def import_records(self, data: Mapping[str, DatastoreRecordData]) -> None: 

945 # Docstring inherited from the base class. 

946 

947 for datastore in self.datastores: 

948 datastore.import_records(data) 

949 

950 def export_records(self, refs: Iterable[DatasetIdRef]) -> Mapping[str, DatastoreRecordData]: 

951 # Docstring inherited from the base class. 

952 

953 all_records: dict[str, DatastoreRecordData] = {} 

954 

955 # Merge all sub-datastore records into one structure 

956 for datastore in self.datastores: 

957 sub_records = datastore.export_records(refs) 

958 for name, record_data in sub_records.items(): 

959 # All datastore names must be unique in a chain. 

960 if name in all_records: 960 ↛ 961line 960 didn't jump to line 961, because the condition on line 960 was never true

961 raise ValueError("Non-unique datastore name found in datastore {datastore}") 

962 all_records[name] = record_data 

963 

964 return all_records 

965 

966 def export( 

967 self, 

968 refs: Iterable[DatasetRef], 

969 *, 

970 directory: ResourcePathExpression | None = None, 

971 transfer: str | None = "auto", 

972 ) -> Iterable[FileDataset]: 

973 # Docstring inherited from Datastore.export. 

974 if transfer == "auto" and directory is None: 

975 transfer = None 

976 

977 if transfer is not None and directory is None: 

978 raise TypeError(f"Cannot export using transfer mode {transfer} with no export directory given") 

979 

980 if transfer == "move": 

981 raise TypeError("Can not export by moving files out of datastore.") 

982 

983 # Exporting from a chain has the potential for a dataset to be 

984 # in one or more of the datastores in the chain. We only need one 

985 # of them since we assume the datasets are the same in all (but 

986 # the file format could be different of course since that is a 

987 # per-datastore configuration). 

988 # We also do not know whether any of the datastores in the chain 

989 # support file export. 

990 

991 # Ensure we have an ordered sequence that is not an iterator or set. 

992 if not isinstance(refs, Sequence): 

993 refs = list(refs) 

994 

995 # If any of the datasets are missing entirely we need to raise early 

996 # before we try to run the export. This can be a little messy but is 

997 # better than exporting files from the first datastore and then finding 

998 # that one is missing but is not in the second datastore either. 

999 known = [datastore.knows_these(refs) for datastore in self.datastores] 

1000 refs_known: set[DatasetRef] = set() 

1001 for known_to_this in known: 

1002 refs_known.update({ref for ref, knows_this in known_to_this.items() if knows_this}) 

1003 missing_count = len(refs) - len(refs_known) 

1004 if missing_count: 

1005 raise FileNotFoundError(f"Not all datasets known to this datastore. Missing {missing_count}") 

1006 

1007 # To allow us to slot each result into the right place after 

1008 # asking each datastore, create a dict with the index. 

1009 ref_positions = {ref: i for i, ref in enumerate(refs)} 

1010 

1011 # Presize the final export list. 

1012 exported: list[FileDataset | None] = [None] * len(refs) 

1013 

1014 # The order of the returned dataset has to match the order of the 

1015 # given refs, even if they are all from different datastores. 

1016 for i, datastore in enumerate(self.datastores): 

1017 known_to_this = known[i] 

1018 filtered = [ref for ref, knows in known_to_this.items() if knows and ref in ref_positions] 

1019 

1020 try: 

1021 this_export = datastore.export(filtered, directory=directory, transfer=transfer) 

1022 except NotImplementedError: 

1023 # Try the next datastore. 

1024 continue 

1025 

1026 for ref, export in zip(filtered, this_export, strict=True): 

1027 # Get the position and also delete it from the list. 

1028 exported[ref_positions.pop(ref)] = export 

1029 

1030 # Every dataset should be accounted for because of the earlier checks 

1031 # but make sure that we did fill all the slots to appease mypy. 

1032 for i, dataset in enumerate(exported): 

1033 if dataset is None: 1033 ↛ 1034line 1033 didn't jump to line 1034, because the condition on line 1033 was never true

1034 raise FileNotFoundError(f"Failed to export dataset {refs[i]}.") 

1035 yield dataset 

1036 

1037 def transfer_from( 

1038 self, 

1039 source_datastore: Datastore, 

1040 refs: Iterable[DatasetRef], 

1041 transfer: str = "auto", 

1042 artifact_existence: dict[ResourcePath, bool] | None = None, 

1043 ) -> tuple[set[DatasetRef], set[DatasetRef]]: 

1044 # Docstring inherited 

1045 # mypy does not understand "type(self) is not type(source)" 

1046 if isinstance(source_datastore, ChainedDatastore): 

1047 # Both the source and destination are chained datastores. 

1048 source_datastores = tuple(source_datastore.datastores) 

1049 else: 

1050 # The source datastore is different, forward everything to the 

1051 # child datastores. 

1052 source_datastores = (source_datastore,) 

1053 

1054 # Need to know the set of all possible refs that could be transferred. 

1055 remaining_refs = set(refs) 

1056 

1057 missing_from_source: set[DatasetRef] | None = None 

1058 all_accepted = set() 

1059 nsuccess = 0 

1060 for source_child in source_datastores: 

1061 # If we are reading from a chained datastore, it's possible that 

1062 # only a subset of the datastores know about the dataset. We can't 

1063 # ask the receiving datastore to copy it when it doesn't exist 

1064 # so we have to filter again based on what the source datastore 

1065 # understands. 

1066 known_to_source = source_child.knows_these(list(refs)) 

1067 

1068 # Need to know that there is a possibility that some of these 

1069 # datasets exist but are unknown to the source datastore if 

1070 # trust is enabled. 

1071 if getattr(source_child, "trustGetRequest", False): 

1072 unknown = [ref for ref, known in known_to_source.items() if not known] 

1073 existence = source_child.mexists(unknown, artifact_existence) 

1074 for ref, exists in existence.items(): 

1075 known_to_source[ref] = exists 

1076 

1077 missing = {ref for ref, known in known_to_source.items() if not known} 

1078 if missing: 

1079 if missing_from_source is None: 

1080 missing_from_source = missing 

1081 else: 

1082 missing_from_source &= missing 

1083 

1084 # Try to transfer from each source datastore to each child 

1085 # datastore. Have to make sure we don't transfer something 

1086 # we've already transferred to this destination on later passes. 

1087 

1088 # Filter the initial list based on the datasets we have 

1089 # not yet transferred. 

1090 these_refs = [] 

1091 for ref in refs: 

1092 if ref in remaining_refs and known_to_source[ref]: 

1093 these_refs.append(ref) 

1094 

1095 if not these_refs: 

1096 # Already transferred all datasets known to this datastore. 

1097 continue 

1098 

1099 for datastore, constraints in zip(self.datastores, self.datastoreConstraints, strict=True): 

1100 if constraints is not None: 1100 ↛ 1108line 1100 didn't jump to line 1108, because the condition on line 1100 was never false

1101 filtered_refs = [] 

1102 for ref in these_refs: 

1103 if constraints.isAcceptable(ref): 

1104 filtered_refs.append(ref) 

1105 else: 

1106 log.debug("Rejecting ref by constraints: %s", ref) 

1107 else: 

1108 filtered_refs = list(these_refs) 

1109 try: 

1110 accepted, _ = datastore.transfer_from( 

1111 source_child, filtered_refs, transfer, artifact_existence 

1112 ) 

1113 except (TypeError, NotImplementedError): 

1114 # The datastores were incompatible. 

1115 continue 

1116 else: 

1117 nsuccess += 1 

1118 

1119 # Remove the accepted datasets from those remaining. 

1120 remaining_refs = remaining_refs - accepted 

1121 

1122 # Keep track of everything we have accepted. 

1123 all_accepted.update(accepted) 

1124 

1125 if missing_from_source: 

1126 for ref in missing_from_source: 

1127 log.warning("Asked to transfer dataset %s but no file artifacts exist for it", ref) 

1128 

1129 if nsuccess == 0: 1129 ↛ 1130line 1129 didn't jump to line 1130, because the condition on line 1129 was never true

1130 raise TypeError(f"None of the child datastores could accept transfers from {source_datastore!r}") 

1131 

1132 return all_accepted, remaining_refs