Coverage for python/lsst/daf/butler/datastores/chainedDatastore.py: 87%

462 statements  

« prev     ^ index     » next       coverage.py v7.4.0, created at 2024-01-25 10:48 +0000

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This software is dual licensed under the GNU General Public License and also 

10# under a 3-clause BSD license. Recipients may choose which of these licenses 

11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt, 

12# respectively. If you choose the GPL option then the following text applies 

13# (but note that there is still no warranty even if you opt for BSD instead): 

14# 

15# This program is free software: you can redistribute it and/or modify 

16# it under the terms of the GNU General Public License as published by 

17# the Free Software Foundation, either version 3 of the License, or 

18# (at your option) any later version. 

19# 

20# This program is distributed in the hope that it will be useful, 

21# but WITHOUT ANY WARRANTY; without even the implied warranty of 

22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

23# GNU General Public License for more details. 

24# 

25# You should have received a copy of the GNU General Public License 

26# along with this program. If not, see <http://www.gnu.org/licenses/>. 

27 

28"""Chained datastore.""" 

29 

30from __future__ import annotations 

31 

32__all__ = ("ChainedDatastore",) 

33 

34import itertools 

35import logging 

36import time 

37import warnings 

38from collections.abc import Iterable, Mapping, Sequence 

39from typing import TYPE_CHECKING, Any 

40 

41from lsst.daf.butler import DatasetRef, DatasetTypeNotSupportedError, FileDataset 

42from lsst.daf.butler.datastore import ( 

43 DatasetRefURIs, 

44 Datastore, 

45 DatastoreConfig, 

46 DatastoreOpaqueTable, 

47 DatastoreValidationError, 

48) 

49from lsst.daf.butler.datastore.constraints import Constraints 

50from lsst.daf.butler.datastore.record_data import DatastoreRecordData 

51from lsst.resources import ResourcePath 

52from lsst.utils import doImportType 

53 

54if TYPE_CHECKING: 

55 from lsst.daf.butler import Config, DatasetType, LookupKey, StorageClass 

56 from lsst.daf.butler.registry.interfaces import DatasetIdRef, DatastoreRegistryBridgeManager 

57 from lsst.resources import ResourcePathExpression 

58 

59log = logging.getLogger(__name__) 

60 

61 

62class _IngestPrepData(Datastore.IngestPrepData): 

63 """Helper class for ChainedDatastore ingest implementation. 

64 

65 Parameters 

66 ---------- 

67 children : `list` of `tuple` 

68 Pairs of `Datastore`, `IngestPrepData` for all child datastores. 

69 """ 

70 

71 def __init__(self, children: list[tuple[Datastore, Datastore.IngestPrepData, set[ResourcePath]]]): 

72 super().__init__(itertools.chain.from_iterable(data.refs.values() for _, data, _ in children)) 

73 self.children = children 

74 

75 

76class ChainedDatastore(Datastore): 

77 """Chained Datastores to allow read and writes from multiple datastores. 

78 

79 A ChainedDatastore is configured with multiple datastore configurations. 

80 A ``put()`` is always sent to each datastore. A ``get()`` 

81 operation is sent to each datastore in turn and the first datastore 

82 to return a valid dataset is used. 

83 

84 Parameters 

85 ---------- 

86 config : `DatastoreConfig` or `str` 

87 Configuration. This configuration must include a ``datastores`` field 

88 as a sequence of datastore configurations. The order in this sequence 

89 indicates the order to use for read operations. 

90 bridgeManager : `DatastoreRegistryBridgeManager` 

91 Object that manages the interface between `Registry` and datastores. 

92 datastores : `list` [`Datastore`] 

93 All the child datastores known to this datastore. 

94 

95 Notes 

96 ----- 

97 ChainedDatastore never supports `None` or `"move"` as an `ingest` transfer 

98 mode. It supports `"copy"`, `"symlink"`, `"relsymlink"` 

99 and `"hardlink"` if and only if all its child datastores do. 

100 """ 

101 

102 defaultConfigFile = "datastores/chainedDatastore.yaml" 

103 """Path to configuration defaults. Accessed within the ``configs`` resource 

104 or relative to a search path. Can be None if no defaults specified. 

105 """ 

106 

107 containerKey = "datastores" 

108 """Key to specify where child datastores are configured.""" 

109 

110 datastores: list[Datastore] 

111 """All the child datastores known to this datastore.""" 

112 

113 datastoreConstraints: Sequence[Constraints | None] 

114 """Constraints to be applied to each of the child datastores.""" 

115 

116 @classmethod 

117 def setConfigRoot(cls, root: str, config: Config, full: Config, overwrite: bool = True) -> None: 

118 """Set any filesystem-dependent config options for child Datastores to 

119 be appropriate for a new empty repository with the given root. 

120 

121 Parameters 

122 ---------- 

123 root : `str` 

124 Filesystem path to the root of the data repository. 

125 config : `Config` 

126 A `Config` to update. Only the subset understood by 

127 this component will be updated. Will not expand 

128 defaults. 

129 full : `Config` 

130 A complete config with all defaults expanded that can be 

131 converted to a `DatastoreConfig`. Read-only and will not be 

132 modified by this method. 

133 Repository-specific options that should not be obtained 

134 from defaults when Butler instances are constructed 

135 should be copied from ``full`` to ``config``. 

136 overwrite : `bool`, optional 

137 If `False`, do not modify a value in ``config`` if the value 

138 already exists. Default is always to overwrite with the provided 

139 ``root``. 

140 

141 Notes 

142 ----- 

143 If a keyword is explicitly defined in the supplied ``config`` it 

144 will not be overridden by this method if ``overwrite`` is `False`. 

145 This allows explicit values set in external configs to be retained. 

146 """ 

147 # Extract the part of the config we care about updating 

148 datastoreConfig = DatastoreConfig(config, mergeDefaults=False) 

149 

150 # And the subset of the full config that we can use for reference. 

151 # Do not bother with defaults because we are told this already has 

152 # them. 

153 fullDatastoreConfig = DatastoreConfig(full, mergeDefaults=False) 

154 

155 # Loop over each datastore config and pass the subsets to the 

156 # child datastores to process. 

157 

158 containerKey = cls.containerKey 

159 for idx, (child, fullChild) in enumerate( 

160 zip(datastoreConfig[containerKey], fullDatastoreConfig[containerKey], strict=True) 

161 ): 

162 childConfig = DatastoreConfig(child, mergeDefaults=False) 

163 fullChildConfig = DatastoreConfig(fullChild, mergeDefaults=False) 

164 datastoreClass = doImportType(fullChildConfig["cls"]) 

165 if not issubclass(datastoreClass, Datastore): 165 ↛ 166line 165 didn't jump to line 166, because the condition on line 165 was never true

166 raise TypeError(f"Imported child class {fullChildConfig['cls']} is not a Datastore") 

167 newroot = f"{root}/{datastoreClass.__qualname__}_{idx}" 

168 datastoreClass.setConfigRoot(newroot, childConfig, fullChildConfig, overwrite=overwrite) 

169 

170 # Reattach to parent 

171 datastoreConfig[containerKey, idx] = childConfig 

172 

173 # Reattach modified datastore config to parent 

174 # If this has a datastore key we attach there, otherwise we assume 

175 # this information goes at the top of the config hierarchy. 

176 if DatastoreConfig.component in config: 

177 config[DatastoreConfig.component] = datastoreConfig 

178 else: 

179 config.update(datastoreConfig) 

180 

181 return 

182 

183 def __init__( 

184 self, 

185 config: DatastoreConfig, 

186 bridgeManager: DatastoreRegistryBridgeManager, 

187 datastores: list[Datastore], 

188 ): 

189 super().__init__(config, bridgeManager) 

190 

191 self.datastores = list(datastores) 

192 

193 # Name ourself based on our children 

194 if self.datastores: 194 ↛ 199line 194 didn't jump to line 199, because the condition on line 194 was never false

195 # We must set the names explicitly 

196 self._names = [d.name for d in self.datastores] 

197 childNames = ",".join(self.names) 

198 else: 

199 childNames = f"(empty@{time.time()})" 

200 self._names = [childNames] 

201 self.name = f"{type(self).__qualname__}[{childNames}]" 

202 

203 # We declare we are ephemeral if all our child datastores declare 

204 # they are ephemeral 

205 self.isEphemeral = all(d.isEphemeral for d in self.datastores) 

206 

207 # per-datastore override constraints 

208 if "datastore_constraints" in self.config: 

209 overrides = self.config["datastore_constraints"] 

210 

211 if len(overrides) != len(self.datastores): 211 ↛ 212line 211 didn't jump to line 212, because the condition on line 211 was never true

212 raise DatastoreValidationError( 

213 f"Number of registered datastores ({len(self.datastores)})" 

214 " differs from number of constraints overrides" 

215 f" {len(overrides)}" 

216 ) 

217 

218 self.datastoreConstraints = [ 

219 Constraints(c.get("constraints"), universe=bridgeManager.universe) for c in overrides 

220 ] 

221 

222 else: 

223 self.datastoreConstraints = (None,) * len(self.datastores) 

224 

225 log.debug("Created %s (%s)", self.name, ("ephemeral" if self.isEphemeral else "permanent")) 

226 

227 @classmethod 

228 def _create_from_config( 

229 cls, 

230 config: DatastoreConfig, 

231 bridgeManager: DatastoreRegistryBridgeManager, 

232 butlerRoot: ResourcePathExpression | None, 

233 ) -> ChainedDatastore: 

234 # Scan for child datastores and instantiate them with the same registry 

235 datastores = [] 

236 for c in config["datastores"]: 

237 c = DatastoreConfig(c) 

238 datastoreType = doImportType(c["cls"]) 

239 if not issubclass(datastoreType, Datastore): 239 ↛ 240line 239 didn't jump to line 240, because the condition on line 239 was never true

240 raise TypeError(f"Imported child class {c['cls']} is not a Datastore") 

241 datastore = datastoreType._create_from_config(c, bridgeManager, butlerRoot=butlerRoot) 

242 log.debug("Creating child datastore %s", datastore.name) 

243 datastores.append(datastore) 

244 

245 return ChainedDatastore(config, bridgeManager, datastores) 

246 

247 def clone(self, bridgeManager: DatastoreRegistryBridgeManager) -> Datastore: 

248 datastores = [ds.clone(bridgeManager) for ds in self.datastores] 

249 return ChainedDatastore(self.config, bridgeManager, datastores) 

250 

251 @property 

252 def names(self) -> tuple[str, ...]: 

253 return tuple(self._names) 

254 

255 @property 

256 def roots(self) -> dict[str, ResourcePath | None]: 

257 # Docstring inherited. 

258 roots = {} 

259 for datastore in self.datastores: 

260 roots.update(datastore.roots) 

261 return roots 

262 

263 def __str__(self) -> str: 

264 chainName = ", ".join(str(ds) for ds in self.datastores) 

265 return chainName 

266 

267 def knows(self, ref: DatasetRef) -> bool: 

268 """Check if the dataset is known to any of the datastores. 

269 

270 Does not check for existence of any artifact. 

271 

272 Parameters 

273 ---------- 

274 ref : `DatasetRef` 

275 Reference to the required dataset. 

276 

277 Returns 

278 ------- 

279 exists : `bool` 

280 `True` if the dataset is known to the datastore. 

281 """ 

282 for datastore in self.datastores: 

283 if datastore.knows(ref): 

284 log.debug("%s known to datastore %s", ref, datastore.name) 

285 return True 

286 return False 

287 

288 def knows_these(self, refs: Iterable[DatasetRef]) -> dict[DatasetRef, bool]: 

289 # Docstring inherited from the base class. 

290 refs_known: dict[DatasetRef, bool] = {} 

291 for datastore in self.datastores: 

292 refs_known.update(datastore.knows_these(refs)) 

293 

294 # No need to check in next datastore for refs that are known. 

295 # We only update entries that were initially False. 

296 refs = [ref for ref, known in refs_known.items() if not known] 

297 

298 return refs_known 

299 

300 def mexists( 

301 self, refs: Iterable[DatasetRef], artifact_existence: dict[ResourcePath, bool] | None = None 

302 ) -> dict[DatasetRef, bool]: 

303 """Check the existence of multiple datasets at once. 

304 

305 Parameters 

306 ---------- 

307 refs : iterable of `DatasetRef` 

308 The datasets to be checked. 

309 artifact_existence : `dict` [`lsst.resources.ResourcePath`, `bool`] 

310 Optional mapping of datastore artifact to existence. Updated by 

311 this method with details of all artifacts tested. Can be `None` 

312 if the caller is not interested. 

313 

314 Returns 

315 ------- 

316 existence : `dict` of [`DatasetRef`, `bool`] 

317 Mapping from dataset to boolean indicating existence in any 

318 of the child datastores. 

319 """ 

320 dataset_existence: dict[DatasetRef, bool] = {} 

321 for datastore in self.datastores: 

322 dataset_existence.update(datastore.mexists(refs, artifact_existence=artifact_existence)) 

323 

324 # For next datastore no point asking about ones we know 

325 # exist already. No special exemption for ephemeral datastores. 

326 refs = [ref for ref, exists in dataset_existence.items() if not exists] 

327 

328 return dataset_existence 

329 

330 def exists(self, ref: DatasetRef) -> bool: 

331 """Check if the dataset exists in one of the datastores. 

332 

333 Parameters 

334 ---------- 

335 ref : `DatasetRef` 

336 Reference to the required dataset. 

337 

338 Returns 

339 ------- 

340 exists : `bool` 

341 `True` if the entity exists in one of the child datastores. 

342 """ 

343 for datastore in self.datastores: 

344 if datastore.exists(ref): 

345 log.debug("Found %s in datastore %s", ref, datastore.name) 

346 return True 

347 return False 

348 

349 def get( 

350 self, 

351 ref: DatasetRef, 

352 parameters: Mapping[str, Any] | None = None, 

353 storageClass: StorageClass | str | None = None, 

354 ) -> Any: 

355 """Load an InMemoryDataset from the store. 

356 

357 The dataset is returned from the first datastore that has 

358 the dataset. 

359 

360 Parameters 

361 ---------- 

362 ref : `DatasetRef` 

363 Reference to the required Dataset. 

364 parameters : `dict` 

365 `StorageClass`-specific parameters that specify, for example, 

366 a slice of the dataset to be loaded. 

367 storageClass : `StorageClass` or `str`, optional 

368 The storage class to be used to override the Python type 

369 returned by this method. By default the returned type matches 

370 the dataset type definition for this dataset. Specifying a 

371 read `StorageClass` can force a different type to be returned. 

372 This type must be compatible with the original type. 

373 

374 Returns 

375 ------- 

376 inMemoryDataset : `object` 

377 Requested dataset or slice thereof as an InMemoryDataset. 

378 

379 Raises 

380 ------ 

381 FileNotFoundError 

382 Requested dataset can not be retrieved. 

383 TypeError 

384 Return value from formatter has unexpected type. 

385 ValueError 

386 Formatter failed to process the dataset. 

387 """ 

388 for datastore in self.datastores: 

389 try: 

390 inMemoryObject = datastore.get(ref, parameters, storageClass=storageClass) 

391 log.debug("Found dataset %s in datastore %s", ref, datastore.name) 

392 return inMemoryObject 

393 except FileNotFoundError: 

394 pass 

395 

396 raise FileNotFoundError(f"Dataset {ref} could not be found in any of the datastores") 

397 

398 def put(self, inMemoryDataset: Any, ref: DatasetRef) -> None: 

399 """Write a InMemoryDataset with a given `DatasetRef` to each 

400 datastore. 

401 

402 The put() to child datastores can fail with 

403 `DatasetTypeNotSupportedError`. The put() for this datastore will be 

404 deemed to have succeeded so long as at least one child datastore 

405 accepted the inMemoryDataset. 

406 

407 Parameters 

408 ---------- 

409 inMemoryDataset : `object` 

410 The dataset to store. 

411 ref : `DatasetRef` 

412 Reference to the associated Dataset. 

413 

414 Raises 

415 ------ 

416 TypeError 

417 Supplied object and storage class are inconsistent. 

418 DatasetTypeNotSupportedError 

419 All datastores reported `DatasetTypeNotSupportedError`. 

420 """ 

421 log.debug("Put %s", ref) 

422 

423 # Confirm that we can accept this dataset 

424 if not self.constraints.isAcceptable(ref): 

425 # Raise rather than use boolean return value. 

426 raise DatasetTypeNotSupportedError( 

427 f"Dataset {ref} has been rejected by this datastore via configuration." 

428 ) 

429 

430 isPermanent = False 

431 nsuccess = 0 

432 npermanent = 0 

433 nephemeral = 0 

434 for datastore, constraints in zip(self.datastores, self.datastoreConstraints, strict=True): 

435 if ( 

436 constraints is not None and not constraints.isAcceptable(ref) 

437 ) or not datastore.constraints.isAcceptable(ref): 

438 log.debug("Datastore %s skipping put via configuration for ref %s", datastore.name, ref) 

439 continue 

440 

441 if datastore.isEphemeral: 

442 nephemeral += 1 

443 else: 

444 npermanent += 1 

445 try: 

446 datastore.put(inMemoryDataset, ref) 

447 nsuccess += 1 

448 if not datastore.isEphemeral: 

449 isPermanent = True 

450 except DatasetTypeNotSupportedError: 

451 pass 

452 

453 if nsuccess == 0: 

454 raise DatasetTypeNotSupportedError(f"None of the chained datastores supported ref {ref}") 

455 

456 if not isPermanent and npermanent > 0: 456 ↛ 457line 456 didn't jump to line 457, because the condition on line 456 was never true

457 warnings.warn(f"Put of {ref} only succeeded in ephemeral databases", stacklevel=2) 

458 

459 if self._transaction is not None: 

460 self._transaction.registerUndo("put", self.remove, ref) 

461 

462 def put_new(self, in_memory_dataset: Any, ref: DatasetRef) -> Mapping[str, DatasetRef]: 

463 # Docstring inherited from base class. 

464 log.debug("Put %s", ref) 

465 

466 # Confirm that we can accept this dataset 

467 if not self.constraints.isAcceptable(ref): 

468 # Raise rather than use boolean return value. 

469 raise DatasetTypeNotSupportedError( 

470 f"Dataset {ref} has been rejected by this datastore via configuration." 

471 ) 

472 

473 isPermanent = False 

474 nsuccess = 0 

475 npermanent = 0 

476 nephemeral = 0 

477 stored_refs: dict[str, DatasetRef] = {} 

478 for datastore, constraints in zip(self.datastores, self.datastoreConstraints, strict=True): 

479 if ( 

480 constraints is not None and not constraints.isAcceptable(ref) 

481 ) or not datastore.constraints.isAcceptable(ref): 

482 log.debug("Datastore %s skipping put via configuration for ref %s", datastore.name, ref) 

483 continue 

484 

485 if datastore.isEphemeral: 

486 nephemeral += 1 

487 else: 

488 npermanent += 1 

489 try: 

490 stored_ref_map = datastore.put_new(in_memory_dataset, ref) 

491 stored_refs.update(stored_ref_map) 

492 nsuccess += 1 

493 if not datastore.isEphemeral: 

494 isPermanent = True 

495 except DatasetTypeNotSupportedError: 

496 pass 

497 

498 if nsuccess == 0: 

499 raise DatasetTypeNotSupportedError(f"None of the chained datastores supported ref {ref}") 

500 

501 if not isPermanent and npermanent > 0: 

502 warnings.warn(f"Put of {ref} only succeeded in ephemeral databases", stacklevel=2) 

503 

504 if self._transaction is not None: 

505 self._transaction.registerUndo("put", self.remove, ref) 

506 

507 return stored_refs 

508 

509 def _overrideTransferMode(self, *datasets: Any, transfer: str | None = None) -> str | None: 

510 # Docstring inherited from base class. 

511 if transfer != "auto": 

512 return transfer 

513 # Ask each datastore what they think auto means 

514 transfers = {d._overrideTransferMode(*datasets, transfer=transfer) for d in self.datastores} 

515 

516 # Remove any untranslated "auto" values 

517 transfers.discard(transfer) 

518 

519 if len(transfers) == 1: 519 ↛ 520line 519 didn't jump to line 520, because the condition on line 519 was never true

520 return transfers.pop() 

521 if not transfers: 521 ↛ 525line 521 didn't jump to line 525, because the condition on line 521 was never false

522 # Everything reported "auto" 

523 return transfer 

524 

525 raise RuntimeError( 

526 "Chained datastore does not yet support different transfer modes" 

527 f" from 'auto' in each child datastore (wanted {transfers})" 

528 ) 

529 

530 def _prepIngest(self, *datasets: FileDataset, transfer: str | None = None) -> _IngestPrepData: 

531 # Docstring inherited from Datastore._prepIngest. 

532 if transfer is None: 

533 raise NotImplementedError("ChainedDatastore does not support transfer=None.") 

534 

535 def isDatasetAcceptable(dataset: FileDataset, *, name: str, constraints: Constraints) -> bool: 

536 acceptable = [ref for ref in dataset.refs if constraints.isAcceptable(ref)] 

537 if not acceptable: 

538 log.debug( 

539 "Datastore %s skipping ingest via configuration for refs %s", 

540 name, 

541 ", ".join(str(ref) for ref in dataset.refs), 

542 ) 

543 return False 

544 else: 

545 return True 

546 

547 # Filter down to just datasets the chained datastore's own 

548 # configuration accepts. 

549 okForParent: list[FileDataset] = [ 

550 dataset 

551 for dataset in datasets 

552 if isDatasetAcceptable(dataset, name=self.name, constraints=self.constraints) 

553 ] 

554 

555 # Iterate over nested datastores and call _prepIngest on each. 

556 # Save the results to a list: 

557 children: list[tuple[Datastore, Datastore.IngestPrepData, set[ResourcePath]]] = [] 

558 # ...and remember whether all of the failures are due to 

559 # NotImplementedError being raised. 

560 allFailuresAreNotImplementedError = True 

561 for datastore, constraints in zip(self.datastores, self.datastoreConstraints, strict=True): 

562 okForChild: list[FileDataset] 

563 if constraints is not None: 

564 okForChild = [ 

565 dataset 

566 for dataset in okForParent 

567 if isDatasetAcceptable(dataset, name=datastore.name, constraints=constraints) 

568 ] 

569 else: 

570 okForChild = okForParent 

571 try: 

572 prepDataForChild = datastore._prepIngest(*okForChild, transfer=transfer) 

573 except NotImplementedError: 

574 log.debug( 

575 "Skipping ingest for datastore %s because transfer mode %s is not supported.", 

576 datastore.name, 

577 transfer, 

578 ) 

579 continue 

580 allFailuresAreNotImplementedError = False 

581 if okForChild: 

582 # Do not store for later if a datastore has rejected 

583 # everything. 

584 # Include the source paths if this is a "move". It's clearer 

585 # to find the paths now rather than try to infer how 

586 # each datastore has stored them in the internal prep class. 

587 paths = ( 

588 {ResourcePath(dataset.path, forceDirectory=False) for dataset in okForChild} 

589 if transfer == "move" 

590 else set() 

591 ) 

592 children.append((datastore, prepDataForChild, paths)) 

593 if allFailuresAreNotImplementedError: 

594 raise NotImplementedError(f"No child datastore supports transfer mode {transfer}.") 

595 return _IngestPrepData(children=children) 

596 

597 def _finishIngest( 

598 self, 

599 prepData: _IngestPrepData, 

600 *, 

601 transfer: str | None = None, 

602 record_validation_info: bool = True, 

603 ) -> None: 

604 # Docstring inherited from Datastore._finishIngest. 

605 # For "move" we must use "copy" and then delete the input 

606 # data at the end. This has no rollback option if the ingest 

607 # subsequently fails. If there is only one active datastore 

608 # accepting any files we can leave it as "move" 

609 actual_transfer: str | None 

610 if transfer == "move" and len(prepData.children) > 1: 

611 actual_transfer = "copy" 

612 else: 

613 actual_transfer = transfer 

614 to_be_deleted: set[ResourcePath] = set() 

615 for datastore, prepDataForChild, paths in prepData.children: 

616 datastore._finishIngest( 

617 prepDataForChild, transfer=actual_transfer, record_validation_info=record_validation_info 

618 ) 

619 to_be_deleted.update(paths) 

620 if actual_transfer != transfer: 

621 # These datasets were copied but now need to be deleted. 

622 # This can not be rolled back. 

623 for uri in to_be_deleted: 

624 uri.remove() 

625 

626 def getManyURIs( 

627 self, 

628 refs: Iterable[DatasetRef], 

629 predict: bool = False, 

630 allow_missing: bool = False, 

631 ) -> dict[DatasetRef, DatasetRefURIs]: 

632 # Docstring inherited 

633 

634 uris: dict[DatasetRef, DatasetRefURIs] = {} 

635 missing_refs = set(refs) 

636 

637 # If predict is True we don't want to predict a dataset in the first 

638 # datastore if it actually exists in a later datastore, so in that 

639 # case check all datastores with predict=False first, and then try 

640 # again with predict=True. 

641 for p in (False, True) if predict else (False,): 

642 if not missing_refs: 

643 break 

644 for datastore in self.datastores: 

645 try: 

646 got_uris = datastore.getManyURIs(missing_refs, p, allow_missing=True) 

647 except NotImplementedError: 

648 # some datastores may not implement generating URIs 

649 continue 

650 missing_refs -= got_uris.keys() 

651 uris.update(got_uris) 

652 if not missing_refs: 

653 break 

654 

655 if missing_refs and not allow_missing: 

656 raise FileNotFoundError(f"Dataset(s) {missing_refs} not in this datastore.") 

657 

658 return uris 

659 

660 def getURIs(self, ref: DatasetRef, predict: bool = False) -> DatasetRefURIs: 

661 """Return URIs associated with dataset. 

662 

663 Parameters 

664 ---------- 

665 ref : `DatasetRef` 

666 Reference to the required dataset. 

667 predict : `bool`, optional 

668 If the datastore does not know about the dataset, controls whether 

669 it should return a predicted URI or not. 

670 

671 Returns 

672 ------- 

673 uris : `DatasetRefURIs` 

674 The URI to the primary artifact associated with this dataset (if 

675 the dataset was disassembled within the datastore this may be 

676 `None`), and the URIs to any components associated with the dataset 

677 artifact. (can be empty if there are no components). 

678 

679 Notes 

680 ----- 

681 The returned URI is from the first datastore in the list that has 

682 the dataset with preference given to the first dataset coming from 

683 a permanent datastore. If no datastores have the dataset and prediction 

684 is allowed, the predicted URI for the first datastore in the list will 

685 be returned. 

686 """ 

687 log.debug("Requesting URIs for %s", ref) 

688 predictedUri: DatasetRefURIs | None = None 

689 predictedEphemeralUri: DatasetRefURIs | None = None 

690 firstEphemeralUri: DatasetRefURIs | None = None 

691 for datastore in self.datastores: 

692 if datastore.exists(ref): 

693 if not datastore.isEphemeral: 

694 uri = datastore.getURIs(ref) 

695 log.debug("Retrieved non-ephemeral URI: %s", uri) 

696 return uri 

697 elif not firstEphemeralUri: 

698 firstEphemeralUri = datastore.getURIs(ref) 

699 elif predict: 

700 if not predictedUri and not datastore.isEphemeral: 

701 predictedUri = datastore.getURIs(ref, predict) 

702 elif not predictedEphemeralUri and datastore.isEphemeral: 

703 predictedEphemeralUri = datastore.getURIs(ref, predict) 

704 

705 if firstEphemeralUri: 

706 log.debug("Retrieved ephemeral URI: %s", firstEphemeralUri) 

707 return firstEphemeralUri 

708 

709 if predictedUri: 

710 log.debug("Retrieved predicted URI: %s", predictedUri) 

711 return predictedUri 

712 

713 if predictedEphemeralUri: 

714 log.debug("Retrieved predicted ephemeral URI: %s", predictedEphemeralUri) 

715 return predictedEphemeralUri 

716 

717 raise FileNotFoundError(f"Dataset {ref} not in any datastore") 

718 

719 def getURI(self, ref: DatasetRef, predict: bool = False) -> ResourcePath: 

720 """URI to the Dataset. 

721 

722 The returned URI is from the first datastore in the list that has 

723 the dataset with preference given to the first dataset coming from 

724 a permanent datastore. If no datastores have the dataset and prediction 

725 is allowed, the predicted URI for the first datastore in the list will 

726 be returned. 

727 

728 Parameters 

729 ---------- 

730 ref : `DatasetRef` 

731 Reference to the required Dataset. 

732 predict : `bool` 

733 If `True`, allow URIs to be returned of datasets that have not 

734 been written. 

735 

736 Returns 

737 ------- 

738 uri : `lsst.resources.ResourcePath` 

739 URI pointing to the dataset within the datastore. If the 

740 dataset does not exist in the datastore, and if ``predict`` is 

741 `True`, the URI will be a prediction and will include a URI 

742 fragment "#predicted". 

743 

744 Notes 

745 ----- 

746 If the datastore does not have entities that relate well 

747 to the concept of a URI the returned URI string will be 

748 descriptive. The returned URI is not guaranteed to be obtainable. 

749 

750 Raises 

751 ------ 

752 FileNotFoundError 

753 A URI has been requested for a dataset that does not exist and 

754 guessing is not allowed. 

755 RuntimeError 

756 Raised if a request is made for a single URI but multiple URIs 

757 are associated with this dataset. 

758 """ 

759 log.debug("Requesting URI for %s", ref) 

760 primary, components = self.getURIs(ref, predict) 

761 if primary is None or components: 761 ↛ 762line 761 didn't jump to line 762, because the condition on line 761 was never true

762 raise RuntimeError( 

763 f"Dataset ({ref}) includes distinct URIs for components. Use Datastore.getURIs() instead." 

764 ) 

765 return primary 

766 

767 def retrieveArtifacts( 

768 self, 

769 refs: Iterable[DatasetRef], 

770 destination: ResourcePath, 

771 transfer: str = "auto", 

772 preserve_path: bool = True, 

773 overwrite: bool = False, 

774 ) -> list[ResourcePath]: 

775 """Retrieve the file artifacts associated with the supplied refs. 

776 

777 Parameters 

778 ---------- 

779 refs : iterable of `DatasetRef` 

780 The datasets for which file artifacts are to be retrieved. 

781 A single ref can result in multiple files. The refs must 

782 be resolved. 

783 destination : `lsst.resources.ResourcePath` 

784 Location to write the file artifacts. 

785 transfer : `str`, optional 

786 Method to use to transfer the artifacts. Must be one of the options 

787 supported by `lsst.resources.ResourcePath.transfer_from()`. 

788 "move" is not allowed. 

789 preserve_path : `bool`, optional 

790 If `True` the full path of the file artifact within the datastore 

791 is preserved. If `False` the final file component of the path 

792 is used. 

793 overwrite : `bool`, optional 

794 If `True` allow transfers to overwrite existing files at the 

795 destination. 

796 

797 Returns 

798 ------- 

799 targets : `list` of `lsst.resources.ResourcePath` 

800 URIs of file artifacts in destination location. Order is not 

801 preserved. 

802 """ 

803 if not destination.isdir(): 803 ↛ 804line 803 didn't jump to line 804, because the condition on line 803 was never true

804 raise ValueError(f"Destination location must refer to a directory. Given {destination}") 

805 

806 # Using getURIs is not feasible since it becomes difficult to 

807 # determine the path within the datastore later on. For now 

808 # follow getURIs implementation approach. 

809 

810 pending = set(refs) 

811 

812 # There is a question as to whether an exception should be raised 

813 # early if some of the refs are missing, or whether files should be 

814 # transferred until a problem is hit. Prefer to complain up front. 

815 # Use the datastore integer as primary key. 

816 grouped_by_datastore: dict[int, set[DatasetRef]] = {} 

817 

818 for number, datastore in enumerate(self.datastores): 

819 if datastore.isEphemeral: 

820 # In the future we will want to distinguish in-memory from 

821 # caching datastore since using an on-disk local 

822 # cache is exactly what we should be doing. 

823 continue 

824 try: 

825 datastore_refs = {ref for ref in pending if datastore.exists(ref)} 

826 except NotImplementedError: 

827 # Some datastores may not support retrieving artifacts 

828 continue 

829 

830 if datastore_refs: 

831 grouped_by_datastore[number] = datastore_refs 

832 

833 # Remove these from the pending list so that we do not bother 

834 # looking for them any more. 

835 pending = pending - datastore_refs 

836 

837 if pending: 837 ↛ 838line 837 didn't jump to line 838, because the condition on line 837 was never true

838 raise RuntimeError(f"Some datasets were not found in any datastores: {pending}") 

839 

840 # Now do the transfer. 

841 targets: list[ResourcePath] = [] 

842 for number, datastore_refs in grouped_by_datastore.items(): 

843 targets.extend( 

844 self.datastores[number].retrieveArtifacts( 

845 datastore_refs, 

846 destination, 

847 transfer=transfer, 

848 preserve_path=preserve_path, 

849 overwrite=overwrite, 

850 ) 

851 ) 

852 

853 return targets 

854 

855 def remove(self, ref: DatasetRef) -> None: 

856 """Indicate to the datastore that a dataset can be removed. 

857 

858 The dataset will be removed from each datastore. The dataset is 

859 not required to exist in every child datastore. 

860 

861 Parameters 

862 ---------- 

863 ref : `DatasetRef` 

864 Reference to the required dataset. 

865 

866 Raises 

867 ------ 

868 FileNotFoundError 

869 Attempt to remove a dataset that does not exist. Raised if none 

870 of the child datastores removed the dataset. 

871 """ 

872 log.debug("Removing %s", ref) 

873 self.trash(ref, ignore_errors=False) 

874 self.emptyTrash(ignore_errors=False) 

875 

876 def forget(self, refs: Iterable[DatasetRef]) -> None: 

877 for datastore in tuple(self.datastores): 

878 datastore.forget(refs) 

879 

880 def trash(self, ref: DatasetRef | Iterable[DatasetRef], ignore_errors: bool = True) -> None: 

881 if isinstance(ref, DatasetRef): 

882 ref_label = str(ref) 

883 else: 

884 ref_label = "bulk datasets" 

885 

886 log.debug("Trashing %s", ref_label) 

887 

888 counter = 0 

889 for datastore in self.datastores: 

890 try: 

891 datastore.trash(ref, ignore_errors=ignore_errors) 

892 counter += 1 

893 except FileNotFoundError: 

894 pass 

895 

896 if counter == 0: 

897 err_msg = f"Could not mark for removal from any child datastore: {ref_label}" 

898 if ignore_errors: 898 ↛ 899line 898 didn't jump to line 899, because the condition on line 898 was never true

899 log.warning(err_msg) 

900 else: 

901 raise FileNotFoundError(err_msg) 

902 

903 def emptyTrash(self, ignore_errors: bool = True) -> None: 

904 for datastore in self.datastores: 

905 datastore.emptyTrash(ignore_errors=ignore_errors) 

906 

907 def transfer(self, inputDatastore: Datastore, ref: DatasetRef) -> None: 

908 """Retrieve a dataset from an input `Datastore`, 

909 and store the result in this `Datastore`. 

910 

911 Parameters 

912 ---------- 

913 inputDatastore : `Datastore` 

914 The external `Datastore` from which to retreive the Dataset. 

915 ref : `DatasetRef` 

916 Reference to the required dataset in the input data store. 

917 

918 Returns 

919 ------- 

920 results : `list` 

921 List containing the return value from the ``put()`` to each 

922 child datastore. 

923 """ 

924 assert inputDatastore is not self # unless we want it for renames? 

925 inMemoryDataset = inputDatastore.get(ref) 

926 self.put(inMemoryDataset, ref) 

927 

928 def validateConfiguration( 

929 self, entities: Iterable[DatasetRef | DatasetType | StorageClass], logFailures: bool = False 

930 ) -> None: 

931 """Validate some of the configuration for this datastore. 

932 

933 Parameters 

934 ---------- 

935 entities : iterable of `DatasetRef`, `DatasetType`, or `StorageClass` 

936 Entities to test against this configuration. Can be differing 

937 types. 

938 logFailures : `bool`, optional 

939 If `True`, output a log message for every validation error 

940 detected. 

941 

942 Raises 

943 ------ 

944 DatastoreValidationError 

945 Raised if there is a validation problem with a configuration. 

946 All the problems are reported in a single exception. 

947 

948 Notes 

949 ----- 

950 This method checks each datastore in turn. 

951 """ 

952 # Need to catch each of the datastore outputs and ensure that 

953 # all are tested. 

954 failures = [] 

955 for datastore in self.datastores: 

956 try: 

957 datastore.validateConfiguration(entities, logFailures=logFailures) 

958 except DatastoreValidationError as e: 

959 if logFailures: 959 ↛ 961line 959 didn't jump to line 961, because the condition on line 959 was never false

960 log.critical("Datastore %s failed validation", datastore.name) 

961 failures.append(f"Datastore {self.name}: {e}") 

962 

963 if failures: 

964 msg = ";\n".join(failures) 

965 raise DatastoreValidationError(msg) 

966 

967 def validateKey(self, lookupKey: LookupKey, entity: DatasetRef | DatasetType | StorageClass) -> None: 

968 # Docstring is inherited from base class 

969 failures = [] 

970 for datastore in self.datastores: 

971 try: 

972 datastore.validateKey(lookupKey, entity) 

973 except DatastoreValidationError as e: 

974 failures.append(f"Datastore {self.name}: {e}") 

975 

976 if failures: 

977 msg = ";\n".join(failures) 

978 raise DatastoreValidationError(msg) 

979 

980 def getLookupKeys(self) -> set[LookupKey]: 

981 # Docstring is inherited from base class 

982 keys = set() 

983 for datastore in self.datastores: 

984 keys.update(datastore.getLookupKeys()) 

985 

986 keys.update(self.constraints.getLookupKeys()) 

987 for p in self.datastoreConstraints: 

988 if p is not None: 988 ↛ 987line 988 didn't jump to line 987, because the condition on line 988 was never false

989 keys.update(p.getLookupKeys()) 

990 

991 return keys 

992 

993 def needs_expanded_data_ids( 

994 self, 

995 transfer: str | None, 

996 entity: DatasetRef | DatasetType | StorageClass | None = None, 

997 ) -> bool: 

998 # Docstring inherited. 

999 # We can't safely use `self.datastoreConstraints` with `entity` to 

1000 # check whether a child datastore would even want to ingest this 

1001 # dataset, because we don't want to filter out datastores that might 

1002 # need an expanded data ID based in incomplete information (e.g. we 

1003 # pass a StorageClass, but the constraint dispatches on DatasetType). 

1004 # So we pessimistically check if any datastore would need an expanded 

1005 # data ID for this transfer mode. 

1006 return any(datastore.needs_expanded_data_ids(transfer, entity) for datastore in self.datastores) 

1007 

1008 def import_records(self, data: Mapping[str, DatastoreRecordData]) -> None: 

1009 # Docstring inherited from the base class. 

1010 

1011 for datastore in self.datastores: 

1012 datastore.import_records(data) 

1013 

1014 def export_records(self, refs: Iterable[DatasetIdRef]) -> Mapping[str, DatastoreRecordData]: 

1015 # Docstring inherited from the base class. 

1016 

1017 all_records: dict[str, DatastoreRecordData] = {} 

1018 

1019 # Merge all sub-datastore records into one structure 

1020 for datastore in self.datastores: 

1021 sub_records = datastore.export_records(refs) 

1022 for name, record_data in sub_records.items(): 

1023 # All datastore names must be unique in a chain. 

1024 if name in all_records: 1024 ↛ 1025line 1024 didn't jump to line 1025, because the condition on line 1024 was never true

1025 raise ValueError("Non-unique datastore name found in datastore {datastore}") 

1026 all_records[name] = record_data 

1027 

1028 return all_records 

1029 

1030 def export( 

1031 self, 

1032 refs: Iterable[DatasetRef], 

1033 *, 

1034 directory: ResourcePathExpression | None = None, 

1035 transfer: str | None = "auto", 

1036 ) -> Iterable[FileDataset]: 

1037 # Docstring inherited from Datastore.export. 

1038 if transfer == "auto" and directory is None: 

1039 transfer = None 

1040 

1041 if transfer is not None and directory is None: 

1042 raise TypeError(f"Cannot export using transfer mode {transfer} with no export directory given") 

1043 

1044 if transfer == "move": 

1045 raise TypeError("Can not export by moving files out of datastore.") 

1046 

1047 # Exporting from a chain has the potential for a dataset to be 

1048 # in one or more of the datastores in the chain. We only need one 

1049 # of them since we assume the datasets are the same in all (but 

1050 # the file format could be different of course since that is a 

1051 # per-datastore configuration). 

1052 # We also do not know whether any of the datastores in the chain 

1053 # support file export. 

1054 

1055 # Ensure we have an ordered sequence that is not an iterator or set. 

1056 if not isinstance(refs, Sequence): 

1057 refs = list(refs) 

1058 

1059 # If any of the datasets are missing entirely we need to raise early 

1060 # before we try to run the export. This can be a little messy but is 

1061 # better than exporting files from the first datastore and then finding 

1062 # that one is missing but is not in the second datastore either. 

1063 known = [datastore.knows_these(refs) for datastore in self.datastores] 

1064 refs_known: set[DatasetRef] = set() 

1065 for known_to_this in known: 

1066 refs_known.update({ref for ref, knows_this in known_to_this.items() if knows_this}) 

1067 missing_count = len(refs) - len(refs_known) 

1068 if missing_count: 

1069 raise FileNotFoundError(f"Not all datasets known to this datastore. Missing {missing_count}") 

1070 

1071 # To allow us to slot each result into the right place after 

1072 # asking each datastore, create a dict with the index. 

1073 ref_positions = {ref: i for i, ref in enumerate(refs)} 

1074 

1075 # Presize the final export list. 

1076 exported: list[FileDataset | None] = [None] * len(refs) 

1077 

1078 # The order of the returned dataset has to match the order of the 

1079 # given refs, even if they are all from different datastores. 

1080 for i, datastore in enumerate(self.datastores): 

1081 known_to_this = known[i] 

1082 filtered = [ref for ref, knows in known_to_this.items() if knows and ref in ref_positions] 

1083 

1084 try: 

1085 this_export = datastore.export(filtered, directory=directory, transfer=transfer) 

1086 except NotImplementedError: 

1087 # Try the next datastore. 

1088 continue 

1089 

1090 for ref, export in zip(filtered, this_export, strict=True): 

1091 # Get the position and also delete it from the list. 

1092 exported[ref_positions.pop(ref)] = export 

1093 

1094 # Every dataset should be accounted for because of the earlier checks 

1095 # but make sure that we did fill all the slots to appease mypy. 

1096 for i, dataset in enumerate(exported): 

1097 if dataset is None: 1097 ↛ 1098line 1097 didn't jump to line 1098, because the condition on line 1097 was never true

1098 raise FileNotFoundError(f"Failed to export dataset {refs[i]}.") 

1099 yield dataset 

1100 

1101 def transfer_from( 

1102 self, 

1103 source_datastore: Datastore, 

1104 refs: Iterable[DatasetRef], 

1105 transfer: str = "auto", 

1106 artifact_existence: dict[ResourcePath, bool] | None = None, 

1107 dry_run: bool = False, 

1108 ) -> tuple[set[DatasetRef], set[DatasetRef]]: 

1109 # Docstring inherited 

1110 # mypy does not understand "type(self) is not type(source)" 

1111 if isinstance(source_datastore, ChainedDatastore): 

1112 # Both the source and destination are chained datastores. 

1113 source_datastores = tuple(source_datastore.datastores) 

1114 else: 

1115 # The source datastore is different, forward everything to the 

1116 # child datastores. 

1117 source_datastores = (source_datastore,) 

1118 

1119 # Need to know the set of all possible refs that could be transferred. 

1120 remaining_refs = set(refs) 

1121 

1122 missing_from_source: set[DatasetRef] | None = None 

1123 all_accepted = set() 

1124 nsuccess = 0 

1125 for source_child in source_datastores: 

1126 # If we are reading from a chained datastore, it's possible that 

1127 # only a subset of the datastores know about the dataset. We can't 

1128 # ask the receiving datastore to copy it when it doesn't exist 

1129 # so we have to filter again based on what the source datastore 

1130 # understands. 

1131 known_to_source = source_child.knows_these(list(refs)) 

1132 

1133 # Need to know that there is a possibility that some of these 

1134 # datasets exist but are unknown to the source datastore if 

1135 # trust is enabled. 

1136 if getattr(source_child, "trustGetRequest", False): 

1137 unknown = [ref for ref, known in known_to_source.items() if not known] 

1138 existence = source_child.mexists(unknown, artifact_existence) 

1139 for ref, exists in existence.items(): 

1140 known_to_source[ref] = exists 

1141 

1142 missing = {ref for ref, known in known_to_source.items() if not known} 

1143 if missing: 

1144 if missing_from_source is None: 

1145 missing_from_source = missing 

1146 else: 

1147 missing_from_source &= missing 

1148 

1149 # Try to transfer from each source datastore to each child 

1150 # datastore. Have to make sure we don't transfer something 

1151 # we've already transferred to this destination on later passes. 

1152 

1153 # Filter the initial list based on the datasets we have 

1154 # not yet transferred. 

1155 these_refs = [] 

1156 for ref in refs: 

1157 if ref in remaining_refs and known_to_source[ref]: 

1158 these_refs.append(ref) 

1159 

1160 if not these_refs: 

1161 # Already transferred all datasets known to this datastore. 

1162 continue 

1163 

1164 for datastore, constraints in zip(self.datastores, self.datastoreConstraints, strict=True): 

1165 if constraints is not None: 1165 ↛ 1173line 1165 didn't jump to line 1173, because the condition on line 1165 was never false

1166 filtered_refs = [] 

1167 for ref in these_refs: 

1168 if constraints.isAcceptable(ref): 

1169 filtered_refs.append(ref) 

1170 else: 

1171 log.debug("Rejecting ref by constraints: %s", ref) 

1172 else: 

1173 filtered_refs = list(these_refs) 

1174 try: 

1175 accepted, _ = datastore.transfer_from( 

1176 source_child, 

1177 filtered_refs, 

1178 transfer, 

1179 artifact_existence, 

1180 dry_run=dry_run, 

1181 ) 

1182 except (TypeError, NotImplementedError): 

1183 # The datastores were incompatible. 

1184 continue 

1185 else: 

1186 nsuccess += 1 

1187 

1188 # Remove the accepted datasets from those remaining. 

1189 remaining_refs = remaining_refs - accepted 

1190 

1191 # Keep track of everything we have accepted. 

1192 all_accepted.update(accepted) 

1193 

1194 if missing_from_source: 

1195 for ref in missing_from_source: 

1196 log.warning("Asked to transfer dataset %s but no file artifacts exist for it", ref) 

1197 

1198 if nsuccess == 0: 1198 ↛ 1199line 1198 didn't jump to line 1199, because the condition on line 1198 was never true

1199 raise TypeError(f"None of the child datastores could accept transfers from {source_datastore!r}") 

1200 

1201 return all_accepted, remaining_refs 

1202 

1203 def get_opaque_table_definitions(self) -> Mapping[str, DatastoreOpaqueTable]: 

1204 # Docstring inherited from the base class. 

1205 tables: dict[str, DatastoreOpaqueTable] = {} 

1206 for datastore in self.datastores: 

1207 tables.update(datastore.get_opaque_table_definitions()) 

1208 return tables