Coverage for python/lsst/daf/butler/datastores/chainedDatastore.py: 86%

455 statements  

« prev     ^ index     » next       coverage.py v7.3.2, created at 2023-12-05 11:05 +0000

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This software is dual licensed under the GNU General Public License and also 

10# under a 3-clause BSD license. Recipients may choose which of these licenses 

11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt, 

12# respectively. If you choose the GPL option then the following text applies 

13# (but note that there is still no warranty even if you opt for BSD instead): 

14# 

15# This program is free software: you can redistribute it and/or modify 

16# it under the terms of the GNU General Public License as published by 

17# the Free Software Foundation, either version 3 of the License, or 

18# (at your option) any later version. 

19# 

20# This program is distributed in the hope that it will be useful, 

21# but WITHOUT ANY WARRANTY; without even the implied warranty of 

22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

23# GNU General Public License for more details. 

24# 

25# You should have received a copy of the GNU General Public License 

26# along with this program. If not, see <http://www.gnu.org/licenses/>. 

27 

28"""Chained datastore.""" 

29 

30from __future__ import annotations 

31 

32__all__ = ("ChainedDatastore",) 

33 

34import itertools 

35import logging 

36import time 

37import warnings 

38from collections.abc import Iterable, Mapping, Sequence 

39from typing import TYPE_CHECKING, Any 

40 

41from lsst.daf.butler import DatasetRef, DatasetTypeNotSupportedError, FileDataset 

42from lsst.daf.butler.datastore import ( 

43 DatasetRefURIs, 

44 Datastore, 

45 DatastoreConfig, 

46 DatastoreOpaqueTable, 

47 DatastoreValidationError, 

48) 

49from lsst.daf.butler.datastore.constraints import Constraints 

50from lsst.daf.butler.datastore.record_data import DatastoreRecordData 

51from lsst.resources import ResourcePath 

52from lsst.utils import doImportType 

53 

54if TYPE_CHECKING: 

55 from lsst.daf.butler import Config, DatasetType, LookupKey, StorageClass 

56 from lsst.daf.butler.registry.interfaces import DatasetIdRef, DatastoreRegistryBridgeManager 

57 from lsst.resources import ResourcePathExpression 

58 

59log = logging.getLogger(__name__) 

60 

61 

62class _IngestPrepData(Datastore.IngestPrepData): 

63 """Helper class for ChainedDatastore ingest implementation. 

64 

65 Parameters 

66 ---------- 

67 children : `list` of `tuple` 

68 Pairs of `Datastore`, `IngestPrepData` for all child datastores. 

69 """ 

70 

71 def __init__(self, children: list[tuple[Datastore, Datastore.IngestPrepData, set[ResourcePath]]]): 

72 super().__init__(itertools.chain.from_iterable(data.refs.values() for _, data, _ in children)) 

73 self.children = children 

74 

75 

76class ChainedDatastore(Datastore): 

77 """Chained Datastores to allow read and writes from multiple datastores. 

78 

79 A ChainedDatastore is configured with multiple datastore configurations. 

80 A ``put()`` is always sent to each datastore. A ``get()`` 

81 operation is sent to each datastore in turn and the first datastore 

82 to return a valid dataset is used. 

83 

84 Parameters 

85 ---------- 

86 config : `DatastoreConfig` or `str` 

87 Configuration. This configuration must include a ``datastores`` field 

88 as a sequence of datastore configurations. The order in this sequence 

89 indicates the order to use for read operations. 

90 bridgeManager : `DatastoreRegistryBridgeManager` 

91 Object that manages the interface between `Registry` and datastores. 

92 butlerRoot : `str`, optional 

93 New datastore root to use to override the configuration value. This 

94 root is sent to each child datastore. 

95 

96 Notes 

97 ----- 

98 ChainedDatastore never supports `None` or `"move"` as an `ingest` transfer 

99 mode. It supports `"copy"`, `"symlink"`, `"relsymlink"` 

100 and `"hardlink"` if and only if all its child datastores do. 

101 """ 

102 

103 defaultConfigFile = "datastores/chainedDatastore.yaml" 

104 """Path to configuration defaults. Accessed within the ``configs`` resource 

105 or relative to a search path. Can be None if no defaults specified. 

106 """ 

107 

108 containerKey = "datastores" 

109 """Key to specify where child datastores are configured.""" 

110 

111 datastores: list[Datastore] 

112 """All the child datastores known to this datastore.""" 

113 

114 datastoreConstraints: Sequence[Constraints | None] 

115 """Constraints to be applied to each of the child datastores.""" 

116 

117 @classmethod 

118 def setConfigRoot(cls, root: str, config: Config, full: Config, overwrite: bool = True) -> None: 

119 """Set any filesystem-dependent config options for child Datastores to 

120 be appropriate for a new empty repository with the given root. 

121 

122 Parameters 

123 ---------- 

124 root : `str` 

125 Filesystem path to the root of the data repository. 

126 config : `Config` 

127 A `Config` to update. Only the subset understood by 

128 this component will be updated. Will not expand 

129 defaults. 

130 full : `Config` 

131 A complete config with all defaults expanded that can be 

132 converted to a `DatastoreConfig`. Read-only and will not be 

133 modified by this method. 

134 Repository-specific options that should not be obtained 

135 from defaults when Butler instances are constructed 

136 should be copied from ``full`` to ``config``. 

137 overwrite : `bool`, optional 

138 If `False`, do not modify a value in ``config`` if the value 

139 already exists. Default is always to overwrite with the provided 

140 ``root``. 

141 

142 Notes 

143 ----- 

144 If a keyword is explicitly defined in the supplied ``config`` it 

145 will not be overridden by this method if ``overwrite`` is `False`. 

146 This allows explicit values set in external configs to be retained. 

147 """ 

148 # Extract the part of the config we care about updating 

149 datastoreConfig = DatastoreConfig(config, mergeDefaults=False) 

150 

151 # And the subset of the full config that we can use for reference. 

152 # Do not bother with defaults because we are told this already has 

153 # them. 

154 fullDatastoreConfig = DatastoreConfig(full, mergeDefaults=False) 

155 

156 # Loop over each datastore config and pass the subsets to the 

157 # child datastores to process. 

158 

159 containerKey = cls.containerKey 

160 for idx, (child, fullChild) in enumerate( 

161 zip(datastoreConfig[containerKey], fullDatastoreConfig[containerKey], strict=True) 

162 ): 

163 childConfig = DatastoreConfig(child, mergeDefaults=False) 

164 fullChildConfig = DatastoreConfig(fullChild, mergeDefaults=False) 

165 datastoreClass = doImportType(fullChildConfig["cls"]) 

166 if not issubclass(datastoreClass, Datastore): 166 ↛ 167line 166 didn't jump to line 167, because the condition on line 166 was never true

167 raise TypeError(f"Imported child class {fullChildConfig['cls']} is not a Datastore") 

168 newroot = f"{root}/{datastoreClass.__qualname__}_{idx}" 

169 datastoreClass.setConfigRoot(newroot, childConfig, fullChildConfig, overwrite=overwrite) 

170 

171 # Reattach to parent 

172 datastoreConfig[containerKey, idx] = childConfig 

173 

174 # Reattach modified datastore config to parent 

175 # If this has a datastore key we attach there, otherwise we assume 

176 # this information goes at the top of the config hierarchy. 

177 if DatastoreConfig.component in config: 

178 config[DatastoreConfig.component] = datastoreConfig 

179 else: 

180 config.update(datastoreConfig) 

181 

182 return 

183 

184 def __init__( 

185 self, 

186 config: Config | ResourcePathExpression, 

187 bridgeManager: DatastoreRegistryBridgeManager, 

188 butlerRoot: str | None = None, 

189 ): 

190 super().__init__(config, bridgeManager) 

191 

192 # Scan for child datastores and instantiate them with the same registry 

193 self.datastores = [] 

194 for c in self.config["datastores"]: 

195 c = DatastoreConfig(c) 

196 datastoreType = doImportType(c["cls"]) 

197 if not issubclass(datastoreType, Datastore): 197 ↛ 198line 197 didn't jump to line 198, because the condition on line 197 was never true

198 raise TypeError(f"Imported child class {c['cls']} is not a Datastore") 

199 datastore = datastoreType(c, bridgeManager, butlerRoot=butlerRoot) 

200 log.debug("Creating child datastore %s", datastore.name) 

201 self.datastores.append(datastore) 

202 

203 # Name ourself based on our children 

204 if self.datastores: 204 ↛ 209line 204 didn't jump to line 209, because the condition on line 204 was never false

205 # We must set the names explicitly 

206 self._names = [d.name for d in self.datastores] 

207 childNames = ",".join(self.names) 

208 else: 

209 childNames = f"(empty@{time.time()})" 

210 self._names = [childNames] 

211 self.name = f"{type(self).__qualname__}[{childNames}]" 

212 

213 # We declare we are ephemeral if all our child datastores declare 

214 # they are ephemeral 

215 self.isEphemeral = all(d.isEphemeral for d in self.datastores) 

216 

217 # per-datastore override constraints 

218 if "datastore_constraints" in self.config: 

219 overrides = self.config["datastore_constraints"] 

220 

221 if len(overrides) != len(self.datastores): 221 ↛ 222line 221 didn't jump to line 222, because the condition on line 221 was never true

222 raise DatastoreValidationError( 

223 f"Number of registered datastores ({len(self.datastores)})" 

224 " differs from number of constraints overrides" 

225 f" {len(overrides)}" 

226 ) 

227 

228 self.datastoreConstraints = [ 

229 Constraints(c.get("constraints"), universe=bridgeManager.universe) for c in overrides 

230 ] 

231 

232 else: 

233 self.datastoreConstraints = (None,) * len(self.datastores) 

234 

235 log.debug("Created %s (%s)", self.name, ("ephemeral" if self.isEphemeral else "permanent")) 

236 

237 @property 

238 def names(self) -> tuple[str, ...]: 

239 return tuple(self._names) 

240 

241 @property 

242 def roots(self) -> dict[str, ResourcePath | None]: 

243 # Docstring inherited. 

244 roots = {} 

245 for datastore in self.datastores: 

246 roots.update(datastore.roots) 

247 return roots 

248 

249 def __str__(self) -> str: 

250 chainName = ", ".join(str(ds) for ds in self.datastores) 

251 return chainName 

252 

253 def knows(self, ref: DatasetRef) -> bool: 

254 """Check if the dataset is known to any of the datastores. 

255 

256 Does not check for existence of any artifact. 

257 

258 Parameters 

259 ---------- 

260 ref : `DatasetRef` 

261 Reference to the required dataset. 

262 

263 Returns 

264 ------- 

265 exists : `bool` 

266 `True` if the dataset is known to the datastore. 

267 """ 

268 for datastore in self.datastores: 

269 if datastore.knows(ref): 

270 log.debug("%s known to datastore %s", ref, datastore.name) 

271 return True 

272 return False 

273 

274 def knows_these(self, refs: Iterable[DatasetRef]) -> dict[DatasetRef, bool]: 

275 # Docstring inherited from the base class. 

276 refs_known: dict[DatasetRef, bool] = {} 

277 for datastore in self.datastores: 

278 refs_known.update(datastore.knows_these(refs)) 

279 

280 # No need to check in next datastore for refs that are known. 

281 # We only update entries that were initially False. 

282 refs = [ref for ref, known in refs_known.items() if not known] 

283 

284 return refs_known 

285 

286 def mexists( 

287 self, refs: Iterable[DatasetRef], artifact_existence: dict[ResourcePath, bool] | None = None 

288 ) -> dict[DatasetRef, bool]: 

289 """Check the existence of multiple datasets at once. 

290 

291 Parameters 

292 ---------- 

293 refs : iterable of `DatasetRef` 

294 The datasets to be checked. 

295 artifact_existence : `dict` [`lsst.resources.ResourcePath`, `bool`] 

296 Optional mapping of datastore artifact to existence. Updated by 

297 this method with details of all artifacts tested. Can be `None` 

298 if the caller is not interested. 

299 

300 Returns 

301 ------- 

302 existence : `dict` of [`DatasetRef`, `bool`] 

303 Mapping from dataset to boolean indicating existence in any 

304 of the child datastores. 

305 """ 

306 dataset_existence: dict[DatasetRef, bool] = {} 

307 for datastore in self.datastores: 

308 dataset_existence.update(datastore.mexists(refs, artifact_existence=artifact_existence)) 

309 

310 # For next datastore no point asking about ones we know 

311 # exist already. No special exemption for ephemeral datastores. 

312 refs = [ref for ref, exists in dataset_existence.items() if not exists] 

313 

314 return dataset_existence 

315 

316 def exists(self, ref: DatasetRef) -> bool: 

317 """Check if the dataset exists in one of the datastores. 

318 

319 Parameters 

320 ---------- 

321 ref : `DatasetRef` 

322 Reference to the required dataset. 

323 

324 Returns 

325 ------- 

326 exists : `bool` 

327 `True` if the entity exists in one of the child datastores. 

328 """ 

329 for datastore in self.datastores: 

330 if datastore.exists(ref): 

331 log.debug("Found %s in datastore %s", ref, datastore.name) 

332 return True 

333 return False 

334 

335 def get( 

336 self, 

337 ref: DatasetRef, 

338 parameters: Mapping[str, Any] | None = None, 

339 storageClass: StorageClass | str | None = None, 

340 ) -> Any: 

341 """Load an InMemoryDataset from the store. 

342 

343 The dataset is returned from the first datastore that has 

344 the dataset. 

345 

346 Parameters 

347 ---------- 

348 ref : `DatasetRef` 

349 Reference to the required Dataset. 

350 parameters : `dict` 

351 `StorageClass`-specific parameters that specify, for example, 

352 a slice of the dataset to be loaded. 

353 storageClass : `StorageClass` or `str`, optional 

354 The storage class to be used to override the Python type 

355 returned by this method. By default the returned type matches 

356 the dataset type definition for this dataset. Specifying a 

357 read `StorageClass` can force a different type to be returned. 

358 This type must be compatible with the original type. 

359 

360 Returns 

361 ------- 

362 inMemoryDataset : `object` 

363 Requested dataset or slice thereof as an InMemoryDataset. 

364 

365 Raises 

366 ------ 

367 FileNotFoundError 

368 Requested dataset can not be retrieved. 

369 TypeError 

370 Return value from formatter has unexpected type. 

371 ValueError 

372 Formatter failed to process the dataset. 

373 """ 

374 for datastore in self.datastores: 

375 try: 

376 inMemoryObject = datastore.get(ref, parameters, storageClass=storageClass) 

377 log.debug("Found dataset %s in datastore %s", ref, datastore.name) 

378 return inMemoryObject 

379 except FileNotFoundError: 

380 pass 

381 

382 raise FileNotFoundError(f"Dataset {ref} could not be found in any of the datastores") 

383 

384 def put(self, inMemoryDataset: Any, ref: DatasetRef) -> None: 

385 """Write a InMemoryDataset with a given `DatasetRef` to each 

386 datastore. 

387 

388 The put() to child datastores can fail with 

389 `DatasetTypeNotSupportedError`. The put() for this datastore will be 

390 deemed to have succeeded so long as at least one child datastore 

391 accepted the inMemoryDataset. 

392 

393 Parameters 

394 ---------- 

395 inMemoryDataset : `object` 

396 The dataset to store. 

397 ref : `DatasetRef` 

398 Reference to the associated Dataset. 

399 

400 Raises 

401 ------ 

402 TypeError 

403 Supplied object and storage class are inconsistent. 

404 DatasetTypeNotSupportedError 

405 All datastores reported `DatasetTypeNotSupportedError`. 

406 """ 

407 log.debug("Put %s", ref) 

408 

409 # Confirm that we can accept this dataset 

410 if not self.constraints.isAcceptable(ref): 

411 # Raise rather than use boolean return value. 

412 raise DatasetTypeNotSupportedError( 

413 f"Dataset {ref} has been rejected by this datastore via configuration." 

414 ) 

415 

416 isPermanent = False 

417 nsuccess = 0 

418 npermanent = 0 

419 nephemeral = 0 

420 for datastore, constraints in zip(self.datastores, self.datastoreConstraints, strict=True): 

421 if ( 

422 constraints is not None and not constraints.isAcceptable(ref) 

423 ) or not datastore.constraints.isAcceptable(ref): 

424 log.debug("Datastore %s skipping put via configuration for ref %s", datastore.name, ref) 

425 continue 

426 

427 if datastore.isEphemeral: 

428 nephemeral += 1 

429 else: 

430 npermanent += 1 

431 try: 

432 datastore.put(inMemoryDataset, ref) 

433 nsuccess += 1 

434 if not datastore.isEphemeral: 

435 isPermanent = True 

436 except DatasetTypeNotSupportedError: 

437 pass 

438 

439 if nsuccess == 0: 

440 raise DatasetTypeNotSupportedError(f"None of the chained datastores supported ref {ref}") 

441 

442 if not isPermanent and npermanent > 0: 442 ↛ 443line 442 didn't jump to line 443, because the condition on line 442 was never true

443 warnings.warn(f"Put of {ref} only succeeded in ephemeral databases", stacklevel=2) 

444 

445 if self._transaction is not None: 

446 self._transaction.registerUndo("put", self.remove, ref) 

447 

448 def put_new(self, inMemoryDataset: Any, ref: DatasetRef) -> Mapping[str, DatasetRef]: 

449 # Docstring inherited from base class. 

450 log.debug("Put %s", ref) 

451 

452 # Confirm that we can accept this dataset 

453 if not self.constraints.isAcceptable(ref): 

454 # Raise rather than use boolean return value. 

455 raise DatasetTypeNotSupportedError( 

456 f"Dataset {ref} has been rejected by this datastore via configuration." 

457 ) 

458 

459 isPermanent = False 

460 nsuccess = 0 

461 npermanent = 0 

462 nephemeral = 0 

463 stored_refs: dict[str, DatasetRef] = {} 

464 for datastore, constraints in zip(self.datastores, self.datastoreConstraints, strict=True): 

465 if ( 

466 constraints is not None and not constraints.isAcceptable(ref) 

467 ) or not datastore.constraints.isAcceptable(ref): 

468 log.debug("Datastore %s skipping put via configuration for ref %s", datastore.name, ref) 

469 continue 

470 

471 if datastore.isEphemeral: 

472 nephemeral += 1 

473 else: 

474 npermanent += 1 

475 try: 

476 stored_ref_map = datastore.put_new(inMemoryDataset, ref) 

477 stored_refs.update(stored_ref_map) 

478 nsuccess += 1 

479 if not datastore.isEphemeral: 

480 isPermanent = True 

481 except DatasetTypeNotSupportedError: 

482 pass 

483 

484 if nsuccess == 0: 

485 raise DatasetTypeNotSupportedError(f"None of the chained datastores supported ref {ref}") 

486 

487 if not isPermanent and npermanent > 0: 

488 warnings.warn(f"Put of {ref} only succeeded in ephemeral databases", stacklevel=2) 

489 

490 if self._transaction is not None: 

491 self._transaction.registerUndo("put", self.remove, ref) 

492 

493 return stored_refs 

494 

495 def _overrideTransferMode(self, *datasets: Any, transfer: str | None = None) -> str | None: 

496 # Docstring inherited from base class. 

497 if transfer != "auto": 

498 return transfer 

499 # Ask each datastore what they think auto means 

500 transfers = {d._overrideTransferMode(*datasets, transfer=transfer) for d in self.datastores} 

501 

502 # Remove any untranslated "auto" values 

503 transfers.discard(transfer) 

504 

505 if len(transfers) == 1: 505 ↛ 506line 505 didn't jump to line 506, because the condition on line 505 was never true

506 return transfers.pop() 

507 if not transfers: 507 ↛ 511line 507 didn't jump to line 511, because the condition on line 507 was never false

508 # Everything reported "auto" 

509 return transfer 

510 

511 raise RuntimeError( 

512 "Chained datastore does not yet support different transfer modes" 

513 f" from 'auto' in each child datastore (wanted {transfers})" 

514 ) 

515 

516 def _prepIngest(self, *datasets: FileDataset, transfer: str | None = None) -> _IngestPrepData: 

517 # Docstring inherited from Datastore._prepIngest. 

518 if transfer is None: 

519 raise NotImplementedError("ChainedDatastore does not support transfer=None.") 

520 

521 def isDatasetAcceptable(dataset: FileDataset, *, name: str, constraints: Constraints) -> bool: 

522 acceptable = [ref for ref in dataset.refs if constraints.isAcceptable(ref)] 

523 if not acceptable: 

524 log.debug( 

525 "Datastore %s skipping ingest via configuration for refs %s", 

526 name, 

527 ", ".join(str(ref) for ref in dataset.refs), 

528 ) 

529 return False 

530 else: 

531 return True 

532 

533 # Filter down to just datasets the chained datastore's own 

534 # configuration accepts. 

535 okForParent: list[FileDataset] = [ 

536 dataset 

537 for dataset in datasets 

538 if isDatasetAcceptable(dataset, name=self.name, constraints=self.constraints) 

539 ] 

540 

541 # Iterate over nested datastores and call _prepIngest on each. 

542 # Save the results to a list: 

543 children: list[tuple[Datastore, Datastore.IngestPrepData, set[ResourcePath]]] = [] 

544 # ...and remember whether all of the failures are due to 

545 # NotImplementedError being raised. 

546 allFailuresAreNotImplementedError = True 

547 for datastore, constraints in zip(self.datastores, self.datastoreConstraints, strict=True): 

548 okForChild: list[FileDataset] 

549 if constraints is not None: 

550 okForChild = [ 

551 dataset 

552 for dataset in okForParent 

553 if isDatasetAcceptable(dataset, name=datastore.name, constraints=constraints) 

554 ] 

555 else: 

556 okForChild = okForParent 

557 try: 

558 prepDataForChild = datastore._prepIngest(*okForChild, transfer=transfer) 

559 except NotImplementedError: 

560 log.debug( 

561 "Skipping ingest for datastore %s because transfer mode %s is not supported.", 

562 datastore.name, 

563 transfer, 

564 ) 

565 continue 

566 allFailuresAreNotImplementedError = False 

567 if okForChild: 

568 # Do not store for later if a datastore has rejected 

569 # everything. 

570 # Include the source paths if this is a "move". It's clearer 

571 # to find the paths now rather than try to infer how 

572 # each datastore has stored them in the internal prep class. 

573 paths = ( 

574 {ResourcePath(dataset.path) for dataset in okForChild} if transfer == "move" else set() 

575 ) 

576 children.append((datastore, prepDataForChild, paths)) 

577 if allFailuresAreNotImplementedError: 

578 raise NotImplementedError(f"No child datastore supports transfer mode {transfer}.") 

579 return _IngestPrepData(children=children) 

580 

581 def _finishIngest( 

582 self, 

583 prepData: _IngestPrepData, 

584 *, 

585 transfer: str | None = None, 

586 record_validation_info: bool = True, 

587 ) -> None: 

588 # Docstring inherited from Datastore._finishIngest. 

589 # For "move" we must use "copy" and then delete the input 

590 # data at the end. This has no rollback option if the ingest 

591 # subsequently fails. If there is only one active datastore 

592 # accepting any files we can leave it as "move" 

593 actual_transfer: str | None 

594 if transfer == "move" and len(prepData.children) > 1: 

595 actual_transfer = "copy" 

596 else: 

597 actual_transfer = transfer 

598 to_be_deleted: set[ResourcePath] = set() 

599 for datastore, prepDataForChild, paths in prepData.children: 

600 datastore._finishIngest( 

601 prepDataForChild, transfer=actual_transfer, record_validation_info=record_validation_info 

602 ) 

603 to_be_deleted.update(paths) 

604 if actual_transfer != transfer: 

605 # These datasets were copied but now need to be deleted. 

606 # This can not be rolled back. 

607 for uri in to_be_deleted: 

608 uri.remove() 

609 

610 def getManyURIs( 

611 self, 

612 refs: Iterable[DatasetRef], 

613 predict: bool = False, 

614 allow_missing: bool = False, 

615 ) -> dict[DatasetRef, DatasetRefURIs]: 

616 # Docstring inherited 

617 

618 uris: dict[DatasetRef, DatasetRefURIs] = {} 

619 missing_refs = set(refs) 

620 

621 # If predict is True we don't want to predict a dataset in the first 

622 # datastore if it actually exists in a later datastore, so in that 

623 # case check all datastores with predict=False first, and then try 

624 # again with predict=True. 

625 for p in (False, True) if predict else (False,): 

626 if not missing_refs: 

627 break 

628 for datastore in self.datastores: 

629 try: 

630 got_uris = datastore.getManyURIs(missing_refs, p, allow_missing=True) 

631 except NotImplementedError: 

632 # some datastores may not implement generating URIs 

633 continue 

634 missing_refs -= got_uris.keys() 

635 uris.update(got_uris) 

636 if not missing_refs: 

637 break 

638 

639 if missing_refs and not allow_missing: 

640 raise FileNotFoundError(f"Dataset(s) {missing_refs} not in this datastore.") 

641 

642 return uris 

643 

644 def getURIs(self, ref: DatasetRef, predict: bool = False) -> DatasetRefURIs: 

645 """Return URIs associated with dataset. 

646 

647 Parameters 

648 ---------- 

649 ref : `DatasetRef` 

650 Reference to the required dataset. 

651 predict : `bool`, optional 

652 If the datastore does not know about the dataset, should it 

653 return a predicted URI or not? 

654 

655 Returns 

656 ------- 

657 uris : `DatasetRefURIs` 

658 The URI to the primary artifact associated with this dataset (if 

659 the dataset was disassembled within the datastore this may be 

660 `None`), and the URIs to any components associated with the dataset 

661 artifact. (can be empty if there are no components). 

662 

663 Notes 

664 ----- 

665 The returned URI is from the first datastore in the list that has 

666 the dataset with preference given to the first dataset coming from 

667 a permanent datastore. If no datastores have the dataset and prediction 

668 is allowed, the predicted URI for the first datastore in the list will 

669 be returned. 

670 """ 

671 log.debug("Requesting URIs for %s", ref) 

672 predictedUri: DatasetRefURIs | None = None 

673 predictedEphemeralUri: DatasetRefURIs | None = None 

674 firstEphemeralUri: DatasetRefURIs | None = None 

675 for datastore in self.datastores: 

676 if datastore.exists(ref): 

677 if not datastore.isEphemeral: 

678 uri = datastore.getURIs(ref) 

679 log.debug("Retrieved non-ephemeral URI: %s", uri) 

680 return uri 

681 elif not firstEphemeralUri: 

682 firstEphemeralUri = datastore.getURIs(ref) 

683 elif predict: 

684 if not predictedUri and not datastore.isEphemeral: 

685 predictedUri = datastore.getURIs(ref, predict) 

686 elif not predictedEphemeralUri and datastore.isEphemeral: 

687 predictedEphemeralUri = datastore.getURIs(ref, predict) 

688 

689 if firstEphemeralUri: 

690 log.debug("Retrieved ephemeral URI: %s", firstEphemeralUri) 

691 return firstEphemeralUri 

692 

693 if predictedUri: 

694 log.debug("Retrieved predicted URI: %s", predictedUri) 

695 return predictedUri 

696 

697 if predictedEphemeralUri: 

698 log.debug("Retrieved predicted ephemeral URI: %s", predictedEphemeralUri) 

699 return predictedEphemeralUri 

700 

701 raise FileNotFoundError(f"Dataset {ref} not in any datastore") 

702 

703 def getURI(self, ref: DatasetRef, predict: bool = False) -> ResourcePath: 

704 """URI to the Dataset. 

705 

706 The returned URI is from the first datastore in the list that has 

707 the dataset with preference given to the first dataset coming from 

708 a permanent datastore. If no datastores have the dataset and prediction 

709 is allowed, the predicted URI for the first datastore in the list will 

710 be returned. 

711 

712 Parameters 

713 ---------- 

714 ref : `DatasetRef` 

715 Reference to the required Dataset. 

716 predict : `bool` 

717 If `True`, allow URIs to be returned of datasets that have not 

718 been written. 

719 

720 Returns 

721 ------- 

722 uri : `lsst.resources.ResourcePath` 

723 URI pointing to the dataset within the datastore. If the 

724 dataset does not exist in the datastore, and if ``predict`` is 

725 `True`, the URI will be a prediction and will include a URI 

726 fragment "#predicted". 

727 

728 Notes 

729 ----- 

730 If the datastore does not have entities that relate well 

731 to the concept of a URI the returned URI string will be 

732 descriptive. The returned URI is not guaranteed to be obtainable. 

733 

734 Raises 

735 ------ 

736 FileNotFoundError 

737 A URI has been requested for a dataset that does not exist and 

738 guessing is not allowed. 

739 RuntimeError 

740 Raised if a request is made for a single URI but multiple URIs 

741 are associated with this dataset. 

742 """ 

743 log.debug("Requesting URI for %s", ref) 

744 primary, components = self.getURIs(ref, predict) 

745 if primary is None or components: 745 ↛ 746line 745 didn't jump to line 746, because the condition on line 745 was never true

746 raise RuntimeError( 

747 f"Dataset ({ref}) includes distinct URIs for components. Use Datastore.getURIs() instead." 

748 ) 

749 return primary 

750 

751 def retrieveArtifacts( 

752 self, 

753 refs: Iterable[DatasetRef], 

754 destination: ResourcePath, 

755 transfer: str = "auto", 

756 preserve_path: bool = True, 

757 overwrite: bool = False, 

758 ) -> list[ResourcePath]: 

759 """Retrieve the file artifacts associated with the supplied refs. 

760 

761 Parameters 

762 ---------- 

763 refs : iterable of `DatasetRef` 

764 The datasets for which file artifacts are to be retrieved. 

765 A single ref can result in multiple files. The refs must 

766 be resolved. 

767 destination : `lsst.resources.ResourcePath` 

768 Location to write the file artifacts. 

769 transfer : `str`, optional 

770 Method to use to transfer the artifacts. Must be one of the options 

771 supported by `lsst.resources.ResourcePath.transfer_from()`. 

772 "move" is not allowed. 

773 preserve_path : `bool`, optional 

774 If `True` the full path of the file artifact within the datastore 

775 is preserved. If `False` the final file component of the path 

776 is used. 

777 overwrite : `bool`, optional 

778 If `True` allow transfers to overwrite existing files at the 

779 destination. 

780 

781 Returns 

782 ------- 

783 targets : `list` of `lsst.resources.ResourcePath` 

784 URIs of file artifacts in destination location. Order is not 

785 preserved. 

786 """ 

787 if not destination.isdir(): 787 ↛ 788line 787 didn't jump to line 788, because the condition on line 787 was never true

788 raise ValueError(f"Destination location must refer to a directory. Given {destination}") 

789 

790 # Using getURIs is not feasible since it becomes difficult to 

791 # determine the path within the datastore later on. For now 

792 # follow getURIs implementation approach. 

793 

794 pending = set(refs) 

795 

796 # There is a question as to whether an exception should be raised 

797 # early if some of the refs are missing, or whether files should be 

798 # transferred until a problem is hit. Prefer to complain up front. 

799 # Use the datastore integer as primary key. 

800 grouped_by_datastore: dict[int, set[DatasetRef]] = {} 

801 

802 for number, datastore in enumerate(self.datastores): 

803 if datastore.isEphemeral: 

804 # In the future we will want to distinguish in-memory from 

805 # caching datastore since using an on-disk local 

806 # cache is exactly what we should be doing. 

807 continue 

808 try: 

809 datastore_refs = {ref for ref in pending if datastore.exists(ref)} 

810 except NotImplementedError: 

811 # Some datastores may not support retrieving artifacts 

812 continue 

813 

814 if datastore_refs: 

815 grouped_by_datastore[number] = datastore_refs 

816 

817 # Remove these from the pending list so that we do not bother 

818 # looking for them any more. 

819 pending = pending - datastore_refs 

820 

821 if pending: 821 ↛ 822line 821 didn't jump to line 822, because the condition on line 821 was never true

822 raise RuntimeError(f"Some datasets were not found in any datastores: {pending}") 

823 

824 # Now do the transfer. 

825 targets: list[ResourcePath] = [] 

826 for number, datastore_refs in grouped_by_datastore.items(): 

827 targets.extend( 

828 self.datastores[number].retrieveArtifacts( 

829 datastore_refs, 

830 destination, 

831 transfer=transfer, 

832 preserve_path=preserve_path, 

833 overwrite=overwrite, 

834 ) 

835 ) 

836 

837 return targets 

838 

839 def remove(self, ref: DatasetRef) -> None: 

840 """Indicate to the datastore that a dataset can be removed. 

841 

842 The dataset will be removed from each datastore. The dataset is 

843 not required to exist in every child datastore. 

844 

845 Parameters 

846 ---------- 

847 ref : `DatasetRef` 

848 Reference to the required dataset. 

849 

850 Raises 

851 ------ 

852 FileNotFoundError 

853 Attempt to remove a dataset that does not exist. Raised if none 

854 of the child datastores removed the dataset. 

855 """ 

856 log.debug("Removing %s", ref) 

857 self.trash(ref, ignore_errors=False) 

858 self.emptyTrash(ignore_errors=False) 

859 

860 def forget(self, refs: Iterable[DatasetRef]) -> None: 

861 for datastore in tuple(self.datastores): 

862 datastore.forget(refs) 

863 

864 def trash(self, ref: DatasetRef | Iterable[DatasetRef], ignore_errors: bool = True) -> None: 

865 if isinstance(ref, DatasetRef): 

866 ref_label = str(ref) 

867 else: 

868 ref_label = "bulk datasets" 

869 

870 log.debug("Trashing %s", ref_label) 

871 

872 counter = 0 

873 for datastore in self.datastores: 

874 try: 

875 datastore.trash(ref, ignore_errors=ignore_errors) 

876 counter += 1 

877 except FileNotFoundError: 

878 pass 

879 

880 if counter == 0: 

881 err_msg = f"Could not mark for removal from any child datastore: {ref_label}" 

882 if ignore_errors: 882 ↛ 883line 882 didn't jump to line 883, because the condition on line 882 was never true

883 log.warning(err_msg) 

884 else: 

885 raise FileNotFoundError(err_msg) 

886 

887 def emptyTrash(self, ignore_errors: bool = True) -> None: 

888 for datastore in self.datastores: 

889 datastore.emptyTrash(ignore_errors=ignore_errors) 

890 

891 def transfer(self, inputDatastore: Datastore, ref: DatasetRef) -> None: 

892 """Retrieve a dataset from an input `Datastore`, 

893 and store the result in this `Datastore`. 

894 

895 Parameters 

896 ---------- 

897 inputDatastore : `Datastore` 

898 The external `Datastore` from which to retreive the Dataset. 

899 ref : `DatasetRef` 

900 Reference to the required dataset in the input data store. 

901 

902 Returns 

903 ------- 

904 results : `list` 

905 List containing the return value from the ``put()`` to each 

906 child datastore. 

907 """ 

908 assert inputDatastore is not self # unless we want it for renames? 

909 inMemoryDataset = inputDatastore.get(ref) 

910 self.put(inMemoryDataset, ref) 

911 

912 def validateConfiguration( 

913 self, entities: Iterable[DatasetRef | DatasetType | StorageClass], logFailures: bool = False 

914 ) -> None: 

915 """Validate some of the configuration for this datastore. 

916 

917 Parameters 

918 ---------- 

919 entities : iterable of `DatasetRef`, `DatasetType`, or `StorageClass` 

920 Entities to test against this configuration. Can be differing 

921 types. 

922 logFailures : `bool`, optional 

923 If `True`, output a log message for every validation error 

924 detected. 

925 

926 Raises 

927 ------ 

928 DatastoreValidationError 

929 Raised if there is a validation problem with a configuration. 

930 All the problems are reported in a single exception. 

931 

932 Notes 

933 ----- 

934 This method checks each datastore in turn. 

935 """ 

936 # Need to catch each of the datastore outputs and ensure that 

937 # all are tested. 

938 failures = [] 

939 for datastore in self.datastores: 

940 try: 

941 datastore.validateConfiguration(entities, logFailures=logFailures) 

942 except DatastoreValidationError as e: 

943 if logFailures: 943 ↛ 945line 943 didn't jump to line 945, because the condition on line 943 was never false

944 log.critical("Datastore %s failed validation", datastore.name) 

945 failures.append(f"Datastore {self.name}: {e}") 

946 

947 if failures: 

948 msg = ";\n".join(failures) 

949 raise DatastoreValidationError(msg) 

950 

951 def validateKey(self, lookupKey: LookupKey, entity: DatasetRef | DatasetType | StorageClass) -> None: 

952 # Docstring is inherited from base class 

953 failures = [] 

954 for datastore in self.datastores: 

955 try: 

956 datastore.validateKey(lookupKey, entity) 

957 except DatastoreValidationError as e: 

958 failures.append(f"Datastore {self.name}: {e}") 

959 

960 if failures: 

961 msg = ";\n".join(failures) 

962 raise DatastoreValidationError(msg) 

963 

964 def getLookupKeys(self) -> set[LookupKey]: 

965 # Docstring is inherited from base class 

966 keys = set() 

967 for datastore in self.datastores: 

968 keys.update(datastore.getLookupKeys()) 

969 

970 keys.update(self.constraints.getLookupKeys()) 

971 for p in self.datastoreConstraints: 

972 if p is not None: 972 ↛ 971line 972 didn't jump to line 971, because the condition on line 972 was never false

973 keys.update(p.getLookupKeys()) 

974 

975 return keys 

976 

977 def needs_expanded_data_ids( 

978 self, 

979 transfer: str | None, 

980 entity: DatasetRef | DatasetType | StorageClass | None = None, 

981 ) -> bool: 

982 # Docstring inherited. 

983 # We can't safely use `self.datastoreConstraints` with `entity` to 

984 # check whether a child datastore would even want to ingest this 

985 # dataset, because we don't want to filter out datastores that might 

986 # need an expanded data ID based in incomplete information (e.g. we 

987 # pass a StorageClass, but the constraint dispatches on DatasetType). 

988 # So we pessimistically check if any datastore would need an expanded 

989 # data ID for this transfer mode. 

990 return any(datastore.needs_expanded_data_ids(transfer, entity) for datastore in self.datastores) 

991 

992 def import_records(self, data: Mapping[str, DatastoreRecordData]) -> None: 

993 # Docstring inherited from the base class. 

994 

995 for datastore in self.datastores: 

996 datastore.import_records(data) 

997 

998 def export_records(self, refs: Iterable[DatasetIdRef]) -> Mapping[str, DatastoreRecordData]: 

999 # Docstring inherited from the base class. 

1000 

1001 all_records: dict[str, DatastoreRecordData] = {} 

1002 

1003 # Merge all sub-datastore records into one structure 

1004 for datastore in self.datastores: 

1005 sub_records = datastore.export_records(refs) 

1006 for name, record_data in sub_records.items(): 

1007 # All datastore names must be unique in a chain. 

1008 if name in all_records: 1008 ↛ 1009line 1008 didn't jump to line 1009, because the condition on line 1008 was never true

1009 raise ValueError("Non-unique datastore name found in datastore {datastore}") 

1010 all_records[name] = record_data 

1011 

1012 return all_records 

1013 

1014 def export( 

1015 self, 

1016 refs: Iterable[DatasetRef], 

1017 *, 

1018 directory: ResourcePathExpression | None = None, 

1019 transfer: str | None = "auto", 

1020 ) -> Iterable[FileDataset]: 

1021 # Docstring inherited from Datastore.export. 

1022 if transfer == "auto" and directory is None: 

1023 transfer = None 

1024 

1025 if transfer is not None and directory is None: 

1026 raise TypeError(f"Cannot export using transfer mode {transfer} with no export directory given") 

1027 

1028 if transfer == "move": 

1029 raise TypeError("Can not export by moving files out of datastore.") 

1030 

1031 # Exporting from a chain has the potential for a dataset to be 

1032 # in one or more of the datastores in the chain. We only need one 

1033 # of them since we assume the datasets are the same in all (but 

1034 # the file format could be different of course since that is a 

1035 # per-datastore configuration). 

1036 # We also do not know whether any of the datastores in the chain 

1037 # support file export. 

1038 

1039 # Ensure we have an ordered sequence that is not an iterator or set. 

1040 if not isinstance(refs, Sequence): 

1041 refs = list(refs) 

1042 

1043 # If any of the datasets are missing entirely we need to raise early 

1044 # before we try to run the export. This can be a little messy but is 

1045 # better than exporting files from the first datastore and then finding 

1046 # that one is missing but is not in the second datastore either. 

1047 known = [datastore.knows_these(refs) for datastore in self.datastores] 

1048 refs_known: set[DatasetRef] = set() 

1049 for known_to_this in known: 

1050 refs_known.update({ref for ref, knows_this in known_to_this.items() if knows_this}) 

1051 missing_count = len(refs) - len(refs_known) 

1052 if missing_count: 

1053 raise FileNotFoundError(f"Not all datasets known to this datastore. Missing {missing_count}") 

1054 

1055 # To allow us to slot each result into the right place after 

1056 # asking each datastore, create a dict with the index. 

1057 ref_positions = {ref: i for i, ref in enumerate(refs)} 

1058 

1059 # Presize the final export list. 

1060 exported: list[FileDataset | None] = [None] * len(refs) 

1061 

1062 # The order of the returned dataset has to match the order of the 

1063 # given refs, even if they are all from different datastores. 

1064 for i, datastore in enumerate(self.datastores): 

1065 known_to_this = known[i] 

1066 filtered = [ref for ref, knows in known_to_this.items() if knows and ref in ref_positions] 

1067 

1068 try: 

1069 this_export = datastore.export(filtered, directory=directory, transfer=transfer) 

1070 except NotImplementedError: 

1071 # Try the next datastore. 

1072 continue 

1073 

1074 for ref, export in zip(filtered, this_export, strict=True): 

1075 # Get the position and also delete it from the list. 

1076 exported[ref_positions.pop(ref)] = export 

1077 

1078 # Every dataset should be accounted for because of the earlier checks 

1079 # but make sure that we did fill all the slots to appease mypy. 

1080 for i, dataset in enumerate(exported): 

1081 if dataset is None: 1081 ↛ 1082line 1081 didn't jump to line 1082, because the condition on line 1081 was never true

1082 raise FileNotFoundError(f"Failed to export dataset {refs[i]}.") 

1083 yield dataset 

1084 

1085 def transfer_from( 

1086 self, 

1087 source_datastore: Datastore, 

1088 refs: Iterable[DatasetRef], 

1089 transfer: str = "auto", 

1090 artifact_existence: dict[ResourcePath, bool] | None = None, 

1091 ) -> tuple[set[DatasetRef], set[DatasetRef]]: 

1092 # Docstring inherited 

1093 # mypy does not understand "type(self) is not type(source)" 

1094 if isinstance(source_datastore, ChainedDatastore): 

1095 # Both the source and destination are chained datastores. 

1096 source_datastores = tuple(source_datastore.datastores) 

1097 else: 

1098 # The source datastore is different, forward everything to the 

1099 # child datastores. 

1100 source_datastores = (source_datastore,) 

1101 

1102 # Need to know the set of all possible refs that could be transferred. 

1103 remaining_refs = set(refs) 

1104 

1105 missing_from_source: set[DatasetRef] | None = None 

1106 all_accepted = set() 

1107 nsuccess = 0 

1108 for source_child in source_datastores: 

1109 # If we are reading from a chained datastore, it's possible that 

1110 # only a subset of the datastores know about the dataset. We can't 

1111 # ask the receiving datastore to copy it when it doesn't exist 

1112 # so we have to filter again based on what the source datastore 

1113 # understands. 

1114 known_to_source = source_child.knows_these(list(refs)) 

1115 

1116 # Need to know that there is a possibility that some of these 

1117 # datasets exist but are unknown to the source datastore if 

1118 # trust is enabled. 

1119 if getattr(source_child, "trustGetRequest", False): 

1120 unknown = [ref for ref, known in known_to_source.items() if not known] 

1121 existence = source_child.mexists(unknown, artifact_existence) 

1122 for ref, exists in existence.items(): 

1123 known_to_source[ref] = exists 

1124 

1125 missing = {ref for ref, known in known_to_source.items() if not known} 

1126 if missing: 

1127 if missing_from_source is None: 

1128 missing_from_source = missing 

1129 else: 

1130 missing_from_source &= missing 

1131 

1132 # Try to transfer from each source datastore to each child 

1133 # datastore. Have to make sure we don't transfer something 

1134 # we've already transferred to this destination on later passes. 

1135 

1136 # Filter the initial list based on the datasets we have 

1137 # not yet transferred. 

1138 these_refs = [] 

1139 for ref in refs: 

1140 if ref in remaining_refs and known_to_source[ref]: 

1141 these_refs.append(ref) 

1142 

1143 if not these_refs: 

1144 # Already transferred all datasets known to this datastore. 

1145 continue 

1146 

1147 for datastore, constraints in zip(self.datastores, self.datastoreConstraints, strict=True): 

1148 if constraints is not None: 1148 ↛ 1156line 1148 didn't jump to line 1156, because the condition on line 1148 was never false

1149 filtered_refs = [] 

1150 for ref in these_refs: 

1151 if constraints.isAcceptable(ref): 

1152 filtered_refs.append(ref) 

1153 else: 

1154 log.debug("Rejecting ref by constraints: %s", ref) 

1155 else: 

1156 filtered_refs = list(these_refs) 

1157 try: 

1158 accepted, _ = datastore.transfer_from( 

1159 source_child, filtered_refs, transfer, artifact_existence 

1160 ) 

1161 except (TypeError, NotImplementedError): 

1162 # The datastores were incompatible. 

1163 continue 

1164 else: 

1165 nsuccess += 1 

1166 

1167 # Remove the accepted datasets from those remaining. 

1168 remaining_refs = remaining_refs - accepted 

1169 

1170 # Keep track of everything we have accepted. 

1171 all_accepted.update(accepted) 

1172 

1173 if missing_from_source: 

1174 for ref in missing_from_source: 

1175 log.warning("Asked to transfer dataset %s but no file artifacts exist for it", ref) 

1176 

1177 if nsuccess == 0: 1177 ↛ 1178line 1177 didn't jump to line 1178, because the condition on line 1177 was never true

1178 raise TypeError(f"None of the child datastores could accept transfers from {source_datastore!r}") 

1179 

1180 return all_accepted, remaining_refs 

1181 

1182 def get_opaque_table_definitions(self) -> Mapping[str, DatastoreOpaqueTable]: 

1183 # Docstring inherited from the base class. 

1184 tables: dict[str, DatastoreOpaqueTable] = {} 

1185 for datastore in self.datastores: 

1186 tables.update(datastore.get_opaque_table_definitions()) 

1187 return tables