Coverage for python/lsst/daf/butler/datastores/chainedDatastore.py: 92%

414 statements  

« prev     ^ index     » next       coverage.py v7.2.7, created at 2023-06-02 02:15 -0700

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22from __future__ import annotations 

23 

24"""Chained datastore.""" 

25 

26__all__ = ("ChainedDatastore",) 

27 

28import itertools 

29import logging 

30import time 

31import warnings 

32from typing import TYPE_CHECKING, Any, Dict, Iterable, List, Mapping, Optional, Sequence, Set, Tuple, Union 

33 

34from lsst.daf.butler import ( 

35 Constraints, 

36 DatasetRef, 

37 DatasetRefURIs, 

38 DatasetTypeNotSupportedError, 

39 Datastore, 

40 DatastoreConfig, 

41 DatastoreRecordData, 

42 DatastoreValidationError, 

43 FileDataset, 

44) 

45from lsst.resources import ResourcePath 

46from lsst.utils import doImportType 

47 

48if TYPE_CHECKING: 

49 from lsst.daf.butler import Config, DatasetType, LookupKey, StorageClass 

50 from lsst.daf.butler.registry.interfaces import DatasetIdRef, DatastoreRegistryBridgeManager 

51 from lsst.resources import ResourcePathExpression 

52 

53log = logging.getLogger(__name__) 

54 

55 

56class _IngestPrepData(Datastore.IngestPrepData): 

57 """Helper class for ChainedDatastore ingest implementation. 

58 

59 Parameters 

60 ---------- 

61 children : `list` of `tuple` 

62 Pairs of `Datastore`, `IngestPrepData` for all child datastores. 

63 """ 

64 

65 def __init__(self, children: List[Tuple[Datastore, Datastore.IngestPrepData, set[ResourcePath]]]): 

66 super().__init__(itertools.chain.from_iterable(data.refs.values() for _, data, _ in children)) 

67 self.children = children 

68 

69 

70class ChainedDatastore(Datastore): 

71 """Chained Datastores to allow read and writes from multiple datastores. 

72 

73 A ChainedDatastore is configured with multiple datastore configurations. 

74 A ``put()`` is always sent to each datastore. A ``get()`` 

75 operation is sent to each datastore in turn and the first datastore 

76 to return a valid dataset is used. 

77 

78 Parameters 

79 ---------- 

80 config : `DatastoreConfig` or `str` 

81 Configuration. This configuration must include a ``datastores`` field 

82 as a sequence of datastore configurations. The order in this sequence 

83 indicates the order to use for read operations. 

84 bridgeManager : `DatastoreRegistryBridgeManager` 

85 Object that manages the interface between `Registry` and datastores. 

86 butlerRoot : `str`, optional 

87 New datastore root to use to override the configuration value. This 

88 root is sent to each child datastore. 

89 

90 Notes 

91 ----- 

92 ChainedDatastore never supports `None` or `"move"` as an `ingest` transfer 

93 mode. It supports `"copy"`, `"symlink"`, `"relsymlink"` 

94 and `"hardlink"` if and only if all its child datastores do. 

95 """ 

96 

97 defaultConfigFile = "datastores/chainedDatastore.yaml" 

98 """Path to configuration defaults. Accessed within the ``configs`` resource 

99 or relative to a search path. Can be None if no defaults specified. 

100 """ 

101 

102 containerKey = "datastores" 

103 """Key to specify where child datastores are configured.""" 

104 

105 datastores: List[Datastore] 

106 """All the child datastores known to this datastore.""" 

107 

108 datastoreConstraints: Sequence[Optional[Constraints]] 

109 """Constraints to be applied to each of the child datastores.""" 

110 

111 @classmethod 

112 def setConfigRoot(cls, root: str, config: Config, full: Config, overwrite: bool = True) -> None: 

113 """Set any filesystem-dependent config options for child Datastores to 

114 be appropriate for a new empty repository with the given root. 

115 

116 Parameters 

117 ---------- 

118 root : `str` 

119 Filesystem path to the root of the data repository. 

120 config : `Config` 

121 A `Config` to update. Only the subset understood by 

122 this component will be updated. Will not expand 

123 defaults. 

124 full : `Config` 

125 A complete config with all defaults expanded that can be 

126 converted to a `DatastoreConfig`. Read-only and will not be 

127 modified by this method. 

128 Repository-specific options that should not be obtained 

129 from defaults when Butler instances are constructed 

130 should be copied from ``full`` to ``config``. 

131 overwrite : `bool`, optional 

132 If `False`, do not modify a value in ``config`` if the value 

133 already exists. Default is always to overwrite with the provided 

134 ``root``. 

135 

136 Notes 

137 ----- 

138 If a keyword is explicitly defined in the supplied ``config`` it 

139 will not be overridden by this method if ``overwrite`` is `False`. 

140 This allows explicit values set in external configs to be retained. 

141 """ 

142 

143 # Extract the part of the config we care about updating 

144 datastoreConfig = DatastoreConfig(config, mergeDefaults=False) 

145 

146 # And the subset of the full config that we can use for reference. 

147 # Do not bother with defaults because we are told this already has 

148 # them. 

149 fullDatastoreConfig = DatastoreConfig(full, mergeDefaults=False) 

150 

151 # Loop over each datastore config and pass the subsets to the 

152 # child datastores to process. 

153 

154 containerKey = cls.containerKey 

155 for idx, (child, fullChild) in enumerate( 

156 zip(datastoreConfig[containerKey], fullDatastoreConfig[containerKey]) 

157 ): 

158 childConfig = DatastoreConfig(child, mergeDefaults=False) 

159 fullChildConfig = DatastoreConfig(fullChild, mergeDefaults=False) 

160 datastoreClass = doImportType(fullChildConfig["cls"]) 

161 if not issubclass(datastoreClass, Datastore): 161 ↛ 162line 161 didn't jump to line 162, because the condition on line 161 was never true

162 raise TypeError(f"Imported child class {fullChildConfig['cls']} is not a Datastore") 

163 newroot = "{}/{}_{}".format(root, datastoreClass.__qualname__, idx) 

164 datastoreClass.setConfigRoot(newroot, childConfig, fullChildConfig, overwrite=overwrite) 

165 

166 # Reattach to parent 

167 datastoreConfig[containerKey, idx] = childConfig 

168 

169 # Reattach modified datastore config to parent 

170 # If this has a datastore key we attach there, otherwise we assume 

171 # this information goes at the top of the config hierarchy. 

172 if DatastoreConfig.component in config: 

173 config[DatastoreConfig.component] = datastoreConfig 

174 else: 

175 config.update(datastoreConfig) 

176 

177 return 

178 

179 def __init__( 

180 self, 

181 config: Union[Config, str], 

182 bridgeManager: DatastoreRegistryBridgeManager, 

183 butlerRoot: str | None = None, 

184 ): 

185 super().__init__(config, bridgeManager) 

186 

187 # Scan for child datastores and instantiate them with the same registry 

188 self.datastores = [] 

189 for c in self.config["datastores"]: 

190 c = DatastoreConfig(c) 

191 datastoreType = doImportType(c["cls"]) 

192 if not issubclass(datastoreType, Datastore): 192 ↛ 193line 192 didn't jump to line 193, because the condition on line 192 was never true

193 raise TypeError(f"Imported child class {c['cls']} is not a Datastore") 

194 datastore = datastoreType(c, bridgeManager, butlerRoot=butlerRoot) 

195 log.debug("Creating child datastore %s", datastore.name) 

196 self.datastores.append(datastore) 

197 

198 # Name ourself based on our children 

199 if self.datastores: 199 ↛ 204line 199 didn't jump to line 204, because the condition on line 199 was never false

200 # We must set the names explicitly 

201 self._names = [d.name for d in self.datastores] 

202 childNames = ",".join(self.names) 

203 else: 

204 childNames = "(empty@{})".format(time.time()) 

205 self._names = [childNames] 

206 self.name = "{}[{}]".format(type(self).__qualname__, childNames) 

207 

208 # We declare we are ephemeral if all our child datastores declare 

209 # they are ephemeral 

210 isEphemeral = True 

211 for d in self.datastores: 

212 if not d.isEphemeral: 

213 isEphemeral = False 

214 break 

215 self.isEphemeral = isEphemeral 

216 

217 # per-datastore override constraints 

218 if "datastore_constraints" in self.config: 

219 overrides = self.config["datastore_constraints"] 

220 

221 if len(overrides) != len(self.datastores): 221 ↛ 222line 221 didn't jump to line 222, because the condition on line 221 was never true

222 raise DatastoreValidationError( 

223 f"Number of registered datastores ({len(self.datastores)})" 

224 " differs from number of constraints overrides" 

225 f" {len(overrides)}" 

226 ) 

227 

228 self.datastoreConstraints = [ 

229 Constraints(c.get("constraints"), universe=bridgeManager.universe) for c in overrides 

230 ] 

231 

232 else: 

233 self.datastoreConstraints = (None,) * len(self.datastores) 

234 

235 log.debug("Created %s (%s)", self.name, ("ephemeral" if self.isEphemeral else "permanent")) 

236 

237 @property 

238 def names(self) -> Tuple[str, ...]: 

239 return tuple(self._names) 

240 

241 def __str__(self) -> str: 

242 chainName = ", ".join(str(ds) for ds in self.datastores) 

243 return chainName 

244 

245 def knows(self, ref: DatasetRef) -> bool: 

246 """Check if the dataset is known to any of the datastores. 

247 

248 Does not check for existence of any artifact. 

249 

250 Parameters 

251 ---------- 

252 ref : `DatasetRef` 

253 Reference to the required dataset. 

254 

255 Returns 

256 ------- 

257 exists : `bool` 

258 `True` if the dataset is known to the datastore. 

259 """ 

260 for datastore in self.datastores: 

261 if datastore.knows(ref): 

262 log.debug("%s known to datastore %s", ref, datastore.name) 

263 return True 

264 return False 

265 

266 def knows_these(self, refs: Iterable[DatasetRef]) -> dict[DatasetRef, bool]: 

267 # Docstring inherited from the base class. 

268 refs_known: dict[DatasetRef, bool] = {} 

269 for datastore in self.datastores: 

270 refs_known.update(datastore.knows_these(refs)) 

271 

272 # No need to check in next datastore for refs that are known. 

273 # We only update entries that were initially False. 

274 refs = [ref for ref, known in refs_known.items() if not known] 

275 

276 return refs_known 

277 

278 def mexists( 

279 self, refs: Iterable[DatasetRef], artifact_existence: Optional[Dict[ResourcePath, bool]] = None 

280 ) -> Dict[DatasetRef, bool]: 

281 """Check the existence of multiple datasets at once. 

282 

283 Parameters 

284 ---------- 

285 refs : iterable of `DatasetRef` 

286 The datasets to be checked. 

287 artifact_existence : `dict` [`lsst.resources.ResourcePath`, `bool`] 

288 Optional mapping of datastore artifact to existence. Updated by 

289 this method with details of all artifacts tested. Can be `None` 

290 if the caller is not interested. 

291 

292 Returns 

293 ------- 

294 existence : `dict` of [`DatasetRef`, `bool`] 

295 Mapping from dataset to boolean indicating existence in any 

296 of the child datastores. 

297 """ 

298 dataset_existence: Dict[DatasetRef, bool] = {} 

299 for datastore in self.datastores: 

300 dataset_existence.update(datastore.mexists(refs, artifact_existence=artifact_existence)) 

301 

302 # For next datastore no point asking about ones we know 

303 # exist already. No special exemption for ephemeral datastores. 

304 refs = [ref for ref, exists in dataset_existence.items() if not exists] 

305 

306 return dataset_existence 

307 

308 def exists(self, ref: DatasetRef) -> bool: 

309 """Check if the dataset exists in one of the datastores. 

310 

311 Parameters 

312 ---------- 

313 ref : `DatasetRef` 

314 Reference to the required dataset. 

315 

316 Returns 

317 ------- 

318 exists : `bool` 

319 `True` if the entity exists in one of the child datastores. 

320 """ 

321 for datastore in self.datastores: 

322 if datastore.exists(ref): 

323 log.debug("Found %s in datastore %s", ref, datastore.name) 

324 return True 

325 return False 

326 

327 def get( 

328 self, 

329 ref: DatasetRef, 

330 parameters: Optional[Mapping[str, Any]] = None, 

331 storageClass: Optional[Union[StorageClass, str]] = None, 

332 ) -> Any: 

333 """Load an InMemoryDataset from the store. 

334 

335 The dataset is returned from the first datastore that has 

336 the dataset. 

337 

338 Parameters 

339 ---------- 

340 ref : `DatasetRef` 

341 Reference to the required Dataset. 

342 parameters : `dict` 

343 `StorageClass`-specific parameters that specify, for example, 

344 a slice of the dataset to be loaded. 

345 storageClass : `StorageClass` or `str`, optional 

346 The storage class to be used to override the Python type 

347 returned by this method. By default the returned type matches 

348 the dataset type definition for this dataset. Specifying a 

349 read `StorageClass` can force a different type to be returned. 

350 This type must be compatible with the original type. 

351 

352 Returns 

353 ------- 

354 inMemoryDataset : `object` 

355 Requested dataset or slice thereof as an InMemoryDataset. 

356 

357 Raises 

358 ------ 

359 FileNotFoundError 

360 Requested dataset can not be retrieved. 

361 TypeError 

362 Return value from formatter has unexpected type. 

363 ValueError 

364 Formatter failed to process the dataset. 

365 """ 

366 

367 for datastore in self.datastores: 

368 try: 

369 inMemoryObject = datastore.get(ref, parameters, storageClass=storageClass) 

370 log.debug("Found dataset %s in datastore %s", ref, datastore.name) 

371 return inMemoryObject 

372 except FileNotFoundError: 

373 pass 

374 

375 raise FileNotFoundError("Dataset {} could not be found in any of the datastores".format(ref)) 

376 

377 def put(self, inMemoryDataset: Any, ref: DatasetRef) -> None: 

378 """Write a InMemoryDataset with a given `DatasetRef` to each 

379 datastore. 

380 

381 The put() to child datastores can fail with 

382 `DatasetTypeNotSupportedError`. The put() for this datastore will be 

383 deemed to have succeeded so long as at least one child datastore 

384 accepted the inMemoryDataset. 

385 

386 Parameters 

387 ---------- 

388 inMemoryDataset : `object` 

389 The dataset to store. 

390 ref : `DatasetRef` 

391 Reference to the associated Dataset. 

392 

393 Raises 

394 ------ 

395 TypeError 

396 Supplied object and storage class are inconsistent. 

397 DatasetTypeNotSupportedError 

398 All datastores reported `DatasetTypeNotSupportedError`. 

399 """ 

400 log.debug("Put %s", ref) 

401 

402 # Confirm that we can accept this dataset 

403 if not self.constraints.isAcceptable(ref): 

404 # Raise rather than use boolean return value. 

405 raise DatasetTypeNotSupportedError( 

406 f"Dataset {ref} has been rejected by this datastore via configuration." 

407 ) 

408 

409 isPermanent = False 

410 nsuccess = 0 

411 npermanent = 0 

412 nephemeral = 0 

413 for datastore, constraints in zip(self.datastores, self.datastoreConstraints): 

414 if ( 

415 constraints is not None and not constraints.isAcceptable(ref) 

416 ) or not datastore.constraints.isAcceptable(ref): 

417 log.debug("Datastore %s skipping put via configuration for ref %s", datastore.name, ref) 

418 continue 

419 

420 if datastore.isEphemeral: 

421 nephemeral += 1 

422 else: 

423 npermanent += 1 

424 try: 

425 datastore.put(inMemoryDataset, ref) 

426 nsuccess += 1 

427 if not datastore.isEphemeral: 

428 isPermanent = True 

429 except DatasetTypeNotSupportedError: 

430 pass 

431 

432 if nsuccess == 0: 

433 raise DatasetTypeNotSupportedError(f"None of the chained datastores supported ref {ref}") 

434 

435 if not isPermanent and npermanent > 0: 435 ↛ 436line 435 didn't jump to line 436, because the condition on line 435 was never true

436 warnings.warn(f"Put of {ref} only succeeded in ephemeral databases", stacklevel=2) 

437 

438 if self._transaction is not None: 

439 self._transaction.registerUndo("put", self.remove, ref) 

440 

441 def _overrideTransferMode(self, *datasets: Any, transfer: Optional[str] = None) -> Optional[str]: 

442 # Docstring inherited from base class. 

443 if transfer != "auto": 

444 return transfer 

445 # Ask each datastore what they think auto means 

446 transfers = {d._overrideTransferMode(*datasets, transfer=transfer) for d in self.datastores} 

447 

448 # Remove any untranslated "auto" values 

449 transfers.discard(transfer) 

450 

451 if len(transfers) == 1: 451 ↛ 452line 451 didn't jump to line 452, because the condition on line 451 was never true

452 return transfers.pop() 

453 if not transfers: 453 ↛ 457line 453 didn't jump to line 457, because the condition on line 453 was never false

454 # Everything reported "auto" 

455 return transfer 

456 

457 raise RuntimeError( 

458 "Chained datastore does not yet support different transfer modes" 

459 f" from 'auto' in each child datastore (wanted {transfers})" 

460 ) 

461 

462 def _prepIngest(self, *datasets: FileDataset, transfer: Optional[str] = None) -> _IngestPrepData: 

463 # Docstring inherited from Datastore._prepIngest. 

464 if transfer is None: 

465 raise NotImplementedError("ChainedDatastore does not support transfer=None.") 

466 

467 def isDatasetAcceptable(dataset: FileDataset, *, name: str, constraints: Constraints) -> bool: 

468 acceptable = [ref for ref in dataset.refs if constraints.isAcceptable(ref)] 

469 if not acceptable: 

470 log.debug( 

471 "Datastore %s skipping ingest via configuration for refs %s", 

472 name, 

473 ", ".join(str(ref) for ref in dataset.refs), 

474 ) 

475 return False 

476 else: 

477 return True 

478 

479 # Filter down to just datasets the chained datastore's own 

480 # configuration accepts. 

481 okForParent: List[FileDataset] = [ 

482 dataset 

483 for dataset in datasets 

484 if isDatasetAcceptable(dataset, name=self.name, constraints=self.constraints) 

485 ] 

486 

487 # Iterate over nested datastores and call _prepIngest on each. 

488 # Save the results to a list: 

489 children: List[Tuple[Datastore, Datastore.IngestPrepData, set[ResourcePath]]] = [] 

490 # ...and remember whether all of the failures are due to 

491 # NotImplementedError being raised. 

492 allFailuresAreNotImplementedError = True 

493 for datastore, constraints in zip(self.datastores, self.datastoreConstraints): 

494 okForChild: List[FileDataset] 

495 if constraints is not None: 

496 okForChild = [ 

497 dataset 

498 for dataset in okForParent 

499 if isDatasetAcceptable(dataset, name=datastore.name, constraints=constraints) 

500 ] 

501 else: 

502 okForChild = okForParent 

503 try: 

504 prepDataForChild = datastore._prepIngest(*okForChild, transfer=transfer) 

505 except NotImplementedError: 

506 log.debug( 

507 "Skipping ingest for datastore %s because transfer mode %s is not supported.", 

508 datastore.name, 

509 transfer, 

510 ) 

511 continue 

512 allFailuresAreNotImplementedError = False 

513 if okForChild: 

514 # Do not store for later if a datastore has rejected 

515 # everything. 

516 # Include the source paths if this is a "move". It's clearer 

517 # to find the paths now rather than try to infer how 

518 # each datastore has stored them in the internal prep class. 

519 paths = ( 

520 {ResourcePath(dataset.path) for dataset in okForChild} if transfer == "move" else set() 

521 ) 

522 children.append((datastore, prepDataForChild, paths)) 

523 if allFailuresAreNotImplementedError: 

524 raise NotImplementedError(f"No child datastore supports transfer mode {transfer}.") 

525 return _IngestPrepData(children=children) 

526 

527 def _finishIngest( 

528 self, 

529 prepData: _IngestPrepData, 

530 *, 

531 transfer: Optional[str] = None, 

532 record_validation_info: bool = True, 

533 ) -> None: 

534 # Docstring inherited from Datastore._finishIngest. 

535 # For "move" we must use "copy" and then delete the input 

536 # data at the end. This has no rollback option if the ingest 

537 # subsequently fails. If there is only one active datastore 

538 # accepting any files we can leave it as "move" 

539 actual_transfer: str | None 

540 if transfer == "move" and len(prepData.children) > 1: 

541 actual_transfer = "copy" 

542 else: 

543 actual_transfer = transfer 

544 to_be_deleted: set[ResourcePath] = set() 

545 for datastore, prepDataForChild, paths in prepData.children: 

546 datastore._finishIngest( 

547 prepDataForChild, transfer=actual_transfer, record_validation_info=record_validation_info 

548 ) 

549 to_be_deleted.update(paths) 

550 if actual_transfer != transfer: 

551 # These datasets were copied but now need to be deleted. 

552 # This can not be rolled back. 

553 for uri in to_be_deleted: 

554 uri.remove() 

555 

556 def getManyURIs( 

557 self, 

558 refs: Iterable[DatasetRef], 

559 predict: bool = False, 

560 allow_missing: bool = False, 

561 ) -> Dict[DatasetRef, DatasetRefURIs]: 

562 # Docstring inherited 

563 

564 uris: Dict[DatasetRef, DatasetRefURIs] = {} 

565 missing_refs = set(refs) 

566 

567 # If predict is True we don't want to predict a dataset in the first 

568 # datastore if it actually exists in a later datastore, so in that 

569 # case check all datastores with predict=False first, and then try 

570 # again with predict=True. 

571 for p in (False, True) if predict else (False,): 

572 if not missing_refs: 

573 break 

574 for datastore in self.datastores: 

575 try: 

576 got_uris = datastore.getManyURIs(missing_refs, p, allow_missing=True) 

577 except NotImplementedError: 

578 # some datastores may not implement generating URIs 

579 continue 

580 missing_refs -= got_uris.keys() 

581 uris.update(got_uris) 

582 if not missing_refs: 

583 break 

584 

585 if missing_refs and not allow_missing: 

586 raise FileNotFoundError(f"Dataset(s) {missing_refs} not in this datastore.") 

587 

588 return uris 

589 

590 def getURIs(self, ref: DatasetRef, predict: bool = False) -> DatasetRefURIs: 

591 """Return URIs associated with dataset. 

592 

593 Parameters 

594 ---------- 

595 ref : `DatasetRef` 

596 Reference to the required dataset. 

597 predict : `bool`, optional 

598 If the datastore does not know about the dataset, should it 

599 return a predicted URI or not? 

600 

601 Returns 

602 ------- 

603 uris : `DatasetRefURIs` 

604 The URI to the primary artifact associated with this dataset (if 

605 the dataset was disassembled within the datastore this may be 

606 `None`), and the URIs to any components associated with the dataset 

607 artifact. (can be empty if there are no components). 

608 

609 Notes 

610 ----- 

611 The returned URI is from the first datastore in the list that has 

612 the dataset with preference given to the first dataset coming from 

613 a permanent datastore. If no datastores have the dataset and prediction 

614 is allowed, the predicted URI for the first datastore in the list will 

615 be returned. 

616 """ 

617 log.debug("Requesting URIs for %s", ref) 

618 predictedUri: Optional[DatasetRefURIs] = None 

619 predictedEphemeralUri: Optional[DatasetRefURIs] = None 

620 firstEphemeralUri: Optional[DatasetRefURIs] = None 

621 for datastore in self.datastores: 

622 if datastore.exists(ref): 

623 if not datastore.isEphemeral: 

624 uri = datastore.getURIs(ref) 

625 log.debug("Retrieved non-ephemeral URI: %s", uri) 

626 return uri 

627 elif not firstEphemeralUri: 

628 firstEphemeralUri = datastore.getURIs(ref) 

629 elif predict: 

630 if not predictedUri and not datastore.isEphemeral: 

631 predictedUri = datastore.getURIs(ref, predict) 

632 elif not predictedEphemeralUri and datastore.isEphemeral: 

633 predictedEphemeralUri = datastore.getURIs(ref, predict) 

634 

635 if firstEphemeralUri: 

636 log.debug("Retrieved ephemeral URI: %s", firstEphemeralUri) 

637 return firstEphemeralUri 

638 

639 if predictedUri: 

640 log.debug("Retrieved predicted URI: %s", predictedUri) 

641 return predictedUri 

642 

643 if predictedEphemeralUri: 

644 log.debug("Retrieved predicted ephemeral URI: %s", predictedEphemeralUri) 

645 return predictedEphemeralUri 

646 

647 raise FileNotFoundError("Dataset {} not in any datastore".format(ref)) 

648 

649 def getURI(self, ref: DatasetRef, predict: bool = False) -> ResourcePath: 

650 """URI to the Dataset. 

651 

652 The returned URI is from the first datastore in the list that has 

653 the dataset with preference given to the first dataset coming from 

654 a permanent datastore. If no datastores have the dataset and prediction 

655 is allowed, the predicted URI for the first datastore in the list will 

656 be returned. 

657 

658 Parameters 

659 ---------- 

660 ref : `DatasetRef` 

661 Reference to the required Dataset. 

662 predict : `bool` 

663 If `True`, allow URIs to be returned of datasets that have not 

664 been written. 

665 

666 Returns 

667 ------- 

668 uri : `lsst.resources.ResourcePath` 

669 URI pointing to the dataset within the datastore. If the 

670 dataset does not exist in the datastore, and if ``predict`` is 

671 `True`, the URI will be a prediction and will include a URI 

672 fragment "#predicted". 

673 

674 Notes 

675 ----- 

676 If the datastore does not have entities that relate well 

677 to the concept of a URI the returned URI string will be 

678 descriptive. The returned URI is not guaranteed to be obtainable. 

679 

680 Raises 

681 ------ 

682 FileNotFoundError 

683 A URI has been requested for a dataset that does not exist and 

684 guessing is not allowed. 

685 RuntimeError 

686 Raised if a request is made for a single URI but multiple URIs 

687 are associated with this dataset. 

688 """ 

689 log.debug("Requesting URI for %s", ref) 

690 primary, components = self.getURIs(ref, predict) 

691 if primary is None or components: 691 ↛ 692line 691 didn't jump to line 692, because the condition on line 691 was never true

692 raise RuntimeError( 

693 f"Dataset ({ref}) includes distinct URIs for components. Use Datastore.getURIs() instead." 

694 ) 

695 return primary 

696 

697 def retrieveArtifacts( 

698 self, 

699 refs: Iterable[DatasetRef], 

700 destination: ResourcePath, 

701 transfer: str = "auto", 

702 preserve_path: bool = True, 

703 overwrite: bool = False, 

704 ) -> List[ResourcePath]: 

705 """Retrieve the file artifacts associated with the supplied refs. 

706 

707 Parameters 

708 ---------- 

709 refs : iterable of `DatasetRef` 

710 The datasets for which file artifacts are to be retrieved. 

711 A single ref can result in multiple files. The refs must 

712 be resolved. 

713 destination : `lsst.resources.ResourcePath` 

714 Location to write the file artifacts. 

715 transfer : `str`, optional 

716 Method to use to transfer the artifacts. Must be one of the options 

717 supported by `lsst.resources.ResourcePath.transfer_from()`. 

718 "move" is not allowed. 

719 preserve_path : `bool`, optional 

720 If `True` the full path of the file artifact within the datastore 

721 is preserved. If `False` the final file component of the path 

722 is used. 

723 overwrite : `bool`, optional 

724 If `True` allow transfers to overwrite existing files at the 

725 destination. 

726 

727 Returns 

728 ------- 

729 targets : `list` of `lsst.resources.ResourcePath` 

730 URIs of file artifacts in destination location. Order is not 

731 preserved. 

732 """ 

733 if not destination.isdir(): 733 ↛ 734line 733 didn't jump to line 734, because the condition on line 733 was never true

734 raise ValueError(f"Destination location must refer to a directory. Given {destination}") 

735 

736 # Using getURIs is not feasible since it becomes difficult to 

737 # determine the path within the datastore later on. For now 

738 # follow getURIs implementation approach. 

739 

740 pending = set(refs) 

741 

742 # There is a question as to whether an exception should be raised 

743 # early if some of the refs are missing, or whether files should be 

744 # transferred until a problem is hit. Prefer to complain up front. 

745 # Use the datastore integer as primary key. 

746 grouped_by_datastore: Dict[int, Set[DatasetRef]] = {} 

747 

748 for number, datastore in enumerate(self.datastores): 

749 if datastore.isEphemeral: 

750 # In the future we will want to distinguish in-memory from 

751 # caching datastore since using an on-disk local 

752 # cache is exactly what we should be doing. 

753 continue 

754 try: 

755 datastore_refs = {ref for ref in pending if datastore.exists(ref)} 

756 except NotImplementedError: 

757 # Some datastores may not support retrieving artifacts 

758 continue 

759 

760 if datastore_refs: 

761 grouped_by_datastore[number] = datastore_refs 

762 

763 # Remove these from the pending list so that we do not bother 

764 # looking for them any more. 

765 pending = pending - datastore_refs 

766 

767 if pending: 767 ↛ 768line 767 didn't jump to line 768, because the condition on line 767 was never true

768 raise RuntimeError(f"Some datasets were not found in any datastores: {pending}") 

769 

770 # Now do the transfer. 

771 targets: List[ResourcePath] = [] 

772 for number, datastore_refs in grouped_by_datastore.items(): 

773 targets.extend( 

774 self.datastores[number].retrieveArtifacts( 

775 datastore_refs, 

776 destination, 

777 transfer=transfer, 

778 preserve_path=preserve_path, 

779 overwrite=overwrite, 

780 ) 

781 ) 

782 

783 return targets 

784 

785 def remove(self, ref: DatasetRef) -> None: 

786 """Indicate to the datastore that a dataset can be removed. 

787 

788 The dataset will be removed from each datastore. The dataset is 

789 not required to exist in every child datastore. 

790 

791 Parameters 

792 ---------- 

793 ref : `DatasetRef` 

794 Reference to the required dataset. 

795 

796 Raises 

797 ------ 

798 FileNotFoundError 

799 Attempt to remove a dataset that does not exist. Raised if none 

800 of the child datastores removed the dataset. 

801 """ 

802 log.debug("Removing %s", ref) 

803 self.trash(ref, ignore_errors=False) 

804 self.emptyTrash(ignore_errors=False) 

805 

806 def forget(self, refs: Iterable[DatasetRef]) -> None: 

807 for datastore in tuple(self.datastores): 

808 datastore.forget(refs) 

809 

810 def trash(self, ref: Union[DatasetRef, Iterable[DatasetRef]], ignore_errors: bool = True) -> None: 

811 if isinstance(ref, DatasetRef): 

812 ref_label = str(ref) 

813 else: 

814 ref_label = "bulk datasets" 

815 

816 log.debug("Trashing %s", ref_label) 

817 

818 counter = 0 

819 for datastore in self.datastores: 

820 try: 

821 datastore.trash(ref, ignore_errors=ignore_errors) 

822 counter += 1 

823 except FileNotFoundError: 

824 pass 

825 

826 if counter == 0: 

827 err_msg = f"Could not mark for removal from any child datastore: {ref_label}" 

828 if ignore_errors: 828 ↛ 829line 828 didn't jump to line 829, because the condition on line 828 was never true

829 log.warning(err_msg) 

830 else: 

831 raise FileNotFoundError(err_msg) 

832 

833 def emptyTrash(self, ignore_errors: bool = True) -> None: 

834 for datastore in self.datastores: 

835 datastore.emptyTrash(ignore_errors=ignore_errors) 

836 

837 def transfer(self, inputDatastore: Datastore, ref: DatasetRef) -> None: 

838 """Retrieve a dataset from an input `Datastore`, 

839 and store the result in this `Datastore`. 

840 

841 Parameters 

842 ---------- 

843 inputDatastore : `Datastore` 

844 The external `Datastore` from which to retreive the Dataset. 

845 ref : `DatasetRef` 

846 Reference to the required dataset in the input data store. 

847 

848 Returns 

849 ------- 

850 results : `list` 

851 List containing the return value from the ``put()`` to each 

852 child datastore. 

853 """ 

854 assert inputDatastore is not self # unless we want it for renames? 

855 inMemoryDataset = inputDatastore.get(ref) 

856 self.put(inMemoryDataset, ref) 

857 

858 def validateConfiguration( 

859 self, entities: Iterable[Union[DatasetRef, DatasetType, StorageClass]], logFailures: bool = False 

860 ) -> None: 

861 """Validate some of the configuration for this datastore. 

862 

863 Parameters 

864 ---------- 

865 entities : iterable of `DatasetRef`, `DatasetType`, or `StorageClass` 

866 Entities to test against this configuration. Can be differing 

867 types. 

868 logFailures : `bool`, optional 

869 If `True`, output a log message for every validation error 

870 detected. 

871 

872 Raises 

873 ------ 

874 DatastoreValidationError 

875 Raised if there is a validation problem with a configuration. 

876 All the problems are reported in a single exception. 

877 

878 Notes 

879 ----- 

880 This method checks each datastore in turn. 

881 """ 

882 

883 # Need to catch each of the datastore outputs and ensure that 

884 # all are tested. 

885 failures = [] 

886 for datastore in self.datastores: 

887 try: 

888 datastore.validateConfiguration(entities, logFailures=logFailures) 

889 except DatastoreValidationError as e: 

890 if logFailures: 890 ↛ 892line 890 didn't jump to line 892, because the condition on line 890 was never false

891 log.critical("Datastore %s failed validation", datastore.name) 

892 failures.append(f"Datastore {self.name}: {e}") 

893 

894 if failures: 

895 msg = ";\n".join(failures) 

896 raise DatastoreValidationError(msg) 

897 

898 def validateKey(self, lookupKey: LookupKey, entity: Union[DatasetRef, DatasetType, StorageClass]) -> None: 

899 # Docstring is inherited from base class 

900 failures = [] 

901 for datastore in self.datastores: 

902 try: 

903 datastore.validateKey(lookupKey, entity) 

904 except DatastoreValidationError as e: 

905 failures.append(f"Datastore {self.name}: {e}") 

906 

907 if failures: 

908 msg = ";\n".join(failures) 

909 raise DatastoreValidationError(msg) 

910 

911 def getLookupKeys(self) -> Set[LookupKey]: 

912 # Docstring is inherited from base class 

913 keys = set() 

914 for datastore in self.datastores: 

915 keys.update(datastore.getLookupKeys()) 

916 

917 keys.update(self.constraints.getLookupKeys()) 

918 for p in self.datastoreConstraints: 

919 if p is not None: 919 ↛ 918line 919 didn't jump to line 918, because the condition on line 919 was never false

920 keys.update(p.getLookupKeys()) 

921 

922 return keys 

923 

924 def needs_expanded_data_ids( 

925 self, 

926 transfer: Optional[str], 

927 entity: Optional[Union[DatasetRef, DatasetType, StorageClass]] = None, 

928 ) -> bool: 

929 # Docstring inherited. 

930 # We can't safely use `self.datastoreConstraints` with `entity` to 

931 # check whether a child datastore would even want to ingest this 

932 # dataset, because we don't want to filter out datastores that might 

933 # need an expanded data ID based in incomplete information (e.g. we 

934 # pass a StorageClass, but the constraint dispatches on DatasetType). 

935 # So we pessimistically check if any datastore would need an expanded 

936 # data ID for this transfer mode. 

937 return any(datastore.needs_expanded_data_ids(transfer, entity) for datastore in self.datastores) 

938 

939 def import_records(self, data: Mapping[str, DatastoreRecordData]) -> None: 

940 # Docstring inherited from the base class. 

941 

942 for datastore in self.datastores: 

943 datastore.import_records(data) 

944 

945 def export_records(self, refs: Iterable[DatasetIdRef]) -> Mapping[str, DatastoreRecordData]: 

946 # Docstring inherited from the base class. 

947 

948 all_records: Dict[str, DatastoreRecordData] = {} 

949 

950 # Merge all sub-datastore records into one structure 

951 for datastore in self.datastores: 

952 sub_records = datastore.export_records(refs) 

953 for name, record_data in sub_records.items(): 

954 # All datastore names must be unique in a chain. 

955 if name in all_records: 955 ↛ 956line 955 didn't jump to line 956, because the condition on line 955 was never true

956 raise ValueError("Non-unique datastore name found in datastore {datastore}") 

957 all_records[name] = record_data 

958 

959 return all_records 

960 

961 def export( 

962 self, 

963 refs: Iterable[DatasetRef], 

964 *, 

965 directory: Optional[ResourcePathExpression] = None, 

966 transfer: Optional[str] = "auto", 

967 ) -> Iterable[FileDataset]: 

968 # Docstring inherited from Datastore.export. 

969 if transfer == "auto" and directory is None: 

970 transfer = None 

971 

972 if transfer is not None and directory is None: 

973 raise TypeError(f"Cannot export using transfer mode {transfer} with no export directory given") 

974 

975 if transfer == "move": 

976 raise TypeError("Can not export by moving files out of datastore.") 

977 

978 # Exporting from a chain has the potential for a dataset to be 

979 # in one or more of the datastores in the chain. We only need one 

980 # of them since we assume the datasets are the same in all (but 

981 # the file format could be different of course since that is a 

982 # per-datastore configuration). 

983 # We also do not know whether any of the datastores in the chain 

984 # support file export. 

985 

986 # Ensure we have an ordered sequence that is not an iterator or set. 

987 if not isinstance(refs, Sequence): 

988 refs = list(refs) 

989 

990 # If any of the datasets are missing entirely we need to raise early 

991 # before we try to run the export. This can be a little messy but is 

992 # better than exporting files from the first datastore and then finding 

993 # that one is missing but is not in the second datastore either. 

994 known = [datastore.knows_these(refs) for datastore in self.datastores] 

995 refs_known: set[DatasetRef] = set() 

996 for known_to_this in known: 

997 refs_known.update({ref for ref, knows_this in known_to_this.items() if knows_this}) 

998 missing_count = len(refs) - len(refs_known) 

999 if missing_count: 

1000 raise FileNotFoundError(f"Not all datasets known to this datastore. Missing {missing_count}") 

1001 

1002 # To allow us to slot each result into the right place after 

1003 # asking each datastore, create a dict with the index. 

1004 ref_positions = {ref: i for i, ref in enumerate(refs)} 

1005 

1006 # Presize the final export list. 

1007 exported: list[FileDataset | None] = [None] * len(refs) 

1008 

1009 # The order of the returned dataset has to match the order of the 

1010 # given refs, even if they are all from different datastores. 

1011 for i, datastore in enumerate(self.datastores): 

1012 known_to_this = known[i] 

1013 filtered = [ref for ref, knows in known_to_this.items() if knows and ref in ref_positions] 

1014 

1015 try: 

1016 this_export = datastore.export(filtered, directory=directory, transfer=transfer) 

1017 except NotImplementedError: 

1018 # Try the next datastore. 

1019 continue 

1020 

1021 for ref, export in zip(filtered, this_export): 

1022 # Get the position and also delete it from the list. 

1023 exported[ref_positions.pop(ref)] = export 

1024 

1025 # Every dataset should be accounted for because of the earlier checks 

1026 # but make sure that we did fill all the slots to appease mypy. 

1027 for i, dataset in enumerate(exported): 

1028 if dataset is None: 1028 ↛ 1029line 1028 didn't jump to line 1029, because the condition on line 1028 was never true

1029 raise FileNotFoundError(f"Failed to export dataset {refs[i]}.") 

1030 yield dataset 

1031 

1032 def transfer_from( 

1033 self, 

1034 source_datastore: Datastore, 

1035 refs: Iterable[DatasetRef], 

1036 transfer: str = "auto", 

1037 artifact_existence: Optional[Dict[ResourcePath, bool]] = None, 

1038 ) -> tuple[set[DatasetRef], set[DatasetRef]]: 

1039 # Docstring inherited 

1040 # mypy does not understand "type(self) is not type(source)" 

1041 if isinstance(source_datastore, ChainedDatastore): 

1042 # Both the source and destination are chained datastores. 

1043 source_datastores = tuple(source_datastore.datastores) 

1044 else: 

1045 # The source datastore is different, forward everything to the 

1046 # child datastores. 

1047 source_datastores = tuple([source_datastore]) 

1048 

1049 # Need to know the set of all possible refs that could be transferred. 

1050 remaining_refs = set(refs) 

1051 

1052 missing_from_source: set[DatasetRef] | None = None 

1053 all_accepted = set() 

1054 nsuccess = 0 

1055 for source_child in source_datastores: 

1056 # If we are reading from a chained datastore, it's possible that 

1057 # only a subset of the datastores know about the dataset. We can't 

1058 # ask the receiving datastore to copy it when it doesn't exist 

1059 # so we have to filter again based on what the source datastore 

1060 # understands. 

1061 known_to_source = source_child.knows_these([ref for ref in refs]) 

1062 

1063 # Need to know that there is a possibility that some of these 

1064 # datasets exist but are unknown to the source datastore if 

1065 # trust is enabled. 

1066 if getattr(source_child, "trustGetRequest", False): 

1067 unknown = [ref for ref, known in known_to_source.items() if not known] 

1068 existence = source_child.mexists(unknown, artifact_existence) 

1069 for ref, exists in existence.items(): 

1070 known_to_source[ref] = exists 

1071 

1072 missing = {ref for ref, known in known_to_source.items() if not known} 

1073 if missing: 

1074 if missing_from_source is None: 

1075 missing_from_source = missing 

1076 else: 

1077 missing_from_source &= missing 

1078 

1079 # Try to transfer from each source datastore to each child 

1080 # datastore. Have to make sure we don't transfer something 

1081 # we've already transferred to this destination on later passes. 

1082 

1083 # Filter the initial list based on the datasets we have 

1084 # not yet transferred. 

1085 these_refs = [] 

1086 for ref in refs: 

1087 if ref in remaining_refs and known_to_source[ref]: 

1088 these_refs.append(ref) 

1089 

1090 if not these_refs: 

1091 # Already transferred all datasets known to this datastore. 

1092 continue 

1093 

1094 for datastore, constraints in zip(self.datastores, self.datastoreConstraints): 

1095 if constraints is not None: 1095 ↛ 1103line 1095 didn't jump to line 1103, because the condition on line 1095 was never false

1096 filtered_refs = [] 

1097 for ref in these_refs: 

1098 if constraints.isAcceptable(ref): 

1099 filtered_refs.append(ref) 

1100 else: 

1101 log.debug("Rejecting ref by constraints: %s", ref) 

1102 else: 

1103 filtered_refs = [ref for ref in these_refs] 

1104 try: 

1105 accepted, _ = datastore.transfer_from( 

1106 source_child, filtered_refs, transfer, artifact_existence 

1107 ) 

1108 except (TypeError, NotImplementedError): 

1109 # The datastores were incompatible. 

1110 continue 

1111 else: 

1112 nsuccess += 1 

1113 

1114 # Remove the accepted datasets from those remaining. 

1115 remaining_refs = remaining_refs - accepted 

1116 

1117 # Keep track of everything we have accepted. 

1118 all_accepted.update(accepted) 

1119 

1120 if missing_from_source: 

1121 for ref in missing_from_source: 

1122 log.warning("Asked to transfer dataset %s but no file artifacts exist for it", ref) 

1123 

1124 if nsuccess == 0: 1124 ↛ 1125line 1124 didn't jump to line 1125, because the condition on line 1124 was never true

1125 raise TypeError(f"None of the child datastores could accept transfers from {source_datastore!r}") 

1126 

1127 return all_accepted, remaining_refs