Coverage for python/lsst/daf/butler/datastores/chainedDatastore.py: 91%

319 statements  

« prev     ^ index     » next       coverage.py v6.5.0, created at 2022-10-07 02:46 -0700

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22from __future__ import annotations 

23 

24"""Chained datastore.""" 

25 

26__all__ = ("ChainedDatastore",) 

27 

28import itertools 

29import logging 

30import time 

31import warnings 

32from typing import TYPE_CHECKING, Any, Dict, Iterable, List, Mapping, Optional, Sequence, Set, Tuple, Union 

33 

34from lsst.daf.butler import ( 

35 Constraints, 

36 DatasetRef, 

37 DatasetRefURIs, 

38 DatasetTypeNotSupportedError, 

39 Datastore, 

40 DatastoreConfig, 

41 DatastoreRecordData, 

42 DatastoreValidationError, 

43 FileDataset, 

44) 

45from lsst.resources import ResourcePath 

46from lsst.utils import doImportType 

47 

48if TYPE_CHECKING: 48 ↛ 49line 48 didn't jump to line 49, because the condition on line 48 was never true

49 from lsst.daf.butler import Config, DatasetType, LookupKey, StorageClass 

50 from lsst.daf.butler.registry.interfaces import DatasetIdRef, DatastoreRegistryBridgeManager 

51 

52log = logging.getLogger(__name__) 

53 

54 

55class _IngestPrepData(Datastore.IngestPrepData): 

56 """Helper class for ChainedDatastore ingest implementation. 

57 

58 Parameters 

59 ---------- 

60 children : `list` of `tuple` 

61 Pairs of `Datastore`, `IngestPrepData` for all child datastores. 

62 """ 

63 

64 def __init__(self, children: List[Tuple[Datastore, Datastore.IngestPrepData]]): 

65 super().__init__(itertools.chain.from_iterable(data.refs.values() for _, data in children)) 

66 self.children = children 

67 

68 

69class ChainedDatastore(Datastore): 

70 """Chained Datastores to allow read and writes from multiple datastores. 

71 

72 A ChainedDatastore is configured with multiple datastore configurations. 

73 A ``put()`` is always sent to each datastore. A ``get()`` 

74 operation is sent to each datastore in turn and the first datastore 

75 to return a valid dataset is used. 

76 

77 Parameters 

78 ---------- 

79 config : `DatastoreConfig` or `str` 

80 Configuration. This configuration must include a ``datastores`` field 

81 as a sequence of datastore configurations. The order in this sequence 

82 indicates the order to use for read operations. 

83 bridgeManager : `DatastoreRegistryBridgeManager` 

84 Object that manages the interface between `Registry` and datastores. 

85 butlerRoot : `str`, optional 

86 New datastore root to use to override the configuration value. This 

87 root is sent to each child datastore. 

88 

89 Notes 

90 ----- 

91 ChainedDatastore never supports `None` or `"move"` as an `ingest` transfer 

92 mode. It supports `"copy"`, `"symlink"`, `"relsymlink"` 

93 and `"hardlink"` if and only if all its child datastores do. 

94 """ 

95 

96 defaultConfigFile = "datastores/chainedDatastore.yaml" 

97 """Path to configuration defaults. Accessed within the ``configs`` resource 

98 or relative to a search path. Can be None if no defaults specified. 

99 """ 

100 

101 containerKey = "datastores" 

102 """Key to specify where child datastores are configured.""" 

103 

104 datastores: List[Datastore] 

105 """All the child datastores known to this datastore.""" 

106 

107 datastoreConstraints: Sequence[Optional[Constraints]] 

108 """Constraints to be applied to each of the child datastores.""" 

109 

110 @classmethod 

111 def setConfigRoot(cls, root: str, config: Config, full: Config, overwrite: bool = True) -> None: 

112 """Set any filesystem-dependent config options for child Datastores to 

113 be appropriate for a new empty repository with the given root. 

114 

115 Parameters 

116 ---------- 

117 root : `str` 

118 Filesystem path to the root of the data repository. 

119 config : `Config` 

120 A `Config` to update. Only the subset understood by 

121 this component will be updated. Will not expand 

122 defaults. 

123 full : `Config` 

124 A complete config with all defaults expanded that can be 

125 converted to a `DatastoreConfig`. Read-only and will not be 

126 modified by this method. 

127 Repository-specific options that should not be obtained 

128 from defaults when Butler instances are constructed 

129 should be copied from ``full`` to ``config``. 

130 overwrite : `bool`, optional 

131 If `False`, do not modify a value in ``config`` if the value 

132 already exists. Default is always to overwrite with the provided 

133 ``root``. 

134 

135 Notes 

136 ----- 

137 If a keyword is explicitly defined in the supplied ``config`` it 

138 will not be overridden by this method if ``overwrite`` is `False`. 

139 This allows explicit values set in external configs to be retained. 

140 """ 

141 

142 # Extract the part of the config we care about updating 

143 datastoreConfig = DatastoreConfig(config, mergeDefaults=False) 

144 

145 # And the subset of the full config that we can use for reference. 

146 # Do not bother with defaults because we are told this already has 

147 # them. 

148 fullDatastoreConfig = DatastoreConfig(full, mergeDefaults=False) 

149 

150 # Loop over each datastore config and pass the subsets to the 

151 # child datastores to process. 

152 

153 containerKey = cls.containerKey 

154 for idx, (child, fullChild) in enumerate( 

155 zip(datastoreConfig[containerKey], fullDatastoreConfig[containerKey]) 

156 ): 

157 childConfig = DatastoreConfig(child, mergeDefaults=False) 

158 fullChildConfig = DatastoreConfig(fullChild, mergeDefaults=False) 

159 datastoreClass = doImportType(fullChildConfig["cls"]) 

160 if not issubclass(datastoreClass, Datastore): 160 ↛ 161line 160 didn't jump to line 161, because the condition on line 160 was never true

161 raise TypeError(f"Imported child class {fullChildConfig['cls']} is not a Datastore") 

162 newroot = "{}/{}_{}".format(root, datastoreClass.__qualname__, idx) 

163 datastoreClass.setConfigRoot(newroot, childConfig, fullChildConfig, overwrite=overwrite) 

164 

165 # Reattach to parent 

166 datastoreConfig[containerKey, idx] = childConfig 

167 

168 # Reattach modified datastore config to parent 

169 # If this has a datastore key we attach there, otherwise we assume 

170 # this information goes at the top of the config hierarchy. 

171 if DatastoreConfig.component in config: 

172 config[DatastoreConfig.component] = datastoreConfig 

173 else: 

174 config.update(datastoreConfig) 

175 

176 return 

177 

178 def __init__( 

179 self, 

180 config: Union[Config, str], 

181 bridgeManager: DatastoreRegistryBridgeManager, 

182 butlerRoot: str = None, 

183 ): 

184 super().__init__(config, bridgeManager) 

185 

186 # Scan for child datastores and instantiate them with the same registry 

187 self.datastores = [] 

188 for c in self.config["datastores"]: 

189 c = DatastoreConfig(c) 

190 datastoreType = doImportType(c["cls"]) 

191 if not issubclass(datastoreType, Datastore): 191 ↛ 192line 191 didn't jump to line 192, because the condition on line 191 was never true

192 raise TypeError(f"Imported child class {c['cls']} is not a Datastore") 

193 datastore = datastoreType(c, bridgeManager, butlerRoot=butlerRoot) 

194 log.debug("Creating child datastore %s", datastore.name) 

195 self.datastores.append(datastore) 

196 

197 # Name ourself based on our children 

198 if self.datastores: 198 ↛ 203line 198 didn't jump to line 203, because the condition on line 198 was never false

199 # We must set the names explicitly 

200 self._names = [d.name for d in self.datastores] 

201 childNames = ",".join(self.names) 

202 else: 

203 childNames = "(empty@{})".format(time.time()) 

204 self._names = [childNames] 

205 self.name = "{}[{}]".format(type(self).__qualname__, childNames) 

206 

207 # We declare we are ephemeral if all our child datastores declare 

208 # they are ephemeral 

209 isEphemeral = True 

210 for d in self.datastores: 

211 if not d.isEphemeral: 

212 isEphemeral = False 

213 break 

214 self.isEphemeral = isEphemeral 

215 

216 # per-datastore override constraints 

217 if "datastore_constraints" in self.config: 

218 overrides = self.config["datastore_constraints"] 

219 

220 if len(overrides) != len(self.datastores): 220 ↛ 221line 220 didn't jump to line 221, because the condition on line 220 was never true

221 raise DatastoreValidationError( 

222 f"Number of registered datastores ({len(self.datastores)})" 

223 " differs from number of constraints overrides" 

224 f" {len(overrides)}" 

225 ) 

226 

227 self.datastoreConstraints = [ 

228 Constraints(c.get("constraints"), universe=bridgeManager.universe) for c in overrides 

229 ] 

230 

231 else: 

232 self.datastoreConstraints = (None,) * len(self.datastores) 

233 

234 log.debug("Created %s (%s)", self.name, ("ephemeral" if self.isEphemeral else "permanent")) 

235 

236 @property 

237 def names(self) -> Tuple[str, ...]: 

238 return tuple(self._names) 

239 

240 def __str__(self) -> str: 

241 chainName = ", ".join(str(ds) for ds in self.datastores) 

242 return chainName 

243 

244 def knows(self, ref: DatasetRef) -> bool: 

245 """Check if the dataset is known to any of the datastores. 

246 

247 Does not check for existence of any artifact. 

248 

249 Parameters 

250 ---------- 

251 ref : `DatasetRef` 

252 Reference to the required dataset. 

253 

254 Returns 

255 ------- 

256 exists : `bool` 

257 `True` if the dataset is known to the datastore. 

258 """ 

259 for datastore in self.datastores: 

260 if datastore.knows(ref): 

261 log.debug("%s known to datastore %s", ref, datastore.name) 

262 return True 

263 return False 

264 

265 def mexists( 

266 self, refs: Iterable[DatasetRef], artifact_existence: Optional[Dict[ResourcePath, bool]] = None 

267 ) -> Dict[DatasetRef, bool]: 

268 """Check the existence of multiple datasets at once. 

269 

270 Parameters 

271 ---------- 

272 refs : iterable of `DatasetRef` 

273 The datasets to be checked. 

274 artifact_existence : `dict` [`lsst.resources.ResourcePath`, `bool`] 

275 Optional mapping of datastore artifact to existence. Updated by 

276 this method with details of all artifacts tested. Can be `None` 

277 if the caller is not interested. 

278 

279 Returns 

280 ------- 

281 existence : `dict` of [`DatasetRef`, `bool`] 

282 Mapping from dataset to boolean indicating existence in any 

283 of the child datastores. 

284 """ 

285 dataset_existence: Dict[DatasetRef, bool] = {} 

286 for datastore in self.datastores: 

287 dataset_existence.update(datastore.mexists(refs, artifact_existence=artifact_existence)) 

288 

289 # For next datastore no point asking about ones we know 

290 # exist already. No special exemption for ephemeral datastores. 

291 refs = [ref for ref, exists in dataset_existence.items() if not exists] 

292 

293 return dataset_existence 

294 

295 def exists(self, ref: DatasetRef) -> bool: 

296 """Check if the dataset exists in one of the datastores. 

297 

298 Parameters 

299 ---------- 

300 ref : `DatasetRef` 

301 Reference to the required dataset. 

302 

303 Returns 

304 ------- 

305 exists : `bool` 

306 `True` if the entity exists in one of the child datastores. 

307 """ 

308 for datastore in self.datastores: 

309 if datastore.exists(ref): 

310 log.debug("Found %s in datastore %s", ref, datastore.name) 

311 return True 

312 return False 

313 

314 def get( 

315 self, 

316 ref: DatasetRef, 

317 parameters: Optional[Mapping[str, Any]] = None, 

318 storageClass: Optional[Union[StorageClass, str]] = None, 

319 ) -> Any: 

320 """Load an InMemoryDataset from the store. 

321 

322 The dataset is returned from the first datastore that has 

323 the dataset. 

324 

325 Parameters 

326 ---------- 

327 ref : `DatasetRef` 

328 Reference to the required Dataset. 

329 parameters : `dict` 

330 `StorageClass`-specific parameters that specify, for example, 

331 a slice of the dataset to be loaded. 

332 storageClass : `StorageClass` or `str`, optional 

333 The storage class to be used to override the Python type 

334 returned by this method. By default the returned type matches 

335 the dataset type definition for this dataset. Specifying a 

336 read `StorageClass` can force a different type to be returned. 

337 This type must be compatible with the original type. 

338 

339 Returns 

340 ------- 

341 inMemoryDataset : `object` 

342 Requested dataset or slice thereof as an InMemoryDataset. 

343 

344 Raises 

345 ------ 

346 FileNotFoundError 

347 Requested dataset can not be retrieved. 

348 TypeError 

349 Return value from formatter has unexpected type. 

350 ValueError 

351 Formatter failed to process the dataset. 

352 """ 

353 

354 for datastore in self.datastores: 

355 try: 

356 inMemoryObject = datastore.get(ref, parameters, storageClass=storageClass) 

357 log.debug("Found dataset %s in datastore %s", ref, datastore.name) 

358 return inMemoryObject 

359 except FileNotFoundError: 

360 pass 

361 

362 raise FileNotFoundError("Dataset {} could not be found in any of the datastores".format(ref)) 

363 

364 def put(self, inMemoryDataset: Any, ref: DatasetRef) -> None: 

365 """Write a InMemoryDataset with a given `DatasetRef` to each 

366 datastore. 

367 

368 The put() to child datastores can fail with 

369 `DatasetTypeNotSupportedError`. The put() for this datastore will be 

370 deemed to have succeeded so long as at least one child datastore 

371 accepted the inMemoryDataset. 

372 

373 Parameters 

374 ---------- 

375 inMemoryDataset : `object` 

376 The dataset to store. 

377 ref : `DatasetRef` 

378 Reference to the associated Dataset. 

379 

380 Raises 

381 ------ 

382 TypeError 

383 Supplied object and storage class are inconsistent. 

384 DatasetTypeNotSupportedError 

385 All datastores reported `DatasetTypeNotSupportedError`. 

386 """ 

387 log.debug("Put %s", ref) 

388 

389 # Confirm that we can accept this dataset 

390 if not self.constraints.isAcceptable(ref): 

391 # Raise rather than use boolean return value. 

392 raise DatasetTypeNotSupportedError( 

393 f"Dataset {ref} has been rejected by this datastore via configuration." 

394 ) 

395 

396 isPermanent = False 

397 nsuccess = 0 

398 npermanent = 0 

399 nephemeral = 0 

400 for datastore, constraints in zip(self.datastores, self.datastoreConstraints): 

401 if constraints is not None and not constraints.isAcceptable(ref): 

402 log.debug("Datastore %s skipping put via configuration for ref %s", datastore.name, ref) 

403 continue 

404 

405 if datastore.isEphemeral: 

406 nephemeral += 1 

407 else: 

408 npermanent += 1 

409 try: 

410 datastore.put(inMemoryDataset, ref) 

411 nsuccess += 1 

412 if not datastore.isEphemeral: 

413 isPermanent = True 

414 except DatasetTypeNotSupportedError: 

415 pass 

416 

417 if nsuccess == 0: 

418 raise DatasetTypeNotSupportedError(f"None of the chained datastores supported ref {ref}") 

419 

420 if not isPermanent and npermanent > 0: 420 ↛ 421line 420 didn't jump to line 421, because the condition on line 420 was never true

421 warnings.warn(f"Put of {ref} only succeeded in ephemeral databases", stacklevel=2) 

422 

423 if self._transaction is not None: 

424 self._transaction.registerUndo("put", self.remove, ref) 

425 

426 def _overrideTransferMode(self, *datasets: Any, transfer: Optional[str] = None) -> Optional[str]: 

427 # Docstring inherited from base class. 

428 if transfer != "auto": 

429 return transfer 

430 # Ask each datastore what they think auto means 

431 transfers = {d._overrideTransferMode(*datasets, transfer=transfer) for d in self.datastores} 

432 

433 # Remove any untranslated "auto" values 

434 transfers.discard(transfer) 

435 

436 if len(transfers) == 1: 436 ↛ 437line 436 didn't jump to line 437, because the condition on line 436 was never true

437 return transfers.pop() 

438 if not transfers: 438 ↛ 442line 438 didn't jump to line 442, because the condition on line 438 was never false

439 # Everything reported "auto" 

440 return transfer 

441 

442 raise RuntimeError( 

443 "Chained datastore does not yet support different transfer modes" 

444 f" from 'auto' in each child datastore (wanted {transfers})" 

445 ) 

446 

447 def _prepIngest(self, *datasets: FileDataset, transfer: Optional[str] = None) -> _IngestPrepData: 

448 # Docstring inherited from Datastore._prepIngest. 

449 if transfer is None or transfer == "move": 

450 raise NotImplementedError("ChainedDatastore does not support transfer=None or transfer='move'.") 

451 

452 def isDatasetAcceptable(dataset: FileDataset, *, name: str, constraints: Constraints) -> bool: 

453 acceptable = [ref for ref in dataset.refs if constraints.isAcceptable(ref)] 

454 if not acceptable: 

455 log.debug( 

456 "Datastore %s skipping ingest via configuration for refs %s", 

457 name, 

458 ", ".join(str(ref) for ref in dataset.refs), 

459 ) 

460 return False 

461 else: 

462 return True 

463 

464 # Filter down to just datasets the chained datastore's own 

465 # configuration accepts. 

466 okForParent: List[FileDataset] = [ 

467 dataset 

468 for dataset in datasets 

469 if isDatasetAcceptable(dataset, name=self.name, constraints=self.constraints) 

470 ] 

471 

472 # Iterate over nested datastores and call _prepIngest on each. 

473 # Save the results to a list: 

474 children: List[Tuple[Datastore, Datastore.IngestPrepData]] = [] 

475 # ...and remember whether all of the failures are due to 

476 # NotImplementedError being raised. 

477 allFailuresAreNotImplementedError = True 

478 for datastore, constraints in zip(self.datastores, self.datastoreConstraints): 

479 okForChild: List[FileDataset] 

480 if constraints is not None: 

481 okForChild = [ 

482 dataset 

483 for dataset in okForParent 

484 if isDatasetAcceptable(dataset, name=datastore.name, constraints=constraints) 

485 ] 

486 else: 

487 okForChild = okForParent 

488 try: 

489 prepDataForChild = datastore._prepIngest(*okForChild, transfer=transfer) 

490 except NotImplementedError: 

491 log.debug( 

492 "Skipping ingest for datastore %s because transfer mode %s is not supported.", 

493 datastore.name, 

494 transfer, 

495 ) 

496 continue 

497 allFailuresAreNotImplementedError = False 

498 children.append((datastore, prepDataForChild)) 

499 if allFailuresAreNotImplementedError: 

500 raise NotImplementedError(f"No child datastore supports transfer mode {transfer}.") 

501 return _IngestPrepData(children=children) 

502 

503 def _finishIngest( 

504 self, 

505 prepData: _IngestPrepData, 

506 *, 

507 transfer: Optional[str] = None, 

508 record_validation_info: bool = True, 

509 ) -> None: 

510 # Docstring inherited from Datastore._finishIngest. 

511 for datastore, prepDataForChild in prepData.children: 

512 datastore._finishIngest( 

513 prepDataForChild, transfer=transfer, record_validation_info=record_validation_info 

514 ) 

515 

516 def getManyURIs( 

517 self, 

518 refs: Iterable[DatasetRef], 

519 predict: bool = False, 

520 allow_missing: bool = False, 

521 ) -> Dict[DatasetRef, DatasetRefURIs]: 

522 # Docstring inherited 

523 

524 uris: Dict[DatasetRef, DatasetRefURIs] = {} 

525 missing_refs = set(refs) 

526 

527 # If predict is True we don't want to predict a dataset in the first 

528 # datastore if it actually exists in a later datastore, so in that 

529 # case check all datastores with predict=False first, and then try 

530 # again with predict=True. 

531 for p in (False, True) if predict else (False,): 

532 if not missing_refs: 

533 break 

534 for datastore in self.datastores: 

535 got_uris = datastore.getManyURIs(missing_refs, p, allow_missing=True) 

536 missing_refs -= got_uris.keys() 

537 uris.update(got_uris) 

538 if not missing_refs: 

539 break 

540 

541 if missing_refs and not allow_missing: 

542 raise FileNotFoundError(f"Dataset(s) {missing_refs} not in this datastore.") 

543 

544 return uris 

545 

546 def getURIs(self, ref: DatasetRef, predict: bool = False) -> DatasetRefURIs: 

547 """Return URIs associated with dataset. 

548 

549 Parameters 

550 ---------- 

551 ref : `DatasetRef` 

552 Reference to the required dataset. 

553 predict : `bool`, optional 

554 If the datastore does not know about the dataset, should it 

555 return a predicted URI or not? 

556 

557 Returns 

558 ------- 

559 uris : `DatasetRefURIs` 

560 The URI to the primary artifact associated with this dataset (if 

561 the dataset was disassembled within the datastore this may be 

562 `None`), and the URIs to any components associated with the dataset 

563 artifact. (can be empty if there are no components). 

564 

565 Notes 

566 ----- 

567 The returned URI is from the first datastore in the list that has 

568 the dataset with preference given to the first dataset coming from 

569 a permanent datastore. If no datastores have the dataset and prediction 

570 is allowed, the predicted URI for the first datastore in the list will 

571 be returned. 

572 """ 

573 log.debug("Requesting URIs for %s", ref) 

574 predictedUri: Optional[DatasetRefURIs] = None 

575 predictedEphemeralUri: Optional[DatasetRefURIs] = None 

576 firstEphemeralUri: Optional[DatasetRefURIs] = None 

577 for datastore in self.datastores: 

578 if datastore.exists(ref): 

579 if not datastore.isEphemeral: 

580 uri = datastore.getURIs(ref) 

581 log.debug("Retrieved non-ephemeral URI: %s", uri) 

582 return uri 

583 elif not firstEphemeralUri: 

584 firstEphemeralUri = datastore.getURIs(ref) 

585 elif predict: 

586 if not predictedUri and not datastore.isEphemeral: 

587 predictedUri = datastore.getURIs(ref, predict) 

588 elif not predictedEphemeralUri and datastore.isEphemeral: 

589 predictedEphemeralUri = datastore.getURIs(ref, predict) 

590 

591 if firstEphemeralUri: 

592 log.debug("Retrieved ephemeral URI: %s", firstEphemeralUri) 

593 return firstEphemeralUri 

594 

595 if predictedUri: 

596 log.debug("Retrieved predicted URI: %s", predictedUri) 

597 return predictedUri 

598 

599 if predictedEphemeralUri: 

600 log.debug("Retrieved predicted ephemeral URI: %s", predictedEphemeralUri) 

601 return predictedEphemeralUri 

602 

603 raise FileNotFoundError("Dataset {} not in any datastore".format(ref)) 

604 

605 def getURI(self, ref: DatasetRef, predict: bool = False) -> ResourcePath: 

606 """URI to the Dataset. 

607 

608 The returned URI is from the first datastore in the list that has 

609 the dataset with preference given to the first dataset coming from 

610 a permanent datastore. If no datastores have the dataset and prediction 

611 is allowed, the predicted URI for the first datastore in the list will 

612 be returned. 

613 

614 Parameters 

615 ---------- 

616 ref : `DatasetRef` 

617 Reference to the required Dataset. 

618 predict : `bool` 

619 If `True`, allow URIs to be returned of datasets that have not 

620 been written. 

621 

622 Returns 

623 ------- 

624 uri : `lsst.resources.ResourcePath` 

625 URI pointing to the dataset within the datastore. If the 

626 dataset does not exist in the datastore, and if ``predict`` is 

627 `True`, the URI will be a prediction and will include a URI 

628 fragment "#predicted". 

629 

630 Notes 

631 ----- 

632 If the datastore does not have entities that relate well 

633 to the concept of a URI the returned URI string will be 

634 descriptive. The returned URI is not guaranteed to be obtainable. 

635 

636 Raises 

637 ------ 

638 FileNotFoundError 

639 A URI has been requested for a dataset that does not exist and 

640 guessing is not allowed. 

641 RuntimeError 

642 Raised if a request is made for a single URI but multiple URIs 

643 are associated with this dataset. 

644 """ 

645 log.debug("Requesting URI for %s", ref) 

646 primary, components = self.getURIs(ref, predict) 

647 if primary is None or components: 647 ↛ 648line 647 didn't jump to line 648, because the condition on line 647 was never true

648 raise RuntimeError( 

649 f"Dataset ({ref}) includes distinct URIs for components. Use Datastore.getURIs() instead." 

650 ) 

651 return primary 

652 

653 def retrieveArtifacts( 

654 self, 

655 refs: Iterable[DatasetRef], 

656 destination: ResourcePath, 

657 transfer: str = "auto", 

658 preserve_path: bool = True, 

659 overwrite: bool = False, 

660 ) -> List[ResourcePath]: 

661 """Retrieve the file artifacts associated with the supplied refs. 

662 

663 Parameters 

664 ---------- 

665 refs : iterable of `DatasetRef` 

666 The datasets for which file artifacts are to be retrieved. 

667 A single ref can result in multiple files. The refs must 

668 be resolved. 

669 destination : `lsst.resources.ResourcePath` 

670 Location to write the file artifacts. 

671 transfer : `str`, optional 

672 Method to use to transfer the artifacts. Must be one of the options 

673 supported by `lsst.resources.ResourcePath.transfer_from()`. 

674 "move" is not allowed. 

675 preserve_path : `bool`, optional 

676 If `True` the full path of the file artifact within the datastore 

677 is preserved. If `False` the final file component of the path 

678 is used. 

679 overwrite : `bool`, optional 

680 If `True` allow transfers to overwrite existing files at the 

681 destination. 

682 

683 Returns 

684 ------- 

685 targets : `list` of `lsst.resources.ResourcePath` 

686 URIs of file artifacts in destination location. Order is not 

687 preserved. 

688 """ 

689 if not destination.isdir(): 689 ↛ 690line 689 didn't jump to line 690, because the condition on line 689 was never true

690 raise ValueError(f"Destination location must refer to a directory. Given {destination}") 

691 

692 # Using getURIs is not feasible since it becomes difficult to 

693 # determine the path within the datastore later on. For now 

694 # follow getURIs implementation approach. 

695 

696 pending = set(refs) 

697 

698 # There is a question as to whether an exception should be raised 

699 # early if some of the refs are missing, or whether files should be 

700 # transferred until a problem is hit. Prefer to complain up front. 

701 # Use the datastore integer as primary key. 

702 grouped_by_datastore: Dict[int, Set[DatasetRef]] = {} 

703 

704 for number, datastore in enumerate(self.datastores): 

705 if datastore.isEphemeral: 

706 # In the future we will want to distinguish in-memory from 

707 # caching datastore since using an on-disk local 

708 # cache is exactly what we should be doing. 

709 continue 

710 datastore_refs = {ref for ref in pending if datastore.exists(ref)} 

711 

712 if datastore_refs: 

713 grouped_by_datastore[number] = datastore_refs 

714 

715 # Remove these from the pending list so that we do not bother 

716 # looking for them any more. 

717 pending = pending - datastore_refs 

718 

719 if pending: 719 ↛ 720line 719 didn't jump to line 720, because the condition on line 719 was never true

720 raise RuntimeError(f"Some datasets were not found in any datastores: {pending}") 

721 

722 # Now do the transfer. 

723 targets: List[ResourcePath] = [] 

724 for number, datastore_refs in grouped_by_datastore.items(): 

725 targets.extend( 

726 self.datastores[number].retrieveArtifacts( 

727 datastore_refs, 

728 destination, 

729 transfer=transfer, 

730 preserve_path=preserve_path, 

731 overwrite=overwrite, 

732 ) 

733 ) 

734 

735 return targets 

736 

737 def remove(self, ref: DatasetRef) -> None: 

738 """Indicate to the datastore that a dataset can be removed. 

739 

740 The dataset will be removed from each datastore. The dataset is 

741 not required to exist in every child datastore. 

742 

743 Parameters 

744 ---------- 

745 ref : `DatasetRef` 

746 Reference to the required dataset. 

747 

748 Raises 

749 ------ 

750 FileNotFoundError 

751 Attempt to remove a dataset that does not exist. Raised if none 

752 of the child datastores removed the dataset. 

753 """ 

754 log.debug("Removing %s", ref) 

755 self.trash(ref, ignore_errors=False) 

756 self.emptyTrash(ignore_errors=False) 

757 

758 def forget(self, refs: Iterable[DatasetRef]) -> None: 

759 for datastore in tuple(self.datastores): 

760 datastore.forget(refs) 

761 

762 def trash(self, ref: Union[DatasetRef, Iterable[DatasetRef]], ignore_errors: bool = True) -> None: 

763 if isinstance(ref, DatasetRef): 

764 ref_label = str(ref) 

765 else: 

766 ref_label = "bulk datasets" 

767 

768 log.debug("Trashing %s", ref_label) 

769 

770 counter = 0 

771 for datastore in self.datastores: 

772 try: 

773 datastore.trash(ref, ignore_errors=ignore_errors) 

774 counter += 1 

775 except FileNotFoundError: 

776 pass 

777 

778 if counter == 0: 

779 err_msg = f"Could not mark for removal from any child datastore: {ref_label}" 

780 if ignore_errors: 780 ↛ 781line 780 didn't jump to line 781, because the condition on line 780 was never true

781 log.warning(err_msg) 

782 else: 

783 raise FileNotFoundError(err_msg) 

784 

785 def emptyTrash(self, ignore_errors: bool = True) -> None: 

786 for datastore in self.datastores: 

787 datastore.emptyTrash(ignore_errors=ignore_errors) 

788 

789 def transfer(self, inputDatastore: Datastore, ref: DatasetRef) -> None: 

790 """Retrieve a dataset from an input `Datastore`, 

791 and store the result in this `Datastore`. 

792 

793 Parameters 

794 ---------- 

795 inputDatastore : `Datastore` 

796 The external `Datastore` from which to retreive the Dataset. 

797 ref : `DatasetRef` 

798 Reference to the required dataset in the input data store. 

799 

800 Returns 

801 ------- 

802 results : `list` 

803 List containing the return value from the ``put()`` to each 

804 child datastore. 

805 """ 

806 assert inputDatastore is not self # unless we want it for renames? 

807 inMemoryDataset = inputDatastore.get(ref) 

808 self.put(inMemoryDataset, ref) 

809 

810 def validateConfiguration( 

811 self, entities: Iterable[Union[DatasetRef, DatasetType, StorageClass]], logFailures: bool = False 

812 ) -> None: 

813 """Validate some of the configuration for this datastore. 

814 

815 Parameters 

816 ---------- 

817 entities : iterable of `DatasetRef`, `DatasetType`, or `StorageClass` 

818 Entities to test against this configuration. Can be differing 

819 types. 

820 logFailures : `bool`, optional 

821 If `True`, output a log message for every validation error 

822 detected. 

823 

824 Raises 

825 ------ 

826 DatastoreValidationError 

827 Raised if there is a validation problem with a configuration. 

828 All the problems are reported in a single exception. 

829 

830 Notes 

831 ----- 

832 This method checks each datastore in turn. 

833 """ 

834 

835 # Need to catch each of the datastore outputs and ensure that 

836 # all are tested. 

837 failures = [] 

838 for datastore in self.datastores: 

839 try: 

840 datastore.validateConfiguration(entities, logFailures=logFailures) 

841 except DatastoreValidationError as e: 

842 if logFailures: 842 ↛ 844line 842 didn't jump to line 844, because the condition on line 842 was never false

843 log.critical("Datastore %s failed validation", datastore.name) 

844 failures.append(f"Datastore {self.name}: {e}") 

845 

846 if failures: 

847 msg = ";\n".join(failures) 

848 raise DatastoreValidationError(msg) 

849 

850 def validateKey(self, lookupKey: LookupKey, entity: Union[DatasetRef, DatasetType, StorageClass]) -> None: 

851 # Docstring is inherited from base class 

852 failures = [] 

853 for datastore in self.datastores: 

854 try: 

855 datastore.validateKey(lookupKey, entity) 

856 except DatastoreValidationError as e: 

857 failures.append(f"Datastore {self.name}: {e}") 

858 

859 if failures: 

860 msg = ";\n".join(failures) 

861 raise DatastoreValidationError(msg) 

862 

863 def getLookupKeys(self) -> Set[LookupKey]: 

864 # Docstring is inherited from base class 

865 keys = set() 

866 for datastore in self.datastores: 

867 keys.update(datastore.getLookupKeys()) 

868 

869 keys.update(self.constraints.getLookupKeys()) 

870 for p in self.datastoreConstraints: 

871 if p is not None: 871 ↛ 872line 871 didn't jump to line 872, because the condition on line 871 was never true

872 keys.update(p.getLookupKeys()) 

873 

874 return keys 

875 

876 def needs_expanded_data_ids( 

877 self, 

878 transfer: Optional[str], 

879 entity: Optional[Union[DatasetRef, DatasetType, StorageClass]] = None, 

880 ) -> bool: 

881 # Docstring inherited. 

882 # We can't safely use `self.datastoreConstraints` with `entity` to 

883 # check whether a child datastore would even want to ingest this 

884 # dataset, because we don't want to filter out datastores that might 

885 # need an expanded data ID based in incomplete information (e.g. we 

886 # pass a StorageClass, but the constraint dispatches on DatasetType). 

887 # So we pessimistically check if any datastore would need an expanded 

888 # data ID for this transfer mode. 

889 return any(datastore.needs_expanded_data_ids(transfer) for datastore in self.datastores) 889 ↛ exitline 889 didn't finish the generator expression on line 889

890 

891 def import_records(self, data: Mapping[str, DatastoreRecordData]) -> None: 

892 # Docstring inherited from the base class. 

893 

894 for datastore in self.datastores: 

895 datastore.import_records(data) 

896 

897 def export_records(self, refs: Iterable[DatasetIdRef]) -> Mapping[str, DatastoreRecordData]: 

898 # Docstring inherited from the base class. 

899 

900 all_records: Dict[str, DatastoreRecordData] = {} 

901 

902 # Merge all sub-datastore records into one structure 

903 for datastore in self.datastores: 

904 sub_records = datastore.export_records(refs) 

905 for name, record_data in sub_records.items(): 

906 # All datastore names must be unique in a chain. 

907 if name in all_records: 907 ↛ 908line 907 didn't jump to line 908, because the condition on line 907 was never true

908 raise ValueError("Non-unique datastore name found in datastore {datastore}") 

909 all_records[name] = record_data 

910 

911 return all_records