Coverage for python/lsst/daf/butler/datastores/chainedDatastore.py: 93%

408 statements  

« prev     ^ index     » next       coverage.py v7.2.5, created at 2023-05-02 09:49 +0000

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22from __future__ import annotations 

23 

24"""Chained datastore.""" 

25 

26__all__ = ("ChainedDatastore",) 

27 

28import itertools 

29import logging 

30import time 

31import warnings 

32from typing import TYPE_CHECKING, Any, Dict, Iterable, List, Mapping, Optional, Sequence, Set, Tuple, Union 

33 

34from lsst.daf.butler import ( 

35 Constraints, 

36 DatasetRef, 

37 DatasetRefURIs, 

38 DatasetTypeNotSupportedError, 

39 Datastore, 

40 DatastoreConfig, 

41 DatastoreRecordData, 

42 DatastoreValidationError, 

43 FileDataset, 

44) 

45from lsst.resources import ResourcePath 

46from lsst.utils import doImportType 

47 

48if TYPE_CHECKING: 

49 from lsst.daf.butler import Config, DatasetType, LookupKey, StorageClass 

50 from lsst.daf.butler.registry.interfaces import DatasetIdRef, DatastoreRegistryBridgeManager 

51 from lsst.resources import ResourcePathExpression 

52 

53log = logging.getLogger(__name__) 

54 

55 

56class _IngestPrepData(Datastore.IngestPrepData): 

57 """Helper class for ChainedDatastore ingest implementation. 

58 

59 Parameters 

60 ---------- 

61 children : `list` of `tuple` 

62 Pairs of `Datastore`, `IngestPrepData` for all child datastores. 

63 """ 

64 

65 def __init__(self, children: List[Tuple[Datastore, Datastore.IngestPrepData, set[ResourcePath]]]): 

66 super().__init__(itertools.chain.from_iterable(data.refs.values() for _, data, _ in children)) 

67 self.children = children 

68 

69 

70class ChainedDatastore(Datastore): 

71 """Chained Datastores to allow read and writes from multiple datastores. 

72 

73 A ChainedDatastore is configured with multiple datastore configurations. 

74 A ``put()`` is always sent to each datastore. A ``get()`` 

75 operation is sent to each datastore in turn and the first datastore 

76 to return a valid dataset is used. 

77 

78 Parameters 

79 ---------- 

80 config : `DatastoreConfig` or `str` 

81 Configuration. This configuration must include a ``datastores`` field 

82 as a sequence of datastore configurations. The order in this sequence 

83 indicates the order to use for read operations. 

84 bridgeManager : `DatastoreRegistryBridgeManager` 

85 Object that manages the interface between `Registry` and datastores. 

86 butlerRoot : `str`, optional 

87 New datastore root to use to override the configuration value. This 

88 root is sent to each child datastore. 

89 

90 Notes 

91 ----- 

92 ChainedDatastore never supports `None` or `"move"` as an `ingest` transfer 

93 mode. It supports `"copy"`, `"symlink"`, `"relsymlink"` 

94 and `"hardlink"` if and only if all its child datastores do. 

95 """ 

96 

97 defaultConfigFile = "datastores/chainedDatastore.yaml" 

98 """Path to configuration defaults. Accessed within the ``configs`` resource 

99 or relative to a search path. Can be None if no defaults specified. 

100 """ 

101 

102 containerKey = "datastores" 

103 """Key to specify where child datastores are configured.""" 

104 

105 datastores: List[Datastore] 

106 """All the child datastores known to this datastore.""" 

107 

108 datastoreConstraints: Sequence[Optional[Constraints]] 

109 """Constraints to be applied to each of the child datastores.""" 

110 

111 @classmethod 

112 def setConfigRoot(cls, root: str, config: Config, full: Config, overwrite: bool = True) -> None: 

113 """Set any filesystem-dependent config options for child Datastores to 

114 be appropriate for a new empty repository with the given root. 

115 

116 Parameters 

117 ---------- 

118 root : `str` 

119 Filesystem path to the root of the data repository. 

120 config : `Config` 

121 A `Config` to update. Only the subset understood by 

122 this component will be updated. Will not expand 

123 defaults. 

124 full : `Config` 

125 A complete config with all defaults expanded that can be 

126 converted to a `DatastoreConfig`. Read-only and will not be 

127 modified by this method. 

128 Repository-specific options that should not be obtained 

129 from defaults when Butler instances are constructed 

130 should be copied from ``full`` to ``config``. 

131 overwrite : `bool`, optional 

132 If `False`, do not modify a value in ``config`` if the value 

133 already exists. Default is always to overwrite with the provided 

134 ``root``. 

135 

136 Notes 

137 ----- 

138 If a keyword is explicitly defined in the supplied ``config`` it 

139 will not be overridden by this method if ``overwrite`` is `False`. 

140 This allows explicit values set in external configs to be retained. 

141 """ 

142 

143 # Extract the part of the config we care about updating 

144 datastoreConfig = DatastoreConfig(config, mergeDefaults=False) 

145 

146 # And the subset of the full config that we can use for reference. 

147 # Do not bother with defaults because we are told this already has 

148 # them. 

149 fullDatastoreConfig = DatastoreConfig(full, mergeDefaults=False) 

150 

151 # Loop over each datastore config and pass the subsets to the 

152 # child datastores to process. 

153 

154 containerKey = cls.containerKey 

155 for idx, (child, fullChild) in enumerate( 

156 zip(datastoreConfig[containerKey], fullDatastoreConfig[containerKey]) 

157 ): 

158 childConfig = DatastoreConfig(child, mergeDefaults=False) 

159 fullChildConfig = DatastoreConfig(fullChild, mergeDefaults=False) 

160 datastoreClass = doImportType(fullChildConfig["cls"]) 

161 if not issubclass(datastoreClass, Datastore): 161 ↛ 162line 161 didn't jump to line 162, because the condition on line 161 was never true

162 raise TypeError(f"Imported child class {fullChildConfig['cls']} is not a Datastore") 

163 newroot = "{}/{}_{}".format(root, datastoreClass.__qualname__, idx) 

164 datastoreClass.setConfigRoot(newroot, childConfig, fullChildConfig, overwrite=overwrite) 

165 

166 # Reattach to parent 

167 datastoreConfig[containerKey, idx] = childConfig 

168 

169 # Reattach modified datastore config to parent 

170 # If this has a datastore key we attach there, otherwise we assume 

171 # this information goes at the top of the config hierarchy. 

172 if DatastoreConfig.component in config: 

173 config[DatastoreConfig.component] = datastoreConfig 

174 else: 

175 config.update(datastoreConfig) 

176 

177 return 

178 

179 def __init__( 

180 self, 

181 config: Union[Config, str], 

182 bridgeManager: DatastoreRegistryBridgeManager, 

183 butlerRoot: str | None = None, 

184 ): 

185 super().__init__(config, bridgeManager) 

186 

187 # Scan for child datastores and instantiate them with the same registry 

188 self.datastores = [] 

189 for c in self.config["datastores"]: 

190 c = DatastoreConfig(c) 

191 datastoreType = doImportType(c["cls"]) 

192 if not issubclass(datastoreType, Datastore): 192 ↛ 193line 192 didn't jump to line 193, because the condition on line 192 was never true

193 raise TypeError(f"Imported child class {c['cls']} is not a Datastore") 

194 datastore = datastoreType(c, bridgeManager, butlerRoot=butlerRoot) 

195 log.debug("Creating child datastore %s", datastore.name) 

196 self.datastores.append(datastore) 

197 

198 # Name ourself based on our children 

199 if self.datastores: 199 ↛ 204line 199 didn't jump to line 204, because the condition on line 199 was never false

200 # We must set the names explicitly 

201 self._names = [d.name for d in self.datastores] 

202 childNames = ",".join(self.names) 

203 else: 

204 childNames = "(empty@{})".format(time.time()) 

205 self._names = [childNames] 

206 self.name = "{}[{}]".format(type(self).__qualname__, childNames) 

207 

208 # We declare we are ephemeral if all our child datastores declare 

209 # they are ephemeral 

210 isEphemeral = True 

211 for d in self.datastores: 

212 if not d.isEphemeral: 

213 isEphemeral = False 

214 break 

215 self.isEphemeral = isEphemeral 

216 

217 # per-datastore override constraints 

218 if "datastore_constraints" in self.config: 

219 overrides = self.config["datastore_constraints"] 

220 

221 if len(overrides) != len(self.datastores): 221 ↛ 222line 221 didn't jump to line 222, because the condition on line 221 was never true

222 raise DatastoreValidationError( 

223 f"Number of registered datastores ({len(self.datastores)})" 

224 " differs from number of constraints overrides" 

225 f" {len(overrides)}" 

226 ) 

227 

228 self.datastoreConstraints = [ 

229 Constraints(c.get("constraints"), universe=bridgeManager.universe) for c in overrides 

230 ] 

231 

232 else: 

233 self.datastoreConstraints = (None,) * len(self.datastores) 

234 

235 log.debug("Created %s (%s)", self.name, ("ephemeral" if self.isEphemeral else "permanent")) 

236 

237 @property 

238 def names(self) -> Tuple[str, ...]: 

239 return tuple(self._names) 

240 

241 def __str__(self) -> str: 

242 chainName = ", ".join(str(ds) for ds in self.datastores) 

243 return chainName 

244 

245 def knows(self, ref: DatasetRef) -> bool: 

246 """Check if the dataset is known to any of the datastores. 

247 

248 Does not check for existence of any artifact. 

249 

250 Parameters 

251 ---------- 

252 ref : `DatasetRef` 

253 Reference to the required dataset. 

254 

255 Returns 

256 ------- 

257 exists : `bool` 

258 `True` if the dataset is known to the datastore. 

259 """ 

260 for datastore in self.datastores: 

261 if datastore.knows(ref): 

262 log.debug("%s known to datastore %s", ref, datastore.name) 

263 return True 

264 return False 

265 

266 def knows_these(self, refs: Iterable[DatasetRef]) -> dict[DatasetRef, bool]: 

267 # Docstring inherited from the base class. 

268 refs_known: dict[DatasetRef, bool] = {} 

269 for datastore in self.datastores: 

270 refs_known.update(datastore.knows_these(refs)) 

271 

272 # No need to check in next datastore for refs that are known. 

273 # We only update entries that were initially False. 

274 refs = [ref for ref, known in refs_known.items() if not known] 

275 

276 return refs_known 

277 

278 def mexists( 

279 self, refs: Iterable[DatasetRef], artifact_existence: Optional[Dict[ResourcePath, bool]] = None 

280 ) -> Dict[DatasetRef, bool]: 

281 """Check the existence of multiple datasets at once. 

282 

283 Parameters 

284 ---------- 

285 refs : iterable of `DatasetRef` 

286 The datasets to be checked. 

287 artifact_existence : `dict` [`lsst.resources.ResourcePath`, `bool`] 

288 Optional mapping of datastore artifact to existence. Updated by 

289 this method with details of all artifacts tested. Can be `None` 

290 if the caller is not interested. 

291 

292 Returns 

293 ------- 

294 existence : `dict` of [`DatasetRef`, `bool`] 

295 Mapping from dataset to boolean indicating existence in any 

296 of the child datastores. 

297 """ 

298 dataset_existence: Dict[DatasetRef, bool] = {} 

299 for datastore in self.datastores: 

300 dataset_existence.update(datastore.mexists(refs, artifact_existence=artifact_existence)) 

301 

302 # For next datastore no point asking about ones we know 

303 # exist already. No special exemption for ephemeral datastores. 

304 refs = [ref for ref, exists in dataset_existence.items() if not exists] 

305 

306 return dataset_existence 

307 

308 def exists(self, ref: DatasetRef) -> bool: 

309 """Check if the dataset exists in one of the datastores. 

310 

311 Parameters 

312 ---------- 

313 ref : `DatasetRef` 

314 Reference to the required dataset. 

315 

316 Returns 

317 ------- 

318 exists : `bool` 

319 `True` if the entity exists in one of the child datastores. 

320 """ 

321 for datastore in self.datastores: 

322 if datastore.exists(ref): 

323 log.debug("Found %s in datastore %s", ref, datastore.name) 

324 return True 

325 return False 

326 

327 def get( 

328 self, 

329 ref: DatasetRef, 

330 parameters: Optional[Mapping[str, Any]] = None, 

331 storageClass: Optional[Union[StorageClass, str]] = None, 

332 ) -> Any: 

333 """Load an InMemoryDataset from the store. 

334 

335 The dataset is returned from the first datastore that has 

336 the dataset. 

337 

338 Parameters 

339 ---------- 

340 ref : `DatasetRef` 

341 Reference to the required Dataset. 

342 parameters : `dict` 

343 `StorageClass`-specific parameters that specify, for example, 

344 a slice of the dataset to be loaded. 

345 storageClass : `StorageClass` or `str`, optional 

346 The storage class to be used to override the Python type 

347 returned by this method. By default the returned type matches 

348 the dataset type definition for this dataset. Specifying a 

349 read `StorageClass` can force a different type to be returned. 

350 This type must be compatible with the original type. 

351 

352 Returns 

353 ------- 

354 inMemoryDataset : `object` 

355 Requested dataset or slice thereof as an InMemoryDataset. 

356 

357 Raises 

358 ------ 

359 FileNotFoundError 

360 Requested dataset can not be retrieved. 

361 TypeError 

362 Return value from formatter has unexpected type. 

363 ValueError 

364 Formatter failed to process the dataset. 

365 """ 

366 

367 for datastore in self.datastores: 

368 try: 

369 inMemoryObject = datastore.get(ref, parameters, storageClass=storageClass) 

370 log.debug("Found dataset %s in datastore %s", ref, datastore.name) 

371 return inMemoryObject 

372 except FileNotFoundError: 

373 pass 

374 

375 raise FileNotFoundError("Dataset {} could not be found in any of the datastores".format(ref)) 

376 

377 def put(self, inMemoryDataset: Any, ref: DatasetRef) -> None: 

378 """Write a InMemoryDataset with a given `DatasetRef` to each 

379 datastore. 

380 

381 The put() to child datastores can fail with 

382 `DatasetTypeNotSupportedError`. The put() for this datastore will be 

383 deemed to have succeeded so long as at least one child datastore 

384 accepted the inMemoryDataset. 

385 

386 Parameters 

387 ---------- 

388 inMemoryDataset : `object` 

389 The dataset to store. 

390 ref : `DatasetRef` 

391 Reference to the associated Dataset. 

392 

393 Raises 

394 ------ 

395 TypeError 

396 Supplied object and storage class are inconsistent. 

397 DatasetTypeNotSupportedError 

398 All datastores reported `DatasetTypeNotSupportedError`. 

399 """ 

400 log.debug("Put %s", ref) 

401 

402 # Confirm that we can accept this dataset 

403 if not self.constraints.isAcceptable(ref): 

404 # Raise rather than use boolean return value. 

405 raise DatasetTypeNotSupportedError( 

406 f"Dataset {ref} has been rejected by this datastore via configuration." 

407 ) 

408 

409 isPermanent = False 

410 nsuccess = 0 

411 npermanent = 0 

412 nephemeral = 0 

413 for datastore, constraints in zip(self.datastores, self.datastoreConstraints): 

414 if constraints is not None and not constraints.isAcceptable(ref): 

415 log.debug("Datastore %s skipping put via configuration for ref %s", datastore.name, ref) 

416 continue 

417 

418 if datastore.isEphemeral: 

419 nephemeral += 1 

420 else: 

421 npermanent += 1 

422 try: 

423 datastore.put(inMemoryDataset, ref) 

424 nsuccess += 1 

425 if not datastore.isEphemeral: 

426 isPermanent = True 

427 except DatasetTypeNotSupportedError: 

428 pass 

429 

430 if nsuccess == 0: 

431 raise DatasetTypeNotSupportedError(f"None of the chained datastores supported ref {ref}") 

432 

433 if not isPermanent and npermanent > 0: 433 ↛ 434line 433 didn't jump to line 434, because the condition on line 433 was never true

434 warnings.warn(f"Put of {ref} only succeeded in ephemeral databases", stacklevel=2) 

435 

436 if self._transaction is not None: 

437 self._transaction.registerUndo("put", self.remove, ref) 

438 

439 def _overrideTransferMode(self, *datasets: Any, transfer: Optional[str] = None) -> Optional[str]: 

440 # Docstring inherited from base class. 

441 if transfer != "auto": 

442 return transfer 

443 # Ask each datastore what they think auto means 

444 transfers = {d._overrideTransferMode(*datasets, transfer=transfer) for d in self.datastores} 

445 

446 # Remove any untranslated "auto" values 

447 transfers.discard(transfer) 

448 

449 if len(transfers) == 1: 449 ↛ 450line 449 didn't jump to line 450, because the condition on line 449 was never true

450 return transfers.pop() 

451 if not transfers: 451 ↛ 455line 451 didn't jump to line 455, because the condition on line 451 was never false

452 # Everything reported "auto" 

453 return transfer 

454 

455 raise RuntimeError( 

456 "Chained datastore does not yet support different transfer modes" 

457 f" from 'auto' in each child datastore (wanted {transfers})" 

458 ) 

459 

460 def _prepIngest(self, *datasets: FileDataset, transfer: Optional[str] = None) -> _IngestPrepData: 

461 # Docstring inherited from Datastore._prepIngest. 

462 if transfer is None: 

463 raise NotImplementedError("ChainedDatastore does not support transfer=None.") 

464 

465 def isDatasetAcceptable(dataset: FileDataset, *, name: str, constraints: Constraints) -> bool: 

466 acceptable = [ref for ref in dataset.refs if constraints.isAcceptable(ref)] 

467 if not acceptable: 

468 log.debug( 

469 "Datastore %s skipping ingest via configuration for refs %s", 

470 name, 

471 ", ".join(str(ref) for ref in dataset.refs), 

472 ) 

473 return False 

474 else: 

475 return True 

476 

477 # Filter down to just datasets the chained datastore's own 

478 # configuration accepts. 

479 okForParent: List[FileDataset] = [ 

480 dataset 

481 for dataset in datasets 

482 if isDatasetAcceptable(dataset, name=self.name, constraints=self.constraints) 

483 ] 

484 

485 # Iterate over nested datastores and call _prepIngest on each. 

486 # Save the results to a list: 

487 children: List[Tuple[Datastore, Datastore.IngestPrepData, set[ResourcePath]]] = [] 

488 # ...and remember whether all of the failures are due to 

489 # NotImplementedError being raised. 

490 allFailuresAreNotImplementedError = True 

491 for datastore, constraints in zip(self.datastores, self.datastoreConstraints): 

492 okForChild: List[FileDataset] 

493 if constraints is not None: 

494 okForChild = [ 

495 dataset 

496 for dataset in okForParent 

497 if isDatasetAcceptable(dataset, name=datastore.name, constraints=constraints) 

498 ] 

499 else: 

500 okForChild = okForParent 

501 try: 

502 prepDataForChild = datastore._prepIngest(*okForChild, transfer=transfer) 

503 except NotImplementedError: 

504 log.debug( 

505 "Skipping ingest for datastore %s because transfer mode %s is not supported.", 

506 datastore.name, 

507 transfer, 

508 ) 

509 continue 

510 allFailuresAreNotImplementedError = False 

511 if okForChild: 

512 # Do not store for later if a datastore has rejected 

513 # everything. 

514 # Include the source paths if this is a "move". It's clearer 

515 # to find the paths now rather than try to infer how 

516 # each datastore has stored them in the internal prep class. 

517 paths = ( 

518 {ResourcePath(dataset.path) for dataset in okForChild} if transfer == "move" else set() 

519 ) 

520 children.append((datastore, prepDataForChild, paths)) 

521 if allFailuresAreNotImplementedError: 

522 raise NotImplementedError(f"No child datastore supports transfer mode {transfer}.") 

523 return _IngestPrepData(children=children) 

524 

525 def _finishIngest( 

526 self, 

527 prepData: _IngestPrepData, 

528 *, 

529 transfer: Optional[str] = None, 

530 record_validation_info: bool = True, 

531 ) -> None: 

532 # Docstring inherited from Datastore._finishIngest. 

533 # For "move" we must use "copy" and then delete the input 

534 # data at the end. This has no rollback option if the ingest 

535 # subsequently fails. If there is only one active datastore 

536 # accepting any files we can leave it as "move" 

537 actual_transfer: str | None 

538 if transfer == "move" and len(prepData.children) > 1: 

539 actual_transfer = "copy" 

540 else: 

541 actual_transfer = transfer 

542 to_be_deleted: set[ResourcePath] = set() 

543 for datastore, prepDataForChild, paths in prepData.children: 

544 datastore._finishIngest( 

545 prepDataForChild, transfer=actual_transfer, record_validation_info=record_validation_info 

546 ) 

547 to_be_deleted.update(paths) 

548 if actual_transfer != transfer: 

549 # These datasets were copied but now need to be deleted. 

550 # This can not be rolled back. 

551 for uri in to_be_deleted: 

552 uri.remove() 

553 

554 def getManyURIs( 

555 self, 

556 refs: Iterable[DatasetRef], 

557 predict: bool = False, 

558 allow_missing: bool = False, 

559 ) -> Dict[DatasetRef, DatasetRefURIs]: 

560 # Docstring inherited 

561 

562 uris: Dict[DatasetRef, DatasetRefURIs] = {} 

563 missing_refs = set(refs) 

564 

565 # If predict is True we don't want to predict a dataset in the first 

566 # datastore if it actually exists in a later datastore, so in that 

567 # case check all datastores with predict=False first, and then try 

568 # again with predict=True. 

569 for p in (False, True) if predict else (False,): 

570 if not missing_refs: 

571 break 

572 for datastore in self.datastores: 

573 got_uris = datastore.getManyURIs(missing_refs, p, allow_missing=True) 

574 missing_refs -= got_uris.keys() 

575 uris.update(got_uris) 

576 if not missing_refs: 

577 break 

578 

579 if missing_refs and not allow_missing: 

580 raise FileNotFoundError(f"Dataset(s) {missing_refs} not in this datastore.") 

581 

582 return uris 

583 

584 def getURIs(self, ref: DatasetRef, predict: bool = False) -> DatasetRefURIs: 

585 """Return URIs associated with dataset. 

586 

587 Parameters 

588 ---------- 

589 ref : `DatasetRef` 

590 Reference to the required dataset. 

591 predict : `bool`, optional 

592 If the datastore does not know about the dataset, should it 

593 return a predicted URI or not? 

594 

595 Returns 

596 ------- 

597 uris : `DatasetRefURIs` 

598 The URI to the primary artifact associated with this dataset (if 

599 the dataset was disassembled within the datastore this may be 

600 `None`), and the URIs to any components associated with the dataset 

601 artifact. (can be empty if there are no components). 

602 

603 Notes 

604 ----- 

605 The returned URI is from the first datastore in the list that has 

606 the dataset with preference given to the first dataset coming from 

607 a permanent datastore. If no datastores have the dataset and prediction 

608 is allowed, the predicted URI for the first datastore in the list will 

609 be returned. 

610 """ 

611 log.debug("Requesting URIs for %s", ref) 

612 predictedUri: Optional[DatasetRefURIs] = None 

613 predictedEphemeralUri: Optional[DatasetRefURIs] = None 

614 firstEphemeralUri: Optional[DatasetRefURIs] = None 

615 for datastore in self.datastores: 

616 if datastore.exists(ref): 

617 if not datastore.isEphemeral: 

618 uri = datastore.getURIs(ref) 

619 log.debug("Retrieved non-ephemeral URI: %s", uri) 

620 return uri 

621 elif not firstEphemeralUri: 

622 firstEphemeralUri = datastore.getURIs(ref) 

623 elif predict: 

624 if not predictedUri and not datastore.isEphemeral: 

625 predictedUri = datastore.getURIs(ref, predict) 

626 elif not predictedEphemeralUri and datastore.isEphemeral: 

627 predictedEphemeralUri = datastore.getURIs(ref, predict) 

628 

629 if firstEphemeralUri: 

630 log.debug("Retrieved ephemeral URI: %s", firstEphemeralUri) 

631 return firstEphemeralUri 

632 

633 if predictedUri: 

634 log.debug("Retrieved predicted URI: %s", predictedUri) 

635 return predictedUri 

636 

637 if predictedEphemeralUri: 

638 log.debug("Retrieved predicted ephemeral URI: %s", predictedEphemeralUri) 

639 return predictedEphemeralUri 

640 

641 raise FileNotFoundError("Dataset {} not in any datastore".format(ref)) 

642 

643 def getURI(self, ref: DatasetRef, predict: bool = False) -> ResourcePath: 

644 """URI to the Dataset. 

645 

646 The returned URI is from the first datastore in the list that has 

647 the dataset with preference given to the first dataset coming from 

648 a permanent datastore. If no datastores have the dataset and prediction 

649 is allowed, the predicted URI for the first datastore in the list will 

650 be returned. 

651 

652 Parameters 

653 ---------- 

654 ref : `DatasetRef` 

655 Reference to the required Dataset. 

656 predict : `bool` 

657 If `True`, allow URIs to be returned of datasets that have not 

658 been written. 

659 

660 Returns 

661 ------- 

662 uri : `lsst.resources.ResourcePath` 

663 URI pointing to the dataset within the datastore. If the 

664 dataset does not exist in the datastore, and if ``predict`` is 

665 `True`, the URI will be a prediction and will include a URI 

666 fragment "#predicted". 

667 

668 Notes 

669 ----- 

670 If the datastore does not have entities that relate well 

671 to the concept of a URI the returned URI string will be 

672 descriptive. The returned URI is not guaranteed to be obtainable. 

673 

674 Raises 

675 ------ 

676 FileNotFoundError 

677 A URI has been requested for a dataset that does not exist and 

678 guessing is not allowed. 

679 RuntimeError 

680 Raised if a request is made for a single URI but multiple URIs 

681 are associated with this dataset. 

682 """ 

683 log.debug("Requesting URI for %s", ref) 

684 primary, components = self.getURIs(ref, predict) 

685 if primary is None or components: 685 ↛ 686line 685 didn't jump to line 686, because the condition on line 685 was never true

686 raise RuntimeError( 

687 f"Dataset ({ref}) includes distinct URIs for components. Use Datastore.getURIs() instead." 

688 ) 

689 return primary 

690 

691 def retrieveArtifacts( 

692 self, 

693 refs: Iterable[DatasetRef], 

694 destination: ResourcePath, 

695 transfer: str = "auto", 

696 preserve_path: bool = True, 

697 overwrite: bool = False, 

698 ) -> List[ResourcePath]: 

699 """Retrieve the file artifacts associated with the supplied refs. 

700 

701 Parameters 

702 ---------- 

703 refs : iterable of `DatasetRef` 

704 The datasets for which file artifacts are to be retrieved. 

705 A single ref can result in multiple files. The refs must 

706 be resolved. 

707 destination : `lsst.resources.ResourcePath` 

708 Location to write the file artifacts. 

709 transfer : `str`, optional 

710 Method to use to transfer the artifacts. Must be one of the options 

711 supported by `lsst.resources.ResourcePath.transfer_from()`. 

712 "move" is not allowed. 

713 preserve_path : `bool`, optional 

714 If `True` the full path of the file artifact within the datastore 

715 is preserved. If `False` the final file component of the path 

716 is used. 

717 overwrite : `bool`, optional 

718 If `True` allow transfers to overwrite existing files at the 

719 destination. 

720 

721 Returns 

722 ------- 

723 targets : `list` of `lsst.resources.ResourcePath` 

724 URIs of file artifacts in destination location. Order is not 

725 preserved. 

726 """ 

727 if not destination.isdir(): 727 ↛ 728line 727 didn't jump to line 728, because the condition on line 727 was never true

728 raise ValueError(f"Destination location must refer to a directory. Given {destination}") 

729 

730 # Using getURIs is not feasible since it becomes difficult to 

731 # determine the path within the datastore later on. For now 

732 # follow getURIs implementation approach. 

733 

734 pending = set(refs) 

735 

736 # There is a question as to whether an exception should be raised 

737 # early if some of the refs are missing, or whether files should be 

738 # transferred until a problem is hit. Prefer to complain up front. 

739 # Use the datastore integer as primary key. 

740 grouped_by_datastore: Dict[int, Set[DatasetRef]] = {} 

741 

742 for number, datastore in enumerate(self.datastores): 

743 if datastore.isEphemeral: 

744 # In the future we will want to distinguish in-memory from 

745 # caching datastore since using an on-disk local 

746 # cache is exactly what we should be doing. 

747 continue 

748 datastore_refs = {ref for ref in pending if datastore.exists(ref)} 

749 

750 if datastore_refs: 

751 grouped_by_datastore[number] = datastore_refs 

752 

753 # Remove these from the pending list so that we do not bother 

754 # looking for them any more. 

755 pending = pending - datastore_refs 

756 

757 if pending: 757 ↛ 758line 757 didn't jump to line 758, because the condition on line 757 was never true

758 raise RuntimeError(f"Some datasets were not found in any datastores: {pending}") 

759 

760 # Now do the transfer. 

761 targets: List[ResourcePath] = [] 

762 for number, datastore_refs in grouped_by_datastore.items(): 

763 targets.extend( 

764 self.datastores[number].retrieveArtifacts( 

765 datastore_refs, 

766 destination, 

767 transfer=transfer, 

768 preserve_path=preserve_path, 

769 overwrite=overwrite, 

770 ) 

771 ) 

772 

773 return targets 

774 

775 def remove(self, ref: DatasetRef) -> None: 

776 """Indicate to the datastore that a dataset can be removed. 

777 

778 The dataset will be removed from each datastore. The dataset is 

779 not required to exist in every child datastore. 

780 

781 Parameters 

782 ---------- 

783 ref : `DatasetRef` 

784 Reference to the required dataset. 

785 

786 Raises 

787 ------ 

788 FileNotFoundError 

789 Attempt to remove a dataset that does not exist. Raised if none 

790 of the child datastores removed the dataset. 

791 """ 

792 log.debug("Removing %s", ref) 

793 self.trash(ref, ignore_errors=False) 

794 self.emptyTrash(ignore_errors=False) 

795 

796 def forget(self, refs: Iterable[DatasetRef]) -> None: 

797 for datastore in tuple(self.datastores): 

798 datastore.forget(refs) 

799 

800 def trash(self, ref: Union[DatasetRef, Iterable[DatasetRef]], ignore_errors: bool = True) -> None: 

801 if isinstance(ref, DatasetRef): 

802 ref_label = str(ref) 

803 else: 

804 ref_label = "bulk datasets" 

805 

806 log.debug("Trashing %s", ref_label) 

807 

808 counter = 0 

809 for datastore in self.datastores: 

810 try: 

811 datastore.trash(ref, ignore_errors=ignore_errors) 

812 counter += 1 

813 except FileNotFoundError: 

814 pass 

815 

816 if counter == 0: 

817 err_msg = f"Could not mark for removal from any child datastore: {ref_label}" 

818 if ignore_errors: 818 ↛ 819line 818 didn't jump to line 819, because the condition on line 818 was never true

819 log.warning(err_msg) 

820 else: 

821 raise FileNotFoundError(err_msg) 

822 

823 def emptyTrash(self, ignore_errors: bool = True) -> None: 

824 for datastore in self.datastores: 

825 datastore.emptyTrash(ignore_errors=ignore_errors) 

826 

827 def transfer(self, inputDatastore: Datastore, ref: DatasetRef) -> None: 

828 """Retrieve a dataset from an input `Datastore`, 

829 and store the result in this `Datastore`. 

830 

831 Parameters 

832 ---------- 

833 inputDatastore : `Datastore` 

834 The external `Datastore` from which to retreive the Dataset. 

835 ref : `DatasetRef` 

836 Reference to the required dataset in the input data store. 

837 

838 Returns 

839 ------- 

840 results : `list` 

841 List containing the return value from the ``put()`` to each 

842 child datastore. 

843 """ 

844 assert inputDatastore is not self # unless we want it for renames? 

845 inMemoryDataset = inputDatastore.get(ref) 

846 self.put(inMemoryDataset, ref) 

847 

848 def validateConfiguration( 

849 self, entities: Iterable[Union[DatasetRef, DatasetType, StorageClass]], logFailures: bool = False 

850 ) -> None: 

851 """Validate some of the configuration for this datastore. 

852 

853 Parameters 

854 ---------- 

855 entities : iterable of `DatasetRef`, `DatasetType`, or `StorageClass` 

856 Entities to test against this configuration. Can be differing 

857 types. 

858 logFailures : `bool`, optional 

859 If `True`, output a log message for every validation error 

860 detected. 

861 

862 Raises 

863 ------ 

864 DatastoreValidationError 

865 Raised if there is a validation problem with a configuration. 

866 All the problems are reported in a single exception. 

867 

868 Notes 

869 ----- 

870 This method checks each datastore in turn. 

871 """ 

872 

873 # Need to catch each of the datastore outputs and ensure that 

874 # all are tested. 

875 failures = [] 

876 for datastore in self.datastores: 

877 try: 

878 datastore.validateConfiguration(entities, logFailures=logFailures) 

879 except DatastoreValidationError as e: 

880 if logFailures: 880 ↛ 882line 880 didn't jump to line 882, because the condition on line 880 was never false

881 log.critical("Datastore %s failed validation", datastore.name) 

882 failures.append(f"Datastore {self.name}: {e}") 

883 

884 if failures: 

885 msg = ";\n".join(failures) 

886 raise DatastoreValidationError(msg) 

887 

888 def validateKey(self, lookupKey: LookupKey, entity: Union[DatasetRef, DatasetType, StorageClass]) -> None: 

889 # Docstring is inherited from base class 

890 failures = [] 

891 for datastore in self.datastores: 

892 try: 

893 datastore.validateKey(lookupKey, entity) 

894 except DatastoreValidationError as e: 

895 failures.append(f"Datastore {self.name}: {e}") 

896 

897 if failures: 

898 msg = ";\n".join(failures) 

899 raise DatastoreValidationError(msg) 

900 

901 def getLookupKeys(self) -> Set[LookupKey]: 

902 # Docstring is inherited from base class 

903 keys = set() 

904 for datastore in self.datastores: 

905 keys.update(datastore.getLookupKeys()) 

906 

907 keys.update(self.constraints.getLookupKeys()) 

908 for p in self.datastoreConstraints: 

909 if p is not None: 909 ↛ 908line 909 didn't jump to line 908, because the condition on line 909 was never false

910 keys.update(p.getLookupKeys()) 

911 

912 return keys 

913 

914 def needs_expanded_data_ids( 

915 self, 

916 transfer: Optional[str], 

917 entity: Optional[Union[DatasetRef, DatasetType, StorageClass]] = None, 

918 ) -> bool: 

919 # Docstring inherited. 

920 # We can't safely use `self.datastoreConstraints` with `entity` to 

921 # check whether a child datastore would even want to ingest this 

922 # dataset, because we don't want to filter out datastores that might 

923 # need an expanded data ID based in incomplete information (e.g. we 

924 # pass a StorageClass, but the constraint dispatches on DatasetType). 

925 # So we pessimistically check if any datastore would need an expanded 

926 # data ID for this transfer mode. 

927 return any(datastore.needs_expanded_data_ids(transfer, entity) for datastore in self.datastores) 

928 

929 def import_records(self, data: Mapping[str, DatastoreRecordData]) -> None: 

930 # Docstring inherited from the base class. 

931 

932 for datastore in self.datastores: 

933 datastore.import_records(data) 

934 

935 def export_records(self, refs: Iterable[DatasetIdRef]) -> Mapping[str, DatastoreRecordData]: 

936 # Docstring inherited from the base class. 

937 

938 all_records: Dict[str, DatastoreRecordData] = {} 

939 

940 # Merge all sub-datastore records into one structure 

941 for datastore in self.datastores: 

942 sub_records = datastore.export_records(refs) 

943 for name, record_data in sub_records.items(): 

944 # All datastore names must be unique in a chain. 

945 if name in all_records: 945 ↛ 946line 945 didn't jump to line 946, because the condition on line 945 was never true

946 raise ValueError("Non-unique datastore name found in datastore {datastore}") 

947 all_records[name] = record_data 

948 

949 return all_records 

950 

951 def export( 

952 self, 

953 refs: Iterable[DatasetRef], 

954 *, 

955 directory: Optional[ResourcePathExpression] = None, 

956 transfer: Optional[str] = "auto", 

957 ) -> Iterable[FileDataset]: 

958 # Docstring inherited from Datastore.export. 

959 if transfer == "auto" and directory is None: 

960 transfer = None 

961 

962 if transfer is not None and directory is None: 

963 raise TypeError(f"Cannot export using transfer mode {transfer} with no export directory given") 

964 

965 if transfer == "move": 

966 raise TypeError("Can not export by moving files out of datastore.") 

967 

968 # Exporting from a chain has the potential for a dataset to be 

969 # in one or more of the datastores in the chain. We only need one 

970 # of them since we assume the datasets are the same in all (but 

971 # the file format could be different of course since that is a 

972 # per-datastore configuration). 

973 # We also do not know whether any of the datastores in the chain 

974 # support file export. 

975 

976 # Ensure we have an ordered sequence that is not an iterator or set. 

977 if not isinstance(refs, Sequence): 

978 refs = list(refs) 

979 

980 # If any of the datasets are missing entirely we need to raise early 

981 # before we try to run the export. This can be a little messy but is 

982 # better than exporting files from the first datastore and then finding 

983 # that one is missing but is not in the second datastore either. 

984 known = [datastore.knows_these(refs) for datastore in self.datastores] 

985 refs_known: set[DatasetRef] = set() 

986 for known_to_this in known: 

987 refs_known.update({ref for ref, knows_this in known_to_this.items() if knows_this}) 

988 missing_count = len(refs) - len(refs_known) 

989 if missing_count: 

990 raise FileNotFoundError(f"Not all datasets known to this datastore. Missing {missing_count}") 

991 

992 # To allow us to slot each result into the right place after 

993 # asking each datastore, create a dict with the index. 

994 ref_positions = {ref: i for i, ref in enumerate(refs)} 

995 

996 # Presize the final export list. 

997 exported: list[FileDataset | None] = [None] * len(refs) 

998 

999 # The order of the returned dataset has to match the order of the 

1000 # given refs, even if they are all from different datastores. 

1001 for i, datastore in enumerate(self.datastores): 

1002 known_to_this = known[i] 

1003 filtered = [ref for ref, knows in known_to_this.items() if knows and ref in ref_positions] 

1004 

1005 try: 

1006 this_export = datastore.export(filtered, directory=directory, transfer=transfer) 

1007 except NotImplementedError: 

1008 # Try the next datastore. 

1009 continue 

1010 

1011 for ref, export in zip(filtered, this_export): 

1012 # Get the position and also delete it from the list. 

1013 exported[ref_positions.pop(ref)] = export 

1014 

1015 # Every dataset should be accounted for because of the earlier checks 

1016 # but make sure that we did fill all the slots to appease mypy. 

1017 for i, dataset in enumerate(exported): 

1018 if dataset is None: 1018 ↛ 1019line 1018 didn't jump to line 1019, because the condition on line 1018 was never true

1019 raise FileNotFoundError(f"Failed to export dataset {refs[i]}.") 

1020 yield dataset 

1021 

1022 def transfer_from( 

1023 self, 

1024 source_datastore: Datastore, 

1025 refs: Iterable[DatasetRef], 

1026 transfer: str = "auto", 

1027 artifact_existence: Optional[Dict[ResourcePath, bool]] = None, 

1028 ) -> tuple[set[DatasetRef], set[DatasetRef]]: 

1029 # Docstring inherited 

1030 # mypy does not understand "type(self) is not type(source)" 

1031 if isinstance(source_datastore, ChainedDatastore): 

1032 # Both the source and destination are chained datastores. 

1033 source_datastores = tuple(source_datastore.datastores) 

1034 else: 

1035 # The source datastore is different, forward everything to the 

1036 # child datastores. 

1037 source_datastores = tuple([source_datastore]) 

1038 

1039 # Need to know the set of all possible refs that could be transferred. 

1040 remaining_refs = set(refs) 

1041 

1042 missing_from_source: set[DatasetRef] | None = None 

1043 all_accepted = set() 

1044 nsuccess = 0 

1045 for source_child in source_datastores: 

1046 # If we are reading from a chained datastore, it's possible that 

1047 # only a subset of the datastores know about the dataset. We can't 

1048 # ask the receiving datastore to copy it when it doesn't exist 

1049 # so we have to filter again based on what the source datastore 

1050 # understands. 

1051 known_to_source = source_child.knows_these([ref for ref in refs]) 

1052 

1053 # Need to know that there is a possibility that some of these 

1054 # datasets exist but are unknown to the source datastore if 

1055 # trust is enabled. 

1056 if getattr(source_child, "trustGetRequest", False): 

1057 unknown = [ref for ref, known in known_to_source.items() if not known] 

1058 existence = source_child.mexists(unknown, artifact_existence) 

1059 for ref, exists in existence.items(): 

1060 known_to_source[ref] = exists 

1061 

1062 missing = {ref for ref, known in known_to_source.items() if not known} 

1063 if missing: 

1064 if missing_from_source is None: 

1065 missing_from_source = missing 

1066 else: 

1067 missing_from_source &= missing 

1068 

1069 # Try to transfer from each source datastore to each child 

1070 # datastore. Have to make sure we don't transfer something 

1071 # we've already transferred to this destination on later passes. 

1072 

1073 # Filter the initial list based on the datasets we have 

1074 # not yet transferred. 

1075 these_refs = [] 

1076 for ref in refs: 

1077 if ref in remaining_refs and known_to_source[ref]: 

1078 these_refs.append(ref) 

1079 

1080 if not these_refs: 

1081 # Already transferred all datasets known to this datastore. 

1082 continue 

1083 

1084 for datastore, constraints in zip(self.datastores, self.datastoreConstraints): 

1085 if constraints is not None: 1085 ↛ 1093line 1085 didn't jump to line 1093, because the condition on line 1085 was never false

1086 filtered_refs = [] 

1087 for ref in these_refs: 

1088 if constraints.isAcceptable(ref): 

1089 filtered_refs.append(ref) 

1090 else: 

1091 log.debug("Rejecting ref by constraints: %s", ref) 

1092 else: 

1093 filtered_refs = [ref for ref in these_refs] 

1094 try: 

1095 accepted, _ = datastore.transfer_from( 

1096 source_child, filtered_refs, transfer, artifact_existence 

1097 ) 

1098 except (TypeError, NotImplementedError): 

1099 # The datastores were incompatible. 

1100 continue 

1101 else: 

1102 nsuccess += 1 

1103 

1104 # Remove the accepted datasets from those remaining. 

1105 remaining_refs = remaining_refs - accepted 

1106 

1107 # Keep track of everything we have accepted. 

1108 all_accepted.update(accepted) 

1109 

1110 if missing_from_source: 

1111 for ref in missing_from_source: 

1112 log.warning("Asked to transfer dataset %s but no file artifacts exist for it", ref) 

1113 

1114 if nsuccess == 0: 1114 ↛ 1115line 1114 didn't jump to line 1115, because the condition on line 1114 was never true

1115 raise TypeError(f"None of the child datastores could accept transfers from {source_datastore!r}") 

1116 

1117 return all_accepted, remaining_refs