Coverage for python/lsst/daf/butler/datastores/chainedDatastore.py: 86%

467 statements  

« prev     ^ index     » next       coverage.py v7.4.1, created at 2024-02-01 11:19 +0000

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This software is dual licensed under the GNU General Public License and also 

10# under a 3-clause BSD license. Recipients may choose which of these licenses 

11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt, 

12# respectively. If you choose the GPL option then the following text applies 

13# (but note that there is still no warranty even if you opt for BSD instead): 

14# 

15# This program is free software: you can redistribute it and/or modify 

16# it under the terms of the GNU General Public License as published by 

17# the Free Software Foundation, either version 3 of the License, or 

18# (at your option) any later version. 

19# 

20# This program is distributed in the hope that it will be useful, 

21# but WITHOUT ANY WARRANTY; without even the implied warranty of 

22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

23# GNU General Public License for more details. 

24# 

25# You should have received a copy of the GNU General Public License 

26# along with this program. If not, see <http://www.gnu.org/licenses/>. 

27 

28"""Chained datastore.""" 

29 

30from __future__ import annotations 

31 

32__all__ = ("ChainedDatastore",) 

33 

34import itertools 

35import logging 

36import time 

37import warnings 

38from collections.abc import Collection, Iterable, Mapping, Sequence 

39from typing import TYPE_CHECKING, Any 

40 

41from lsst.daf.butler import DatasetRef, DatasetTypeNotSupportedError, FileDataset 

42from lsst.daf.butler.datastore import ( 

43 DatasetRefURIs, 

44 Datastore, 

45 DatastoreConfig, 

46 DatastoreOpaqueTable, 

47 DatastoreValidationError, 

48) 

49from lsst.daf.butler.datastore.constraints import Constraints 

50from lsst.daf.butler.datastore.record_data import DatastoreRecordData 

51from lsst.resources import ResourcePath 

52from lsst.utils import doImportType 

53 

54if TYPE_CHECKING: 

55 from lsst.daf.butler import Config, DatasetType, LookupKey, StorageClass 

56 from lsst.daf.butler.registry.interfaces import DatasetIdRef, DatastoreRegistryBridgeManager 

57 from lsst.resources import ResourcePathExpression 

58 

59log = logging.getLogger(__name__) 

60 

61 

62class _IngestPrepData(Datastore.IngestPrepData): 

63 """Helper class for ChainedDatastore ingest implementation. 

64 

65 Parameters 

66 ---------- 

67 children : `list` of `tuple` 

68 Pairs of `Datastore`, `IngestPrepData` for all child datastores. 

69 """ 

70 

71 def __init__(self, children: list[tuple[Datastore, Datastore.IngestPrepData, set[ResourcePath]]]): 

72 super().__init__(itertools.chain.from_iterable(data.refs.values() for _, data, _ in children)) 

73 self.children = children 

74 

75 

76class ChainedDatastore(Datastore): 

77 """Chained Datastores to allow read and writes from multiple datastores. 

78 

79 A ChainedDatastore is configured with multiple datastore configurations. 

80 A ``put()`` is always sent to each datastore. A ``get()`` 

81 operation is sent to each datastore in turn and the first datastore 

82 to return a valid dataset is used. 

83 

84 Parameters 

85 ---------- 

86 config : `DatastoreConfig` or `str` 

87 Configuration. This configuration must include a ``datastores`` field 

88 as a sequence of datastore configurations. The order in this sequence 

89 indicates the order to use for read operations. 

90 bridgeManager : `DatastoreRegistryBridgeManager` 

91 Object that manages the interface between `Registry` and datastores. 

92 datastores : `list` [`Datastore`] 

93 All the child datastores known to this datastore. 

94 

95 Notes 

96 ----- 

97 ChainedDatastore never supports `None` or `"move"` as an `ingest` transfer 

98 mode. It supports `"copy"`, `"symlink"`, `"relsymlink"` 

99 and `"hardlink"` if and only if all its child datastores do. 

100 """ 

101 

102 defaultConfigFile = "datastores/chainedDatastore.yaml" 

103 """Path to configuration defaults. Accessed within the ``configs`` resource 

104 or relative to a search path. Can be None if no defaults specified. 

105 """ 

106 

107 containerKey = "datastores" 

108 """Key to specify where child datastores are configured.""" 

109 

110 datastores: list[Datastore] 

111 """All the child datastores known to this datastore.""" 

112 

113 datastoreConstraints: Sequence[Constraints | None] 

114 """Constraints to be applied to each of the child datastores.""" 

115 

116 @classmethod 

117 def setConfigRoot(cls, root: str, config: Config, full: Config, overwrite: bool = True) -> None: 

118 """Set any filesystem-dependent config options for child Datastores to 

119 be appropriate for a new empty repository with the given root. 

120 

121 Parameters 

122 ---------- 

123 root : `str` 

124 Filesystem path to the root of the data repository. 

125 config : `Config` 

126 A `Config` to update. Only the subset understood by 

127 this component will be updated. Will not expand 

128 defaults. 

129 full : `Config` 

130 A complete config with all defaults expanded that can be 

131 converted to a `DatastoreConfig`. Read-only and will not be 

132 modified by this method. 

133 Repository-specific options that should not be obtained 

134 from defaults when Butler instances are constructed 

135 should be copied from ``full`` to ``config``. 

136 overwrite : `bool`, optional 

137 If `False`, do not modify a value in ``config`` if the value 

138 already exists. Default is always to overwrite with the provided 

139 ``root``. 

140 

141 Notes 

142 ----- 

143 If a keyword is explicitly defined in the supplied ``config`` it 

144 will not be overridden by this method if ``overwrite`` is `False`. 

145 This allows explicit values set in external configs to be retained. 

146 """ 

147 # Extract the part of the config we care about updating 

148 datastoreConfig = DatastoreConfig(config, mergeDefaults=False) 

149 

150 # And the subset of the full config that we can use for reference. 

151 # Do not bother with defaults because we are told this already has 

152 # them. 

153 fullDatastoreConfig = DatastoreConfig(full, mergeDefaults=False) 

154 

155 # Loop over each datastore config and pass the subsets to the 

156 # child datastores to process. 

157 

158 containerKey = cls.containerKey 

159 for idx, (child, fullChild) in enumerate( 

160 zip(datastoreConfig[containerKey], fullDatastoreConfig[containerKey], strict=True) 

161 ): 

162 childConfig = DatastoreConfig(child, mergeDefaults=False) 

163 fullChildConfig = DatastoreConfig(fullChild, mergeDefaults=False) 

164 datastoreClass = doImportType(fullChildConfig["cls"]) 

165 if not issubclass(datastoreClass, Datastore): 165 ↛ 166line 165 didn't jump to line 166, because the condition on line 165 was never true

166 raise TypeError(f"Imported child class {fullChildConfig['cls']} is not a Datastore") 

167 newroot = f"{root}/{datastoreClass.__qualname__}_{idx}" 

168 datastoreClass.setConfigRoot(newroot, childConfig, fullChildConfig, overwrite=overwrite) 

169 

170 # Reattach to parent 

171 datastoreConfig[containerKey, idx] = childConfig 

172 

173 # Reattach modified datastore config to parent 

174 # If this has a datastore key we attach there, otherwise we assume 

175 # this information goes at the top of the config hierarchy. 

176 if DatastoreConfig.component in config: 

177 config[DatastoreConfig.component] = datastoreConfig 

178 else: 

179 config.update(datastoreConfig) 

180 

181 return 

182 

183 def __init__( 

184 self, 

185 config: DatastoreConfig, 

186 bridgeManager: DatastoreRegistryBridgeManager, 

187 datastores: list[Datastore], 

188 ): 

189 super().__init__(config, bridgeManager) 

190 

191 self.datastores = list(datastores) 

192 

193 # Name ourself based on our children 

194 if self.datastores: 194 ↛ 199line 194 didn't jump to line 199, because the condition on line 194 was never false

195 # We must set the names explicitly 

196 self._names = [d.name for d in self.datastores] 

197 childNames = ",".join(self.names) 

198 else: 

199 childNames = f"(empty@{time.time()})" 

200 self._names = [childNames] 

201 self.name = f"{type(self).__qualname__}[{childNames}]" 

202 

203 # We declare we are ephemeral if all our child datastores declare 

204 # they are ephemeral 

205 self.isEphemeral = all(d.isEphemeral for d in self.datastores) 

206 

207 # per-datastore override constraints 

208 if "datastore_constraints" in self.config: 

209 overrides = self.config["datastore_constraints"] 

210 

211 if len(overrides) != len(self.datastores): 211 ↛ 212line 211 didn't jump to line 212, because the condition on line 211 was never true

212 raise DatastoreValidationError( 

213 f"Number of registered datastores ({len(self.datastores)})" 

214 " differs from number of constraints overrides" 

215 f" {len(overrides)}" 

216 ) 

217 

218 self.datastoreConstraints = [ 

219 Constraints(c.get("constraints"), universe=bridgeManager.universe) for c in overrides 

220 ] 

221 

222 else: 

223 self.datastoreConstraints = (None,) * len(self.datastores) 

224 

225 log.debug("Created %s (%s)", self.name, ("ephemeral" if self.isEphemeral else "permanent")) 

226 

227 @classmethod 

228 def _create_from_config( 

229 cls, 

230 config: DatastoreConfig, 

231 bridgeManager: DatastoreRegistryBridgeManager, 

232 butlerRoot: ResourcePathExpression | None, 

233 ) -> ChainedDatastore: 

234 # Scan for child datastores and instantiate them with the same registry 

235 datastores = [] 

236 for c in config["datastores"]: 

237 c = DatastoreConfig(c) 

238 datastoreType = doImportType(c["cls"]) 

239 if not issubclass(datastoreType, Datastore): 239 ↛ 240line 239 didn't jump to line 240, because the condition on line 239 was never true

240 raise TypeError(f"Imported child class {c['cls']} is not a Datastore") 

241 datastore = datastoreType._create_from_config(c, bridgeManager, butlerRoot=butlerRoot) 

242 log.debug("Creating child datastore %s", datastore.name) 

243 datastores.append(datastore) 

244 

245 return ChainedDatastore(config, bridgeManager, datastores) 

246 

247 def clone(self, bridgeManager: DatastoreRegistryBridgeManager) -> Datastore: 

248 datastores = [ds.clone(bridgeManager) for ds in self.datastores] 

249 return ChainedDatastore(self.config, bridgeManager, datastores) 

250 

251 @property 

252 def names(self) -> tuple[str, ...]: 

253 return tuple(self._names) 

254 

255 @property 

256 def roots(self) -> dict[str, ResourcePath | None]: 

257 # Docstring inherited. 

258 roots = {} 

259 for datastore in self.datastores: 

260 roots.update(datastore.roots) 

261 return roots 

262 

263 def __str__(self) -> str: 

264 chainName = ", ".join(str(ds) for ds in self.datastores) 

265 return chainName 

266 

267 def _set_trust_mode(self, mode: bool) -> None: 

268 for datastore in self.datastores: 

269 datastore._set_trust_mode(mode) 

270 

271 def knows(self, ref: DatasetRef) -> bool: 

272 """Check if the dataset is known to any of the datastores. 

273 

274 Does not check for existence of any artifact. 

275 

276 Parameters 

277 ---------- 

278 ref : `DatasetRef` 

279 Reference to the required dataset. 

280 

281 Returns 

282 ------- 

283 exists : `bool` 

284 `True` if the dataset is known to the datastore. 

285 """ 

286 for datastore in self.datastores: 

287 if datastore.knows(ref): 

288 log.debug("%s known to datastore %s", ref, datastore.name) 

289 return True 

290 return False 

291 

292 def knows_these(self, refs: Iterable[DatasetRef]) -> dict[DatasetRef, bool]: 

293 # Docstring inherited from the base class. 

294 refs_known: dict[DatasetRef, bool] = {} 

295 for datastore in self.datastores: 

296 refs_known.update(datastore.knows_these(refs)) 

297 

298 # No need to check in next datastore for refs that are known. 

299 # We only update entries that were initially False. 

300 refs = [ref for ref, known in refs_known.items() if not known] 

301 

302 return refs_known 

303 

304 def mexists( 

305 self, refs: Iterable[DatasetRef], artifact_existence: dict[ResourcePath, bool] | None = None 

306 ) -> dict[DatasetRef, bool]: 

307 """Check the existence of multiple datasets at once. 

308 

309 Parameters 

310 ---------- 

311 refs : iterable of `DatasetRef` 

312 The datasets to be checked. 

313 artifact_existence : `dict` [`lsst.resources.ResourcePath`, `bool`] 

314 Optional mapping of datastore artifact to existence. Updated by 

315 this method with details of all artifacts tested. Can be `None` 

316 if the caller is not interested. 

317 

318 Returns 

319 ------- 

320 existence : `dict` of [`DatasetRef`, `bool`] 

321 Mapping from dataset to boolean indicating existence in any 

322 of the child datastores. 

323 """ 

324 dataset_existence: dict[DatasetRef, bool] = {} 

325 for datastore in self.datastores: 

326 dataset_existence.update(datastore.mexists(refs, artifact_existence=artifact_existence)) 

327 

328 # For next datastore no point asking about ones we know 

329 # exist already. No special exemption for ephemeral datastores. 

330 refs = [ref for ref, exists in dataset_existence.items() if not exists] 

331 

332 return dataset_existence 

333 

334 def exists(self, ref: DatasetRef) -> bool: 

335 """Check if the dataset exists in one of the datastores. 

336 

337 Parameters 

338 ---------- 

339 ref : `DatasetRef` 

340 Reference to the required dataset. 

341 

342 Returns 

343 ------- 

344 exists : `bool` 

345 `True` if the entity exists in one of the child datastores. 

346 """ 

347 for datastore in self.datastores: 

348 if datastore.exists(ref): 

349 log.debug("Found %s in datastore %s", ref, datastore.name) 

350 return True 

351 return False 

352 

353 def get( 

354 self, 

355 ref: DatasetRef, 

356 parameters: Mapping[str, Any] | None = None, 

357 storageClass: StorageClass | str | None = None, 

358 ) -> Any: 

359 """Load an InMemoryDataset from the store. 

360 

361 The dataset is returned from the first datastore that has 

362 the dataset. 

363 

364 Parameters 

365 ---------- 

366 ref : `DatasetRef` 

367 Reference to the required Dataset. 

368 parameters : `dict` 

369 `StorageClass`-specific parameters that specify, for example, 

370 a slice of the dataset to be loaded. 

371 storageClass : `StorageClass` or `str`, optional 

372 The storage class to be used to override the Python type 

373 returned by this method. By default the returned type matches 

374 the dataset type definition for this dataset. Specifying a 

375 read `StorageClass` can force a different type to be returned. 

376 This type must be compatible with the original type. 

377 

378 Returns 

379 ------- 

380 inMemoryDataset : `object` 

381 Requested dataset or slice thereof as an InMemoryDataset. 

382 

383 Raises 

384 ------ 

385 FileNotFoundError 

386 Requested dataset can not be retrieved. 

387 TypeError 

388 Return value from formatter has unexpected type. 

389 ValueError 

390 Formatter failed to process the dataset. 

391 """ 

392 for datastore in self.datastores: 

393 try: 

394 inMemoryObject = datastore.get(ref, parameters, storageClass=storageClass) 

395 log.debug("Found dataset %s in datastore %s", ref, datastore.name) 

396 return inMemoryObject 

397 except FileNotFoundError: 

398 pass 

399 

400 raise FileNotFoundError(f"Dataset {ref} could not be found in any of the datastores") 

401 

402 def put(self, inMemoryDataset: Any, ref: DatasetRef) -> None: 

403 """Write a InMemoryDataset with a given `DatasetRef` to each 

404 datastore. 

405 

406 The put() to child datastores can fail with 

407 `DatasetTypeNotSupportedError`. The put() for this datastore will be 

408 deemed to have succeeded so long as at least one child datastore 

409 accepted the inMemoryDataset. 

410 

411 Parameters 

412 ---------- 

413 inMemoryDataset : `object` 

414 The dataset to store. 

415 ref : `DatasetRef` 

416 Reference to the associated Dataset. 

417 

418 Raises 

419 ------ 

420 TypeError 

421 Supplied object and storage class are inconsistent. 

422 DatasetTypeNotSupportedError 

423 All datastores reported `DatasetTypeNotSupportedError`. 

424 """ 

425 log.debug("Put %s", ref) 

426 

427 # Confirm that we can accept this dataset 

428 if not self.constraints.isAcceptable(ref): 

429 # Raise rather than use boolean return value. 

430 raise DatasetTypeNotSupportedError( 

431 f"Dataset {ref} has been rejected by this datastore via configuration." 

432 ) 

433 

434 isPermanent = False 

435 nsuccess = 0 

436 npermanent = 0 

437 nephemeral = 0 

438 for datastore, constraints in zip(self.datastores, self.datastoreConstraints, strict=True): 

439 if ( 

440 constraints is not None and not constraints.isAcceptable(ref) 

441 ) or not datastore.constraints.isAcceptable(ref): 

442 log.debug("Datastore %s skipping put via configuration for ref %s", datastore.name, ref) 

443 continue 

444 

445 if datastore.isEphemeral: 

446 nephemeral += 1 

447 else: 

448 npermanent += 1 

449 try: 

450 datastore.put(inMemoryDataset, ref) 

451 nsuccess += 1 

452 if not datastore.isEphemeral: 

453 isPermanent = True 

454 except DatasetTypeNotSupportedError: 

455 pass 

456 

457 if nsuccess == 0: 

458 raise DatasetTypeNotSupportedError(f"None of the chained datastores supported ref {ref}") 

459 

460 if not isPermanent and npermanent > 0: 460 ↛ 461line 460 didn't jump to line 461, because the condition on line 460 was never true

461 warnings.warn(f"Put of {ref} only succeeded in ephemeral databases", stacklevel=2) 

462 

463 if self._transaction is not None: 

464 self._transaction.registerUndo("put", self.remove, ref) 

465 

466 def put_new(self, in_memory_dataset: Any, ref: DatasetRef) -> Mapping[str, DatasetRef]: 

467 # Docstring inherited from base class. 

468 log.debug("Put %s", ref) 

469 

470 # Confirm that we can accept this dataset 

471 if not self.constraints.isAcceptable(ref): 

472 # Raise rather than use boolean return value. 

473 raise DatasetTypeNotSupportedError( 

474 f"Dataset {ref} has been rejected by this datastore via configuration." 

475 ) 

476 

477 isPermanent = False 

478 nsuccess = 0 

479 npermanent = 0 

480 nephemeral = 0 

481 stored_refs: dict[str, DatasetRef] = {} 

482 for datastore, constraints in zip(self.datastores, self.datastoreConstraints, strict=True): 

483 if ( 

484 constraints is not None and not constraints.isAcceptable(ref) 

485 ) or not datastore.constraints.isAcceptable(ref): 

486 log.debug("Datastore %s skipping put via configuration for ref %s", datastore.name, ref) 

487 continue 

488 

489 if datastore.isEphemeral: 

490 nephemeral += 1 

491 else: 

492 npermanent += 1 

493 try: 

494 stored_ref_map = datastore.put_new(in_memory_dataset, ref) 

495 stored_refs.update(stored_ref_map) 

496 nsuccess += 1 

497 if not datastore.isEphemeral: 

498 isPermanent = True 

499 except DatasetTypeNotSupportedError: 

500 pass 

501 

502 if nsuccess == 0: 

503 raise DatasetTypeNotSupportedError(f"None of the chained datastores supported ref {ref}") 

504 

505 if not isPermanent and npermanent > 0: 

506 warnings.warn(f"Put of {ref} only succeeded in ephemeral databases", stacklevel=2) 

507 

508 if self._transaction is not None: 

509 self._transaction.registerUndo("put", self.remove, ref) 

510 

511 return stored_refs 

512 

513 def _overrideTransferMode(self, *datasets: Any, transfer: str | None = None) -> str | None: 

514 # Docstring inherited from base class. 

515 if transfer != "auto": 

516 return transfer 

517 # Ask each datastore what they think auto means 

518 transfers = {d._overrideTransferMode(*datasets, transfer=transfer) for d in self.datastores} 

519 

520 # Remove any untranslated "auto" values 

521 transfers.discard(transfer) 

522 

523 if len(transfers) == 1: 523 ↛ 524line 523 didn't jump to line 524, because the condition on line 523 was never true

524 return transfers.pop() 

525 if not transfers: 525 ↛ 529line 525 didn't jump to line 529, because the condition on line 525 was never false

526 # Everything reported "auto" 

527 return transfer 

528 

529 raise RuntimeError( 

530 "Chained datastore does not yet support different transfer modes" 

531 f" from 'auto' in each child datastore (wanted {transfers})" 

532 ) 

533 

534 def _prepIngest(self, *datasets: FileDataset, transfer: str | None = None) -> _IngestPrepData: 

535 # Docstring inherited from Datastore._prepIngest. 

536 if transfer is None: 

537 raise NotImplementedError("ChainedDatastore does not support transfer=None.") 

538 

539 def isDatasetAcceptable(dataset: FileDataset, *, name: str, constraints: Constraints) -> bool: 

540 acceptable = [ref for ref in dataset.refs if constraints.isAcceptable(ref)] 

541 if not acceptable: 

542 log.debug( 

543 "Datastore %s skipping ingest via configuration for refs %s", 

544 name, 

545 ", ".join(str(ref) for ref in dataset.refs), 

546 ) 

547 return False 

548 else: 

549 return True 

550 

551 # Filter down to just datasets the chained datastore's own 

552 # configuration accepts. 

553 okForParent: list[FileDataset] = [ 

554 dataset 

555 for dataset in datasets 

556 if isDatasetAcceptable(dataset, name=self.name, constraints=self.constraints) 

557 ] 

558 

559 # Iterate over nested datastores and call _prepIngest on each. 

560 # Save the results to a list: 

561 children: list[tuple[Datastore, Datastore.IngestPrepData, set[ResourcePath]]] = [] 

562 # ...and remember whether all of the failures are due to 

563 # NotImplementedError being raised. 

564 allFailuresAreNotImplementedError = True 

565 for datastore, constraints in zip(self.datastores, self.datastoreConstraints, strict=True): 

566 okForChild: list[FileDataset] 

567 if constraints is not None: 

568 okForChild = [ 

569 dataset 

570 for dataset in okForParent 

571 if isDatasetAcceptable(dataset, name=datastore.name, constraints=constraints) 

572 ] 

573 else: 

574 okForChild = okForParent 

575 try: 

576 prepDataForChild = datastore._prepIngest(*okForChild, transfer=transfer) 

577 except NotImplementedError: 

578 log.debug( 

579 "Skipping ingest for datastore %s because transfer mode %s is not supported.", 

580 datastore.name, 

581 transfer, 

582 ) 

583 continue 

584 allFailuresAreNotImplementedError = False 

585 if okForChild: 

586 # Do not store for later if a datastore has rejected 

587 # everything. 

588 # Include the source paths if this is a "move". It's clearer 

589 # to find the paths now rather than try to infer how 

590 # each datastore has stored them in the internal prep class. 

591 paths = ( 

592 {ResourcePath(dataset.path, forceDirectory=False) for dataset in okForChild} 

593 if transfer == "move" 

594 else set() 

595 ) 

596 children.append((datastore, prepDataForChild, paths)) 

597 if allFailuresAreNotImplementedError: 

598 raise NotImplementedError(f"No child datastore supports transfer mode {transfer}.") 

599 return _IngestPrepData(children=children) 

600 

601 def _finishIngest( 

602 self, 

603 prepData: _IngestPrepData, 

604 *, 

605 transfer: str | None = None, 

606 record_validation_info: bool = True, 

607 ) -> None: 

608 # Docstring inherited from Datastore._finishIngest. 

609 # For "move" we must use "copy" and then delete the input 

610 # data at the end. This has no rollback option if the ingest 

611 # subsequently fails. If there is only one active datastore 

612 # accepting any files we can leave it as "move" 

613 actual_transfer: str | None 

614 if transfer == "move" and len(prepData.children) > 1: 

615 actual_transfer = "copy" 

616 else: 

617 actual_transfer = transfer 

618 to_be_deleted: set[ResourcePath] = set() 

619 for datastore, prepDataForChild, paths in prepData.children: 

620 datastore._finishIngest( 

621 prepDataForChild, transfer=actual_transfer, record_validation_info=record_validation_info 

622 ) 

623 to_be_deleted.update(paths) 

624 if actual_transfer != transfer: 

625 # These datasets were copied but now need to be deleted. 

626 # This can not be rolled back. 

627 for uri in to_be_deleted: 

628 uri.remove() 

629 

630 def getManyURIs( 

631 self, 

632 refs: Iterable[DatasetRef], 

633 predict: bool = False, 

634 allow_missing: bool = False, 

635 ) -> dict[DatasetRef, DatasetRefURIs]: 

636 # Docstring inherited 

637 

638 uris: dict[DatasetRef, DatasetRefURIs] = {} 

639 missing_refs = set(refs) 

640 

641 # If predict is True we don't want to predict a dataset in the first 

642 # datastore if it actually exists in a later datastore, so in that 

643 # case check all datastores with predict=False first, and then try 

644 # again with predict=True. 

645 for p in (False, True) if predict else (False,): 

646 if not missing_refs: 

647 break 

648 for datastore in self.datastores: 

649 try: 

650 got_uris = datastore.getManyURIs(missing_refs, p, allow_missing=True) 

651 except NotImplementedError: 

652 # some datastores may not implement generating URIs 

653 continue 

654 missing_refs -= got_uris.keys() 

655 uris.update(got_uris) 

656 if not missing_refs: 

657 break 

658 

659 if missing_refs and not allow_missing: 

660 raise FileNotFoundError(f"Dataset(s) {missing_refs} not in this datastore.") 

661 

662 return uris 

663 

664 def getURIs(self, ref: DatasetRef, predict: bool = False) -> DatasetRefURIs: 

665 """Return URIs associated with dataset. 

666 

667 Parameters 

668 ---------- 

669 ref : `DatasetRef` 

670 Reference to the required dataset. 

671 predict : `bool`, optional 

672 If the datastore does not know about the dataset, controls whether 

673 it should return a predicted URI or not. 

674 

675 Returns 

676 ------- 

677 uris : `DatasetRefURIs` 

678 The URI to the primary artifact associated with this dataset (if 

679 the dataset was disassembled within the datastore this may be 

680 `None`), and the URIs to any components associated with the dataset 

681 artifact. (can be empty if there are no components). 

682 

683 Notes 

684 ----- 

685 The returned URI is from the first datastore in the list that has 

686 the dataset with preference given to the first dataset coming from 

687 a permanent datastore. If no datastores have the dataset and prediction 

688 is allowed, the predicted URI for the first datastore in the list will 

689 be returned. 

690 """ 

691 log.debug("Requesting URIs for %s", ref) 

692 predictedUri: DatasetRefURIs | None = None 

693 predictedEphemeralUri: DatasetRefURIs | None = None 

694 firstEphemeralUri: DatasetRefURIs | None = None 

695 for datastore in self.datastores: 

696 if datastore.exists(ref): 

697 if not datastore.isEphemeral: 

698 uri = datastore.getURIs(ref) 

699 log.debug("Retrieved non-ephemeral URI: %s", uri) 

700 return uri 

701 elif not firstEphemeralUri: 

702 firstEphemeralUri = datastore.getURIs(ref) 

703 elif predict: 

704 if not predictedUri and not datastore.isEphemeral: 

705 predictedUri = datastore.getURIs(ref, predict) 

706 elif not predictedEphemeralUri and datastore.isEphemeral: 

707 predictedEphemeralUri = datastore.getURIs(ref, predict) 

708 

709 if firstEphemeralUri: 

710 log.debug("Retrieved ephemeral URI: %s", firstEphemeralUri) 

711 return firstEphemeralUri 

712 

713 if predictedUri: 

714 log.debug("Retrieved predicted URI: %s", predictedUri) 

715 return predictedUri 

716 

717 if predictedEphemeralUri: 

718 log.debug("Retrieved predicted ephemeral URI: %s", predictedEphemeralUri) 

719 return predictedEphemeralUri 

720 

721 raise FileNotFoundError(f"Dataset {ref} not in any datastore") 

722 

723 def getURI(self, ref: DatasetRef, predict: bool = False) -> ResourcePath: 

724 """URI to the Dataset. 

725 

726 The returned URI is from the first datastore in the list that has 

727 the dataset with preference given to the first dataset coming from 

728 a permanent datastore. If no datastores have the dataset and prediction 

729 is allowed, the predicted URI for the first datastore in the list will 

730 be returned. 

731 

732 Parameters 

733 ---------- 

734 ref : `DatasetRef` 

735 Reference to the required Dataset. 

736 predict : `bool` 

737 If `True`, allow URIs to be returned of datasets that have not 

738 been written. 

739 

740 Returns 

741 ------- 

742 uri : `lsst.resources.ResourcePath` 

743 URI pointing to the dataset within the datastore. If the 

744 dataset does not exist in the datastore, and if ``predict`` is 

745 `True`, the URI will be a prediction and will include a URI 

746 fragment "#predicted". 

747 

748 Notes 

749 ----- 

750 If the datastore does not have entities that relate well 

751 to the concept of a URI the returned URI string will be 

752 descriptive. The returned URI is not guaranteed to be obtainable. 

753 

754 Raises 

755 ------ 

756 FileNotFoundError 

757 A URI has been requested for a dataset that does not exist and 

758 guessing is not allowed. 

759 RuntimeError 

760 Raised if a request is made for a single URI but multiple URIs 

761 are associated with this dataset. 

762 """ 

763 log.debug("Requesting URI for %s", ref) 

764 primary, components = self.getURIs(ref, predict) 

765 if primary is None or components: 765 ↛ 766line 765 didn't jump to line 766, because the condition on line 765 was never true

766 raise RuntimeError( 

767 f"Dataset ({ref}) includes distinct URIs for components. Use Datastore.getURIs() instead." 

768 ) 

769 return primary 

770 

771 def retrieveArtifacts( 

772 self, 

773 refs: Iterable[DatasetRef], 

774 destination: ResourcePath, 

775 transfer: str = "auto", 

776 preserve_path: bool = True, 

777 overwrite: bool = False, 

778 ) -> list[ResourcePath]: 

779 """Retrieve the file artifacts associated with the supplied refs. 

780 

781 Parameters 

782 ---------- 

783 refs : iterable of `DatasetRef` 

784 The datasets for which file artifacts are to be retrieved. 

785 A single ref can result in multiple files. The refs must 

786 be resolved. 

787 destination : `lsst.resources.ResourcePath` 

788 Location to write the file artifacts. 

789 transfer : `str`, optional 

790 Method to use to transfer the artifacts. Must be one of the options 

791 supported by `lsst.resources.ResourcePath.transfer_from()`. 

792 "move" is not allowed. 

793 preserve_path : `bool`, optional 

794 If `True` the full path of the file artifact within the datastore 

795 is preserved. If `False` the final file component of the path 

796 is used. 

797 overwrite : `bool`, optional 

798 If `True` allow transfers to overwrite existing files at the 

799 destination. 

800 

801 Returns 

802 ------- 

803 targets : `list` of `lsst.resources.ResourcePath` 

804 URIs of file artifacts in destination location. Order is not 

805 preserved. 

806 """ 

807 if not destination.isdir(): 807 ↛ 808line 807 didn't jump to line 808, because the condition on line 807 was never true

808 raise ValueError(f"Destination location must refer to a directory. Given {destination}") 

809 

810 # Using getURIs is not feasible since it becomes difficult to 

811 # determine the path within the datastore later on. For now 

812 # follow getURIs implementation approach. 

813 

814 pending = set(refs) 

815 

816 # There is a question as to whether an exception should be raised 

817 # early if some of the refs are missing, or whether files should be 

818 # transferred until a problem is hit. Prefer to complain up front. 

819 # Use the datastore integer as primary key. 

820 grouped_by_datastore: dict[int, set[DatasetRef]] = {} 

821 

822 for number, datastore in enumerate(self.datastores): 

823 if datastore.isEphemeral: 

824 # In the future we will want to distinguish in-memory from 

825 # caching datastore since using an on-disk local 

826 # cache is exactly what we should be doing. 

827 continue 

828 try: 

829 datastore_refs = {ref for ref in pending if datastore.exists(ref)} 

830 except NotImplementedError: 

831 # Some datastores may not support retrieving artifacts 

832 continue 

833 

834 if datastore_refs: 

835 grouped_by_datastore[number] = datastore_refs 

836 

837 # Remove these from the pending list so that we do not bother 

838 # looking for them any more. 

839 pending = pending - datastore_refs 

840 

841 if pending: 841 ↛ 842line 841 didn't jump to line 842, because the condition on line 841 was never true

842 raise RuntimeError(f"Some datasets were not found in any datastores: {pending}") 

843 

844 # Now do the transfer. 

845 targets: list[ResourcePath] = [] 

846 for number, datastore_refs in grouped_by_datastore.items(): 

847 targets.extend( 

848 self.datastores[number].retrieveArtifacts( 

849 datastore_refs, 

850 destination, 

851 transfer=transfer, 

852 preserve_path=preserve_path, 

853 overwrite=overwrite, 

854 ) 

855 ) 

856 

857 return targets 

858 

859 def remove(self, ref: DatasetRef) -> None: 

860 """Indicate to the datastore that a dataset can be removed. 

861 

862 The dataset will be removed from each datastore. The dataset is 

863 not required to exist in every child datastore. 

864 

865 Parameters 

866 ---------- 

867 ref : `DatasetRef` 

868 Reference to the required dataset. 

869 

870 Raises 

871 ------ 

872 FileNotFoundError 

873 Attempt to remove a dataset that does not exist. Raised if none 

874 of the child datastores removed the dataset. 

875 """ 

876 log.debug("Removing %s", ref) 

877 self.trash(ref, ignore_errors=False) 

878 self.emptyTrash(ignore_errors=False) 

879 

880 def forget(self, refs: Iterable[DatasetRef]) -> None: 

881 for datastore in tuple(self.datastores): 

882 datastore.forget(refs) 

883 

884 def trash(self, ref: DatasetRef | Iterable[DatasetRef], ignore_errors: bool = True) -> None: 

885 if isinstance(ref, DatasetRef): 

886 ref_label = str(ref) 

887 else: 

888 ref_label = "bulk datasets" 

889 

890 log.debug("Trashing %s", ref_label) 

891 

892 counter = 0 

893 for datastore in self.datastores: 

894 try: 

895 datastore.trash(ref, ignore_errors=ignore_errors) 

896 counter += 1 

897 except FileNotFoundError: 

898 pass 

899 

900 if counter == 0: 

901 err_msg = f"Could not mark for removal from any child datastore: {ref_label}" 

902 if ignore_errors: 902 ↛ 903line 902 didn't jump to line 903, because the condition on line 902 was never true

903 log.warning(err_msg) 

904 else: 

905 raise FileNotFoundError(err_msg) 

906 

907 def emptyTrash(self, ignore_errors: bool = True) -> None: 

908 for datastore in self.datastores: 

909 datastore.emptyTrash(ignore_errors=ignore_errors) 

910 

911 def transfer(self, inputDatastore: Datastore, ref: DatasetRef) -> None: 

912 """Retrieve a dataset from an input `Datastore`, 

913 and store the result in this `Datastore`. 

914 

915 Parameters 

916 ---------- 

917 inputDatastore : `Datastore` 

918 The external `Datastore` from which to retreive the Dataset. 

919 ref : `DatasetRef` 

920 Reference to the required dataset in the input data store. 

921 

922 Returns 

923 ------- 

924 results : `list` 

925 List containing the return value from the ``put()`` to each 

926 child datastore. 

927 """ 

928 assert inputDatastore is not self # unless we want it for renames? 

929 inMemoryDataset = inputDatastore.get(ref) 

930 self.put(inMemoryDataset, ref) 

931 

932 def validateConfiguration( 

933 self, entities: Iterable[DatasetRef | DatasetType | StorageClass], logFailures: bool = False 

934 ) -> None: 

935 """Validate some of the configuration for this datastore. 

936 

937 Parameters 

938 ---------- 

939 entities : iterable of `DatasetRef`, `DatasetType`, or `StorageClass` 

940 Entities to test against this configuration. Can be differing 

941 types. 

942 logFailures : `bool`, optional 

943 If `True`, output a log message for every validation error 

944 detected. 

945 

946 Raises 

947 ------ 

948 DatastoreValidationError 

949 Raised if there is a validation problem with a configuration. 

950 All the problems are reported in a single exception. 

951 

952 Notes 

953 ----- 

954 This method checks each datastore in turn. 

955 """ 

956 # Need to catch each of the datastore outputs and ensure that 

957 # all are tested. 

958 failures = [] 

959 for datastore in self.datastores: 

960 try: 

961 datastore.validateConfiguration(entities, logFailures=logFailures) 

962 except DatastoreValidationError as e: 

963 if logFailures: 963 ↛ 965line 963 didn't jump to line 965, because the condition on line 963 was never false

964 log.critical("Datastore %s failed validation", datastore.name) 

965 failures.append(f"Datastore {self.name}: {e}") 

966 

967 if failures: 

968 msg = ";\n".join(failures) 

969 raise DatastoreValidationError(msg) 

970 

971 def validateKey(self, lookupKey: LookupKey, entity: DatasetRef | DatasetType | StorageClass) -> None: 

972 # Docstring is inherited from base class 

973 failures = [] 

974 for datastore in self.datastores: 

975 try: 

976 datastore.validateKey(lookupKey, entity) 

977 except DatastoreValidationError as e: 

978 failures.append(f"Datastore {self.name}: {e}") 

979 

980 if failures: 

981 msg = ";\n".join(failures) 

982 raise DatastoreValidationError(msg) 

983 

984 def getLookupKeys(self) -> set[LookupKey]: 

985 # Docstring is inherited from base class 

986 keys = set() 

987 for datastore in self.datastores: 

988 keys.update(datastore.getLookupKeys()) 

989 

990 keys.update(self.constraints.getLookupKeys()) 

991 for p in self.datastoreConstraints: 

992 if p is not None: 992 ↛ 991line 992 didn't jump to line 991, because the condition on line 992 was never false

993 keys.update(p.getLookupKeys()) 

994 

995 return keys 

996 

997 def needs_expanded_data_ids( 

998 self, 

999 transfer: str | None, 

1000 entity: DatasetRef | DatasetType | StorageClass | None = None, 

1001 ) -> bool: 

1002 # Docstring inherited. 

1003 # We can't safely use `self.datastoreConstraints` with `entity` to 

1004 # check whether a child datastore would even want to ingest this 

1005 # dataset, because we don't want to filter out datastores that might 

1006 # need an expanded data ID based in incomplete information (e.g. we 

1007 # pass a StorageClass, but the constraint dispatches on DatasetType). 

1008 # So we pessimistically check if any datastore would need an expanded 

1009 # data ID for this transfer mode. 

1010 return any(datastore.needs_expanded_data_ids(transfer, entity) for datastore in self.datastores) 

1011 

1012 def import_records(self, data: Mapping[str, DatastoreRecordData]) -> None: 

1013 # Docstring inherited from the base class. 

1014 

1015 for datastore in self.datastores: 

1016 datastore.import_records(data) 

1017 

1018 def export_records(self, refs: Iterable[DatasetIdRef]) -> Mapping[str, DatastoreRecordData]: 

1019 # Docstring inherited from the base class. 

1020 

1021 all_records: dict[str, DatastoreRecordData] = {} 

1022 

1023 # Merge all sub-datastore records into one structure 

1024 for datastore in self.datastores: 

1025 sub_records = datastore.export_records(refs) 

1026 for name, record_data in sub_records.items(): 

1027 # All datastore names must be unique in a chain. 

1028 if name in all_records: 1028 ↛ 1029line 1028 didn't jump to line 1029, because the condition on line 1028 was never true

1029 raise ValueError("Non-unique datastore name found in datastore {datastore}") 

1030 all_records[name] = record_data 

1031 

1032 return all_records 

1033 

1034 def export( 

1035 self, 

1036 refs: Iterable[DatasetRef], 

1037 *, 

1038 directory: ResourcePathExpression | None = None, 

1039 transfer: str | None = "auto", 

1040 ) -> Iterable[FileDataset]: 

1041 # Docstring inherited from Datastore.export. 

1042 if transfer == "auto" and directory is None: 

1043 transfer = None 

1044 

1045 if transfer is not None and directory is None: 

1046 raise TypeError(f"Cannot export using transfer mode {transfer} with no export directory given") 

1047 

1048 if transfer == "move": 

1049 raise TypeError("Can not export by moving files out of datastore.") 

1050 

1051 # Exporting from a chain has the potential for a dataset to be 

1052 # in one or more of the datastores in the chain. We only need one 

1053 # of them since we assume the datasets are the same in all (but 

1054 # the file format could be different of course since that is a 

1055 # per-datastore configuration). 

1056 # We also do not know whether any of the datastores in the chain 

1057 # support file export. 

1058 

1059 # Ensure we have an ordered sequence that is not an iterator or set. 

1060 if not isinstance(refs, Sequence): 

1061 refs = list(refs) 

1062 

1063 # If any of the datasets are missing entirely we need to raise early 

1064 # before we try to run the export. This can be a little messy but is 

1065 # better than exporting files from the first datastore and then finding 

1066 # that one is missing but is not in the second datastore either. 

1067 known = [datastore.knows_these(refs) for datastore in self.datastores] 

1068 refs_known: set[DatasetRef] = set() 

1069 for known_to_this in known: 

1070 refs_known.update({ref for ref, knows_this in known_to_this.items() if knows_this}) 

1071 missing_count = len(refs) - len(refs_known) 

1072 if missing_count: 

1073 raise FileNotFoundError(f"Not all datasets known to this datastore. Missing {missing_count}") 

1074 

1075 # To allow us to slot each result into the right place after 

1076 # asking each datastore, create a dict with the index. 

1077 ref_positions = {ref: i for i, ref in enumerate(refs)} 

1078 

1079 # Presize the final export list. 

1080 exported: list[FileDataset | None] = [None] * len(refs) 

1081 

1082 # The order of the returned dataset has to match the order of the 

1083 # given refs, even if they are all from different datastores. 

1084 for i, datastore in enumerate(self.datastores): 

1085 known_to_this = known[i] 

1086 filtered = [ref for ref, knows in known_to_this.items() if knows and ref in ref_positions] 

1087 

1088 try: 

1089 this_export = datastore.export(filtered, directory=directory, transfer=transfer) 

1090 except NotImplementedError: 

1091 # Try the next datastore. 

1092 continue 

1093 

1094 for ref, export in zip(filtered, this_export, strict=True): 

1095 # Get the position and also delete it from the list. 

1096 exported[ref_positions.pop(ref)] = export 

1097 

1098 # Every dataset should be accounted for because of the earlier checks 

1099 # but make sure that we did fill all the slots to appease mypy. 

1100 for i, dataset in enumerate(exported): 

1101 if dataset is None: 1101 ↛ 1102line 1101 didn't jump to line 1102, because the condition on line 1101 was never true

1102 raise FileNotFoundError(f"Failed to export dataset {refs[i]}.") 

1103 yield dataset 

1104 

1105 def transfer_from( 

1106 self, 

1107 source_datastore: Datastore, 

1108 refs: Collection[DatasetRef], 

1109 transfer: str = "auto", 

1110 artifact_existence: dict[ResourcePath, bool] | None = None, 

1111 dry_run: bool = False, 

1112 ) -> tuple[set[DatasetRef], set[DatasetRef]]: 

1113 # Docstring inherited 

1114 # mypy does not understand "type(self) is not type(source)" 

1115 if isinstance(source_datastore, ChainedDatastore): 

1116 # Both the source and destination are chained datastores. 

1117 source_datastores = tuple(source_datastore.datastores) 

1118 else: 

1119 # The source datastore is different, forward everything to the 

1120 # child datastores. 

1121 source_datastores = (source_datastore,) 

1122 

1123 if not refs: 1123 ↛ 1125line 1123 didn't jump to line 1125, because the condition on line 1123 was never true

1124 # Nothing to transfer. 

1125 return set(), set() 

1126 

1127 # Need to know the set of all possible refs that could be transferred. 

1128 remaining_refs = set(refs) 

1129 

1130 missing_from_source: set[DatasetRef] | None = None 

1131 all_accepted = set() 

1132 nsuccess = 0 

1133 for source_child in source_datastores: 

1134 # If we are reading from a chained datastore, it's possible that 

1135 # only a subset of the datastores know about the dataset. We can't 

1136 # ask the receiving datastore to copy it when it doesn't exist 

1137 # so we have to filter again based on what the source datastore 

1138 # understands. 

1139 known_to_source = source_child.knows_these(list(refs)) 

1140 

1141 # Need to know that there is a possibility that some of these 

1142 # datasets exist but are unknown to the source datastore if 

1143 # trust is enabled. 

1144 if getattr(source_child, "trustGetRequest", False): 

1145 unknown = [ref for ref, known in known_to_source.items() if not known] 

1146 existence = source_child.mexists(unknown, artifact_existence) 

1147 for ref, exists in existence.items(): 

1148 known_to_source[ref] = exists 

1149 

1150 missing = {ref for ref, known in known_to_source.items() if not known} 

1151 if missing: 

1152 if missing_from_source is None: 

1153 missing_from_source = missing 

1154 else: 

1155 missing_from_source &= missing 

1156 

1157 # Try to transfer from each source datastore to each child 

1158 # datastore. Have to make sure we don't transfer something 

1159 # we've already transferred to this destination on later passes. 

1160 

1161 # Filter the initial list based on the datasets we have 

1162 # not yet transferred. 

1163 these_refs = [] 

1164 for ref in refs: 

1165 if ref in remaining_refs and known_to_source[ref]: 

1166 these_refs.append(ref) 

1167 

1168 if not these_refs: 

1169 # Already transferred all datasets known to this datastore. 

1170 continue 

1171 

1172 for datastore, constraints in zip(self.datastores, self.datastoreConstraints, strict=True): 

1173 if constraints is not None: 1173 ↛ 1181line 1173 didn't jump to line 1181, because the condition on line 1173 was never false

1174 filtered_refs = [] 

1175 for ref in these_refs: 

1176 if constraints.isAcceptable(ref): 

1177 filtered_refs.append(ref) 

1178 else: 

1179 log.debug("Rejecting ref by constraints: %s", ref) 

1180 else: 

1181 filtered_refs = list(these_refs) 

1182 try: 

1183 accepted, _ = datastore.transfer_from( 

1184 source_child, 

1185 filtered_refs, 

1186 transfer, 

1187 artifact_existence, 

1188 dry_run=dry_run, 

1189 ) 

1190 except (TypeError, NotImplementedError): 

1191 # The datastores were incompatible. 

1192 continue 

1193 else: 

1194 nsuccess += 1 

1195 

1196 # Remove the accepted datasets from those remaining. 

1197 remaining_refs = remaining_refs - accepted 

1198 

1199 # Keep track of everything we have accepted. 

1200 all_accepted.update(accepted) 

1201 

1202 if missing_from_source: 

1203 for ref in missing_from_source: 

1204 log.warning("Asked to transfer dataset %s but no file artifacts exist for it", ref) 

1205 

1206 if nsuccess == 0: 1206 ↛ 1207line 1206 didn't jump to line 1207, because the condition on line 1206 was never true

1207 raise TypeError(f"None of the child datastores could accept transfers from {source_datastore!r}") 

1208 

1209 return all_accepted, remaining_refs 

1210 

1211 def get_opaque_table_definitions(self) -> Mapping[str, DatastoreOpaqueTable]: 

1212 # Docstring inherited from the base class. 

1213 tables: dict[str, DatastoreOpaqueTable] = {} 

1214 for datastore in self.datastores: 

1215 tables.update(datastore.get_opaque_table_definitions()) 

1216 return tables