Coverage for python/lsst/daf/butler/datastores/chainedDatastore.py: 86%

477 statements  

« prev     ^ index     » next       coverage.py v7.4.4, created at 2024-04-19 03:43 -0700

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This software is dual licensed under the GNU General Public License and also 

10# under a 3-clause BSD license. Recipients may choose which of these licenses 

11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt, 

12# respectively. If you choose the GPL option then the following text applies 

13# (but note that there is still no warranty even if you opt for BSD instead): 

14# 

15# This program is free software: you can redistribute it and/or modify 

16# it under the terms of the GNU General Public License as published by 

17# the Free Software Foundation, either version 3 of the License, or 

18# (at your option) any later version. 

19# 

20# This program is distributed in the hope that it will be useful, 

21# but WITHOUT ANY WARRANTY; without even the implied warranty of 

22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

23# GNU General Public License for more details. 

24# 

25# You should have received a copy of the GNU General Public License 

26# along with this program. If not, see <http://www.gnu.org/licenses/>. 

27 

28"""Chained datastore.""" 

29 

30from __future__ import annotations 

31 

32__all__ = ("ChainedDatastore",) 

33 

34import itertools 

35import logging 

36import time 

37import warnings 

38from collections.abc import Collection, Iterable, Mapping, Sequence 

39from typing import TYPE_CHECKING, Any 

40 

41from lsst.daf.butler import DatasetRef, DatasetTypeNotSupportedError, FileDataset 

42from lsst.daf.butler.datastore import ( 

43 DatasetRefURIs, 

44 Datastore, 

45 DatastoreConfig, 

46 DatastoreOpaqueTable, 

47 DatastoreValidationError, 

48) 

49from lsst.daf.butler.datastore.constraints import Constraints 

50from lsst.daf.butler.datastore.record_data import DatastoreRecordData 

51from lsst.resources import ResourcePath 

52from lsst.utils import doImportType 

53 

54if TYPE_CHECKING: 

55 from lsst.daf.butler import Config, DatasetType, LookupKey, StorageClass 

56 from lsst.daf.butler.registry.interfaces import DatasetIdRef, DatastoreRegistryBridgeManager 

57 from lsst.resources import ResourcePathExpression 

58 

59log = logging.getLogger(__name__) 

60 

61 

62class _IngestPrepData(Datastore.IngestPrepData): 

63 """Helper class for ChainedDatastore ingest implementation. 

64 

65 Parameters 

66 ---------- 

67 children : `list` of `tuple` 

68 Pairs of `Datastore`, `IngestPrepData` for all child datastores. 

69 """ 

70 

71 def __init__(self, children: list[tuple[Datastore, Datastore.IngestPrepData, set[ResourcePath]]]): 

72 super().__init__(itertools.chain.from_iterable(data.refs.values() for _, data, _ in children)) 

73 self.children = children 

74 

75 

76class ChainedDatastore(Datastore): 

77 """Chained Datastores to allow read and writes from multiple datastores. 

78 

79 A ChainedDatastore is configured with multiple datastore configurations. 

80 A ``put()`` is always sent to each datastore. A ``get()`` 

81 operation is sent to each datastore in turn and the first datastore 

82 to return a valid dataset is used. 

83 

84 Parameters 

85 ---------- 

86 config : `DatastoreConfig` or `str` 

87 Configuration. This configuration must include a ``datastores`` field 

88 as a sequence of datastore configurations. The order in this sequence 

89 indicates the order to use for read operations. 

90 bridgeManager : `DatastoreRegistryBridgeManager` 

91 Object that manages the interface between `Registry` and datastores. 

92 datastores : `list` [`Datastore`] 

93 All the child datastores known to this datastore. 

94 

95 Notes 

96 ----- 

97 ChainedDatastore never supports `None` or `"move"` as an `ingest` transfer 

98 mode. It supports `"copy"`, `"symlink"`, `"relsymlink"` 

99 and `"hardlink"` if and only if all its child datastores do. 

100 """ 

101 

102 defaultConfigFile = "datastores/chainedDatastore.yaml" 

103 """Path to configuration defaults. Accessed within the ``configs`` resource 

104 or relative to a search path. Can be None if no defaults specified. 

105 """ 

106 

107 containerKey = "datastores" 

108 """Key to specify where child datastores are configured.""" 

109 

110 datastores: list[Datastore] 

111 """All the child datastores known to this datastore.""" 

112 

113 datastoreConstraints: Sequence[Constraints | None] 

114 """Constraints to be applied to each of the child datastores.""" 

115 

116 @classmethod 

117 def setConfigRoot(cls, root: str, config: Config, full: Config, overwrite: bool = True) -> None: 

118 """Set any filesystem-dependent config options for child Datastores to 

119 be appropriate for a new empty repository with the given root. 

120 

121 Parameters 

122 ---------- 

123 root : `str` 

124 Filesystem path to the root of the data repository. 

125 config : `Config` 

126 A `Config` to update. Only the subset understood by 

127 this component will be updated. Will not expand 

128 defaults. 

129 full : `Config` 

130 A complete config with all defaults expanded that can be 

131 converted to a `DatastoreConfig`. Read-only and will not be 

132 modified by this method. 

133 Repository-specific options that should not be obtained 

134 from defaults when Butler instances are constructed 

135 should be copied from ``full`` to ``config``. 

136 overwrite : `bool`, optional 

137 If `False`, do not modify a value in ``config`` if the value 

138 already exists. Default is always to overwrite with the provided 

139 ``root``. 

140 

141 Notes 

142 ----- 

143 If a keyword is explicitly defined in the supplied ``config`` it 

144 will not be overridden by this method if ``overwrite`` is `False`. 

145 This allows explicit values set in external configs to be retained. 

146 """ 

147 # Extract the part of the config we care about updating 

148 datastoreConfig = DatastoreConfig(config, mergeDefaults=False) 

149 

150 # And the subset of the full config that we can use for reference. 

151 # Do not bother with defaults because we are told this already has 

152 # them. 

153 fullDatastoreConfig = DatastoreConfig(full, mergeDefaults=False) 

154 

155 # Loop over each datastore config and pass the subsets to the 

156 # child datastores to process. 

157 

158 containerKey = cls.containerKey 

159 for idx, (child, fullChild) in enumerate( 

160 zip(datastoreConfig[containerKey], fullDatastoreConfig[containerKey], strict=True) 

161 ): 

162 childConfig = DatastoreConfig(child, mergeDefaults=False) 

163 fullChildConfig = DatastoreConfig(fullChild, mergeDefaults=False) 

164 datastoreClass = doImportType(fullChildConfig["cls"]) 

165 if not issubclass(datastoreClass, Datastore): 165 ↛ 166line 165 didn't jump to line 166, because the condition on line 165 was never true

166 raise TypeError(f"Imported child class {fullChildConfig['cls']} is not a Datastore") 

167 newroot = f"{root}/{datastoreClass.__qualname__}_{idx}" 

168 datastoreClass.setConfigRoot(newroot, childConfig, fullChildConfig, overwrite=overwrite) 

169 

170 # Reattach to parent 

171 datastoreConfig[containerKey, idx] = childConfig 

172 

173 # Reattach modified datastore config to parent 

174 # If this has a datastore key we attach there, otherwise we assume 

175 # this information goes at the top of the config hierarchy. 

176 if DatastoreConfig.component in config: 

177 config[DatastoreConfig.component] = datastoreConfig 

178 else: 

179 config.update(datastoreConfig) 

180 

181 return 

182 

183 def __init__( 

184 self, 

185 config: DatastoreConfig, 

186 bridgeManager: DatastoreRegistryBridgeManager, 

187 datastores: list[Datastore], 

188 ): 

189 super().__init__(config, bridgeManager) 

190 

191 self.datastores = list(datastores) 

192 

193 # Name ourself based on our children 

194 if self.datastores: 194 ↛ 199line 194 didn't jump to line 199, because the condition on line 194 was never false

195 # We must set the names explicitly 

196 self._names = [d.name for d in self.datastores] 

197 childNames = ",".join(self.names) 

198 else: 

199 childNames = f"(empty@{time.time()})" 

200 self._names = [childNames] 

201 self.name = f"{type(self).__qualname__}[{childNames}]" 

202 

203 # We declare we are ephemeral if all our child datastores declare 

204 # they are ephemeral 

205 self.isEphemeral = all(d.isEphemeral for d in self.datastores) 

206 

207 # per-datastore override constraints 

208 if "datastore_constraints" in self.config: 

209 overrides = self.config["datastore_constraints"] 

210 

211 if len(overrides) != len(self.datastores): 211 ↛ 212line 211 didn't jump to line 212, because the condition on line 211 was never true

212 raise DatastoreValidationError( 

213 f"Number of registered datastores ({len(self.datastores)})" 

214 " differs from number of constraints overrides" 

215 f" {len(overrides)}" 

216 ) 

217 

218 self.datastoreConstraints = [ 

219 Constraints(c.get("constraints"), universe=bridgeManager.universe) for c in overrides 

220 ] 

221 

222 else: 

223 self.datastoreConstraints = (None,) * len(self.datastores) 

224 

225 log.debug("Created %s (%s)", self.name, ("ephemeral" if self.isEphemeral else "permanent")) 

226 

227 @classmethod 

228 def _create_from_config( 

229 cls, 

230 config: DatastoreConfig, 

231 bridgeManager: DatastoreRegistryBridgeManager, 

232 butlerRoot: ResourcePathExpression | None, 

233 ) -> ChainedDatastore: 

234 # Scan for child datastores and instantiate them with the same registry 

235 datastores = [] 

236 for c in config["datastores"]: 

237 c = DatastoreConfig(c) 

238 datastoreType = doImportType(c["cls"]) 

239 if not issubclass(datastoreType, Datastore): 239 ↛ 240line 239 didn't jump to line 240, because the condition on line 239 was never true

240 raise TypeError(f"Imported child class {c['cls']} is not a Datastore") 

241 datastore = datastoreType._create_from_config(c, bridgeManager, butlerRoot=butlerRoot) 

242 log.debug("Creating child datastore %s", datastore.name) 

243 datastores.append(datastore) 

244 

245 return ChainedDatastore(config, bridgeManager, datastores) 

246 

247 def clone(self, bridgeManager: DatastoreRegistryBridgeManager) -> Datastore: 

248 datastores = [ds.clone(bridgeManager) for ds in self.datastores] 

249 return ChainedDatastore(self.config, bridgeManager, datastores) 

250 

251 @property 

252 def names(self) -> tuple[str, ...]: 

253 return tuple(self._names) 

254 

255 @property 

256 def roots(self) -> dict[str, ResourcePath | None]: 

257 # Docstring inherited. 

258 roots = {} 

259 for datastore in self.datastores: 

260 roots.update(datastore.roots) 

261 return roots 

262 

263 def __str__(self) -> str: 

264 chainName = ", ".join(str(ds) for ds in self.datastores) 

265 return chainName 

266 

267 def _set_trust_mode(self, mode: bool) -> None: 

268 for datastore in self.datastores: 

269 datastore._set_trust_mode(mode) 

270 

271 def knows(self, ref: DatasetRef) -> bool: 

272 """Check if the dataset is known to any of the datastores. 

273 

274 Does not check for existence of any artifact. 

275 

276 Parameters 

277 ---------- 

278 ref : `DatasetRef` 

279 Reference to the required dataset. 

280 

281 Returns 

282 ------- 

283 exists : `bool` 

284 `True` if the dataset is known to the datastore. 

285 """ 

286 for datastore in self.datastores: 

287 if datastore.knows(ref): 

288 log.debug("%s known to datastore %s", ref, datastore.name) 

289 return True 

290 return False 

291 

292 def knows_these(self, refs: Iterable[DatasetRef]) -> dict[DatasetRef, bool]: 

293 # Docstring inherited from the base class. 

294 refs_known: dict[DatasetRef, bool] = {} 

295 for datastore in self.datastores: 

296 refs_known.update(datastore.knows_these(refs)) 

297 

298 # No need to check in next datastore for refs that are known. 

299 # We only update entries that were initially False. 

300 refs = [ref for ref, known in refs_known.items() if not known] 

301 

302 return refs_known 

303 

304 def mexists( 

305 self, refs: Iterable[DatasetRef], artifact_existence: dict[ResourcePath, bool] | None = None 

306 ) -> dict[DatasetRef, bool]: 

307 """Check the existence of multiple datasets at once. 

308 

309 Parameters 

310 ---------- 

311 refs : iterable of `DatasetRef` 

312 The datasets to be checked. 

313 artifact_existence : `dict` [`lsst.resources.ResourcePath`, `bool`] 

314 Optional mapping of datastore artifact to existence. Updated by 

315 this method with details of all artifacts tested. Can be `None` 

316 if the caller is not interested. 

317 

318 Returns 

319 ------- 

320 existence : `dict` of [`DatasetRef`, `bool`] 

321 Mapping from dataset to boolean indicating existence in any 

322 of the child datastores. 

323 """ 

324 dataset_existence: dict[DatasetRef, bool] = {} 

325 for datastore in self.datastores: 

326 dataset_existence.update(datastore.mexists(refs, artifact_existence=artifact_existence)) 

327 

328 # For next datastore no point asking about ones we know 

329 # exist already. No special exemption for ephemeral datastores. 

330 refs = [ref for ref, exists in dataset_existence.items() if not exists] 

331 

332 return dataset_existence 

333 

334 def exists(self, ref: DatasetRef) -> bool: 

335 """Check if the dataset exists in one of the datastores. 

336 

337 Parameters 

338 ---------- 

339 ref : `DatasetRef` 

340 Reference to the required dataset. 

341 

342 Returns 

343 ------- 

344 exists : `bool` 

345 `True` if the entity exists in one of the child datastores. 

346 """ 

347 for datastore in self.datastores: 

348 if datastore.exists(ref): 

349 log.debug("Found %s in datastore %s", ref, datastore.name) 

350 return True 

351 return False 

352 

353 def get( 

354 self, 

355 ref: DatasetRef, 

356 parameters: Mapping[str, Any] | None = None, 

357 storageClass: StorageClass | str | None = None, 

358 ) -> Any: 

359 """Load an InMemoryDataset from the store. 

360 

361 The dataset is returned from the first datastore that has 

362 the dataset. 

363 

364 Parameters 

365 ---------- 

366 ref : `DatasetRef` 

367 Reference to the required Dataset. 

368 parameters : `dict` 

369 `StorageClass`-specific parameters that specify, for example, 

370 a slice of the dataset to be loaded. 

371 storageClass : `StorageClass` or `str`, optional 

372 The storage class to be used to override the Python type 

373 returned by this method. By default the returned type matches 

374 the dataset type definition for this dataset. Specifying a 

375 read `StorageClass` can force a different type to be returned. 

376 This type must be compatible with the original type. 

377 

378 Returns 

379 ------- 

380 inMemoryDataset : `object` 

381 Requested dataset or slice thereof as an InMemoryDataset. 

382 

383 Raises 

384 ------ 

385 FileNotFoundError 

386 Requested dataset can not be retrieved. 

387 TypeError 

388 Return value from formatter has unexpected type. 

389 ValueError 

390 Formatter failed to process the dataset. 

391 """ 

392 for datastore in self.datastores: 

393 try: 

394 inMemoryObject = datastore.get(ref, parameters, storageClass=storageClass) 

395 log.debug("Found dataset %s in datastore %s", ref, datastore.name) 

396 return inMemoryObject 

397 except FileNotFoundError: 

398 pass 

399 

400 raise FileNotFoundError(f"Dataset {ref} could not be found in any of the datastores") 

401 

402 def prepare_get_for_external_client(self, ref: DatasetRef) -> object | None: 

403 datastore = self._get_matching_datastore(ref) 

404 if datastore is None: 404 ↛ 407line 404 didn't jump to line 407, because the condition on line 404 was never false

405 return None 

406 

407 return datastore.prepare_get_for_external_client(ref) 

408 

409 def _get_matching_datastore(self, ref: DatasetRef) -> Datastore | None: 

410 """Return the first child datastore that owns the specified dataset.""" 

411 for datastore in self.datastores: 

412 if datastore.knows(ref): 412 ↛ 413line 412 didn't jump to line 413, because the condition on line 412 was never true

413 return datastore 

414 

415 return None 

416 

417 def put(self, inMemoryDataset: Any, ref: DatasetRef) -> None: 

418 """Write a InMemoryDataset with a given `DatasetRef` to each 

419 datastore. 

420 

421 The put() to child datastores can fail with 

422 `DatasetTypeNotSupportedError`. The put() for this datastore will be 

423 deemed to have succeeded so long as at least one child datastore 

424 accepted the inMemoryDataset. 

425 

426 Parameters 

427 ---------- 

428 inMemoryDataset : `object` 

429 The dataset to store. 

430 ref : `DatasetRef` 

431 Reference to the associated Dataset. 

432 

433 Raises 

434 ------ 

435 TypeError 

436 Supplied object and storage class are inconsistent. 

437 DatasetTypeNotSupportedError 

438 All datastores reported `DatasetTypeNotSupportedError`. 

439 """ 

440 log.debug("Put %s", ref) 

441 

442 # Confirm that we can accept this dataset 

443 if not self.constraints.isAcceptable(ref): 

444 # Raise rather than use boolean return value. 

445 raise DatasetTypeNotSupportedError( 

446 f"Dataset {ref} has been rejected by this datastore via configuration." 

447 ) 

448 

449 isPermanent = False 

450 nsuccess = 0 

451 npermanent = 0 

452 nephemeral = 0 

453 for datastore, constraints in zip(self.datastores, self.datastoreConstraints, strict=True): 

454 if ( 

455 constraints is not None and not constraints.isAcceptable(ref) 

456 ) or not datastore.constraints.isAcceptable(ref): 

457 log.debug("Datastore %s skipping put via configuration for ref %s", datastore.name, ref) 

458 continue 

459 

460 if datastore.isEphemeral: 

461 nephemeral += 1 

462 else: 

463 npermanent += 1 

464 try: 

465 datastore.put(inMemoryDataset, ref) 

466 nsuccess += 1 

467 if not datastore.isEphemeral: 

468 isPermanent = True 

469 except DatasetTypeNotSupportedError: 

470 pass 

471 

472 if nsuccess == 0: 

473 raise DatasetTypeNotSupportedError(f"None of the chained datastores supported ref {ref}") 

474 

475 if not isPermanent and npermanent > 0: 475 ↛ 476line 475 didn't jump to line 476, because the condition on line 475 was never true

476 warnings.warn(f"Put of {ref} only succeeded in ephemeral databases", stacklevel=2) 

477 

478 if self._transaction is not None: 

479 self._transaction.registerUndo("put", self.remove, ref) 

480 

481 def put_new(self, in_memory_dataset: Any, ref: DatasetRef) -> Mapping[str, DatasetRef]: 

482 # Docstring inherited from base class. 

483 log.debug("Put %s", ref) 

484 

485 # Confirm that we can accept this dataset 

486 if not self.constraints.isAcceptable(ref): 

487 # Raise rather than use boolean return value. 

488 raise DatasetTypeNotSupportedError( 

489 f"Dataset {ref} has been rejected by this datastore via configuration." 

490 ) 

491 

492 isPermanent = False 

493 nsuccess = 0 

494 npermanent = 0 

495 nephemeral = 0 

496 stored_refs: dict[str, DatasetRef] = {} 

497 for datastore, constraints in zip(self.datastores, self.datastoreConstraints, strict=True): 

498 if ( 

499 constraints is not None and not constraints.isAcceptable(ref) 

500 ) or not datastore.constraints.isAcceptable(ref): 

501 log.debug("Datastore %s skipping put via configuration for ref %s", datastore.name, ref) 

502 continue 

503 

504 if datastore.isEphemeral: 

505 nephemeral += 1 

506 else: 

507 npermanent += 1 

508 try: 

509 stored_ref_map = datastore.put_new(in_memory_dataset, ref) 

510 stored_refs.update(stored_ref_map) 

511 nsuccess += 1 

512 if not datastore.isEphemeral: 

513 isPermanent = True 

514 except DatasetTypeNotSupportedError: 

515 pass 

516 

517 if nsuccess == 0: 

518 raise DatasetTypeNotSupportedError(f"None of the chained datastores supported ref {ref}") 

519 

520 if not isPermanent and npermanent > 0: 

521 warnings.warn(f"Put of {ref} only succeeded in ephemeral databases", stacklevel=2) 

522 

523 if self._transaction is not None: 

524 self._transaction.registerUndo("put", self.remove, ref) 

525 

526 return stored_refs 

527 

528 def _overrideTransferMode(self, *datasets: Any, transfer: str | None = None) -> str | None: 

529 # Docstring inherited from base class. 

530 if transfer != "auto": 

531 return transfer 

532 # Ask each datastore what they think auto means 

533 transfers = {d._overrideTransferMode(*datasets, transfer=transfer) for d in self.datastores} 

534 

535 # Remove any untranslated "auto" values 

536 transfers.discard(transfer) 

537 

538 if len(transfers) == 1: 538 ↛ 539line 538 didn't jump to line 539, because the condition on line 538 was never true

539 return transfers.pop() 

540 if not transfers: 540 ↛ 544line 540 didn't jump to line 544, because the condition on line 540 was never false

541 # Everything reported "auto" 

542 return transfer 

543 

544 raise RuntimeError( 

545 "Chained datastore does not yet support different transfer modes" 

546 f" from 'auto' in each child datastore (wanted {transfers})" 

547 ) 

548 

549 def _prepIngest(self, *datasets: FileDataset, transfer: str | None = None) -> _IngestPrepData: 

550 # Docstring inherited from Datastore._prepIngest. 

551 if transfer is None: 

552 raise NotImplementedError("ChainedDatastore does not support transfer=None.") 

553 

554 def isDatasetAcceptable(dataset: FileDataset, *, name: str, constraints: Constraints) -> bool: 

555 acceptable = [ref for ref in dataset.refs if constraints.isAcceptable(ref)] 

556 if not acceptable: 

557 log.debug( 

558 "Datastore %s skipping ingest via configuration for refs %s", 

559 name, 

560 ", ".join(str(ref) for ref in dataset.refs), 

561 ) 

562 return False 

563 else: 

564 return True 

565 

566 # Filter down to just datasets the chained datastore's own 

567 # configuration accepts. 

568 okForParent: list[FileDataset] = [ 

569 dataset 

570 for dataset in datasets 

571 if isDatasetAcceptable(dataset, name=self.name, constraints=self.constraints) 

572 ] 

573 

574 # Iterate over nested datastores and call _prepIngest on each. 

575 # Save the results to a list: 

576 children: list[tuple[Datastore, Datastore.IngestPrepData, set[ResourcePath]]] = [] 

577 # ...and remember whether all of the failures are due to 

578 # NotImplementedError being raised. 

579 allFailuresAreNotImplementedError = True 

580 for datastore, constraints in zip(self.datastores, self.datastoreConstraints, strict=True): 

581 okForChild: list[FileDataset] 

582 if constraints is not None: 

583 okForChild = [ 

584 dataset 

585 for dataset in okForParent 

586 if isDatasetAcceptable(dataset, name=datastore.name, constraints=constraints) 

587 ] 

588 else: 

589 okForChild = okForParent 

590 try: 

591 prepDataForChild = datastore._prepIngest(*okForChild, transfer=transfer) 

592 except NotImplementedError: 

593 log.debug( 

594 "Skipping ingest for datastore %s because transfer mode %s is not supported.", 

595 datastore.name, 

596 transfer, 

597 ) 

598 continue 

599 allFailuresAreNotImplementedError = False 

600 if okForChild: 

601 # Do not store for later if a datastore has rejected 

602 # everything. 

603 # Include the source paths if this is a "move". It's clearer 

604 # to find the paths now rather than try to infer how 

605 # each datastore has stored them in the internal prep class. 

606 paths = ( 

607 {ResourcePath(dataset.path, forceDirectory=False) for dataset in okForChild} 

608 if transfer == "move" 

609 else set() 

610 ) 

611 children.append((datastore, prepDataForChild, paths)) 

612 if allFailuresAreNotImplementedError: 

613 raise NotImplementedError(f"No child datastore supports transfer mode {transfer}.") 

614 return _IngestPrepData(children=children) 

615 

616 def _finishIngest( 

617 self, 

618 prepData: _IngestPrepData, 

619 *, 

620 transfer: str | None = None, 

621 record_validation_info: bool = True, 

622 ) -> None: 

623 # Docstring inherited from Datastore._finishIngest. 

624 # For "move" we must use "copy" and then delete the input 

625 # data at the end. This has no rollback option if the ingest 

626 # subsequently fails. If there is only one active datastore 

627 # accepting any files we can leave it as "move" 

628 actual_transfer: str | None 

629 if transfer == "move" and len(prepData.children) > 1: 

630 actual_transfer = "copy" 

631 else: 

632 actual_transfer = transfer 

633 to_be_deleted: set[ResourcePath] = set() 

634 for datastore, prepDataForChild, paths in prepData.children: 

635 datastore._finishIngest( 

636 prepDataForChild, transfer=actual_transfer, record_validation_info=record_validation_info 

637 ) 

638 to_be_deleted.update(paths) 

639 if actual_transfer != transfer: 

640 # These datasets were copied but now need to be deleted. 

641 # This can not be rolled back. 

642 for uri in to_be_deleted: 

643 uri.remove() 

644 

645 def getManyURIs( 

646 self, 

647 refs: Iterable[DatasetRef], 

648 predict: bool = False, 

649 allow_missing: bool = False, 

650 ) -> dict[DatasetRef, DatasetRefURIs]: 

651 # Docstring inherited 

652 

653 uris: dict[DatasetRef, DatasetRefURIs] = {} 

654 missing_refs = set(refs) 

655 

656 # If predict is True we don't want to predict a dataset in the first 

657 # datastore if it actually exists in a later datastore, so in that 

658 # case check all datastores with predict=False first, and then try 

659 # again with predict=True. 

660 for p in (False, True) if predict else (False,): 

661 if not missing_refs: 

662 break 

663 for datastore in self.datastores: 

664 try: 

665 got_uris = datastore.getManyURIs(missing_refs, p, allow_missing=True) 

666 except NotImplementedError: 

667 # some datastores may not implement generating URIs 

668 continue 

669 missing_refs -= got_uris.keys() 

670 uris.update(got_uris) 

671 if not missing_refs: 

672 break 

673 

674 if missing_refs and not allow_missing: 

675 raise FileNotFoundError(f"Dataset(s) {missing_refs} not in this datastore.") 

676 

677 return uris 

678 

679 def getURIs(self, ref: DatasetRef, predict: bool = False) -> DatasetRefURIs: 

680 """Return URIs associated with dataset. 

681 

682 Parameters 

683 ---------- 

684 ref : `DatasetRef` 

685 Reference to the required dataset. 

686 predict : `bool`, optional 

687 If the datastore does not know about the dataset, controls whether 

688 it should return a predicted URI or not. 

689 

690 Returns 

691 ------- 

692 uris : `DatasetRefURIs` 

693 The URI to the primary artifact associated with this dataset (if 

694 the dataset was disassembled within the datastore this may be 

695 `None`), and the URIs to any components associated with the dataset 

696 artifact. (can be empty if there are no components). 

697 

698 Notes 

699 ----- 

700 The returned URI is from the first datastore in the list that has 

701 the dataset with preference given to the first dataset coming from 

702 a permanent datastore. If no datastores have the dataset and prediction 

703 is allowed, the predicted URI for the first datastore in the list will 

704 be returned. 

705 """ 

706 log.debug("Requesting URIs for %s", ref) 

707 predictedUri: DatasetRefURIs | None = None 

708 predictedEphemeralUri: DatasetRefURIs | None = None 

709 firstEphemeralUri: DatasetRefURIs | None = None 

710 for datastore in self.datastores: 

711 if datastore.exists(ref): 

712 if not datastore.isEphemeral: 

713 uri = datastore.getURIs(ref) 

714 log.debug("Retrieved non-ephemeral URI: %s", uri) 

715 return uri 

716 elif not firstEphemeralUri: 

717 firstEphemeralUri = datastore.getURIs(ref) 

718 elif predict: 

719 if not predictedUri and not datastore.isEphemeral: 

720 predictedUri = datastore.getURIs(ref, predict) 

721 elif not predictedEphemeralUri and datastore.isEphemeral: 

722 predictedEphemeralUri = datastore.getURIs(ref, predict) 

723 

724 if firstEphemeralUri: 

725 log.debug("Retrieved ephemeral URI: %s", firstEphemeralUri) 

726 return firstEphemeralUri 

727 

728 if predictedUri: 

729 log.debug("Retrieved predicted URI: %s", predictedUri) 

730 return predictedUri 

731 

732 if predictedEphemeralUri: 

733 log.debug("Retrieved predicted ephemeral URI: %s", predictedEphemeralUri) 

734 return predictedEphemeralUri 

735 

736 raise FileNotFoundError(f"Dataset {ref} not in any datastore") 

737 

738 def getURI(self, ref: DatasetRef, predict: bool = False) -> ResourcePath: 

739 """URI to the Dataset. 

740 

741 The returned URI is from the first datastore in the list that has 

742 the dataset with preference given to the first dataset coming from 

743 a permanent datastore. If no datastores have the dataset and prediction 

744 is allowed, the predicted URI for the first datastore in the list will 

745 be returned. 

746 

747 Parameters 

748 ---------- 

749 ref : `DatasetRef` 

750 Reference to the required Dataset. 

751 predict : `bool` 

752 If `True`, allow URIs to be returned of datasets that have not 

753 been written. 

754 

755 Returns 

756 ------- 

757 uri : `lsst.resources.ResourcePath` 

758 URI pointing to the dataset within the datastore. If the 

759 dataset does not exist in the datastore, and if ``predict`` is 

760 `True`, the URI will be a prediction and will include a URI 

761 fragment "#predicted". 

762 

763 Notes 

764 ----- 

765 If the datastore does not have entities that relate well 

766 to the concept of a URI the returned URI string will be 

767 descriptive. The returned URI is not guaranteed to be obtainable. 

768 

769 Raises 

770 ------ 

771 FileNotFoundError 

772 A URI has been requested for a dataset that does not exist and 

773 guessing is not allowed. 

774 RuntimeError 

775 Raised if a request is made for a single URI but multiple URIs 

776 are associated with this dataset. 

777 """ 

778 log.debug("Requesting URI for %s", ref) 

779 primary, components = self.getURIs(ref, predict) 

780 if primary is None or components: 780 ↛ 781line 780 didn't jump to line 781, because the condition on line 780 was never true

781 raise RuntimeError( 

782 f"Dataset ({ref}) includes distinct URIs for components. Use Datastore.getURIs() instead." 

783 ) 

784 return primary 

785 

786 def retrieveArtifacts( 

787 self, 

788 refs: Iterable[DatasetRef], 

789 destination: ResourcePath, 

790 transfer: str = "auto", 

791 preserve_path: bool = True, 

792 overwrite: bool = False, 

793 ) -> list[ResourcePath]: 

794 """Retrieve the file artifacts associated with the supplied refs. 

795 

796 Parameters 

797 ---------- 

798 refs : iterable of `DatasetRef` 

799 The datasets for which file artifacts are to be retrieved. 

800 A single ref can result in multiple files. The refs must 

801 be resolved. 

802 destination : `lsst.resources.ResourcePath` 

803 Location to write the file artifacts. 

804 transfer : `str`, optional 

805 Method to use to transfer the artifacts. Must be one of the options 

806 supported by `lsst.resources.ResourcePath.transfer_from()`. 

807 "move" is not allowed. 

808 preserve_path : `bool`, optional 

809 If `True` the full path of the file artifact within the datastore 

810 is preserved. If `False` the final file component of the path 

811 is used. 

812 overwrite : `bool`, optional 

813 If `True` allow transfers to overwrite existing files at the 

814 destination. 

815 

816 Returns 

817 ------- 

818 targets : `list` of `lsst.resources.ResourcePath` 

819 URIs of file artifacts in destination location. Order is not 

820 preserved. 

821 """ 

822 if not destination.isdir(): 

823 raise ValueError(f"Destination location must refer to a directory. Given {destination}") 

824 

825 # Using getURIs is not feasible since it becomes difficult to 

826 # determine the path within the datastore later on. For now 

827 # follow getURIs implementation approach. 

828 

829 pending = set(refs) 

830 

831 # There is a question as to whether an exception should be raised 

832 # early if some of the refs are missing, or whether files should be 

833 # transferred until a problem is hit. Prefer to complain up front. 

834 # Use the datastore integer as primary key. 

835 grouped_by_datastore: dict[int, set[DatasetRef]] = {} 

836 

837 for number, datastore in enumerate(self.datastores): 

838 if datastore.isEphemeral: 

839 # In the future we will want to distinguish in-memory from 

840 # caching datastore since using an on-disk local 

841 # cache is exactly what we should be doing. 

842 continue 

843 try: 

844 datastore_refs = {ref for ref in pending if datastore.exists(ref)} 

845 except NotImplementedError: 

846 # Some datastores may not support retrieving artifacts 

847 continue 

848 

849 if datastore_refs: 

850 grouped_by_datastore[number] = datastore_refs 

851 

852 # Remove these from the pending list so that we do not bother 

853 # looking for them any more. 

854 pending = pending - datastore_refs 

855 

856 if pending: 856 ↛ 857line 856 didn't jump to line 857, because the condition on line 856 was never true

857 raise RuntimeError(f"Some datasets were not found in any datastores: {pending}") 

858 

859 # Now do the transfer. 

860 targets: list[ResourcePath] = [] 

861 for number, datastore_refs in grouped_by_datastore.items(): 

862 targets.extend( 

863 self.datastores[number].retrieveArtifacts( 

864 datastore_refs, 

865 destination, 

866 transfer=transfer, 

867 preserve_path=preserve_path, 

868 overwrite=overwrite, 

869 ) 

870 ) 

871 

872 return targets 

873 

874 def remove(self, ref: DatasetRef) -> None: 

875 """Indicate to the datastore that a dataset can be removed. 

876 

877 The dataset will be removed from each datastore. The dataset is 

878 not required to exist in every child datastore. 

879 

880 Parameters 

881 ---------- 

882 ref : `DatasetRef` 

883 Reference to the required dataset. 

884 

885 Raises 

886 ------ 

887 FileNotFoundError 

888 Attempt to remove a dataset that does not exist. Raised if none 

889 of the child datastores removed the dataset. 

890 """ 

891 log.debug("Removing %s", ref) 

892 self.trash(ref, ignore_errors=False) 

893 self.emptyTrash(ignore_errors=False) 

894 

895 def forget(self, refs: Iterable[DatasetRef]) -> None: 

896 for datastore in tuple(self.datastores): 

897 datastore.forget(refs) 

898 

899 def trash(self, ref: DatasetRef | Iterable[DatasetRef], ignore_errors: bool = True) -> None: 

900 if isinstance(ref, DatasetRef): 

901 ref_label = str(ref) 

902 else: 

903 ref_label = "bulk datasets" 

904 

905 log.debug("Trashing %s", ref_label) 

906 

907 counter = 0 

908 for datastore in self.datastores: 

909 try: 

910 datastore.trash(ref, ignore_errors=ignore_errors) 

911 counter += 1 

912 except FileNotFoundError: 

913 pass 

914 

915 if counter == 0: 

916 err_msg = f"Could not mark for removal from any child datastore: {ref_label}" 

917 if ignore_errors: 917 ↛ 918line 917 didn't jump to line 918, because the condition on line 917 was never true

918 log.warning(err_msg) 

919 else: 

920 raise FileNotFoundError(err_msg) 

921 

922 def emptyTrash(self, ignore_errors: bool = True) -> None: 

923 for datastore in self.datastores: 

924 datastore.emptyTrash(ignore_errors=ignore_errors) 

925 

926 def transfer(self, inputDatastore: Datastore, ref: DatasetRef) -> None: 

927 """Retrieve a dataset from an input `Datastore`, 

928 and store the result in this `Datastore`. 

929 

930 Parameters 

931 ---------- 

932 inputDatastore : `Datastore` 

933 The external `Datastore` from which to retreive the Dataset. 

934 ref : `DatasetRef` 

935 Reference to the required dataset in the input data store. 

936 

937 Returns 

938 ------- 

939 results : `list` 

940 List containing the return value from the ``put()`` to each 

941 child datastore. 

942 """ 

943 assert inputDatastore is not self # unless we want it for renames? 

944 inMemoryDataset = inputDatastore.get(ref) 

945 self.put(inMemoryDataset, ref) 

946 

947 def validateConfiguration( 

948 self, entities: Iterable[DatasetRef | DatasetType | StorageClass], logFailures: bool = False 

949 ) -> None: 

950 """Validate some of the configuration for this datastore. 

951 

952 Parameters 

953 ---------- 

954 entities : iterable of `DatasetRef`, `DatasetType`, or `StorageClass` 

955 Entities to test against this configuration. Can be differing 

956 types. 

957 logFailures : `bool`, optional 

958 If `True`, output a log message for every validation error 

959 detected. 

960 

961 Raises 

962 ------ 

963 DatastoreValidationError 

964 Raised if there is a validation problem with a configuration. 

965 All the problems are reported in a single exception. 

966 

967 Notes 

968 ----- 

969 This method checks each datastore in turn. 

970 """ 

971 # Need to catch each of the datastore outputs and ensure that 

972 # all are tested. 

973 failures = [] 

974 for datastore in self.datastores: 

975 try: 

976 datastore.validateConfiguration(entities, logFailures=logFailures) 

977 except DatastoreValidationError as e: 

978 if logFailures: 978 ↛ 980line 978 didn't jump to line 980, because the condition on line 978 was never false

979 log.critical("Datastore %s failed validation", datastore.name) 

980 failures.append(f"Datastore {self.name}: {e}") 

981 

982 if failures: 

983 msg = ";\n".join(failures) 

984 raise DatastoreValidationError(msg) 

985 

986 def validateKey(self, lookupKey: LookupKey, entity: DatasetRef | DatasetType | StorageClass) -> None: 

987 # Docstring is inherited from base class 

988 failures = [] 

989 for datastore in self.datastores: 

990 try: 

991 datastore.validateKey(lookupKey, entity) 

992 except DatastoreValidationError as e: 

993 failures.append(f"Datastore {self.name}: {e}") 

994 

995 if failures: 

996 msg = ";\n".join(failures) 

997 raise DatastoreValidationError(msg) 

998 

999 def getLookupKeys(self) -> set[LookupKey]: 

1000 # Docstring is inherited from base class 

1001 keys = set() 

1002 for datastore in self.datastores: 

1003 keys.update(datastore.getLookupKeys()) 

1004 

1005 keys.update(self.constraints.getLookupKeys()) 

1006 for p in self.datastoreConstraints: 

1007 if p is not None: 1007 ↛ 1006line 1007 didn't jump to line 1006, because the condition on line 1007 was never false

1008 keys.update(p.getLookupKeys()) 

1009 

1010 return keys 

1011 

1012 def needs_expanded_data_ids( 

1013 self, 

1014 transfer: str | None, 

1015 entity: DatasetRef | DatasetType | StorageClass | None = None, 

1016 ) -> bool: 

1017 # Docstring inherited. 

1018 # We can't safely use `self.datastoreConstraints` with `entity` to 

1019 # check whether a child datastore would even want to ingest this 

1020 # dataset, because we don't want to filter out datastores that might 

1021 # need an expanded data ID based in incomplete information (e.g. we 

1022 # pass a StorageClass, but the constraint dispatches on DatasetType). 

1023 # So we pessimistically check if any datastore would need an expanded 

1024 # data ID for this transfer mode. 

1025 return any(datastore.needs_expanded_data_ids(transfer, entity) for datastore in self.datastores) 

1026 

1027 def import_records(self, data: Mapping[str, DatastoreRecordData]) -> None: 

1028 # Docstring inherited from the base class. 

1029 

1030 for datastore in self.datastores: 

1031 datastore.import_records(data) 

1032 

1033 def export_records(self, refs: Iterable[DatasetIdRef]) -> Mapping[str, DatastoreRecordData]: 

1034 # Docstring inherited from the base class. 

1035 

1036 all_records: dict[str, DatastoreRecordData] = {} 

1037 

1038 # Merge all sub-datastore records into one structure 

1039 for datastore in self.datastores: 

1040 sub_records = datastore.export_records(refs) 

1041 for name, record_data in sub_records.items(): 

1042 # All datastore names must be unique in a chain. 

1043 if name in all_records: 1043 ↛ 1044line 1043 didn't jump to line 1044, because the condition on line 1043 was never true

1044 raise ValueError("Non-unique datastore name found in datastore {datastore}") 

1045 all_records[name] = record_data 

1046 

1047 return all_records 

1048 

1049 def export( 

1050 self, 

1051 refs: Iterable[DatasetRef], 

1052 *, 

1053 directory: ResourcePathExpression | None = None, 

1054 transfer: str | None = "auto", 

1055 ) -> Iterable[FileDataset]: 

1056 # Docstring inherited from Datastore.export. 

1057 if transfer == "auto" and directory is None: 

1058 transfer = None 

1059 

1060 if transfer is not None and directory is None: 

1061 raise TypeError(f"Cannot export using transfer mode {transfer} with no export directory given") 

1062 

1063 if transfer == "move": 

1064 raise TypeError("Can not export by moving files out of datastore.") 

1065 

1066 # Exporting from a chain has the potential for a dataset to be 

1067 # in one or more of the datastores in the chain. We only need one 

1068 # of them since we assume the datasets are the same in all (but 

1069 # the file format could be different of course since that is a 

1070 # per-datastore configuration). 

1071 # We also do not know whether any of the datastores in the chain 

1072 # support file export. 

1073 

1074 # Ensure we have an ordered sequence that is not an iterator or set. 

1075 if not isinstance(refs, Sequence): 

1076 refs = list(refs) 

1077 

1078 # If any of the datasets are missing entirely we need to raise early 

1079 # before we try to run the export. This can be a little messy but is 

1080 # better than exporting files from the first datastore and then finding 

1081 # that one is missing but is not in the second datastore either. 

1082 known = [datastore.knows_these(refs) for datastore in self.datastores] 

1083 refs_known: set[DatasetRef] = set() 

1084 for known_to_this in known: 

1085 refs_known.update({ref for ref, knows_this in known_to_this.items() if knows_this}) 

1086 missing_count = len(refs) - len(refs_known) 

1087 if missing_count: 

1088 raise FileNotFoundError(f"Not all datasets known to this datastore. Missing {missing_count}") 

1089 

1090 # To allow us to slot each result into the right place after 

1091 # asking each datastore, create a dict with the index. 

1092 ref_positions = {ref: i for i, ref in enumerate(refs)} 

1093 

1094 # Presize the final export list. 

1095 exported: list[FileDataset | None] = [None] * len(refs) 

1096 

1097 # The order of the returned dataset has to match the order of the 

1098 # given refs, even if they are all from different datastores. 

1099 for i, datastore in enumerate(self.datastores): 

1100 known_to_this = known[i] 

1101 filtered = [ref for ref, knows in known_to_this.items() if knows and ref in ref_positions] 

1102 

1103 try: 

1104 this_export = datastore.export(filtered, directory=directory, transfer=transfer) 

1105 except NotImplementedError: 

1106 # Try the next datastore. 

1107 continue 

1108 

1109 for ref, export in zip(filtered, this_export, strict=True): 

1110 # Get the position and also delete it from the list. 

1111 exported[ref_positions.pop(ref)] = export 

1112 

1113 # Every dataset should be accounted for because of the earlier checks 

1114 # but make sure that we did fill all the slots to appease mypy. 

1115 for i, dataset in enumerate(exported): 

1116 if dataset is None: 1116 ↛ 1117line 1116 didn't jump to line 1117, because the condition on line 1116 was never true

1117 raise FileNotFoundError(f"Failed to export dataset {refs[i]}.") 

1118 yield dataset 

1119 

1120 def transfer_from( 

1121 self, 

1122 source_datastore: Datastore, 

1123 refs: Collection[DatasetRef], 

1124 transfer: str = "auto", 

1125 artifact_existence: dict[ResourcePath, bool] | None = None, 

1126 dry_run: bool = False, 

1127 ) -> tuple[set[DatasetRef], set[DatasetRef]]: 

1128 # Docstring inherited 

1129 # mypy does not understand "type(self) is not type(source)" 

1130 if isinstance(source_datastore, ChainedDatastore): 

1131 # Both the source and destination are chained datastores. 

1132 source_datastores = tuple(source_datastore.datastores) 

1133 else: 

1134 # The source datastore is different, forward everything to the 

1135 # child datastores. 

1136 source_datastores = (source_datastore,) 

1137 

1138 if not refs: 1138 ↛ 1140line 1138 didn't jump to line 1140, because the condition on line 1138 was never true

1139 # Nothing to transfer. 

1140 return set(), set() 

1141 

1142 # Need to know the set of all possible refs that could be transferred. 

1143 remaining_refs = set(refs) 

1144 

1145 missing_from_source: set[DatasetRef] | None = None 

1146 all_accepted = set() 

1147 nsuccess = 0 

1148 for source_child in source_datastores: 

1149 # If we are reading from a chained datastore, it's possible that 

1150 # only a subset of the datastores know about the dataset. We can't 

1151 # ask the receiving datastore to copy it when it doesn't exist 

1152 # so we have to filter again based on what the source datastore 

1153 # understands. 

1154 known_to_source = source_child.knows_these(list(refs)) 

1155 

1156 # Need to know that there is a possibility that some of these 

1157 # datasets exist but are unknown to the source datastore if 

1158 # trust is enabled. 

1159 if getattr(source_child, "trustGetRequest", False): 

1160 unknown = [ref for ref, known in known_to_source.items() if not known] 

1161 existence = source_child.mexists(unknown, artifact_existence) 

1162 for ref, exists in existence.items(): 

1163 known_to_source[ref] = exists 

1164 

1165 missing = {ref for ref, known in known_to_source.items() if not known} 

1166 if missing: 

1167 if missing_from_source is None: 

1168 missing_from_source = missing 

1169 else: 

1170 missing_from_source &= missing 

1171 

1172 # Try to transfer from each source datastore to each child 

1173 # datastore. Have to make sure we don't transfer something 

1174 # we've already transferred to this destination on later passes. 

1175 

1176 # Filter the initial list based on the datasets we have 

1177 # not yet transferred. 

1178 these_refs = [] 

1179 for ref in refs: 

1180 if ref in remaining_refs and known_to_source[ref]: 

1181 these_refs.append(ref) 

1182 

1183 if not these_refs: 

1184 # Already transferred all datasets known to this datastore. 

1185 continue 

1186 

1187 for datastore, constraints in zip(self.datastores, self.datastoreConstraints, strict=True): 

1188 if constraints is not None: 1188 ↛ 1196line 1188 didn't jump to line 1196, because the condition on line 1188 was never false

1189 filtered_refs = [] 

1190 for ref in these_refs: 

1191 if constraints.isAcceptable(ref): 

1192 filtered_refs.append(ref) 

1193 else: 

1194 log.debug("Rejecting ref by constraints: %s", ref) 

1195 else: 

1196 filtered_refs = list(these_refs) 

1197 try: 

1198 accepted, _ = datastore.transfer_from( 

1199 source_child, 

1200 filtered_refs, 

1201 transfer, 

1202 artifact_existence, 

1203 dry_run=dry_run, 

1204 ) 

1205 except (TypeError, NotImplementedError): 

1206 # The datastores were incompatible. 

1207 continue 

1208 else: 

1209 nsuccess += 1 

1210 

1211 # Remove the accepted datasets from those remaining. 

1212 remaining_refs = remaining_refs - accepted 

1213 

1214 # Keep track of everything we have accepted. 

1215 all_accepted.update(accepted) 

1216 

1217 if missing_from_source: 

1218 for ref in missing_from_source: 

1219 log.warning("Asked to transfer dataset %s but no file artifacts exist for it", ref) 

1220 

1221 if nsuccess == 0: 1221 ↛ 1222line 1221 didn't jump to line 1222, because the condition on line 1221 was never true

1222 raise TypeError(f"None of the child datastores could accept transfers from {source_datastore!r}") 

1223 

1224 return all_accepted, remaining_refs 

1225 

1226 def get_opaque_table_definitions(self) -> Mapping[str, DatastoreOpaqueTable]: 

1227 # Docstring inherited from the base class. 

1228 tables: dict[str, DatastoreOpaqueTable] = {} 

1229 for datastore in self.datastores: 

1230 tables.update(datastore.get_opaque_table_definitions()) 

1231 return tables