Coverage for python/lsst/daf/butler/datastores/chainedDatastore.py: 86%

474 statements  

« prev     ^ index     » next       coverage.py v7.4.1, created at 2024-02-13 10:56 +0000

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This software is dual licensed under the GNU General Public License and also 

10# under a 3-clause BSD license. Recipients may choose which of these licenses 

11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt, 

12# respectively. If you choose the GPL option then the following text applies 

13# (but note that there is still no warranty even if you opt for BSD instead): 

14# 

15# This program is free software: you can redistribute it and/or modify 

16# it under the terms of the GNU General Public License as published by 

17# the Free Software Foundation, either version 3 of the License, or 

18# (at your option) any later version. 

19# 

20# This program is distributed in the hope that it will be useful, 

21# but WITHOUT ANY WARRANTY; without even the implied warranty of 

22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

23# GNU General Public License for more details. 

24# 

25# You should have received a copy of the GNU General Public License 

26# along with this program. If not, see <http://www.gnu.org/licenses/>. 

27 

28"""Chained datastore.""" 

29 

30from __future__ import annotations 

31 

32__all__ = ("ChainedDatastore",) 

33 

34import itertools 

35import logging 

36import time 

37import warnings 

38from collections.abc import Collection, Iterable, Mapping, Sequence 

39from typing import TYPE_CHECKING, Any 

40 

41from lsst.daf.butler import DatasetRef, DatasetTypeNotSupportedError, FileDataset 

42from lsst.daf.butler.datastore import ( 

43 DatasetRefURIs, 

44 Datastore, 

45 DatastoreConfig, 

46 DatastoreOpaqueTable, 

47 DatastoreValidationError, 

48) 

49from lsst.daf.butler.datastore.constraints import Constraints 

50from lsst.daf.butler.datastore.record_data import DatastoreRecordData 

51from lsst.resources import ResourcePath 

52from lsst.utils import doImportType 

53 

54if TYPE_CHECKING: 

55 from lsst.daf.butler import Config, DatasetType, LookupKey, StorageClass 

56 from lsst.daf.butler.registry.interfaces import DatasetIdRef, DatastoreRegistryBridgeManager 

57 from lsst.resources import ResourcePathExpression 

58 

59log = logging.getLogger(__name__) 

60 

61 

62class _IngestPrepData(Datastore.IngestPrepData): 

63 """Helper class for ChainedDatastore ingest implementation. 

64 

65 Parameters 

66 ---------- 

67 children : `list` of `tuple` 

68 Pairs of `Datastore`, `IngestPrepData` for all child datastores. 

69 """ 

70 

71 def __init__(self, children: list[tuple[Datastore, Datastore.IngestPrepData, set[ResourcePath]]]): 

72 super().__init__(itertools.chain.from_iterable(data.refs.values() for _, data, _ in children)) 

73 self.children = children 

74 

75 

76class ChainedDatastore(Datastore): 

77 """Chained Datastores to allow read and writes from multiple datastores. 

78 

79 A ChainedDatastore is configured with multiple datastore configurations. 

80 A ``put()`` is always sent to each datastore. A ``get()`` 

81 operation is sent to each datastore in turn and the first datastore 

82 to return a valid dataset is used. 

83 

84 Parameters 

85 ---------- 

86 config : `DatastoreConfig` or `str` 

87 Configuration. This configuration must include a ``datastores`` field 

88 as a sequence of datastore configurations. The order in this sequence 

89 indicates the order to use for read operations. 

90 bridgeManager : `DatastoreRegistryBridgeManager` 

91 Object that manages the interface between `Registry` and datastores. 

92 datastores : `list` [`Datastore`] 

93 All the child datastores known to this datastore. 

94 

95 Notes 

96 ----- 

97 ChainedDatastore never supports `None` or `"move"` as an `ingest` transfer 

98 mode. It supports `"copy"`, `"symlink"`, `"relsymlink"` 

99 and `"hardlink"` if and only if all its child datastores do. 

100 """ 

101 

102 defaultConfigFile = "datastores/chainedDatastore.yaml" 

103 """Path to configuration defaults. Accessed within the ``configs`` resource 

104 or relative to a search path. Can be None if no defaults specified. 

105 """ 

106 

107 containerKey = "datastores" 

108 """Key to specify where child datastores are configured.""" 

109 

110 datastores: list[Datastore] 

111 """All the child datastores known to this datastore.""" 

112 

113 datastoreConstraints: Sequence[Constraints | None] 

114 """Constraints to be applied to each of the child datastores.""" 

115 

116 @classmethod 

117 def setConfigRoot(cls, root: str, config: Config, full: Config, overwrite: bool = True) -> None: 

118 """Set any filesystem-dependent config options for child Datastores to 

119 be appropriate for a new empty repository with the given root. 

120 

121 Parameters 

122 ---------- 

123 root : `str` 

124 Filesystem path to the root of the data repository. 

125 config : `Config` 

126 A `Config` to update. Only the subset understood by 

127 this component will be updated. Will not expand 

128 defaults. 

129 full : `Config` 

130 A complete config with all defaults expanded that can be 

131 converted to a `DatastoreConfig`. Read-only and will not be 

132 modified by this method. 

133 Repository-specific options that should not be obtained 

134 from defaults when Butler instances are constructed 

135 should be copied from ``full`` to ``config``. 

136 overwrite : `bool`, optional 

137 If `False`, do not modify a value in ``config`` if the value 

138 already exists. Default is always to overwrite with the provided 

139 ``root``. 

140 

141 Notes 

142 ----- 

143 If a keyword is explicitly defined in the supplied ``config`` it 

144 will not be overridden by this method if ``overwrite`` is `False`. 

145 This allows explicit values set in external configs to be retained. 

146 """ 

147 # Extract the part of the config we care about updating 

148 datastoreConfig = DatastoreConfig(config, mergeDefaults=False) 

149 

150 # And the subset of the full config that we can use for reference. 

151 # Do not bother with defaults because we are told this already has 

152 # them. 

153 fullDatastoreConfig = DatastoreConfig(full, mergeDefaults=False) 

154 

155 # Loop over each datastore config and pass the subsets to the 

156 # child datastores to process. 

157 

158 containerKey = cls.containerKey 

159 for idx, (child, fullChild) in enumerate( 

160 zip(datastoreConfig[containerKey], fullDatastoreConfig[containerKey], strict=True) 

161 ): 

162 childConfig = DatastoreConfig(child, mergeDefaults=False) 

163 fullChildConfig = DatastoreConfig(fullChild, mergeDefaults=False) 

164 datastoreClass = doImportType(fullChildConfig["cls"]) 

165 if not issubclass(datastoreClass, Datastore): 165 ↛ 166line 165 didn't jump to line 166, because the condition on line 165 was never true

166 raise TypeError(f"Imported child class {fullChildConfig['cls']} is not a Datastore") 

167 newroot = f"{root}/{datastoreClass.__qualname__}_{idx}" 

168 datastoreClass.setConfigRoot(newroot, childConfig, fullChildConfig, overwrite=overwrite) 

169 

170 # Reattach to parent 

171 datastoreConfig[containerKey, idx] = childConfig 

172 

173 # Reattach modified datastore config to parent 

174 # If this has a datastore key we attach there, otherwise we assume 

175 # this information goes at the top of the config hierarchy. 

176 if DatastoreConfig.component in config: 

177 config[DatastoreConfig.component] = datastoreConfig 

178 else: 

179 config.update(datastoreConfig) 

180 

181 return 

182 

183 def __init__( 

184 self, 

185 config: DatastoreConfig, 

186 bridgeManager: DatastoreRegistryBridgeManager, 

187 datastores: list[Datastore], 

188 ): 

189 super().__init__(config, bridgeManager) 

190 

191 self.datastores = list(datastores) 

192 

193 # Name ourself based on our children 

194 if self.datastores: 194 ↛ 199line 194 didn't jump to line 199, because the condition on line 194 was never false

195 # We must set the names explicitly 

196 self._names = [d.name for d in self.datastores] 

197 childNames = ",".join(self.names) 

198 else: 

199 childNames = f"(empty@{time.time()})" 

200 self._names = [childNames] 

201 self.name = f"{type(self).__qualname__}[{childNames}]" 

202 

203 # We declare we are ephemeral if all our child datastores declare 

204 # they are ephemeral 

205 self.isEphemeral = all(d.isEphemeral for d in self.datastores) 

206 

207 # per-datastore override constraints 

208 if "datastore_constraints" in self.config: 

209 overrides = self.config["datastore_constraints"] 

210 

211 if len(overrides) != len(self.datastores): 211 ↛ 212line 211 didn't jump to line 212, because the condition on line 211 was never true

212 raise DatastoreValidationError( 

213 f"Number of registered datastores ({len(self.datastores)})" 

214 " differs from number of constraints overrides" 

215 f" {len(overrides)}" 

216 ) 

217 

218 self.datastoreConstraints = [ 

219 Constraints(c.get("constraints"), universe=bridgeManager.universe) for c in overrides 

220 ] 

221 

222 else: 

223 self.datastoreConstraints = (None,) * len(self.datastores) 

224 

225 log.debug("Created %s (%s)", self.name, ("ephemeral" if self.isEphemeral else "permanent")) 

226 

227 @classmethod 

228 def _create_from_config( 

229 cls, 

230 config: DatastoreConfig, 

231 bridgeManager: DatastoreRegistryBridgeManager, 

232 butlerRoot: ResourcePathExpression | None, 

233 ) -> ChainedDatastore: 

234 # Scan for child datastores and instantiate them with the same registry 

235 datastores = [] 

236 for c in config["datastores"]: 

237 c = DatastoreConfig(c) 

238 datastoreType = doImportType(c["cls"]) 

239 if not issubclass(datastoreType, Datastore): 239 ↛ 240line 239 didn't jump to line 240, because the condition on line 239 was never true

240 raise TypeError(f"Imported child class {c['cls']} is not a Datastore") 

241 datastore = datastoreType._create_from_config(c, bridgeManager, butlerRoot=butlerRoot) 

242 log.debug("Creating child datastore %s", datastore.name) 

243 datastores.append(datastore) 

244 

245 return ChainedDatastore(config, bridgeManager, datastores) 

246 

247 def clone(self, bridgeManager: DatastoreRegistryBridgeManager) -> Datastore: 

248 datastores = [ds.clone(bridgeManager) for ds in self.datastores] 

249 return ChainedDatastore(self.config, bridgeManager, datastores) 

250 

251 @property 

252 def names(self) -> tuple[str, ...]: 

253 return tuple(self._names) 

254 

255 @property 

256 def roots(self) -> dict[str, ResourcePath | None]: 

257 # Docstring inherited. 

258 roots = {} 

259 for datastore in self.datastores: 

260 roots.update(datastore.roots) 

261 return roots 

262 

263 def __str__(self) -> str: 

264 chainName = ", ".join(str(ds) for ds in self.datastores) 

265 return chainName 

266 

267 def _set_trust_mode(self, mode: bool) -> None: 

268 for datastore in self.datastores: 

269 datastore._set_trust_mode(mode) 

270 

271 def knows(self, ref: DatasetRef) -> bool: 

272 """Check if the dataset is known to any of the datastores. 

273 

274 Does not check for existence of any artifact. 

275 

276 Parameters 

277 ---------- 

278 ref : `DatasetRef` 

279 Reference to the required dataset. 

280 

281 Returns 

282 ------- 

283 exists : `bool` 

284 `True` if the dataset is known to the datastore. 

285 """ 

286 for datastore in self.datastores: 

287 if datastore.knows(ref): 

288 log.debug("%s known to datastore %s", ref, datastore.name) 

289 return True 

290 return False 

291 

292 def knows_these(self, refs: Iterable[DatasetRef]) -> dict[DatasetRef, bool]: 

293 # Docstring inherited from the base class. 

294 refs_known: dict[DatasetRef, bool] = {} 

295 for datastore in self.datastores: 

296 refs_known.update(datastore.knows_these(refs)) 

297 

298 # No need to check in next datastore for refs that are known. 

299 # We only update entries that were initially False. 

300 refs = [ref for ref, known in refs_known.items() if not known] 

301 

302 return refs_known 

303 

304 def mexists( 

305 self, refs: Iterable[DatasetRef], artifact_existence: dict[ResourcePath, bool] | None = None 

306 ) -> dict[DatasetRef, bool]: 

307 """Check the existence of multiple datasets at once. 

308 

309 Parameters 

310 ---------- 

311 refs : iterable of `DatasetRef` 

312 The datasets to be checked. 

313 artifact_existence : `dict` [`lsst.resources.ResourcePath`, `bool`] 

314 Optional mapping of datastore artifact to existence. Updated by 

315 this method with details of all artifacts tested. Can be `None` 

316 if the caller is not interested. 

317 

318 Returns 

319 ------- 

320 existence : `dict` of [`DatasetRef`, `bool`] 

321 Mapping from dataset to boolean indicating existence in any 

322 of the child datastores. 

323 """ 

324 dataset_existence: dict[DatasetRef, bool] = {} 

325 for datastore in self.datastores: 

326 dataset_existence.update(datastore.mexists(refs, artifact_existence=artifact_existence)) 

327 

328 # For next datastore no point asking about ones we know 

329 # exist already. No special exemption for ephemeral datastores. 

330 refs = [ref for ref, exists in dataset_existence.items() if not exists] 

331 

332 return dataset_existence 

333 

334 def exists(self, ref: DatasetRef) -> bool: 

335 """Check if the dataset exists in one of the datastores. 

336 

337 Parameters 

338 ---------- 

339 ref : `DatasetRef` 

340 Reference to the required dataset. 

341 

342 Returns 

343 ------- 

344 exists : `bool` 

345 `True` if the entity exists in one of the child datastores. 

346 """ 

347 for datastore in self.datastores: 

348 if datastore.exists(ref): 

349 log.debug("Found %s in datastore %s", ref, datastore.name) 

350 return True 

351 return False 

352 

353 def get( 

354 self, 

355 ref: DatasetRef, 

356 parameters: Mapping[str, Any] | None = None, 

357 storageClass: StorageClass | str | None = None, 

358 ) -> Any: 

359 """Load an InMemoryDataset from the store. 

360 

361 The dataset is returned from the first datastore that has 

362 the dataset. 

363 

364 Parameters 

365 ---------- 

366 ref : `DatasetRef` 

367 Reference to the required Dataset. 

368 parameters : `dict` 

369 `StorageClass`-specific parameters that specify, for example, 

370 a slice of the dataset to be loaded. 

371 storageClass : `StorageClass` or `str`, optional 

372 The storage class to be used to override the Python type 

373 returned by this method. By default the returned type matches 

374 the dataset type definition for this dataset. Specifying a 

375 read `StorageClass` can force a different type to be returned. 

376 This type must be compatible with the original type. 

377 

378 Returns 

379 ------- 

380 inMemoryDataset : `object` 

381 Requested dataset or slice thereof as an InMemoryDataset. 

382 

383 Raises 

384 ------ 

385 FileNotFoundError 

386 Requested dataset can not be retrieved. 

387 TypeError 

388 Return value from formatter has unexpected type. 

389 ValueError 

390 Formatter failed to process the dataset. 

391 """ 

392 for datastore in self.datastores: 

393 try: 

394 inMemoryObject = datastore.get(ref, parameters, storageClass=storageClass) 

395 log.debug("Found dataset %s in datastore %s", ref, datastore.name) 

396 return inMemoryObject 

397 except FileNotFoundError: 

398 pass 

399 

400 raise FileNotFoundError(f"Dataset {ref} could not be found in any of the datastores") 

401 

402 def prepare_get_for_external_client(self, ref: DatasetRef) -> object: 

403 return self._get_matching_datastore(ref).prepare_get_for_external_client(ref) 

404 

405 def _get_matching_datastore(self, ref: DatasetRef) -> Datastore: 

406 """Return the first child datastore that owns the specified dataset.""" 

407 for datastore in self.datastores: 

408 if datastore.knows(ref): 408 ↛ 409line 408 didn't jump to line 409, because the condition on line 408 was never true

409 return datastore 

410 

411 raise FileNotFoundError(f"Dataset {ref} could not be found in any of the datastores") 

412 

413 def put(self, inMemoryDataset: Any, ref: DatasetRef) -> None: 

414 """Write a InMemoryDataset with a given `DatasetRef` to each 

415 datastore. 

416 

417 The put() to child datastores can fail with 

418 `DatasetTypeNotSupportedError`. The put() for this datastore will be 

419 deemed to have succeeded so long as at least one child datastore 

420 accepted the inMemoryDataset. 

421 

422 Parameters 

423 ---------- 

424 inMemoryDataset : `object` 

425 The dataset to store. 

426 ref : `DatasetRef` 

427 Reference to the associated Dataset. 

428 

429 Raises 

430 ------ 

431 TypeError 

432 Supplied object and storage class are inconsistent. 

433 DatasetTypeNotSupportedError 

434 All datastores reported `DatasetTypeNotSupportedError`. 

435 """ 

436 log.debug("Put %s", ref) 

437 

438 # Confirm that we can accept this dataset 

439 if not self.constraints.isAcceptable(ref): 

440 # Raise rather than use boolean return value. 

441 raise DatasetTypeNotSupportedError( 

442 f"Dataset {ref} has been rejected by this datastore via configuration." 

443 ) 

444 

445 isPermanent = False 

446 nsuccess = 0 

447 npermanent = 0 

448 nephemeral = 0 

449 for datastore, constraints in zip(self.datastores, self.datastoreConstraints, strict=True): 

450 if ( 

451 constraints is not None and not constraints.isAcceptable(ref) 

452 ) or not datastore.constraints.isAcceptable(ref): 

453 log.debug("Datastore %s skipping put via configuration for ref %s", datastore.name, ref) 

454 continue 

455 

456 if datastore.isEphemeral: 

457 nephemeral += 1 

458 else: 

459 npermanent += 1 

460 try: 

461 datastore.put(inMemoryDataset, ref) 

462 nsuccess += 1 

463 if not datastore.isEphemeral: 

464 isPermanent = True 

465 except DatasetTypeNotSupportedError: 

466 pass 

467 

468 if nsuccess == 0: 

469 raise DatasetTypeNotSupportedError(f"None of the chained datastores supported ref {ref}") 

470 

471 if not isPermanent and npermanent > 0: 471 ↛ 472line 471 didn't jump to line 472, because the condition on line 471 was never true

472 warnings.warn(f"Put of {ref} only succeeded in ephemeral databases", stacklevel=2) 

473 

474 if self._transaction is not None: 

475 self._transaction.registerUndo("put", self.remove, ref) 

476 

477 def put_new(self, in_memory_dataset: Any, ref: DatasetRef) -> Mapping[str, DatasetRef]: 

478 # Docstring inherited from base class. 

479 log.debug("Put %s", ref) 

480 

481 # Confirm that we can accept this dataset 

482 if not self.constraints.isAcceptable(ref): 

483 # Raise rather than use boolean return value. 

484 raise DatasetTypeNotSupportedError( 

485 f"Dataset {ref} has been rejected by this datastore via configuration." 

486 ) 

487 

488 isPermanent = False 

489 nsuccess = 0 

490 npermanent = 0 

491 nephemeral = 0 

492 stored_refs: dict[str, DatasetRef] = {} 

493 for datastore, constraints in zip(self.datastores, self.datastoreConstraints, strict=True): 

494 if ( 

495 constraints is not None and not constraints.isAcceptable(ref) 

496 ) or not datastore.constraints.isAcceptable(ref): 

497 log.debug("Datastore %s skipping put via configuration for ref %s", datastore.name, ref) 

498 continue 

499 

500 if datastore.isEphemeral: 

501 nephemeral += 1 

502 else: 

503 npermanent += 1 

504 try: 

505 stored_ref_map = datastore.put_new(in_memory_dataset, ref) 

506 stored_refs.update(stored_ref_map) 

507 nsuccess += 1 

508 if not datastore.isEphemeral: 

509 isPermanent = True 

510 except DatasetTypeNotSupportedError: 

511 pass 

512 

513 if nsuccess == 0: 

514 raise DatasetTypeNotSupportedError(f"None of the chained datastores supported ref {ref}") 

515 

516 if not isPermanent and npermanent > 0: 

517 warnings.warn(f"Put of {ref} only succeeded in ephemeral databases", stacklevel=2) 

518 

519 if self._transaction is not None: 

520 self._transaction.registerUndo("put", self.remove, ref) 

521 

522 return stored_refs 

523 

524 def _overrideTransferMode(self, *datasets: Any, transfer: str | None = None) -> str | None: 

525 # Docstring inherited from base class. 

526 if transfer != "auto": 

527 return transfer 

528 # Ask each datastore what they think auto means 

529 transfers = {d._overrideTransferMode(*datasets, transfer=transfer) for d in self.datastores} 

530 

531 # Remove any untranslated "auto" values 

532 transfers.discard(transfer) 

533 

534 if len(transfers) == 1: 534 ↛ 535line 534 didn't jump to line 535, because the condition on line 534 was never true

535 return transfers.pop() 

536 if not transfers: 536 ↛ 540line 536 didn't jump to line 540, because the condition on line 536 was never false

537 # Everything reported "auto" 

538 return transfer 

539 

540 raise RuntimeError( 

541 "Chained datastore does not yet support different transfer modes" 

542 f" from 'auto' in each child datastore (wanted {transfers})" 

543 ) 

544 

545 def _prepIngest(self, *datasets: FileDataset, transfer: str | None = None) -> _IngestPrepData: 

546 # Docstring inherited from Datastore._prepIngest. 

547 if transfer is None: 

548 raise NotImplementedError("ChainedDatastore does not support transfer=None.") 

549 

550 def isDatasetAcceptable(dataset: FileDataset, *, name: str, constraints: Constraints) -> bool: 

551 acceptable = [ref for ref in dataset.refs if constraints.isAcceptable(ref)] 

552 if not acceptable: 

553 log.debug( 

554 "Datastore %s skipping ingest via configuration for refs %s", 

555 name, 

556 ", ".join(str(ref) for ref in dataset.refs), 

557 ) 

558 return False 

559 else: 

560 return True 

561 

562 # Filter down to just datasets the chained datastore's own 

563 # configuration accepts. 

564 okForParent: list[FileDataset] = [ 

565 dataset 

566 for dataset in datasets 

567 if isDatasetAcceptable(dataset, name=self.name, constraints=self.constraints) 

568 ] 

569 

570 # Iterate over nested datastores and call _prepIngest on each. 

571 # Save the results to a list: 

572 children: list[tuple[Datastore, Datastore.IngestPrepData, set[ResourcePath]]] = [] 

573 # ...and remember whether all of the failures are due to 

574 # NotImplementedError being raised. 

575 allFailuresAreNotImplementedError = True 

576 for datastore, constraints in zip(self.datastores, self.datastoreConstraints, strict=True): 

577 okForChild: list[FileDataset] 

578 if constraints is not None: 

579 okForChild = [ 

580 dataset 

581 for dataset in okForParent 

582 if isDatasetAcceptable(dataset, name=datastore.name, constraints=constraints) 

583 ] 

584 else: 

585 okForChild = okForParent 

586 try: 

587 prepDataForChild = datastore._prepIngest(*okForChild, transfer=transfer) 

588 except NotImplementedError: 

589 log.debug( 

590 "Skipping ingest for datastore %s because transfer mode %s is not supported.", 

591 datastore.name, 

592 transfer, 

593 ) 

594 continue 

595 allFailuresAreNotImplementedError = False 

596 if okForChild: 

597 # Do not store for later if a datastore has rejected 

598 # everything. 

599 # Include the source paths if this is a "move". It's clearer 

600 # to find the paths now rather than try to infer how 

601 # each datastore has stored them in the internal prep class. 

602 paths = ( 

603 {ResourcePath(dataset.path, forceDirectory=False) for dataset in okForChild} 

604 if transfer == "move" 

605 else set() 

606 ) 

607 children.append((datastore, prepDataForChild, paths)) 

608 if allFailuresAreNotImplementedError: 

609 raise NotImplementedError(f"No child datastore supports transfer mode {transfer}.") 

610 return _IngestPrepData(children=children) 

611 

612 def _finishIngest( 

613 self, 

614 prepData: _IngestPrepData, 

615 *, 

616 transfer: str | None = None, 

617 record_validation_info: bool = True, 

618 ) -> None: 

619 # Docstring inherited from Datastore._finishIngest. 

620 # For "move" we must use "copy" and then delete the input 

621 # data at the end. This has no rollback option if the ingest 

622 # subsequently fails. If there is only one active datastore 

623 # accepting any files we can leave it as "move" 

624 actual_transfer: str | None 

625 if transfer == "move" and len(prepData.children) > 1: 

626 actual_transfer = "copy" 

627 else: 

628 actual_transfer = transfer 

629 to_be_deleted: set[ResourcePath] = set() 

630 for datastore, prepDataForChild, paths in prepData.children: 

631 datastore._finishIngest( 

632 prepDataForChild, transfer=actual_transfer, record_validation_info=record_validation_info 

633 ) 

634 to_be_deleted.update(paths) 

635 if actual_transfer != transfer: 

636 # These datasets were copied but now need to be deleted. 

637 # This can not be rolled back. 

638 for uri in to_be_deleted: 

639 uri.remove() 

640 

641 def getManyURIs( 

642 self, 

643 refs: Iterable[DatasetRef], 

644 predict: bool = False, 

645 allow_missing: bool = False, 

646 ) -> dict[DatasetRef, DatasetRefURIs]: 

647 # Docstring inherited 

648 

649 uris: dict[DatasetRef, DatasetRefURIs] = {} 

650 missing_refs = set(refs) 

651 

652 # If predict is True we don't want to predict a dataset in the first 

653 # datastore if it actually exists in a later datastore, so in that 

654 # case check all datastores with predict=False first, and then try 

655 # again with predict=True. 

656 for p in (False, True) if predict else (False,): 

657 if not missing_refs: 

658 break 

659 for datastore in self.datastores: 

660 try: 

661 got_uris = datastore.getManyURIs(missing_refs, p, allow_missing=True) 

662 except NotImplementedError: 

663 # some datastores may not implement generating URIs 

664 continue 

665 missing_refs -= got_uris.keys() 

666 uris.update(got_uris) 

667 if not missing_refs: 

668 break 

669 

670 if missing_refs and not allow_missing: 

671 raise FileNotFoundError(f"Dataset(s) {missing_refs} not in this datastore.") 

672 

673 return uris 

674 

675 def getURIs(self, ref: DatasetRef, predict: bool = False) -> DatasetRefURIs: 

676 """Return URIs associated with dataset. 

677 

678 Parameters 

679 ---------- 

680 ref : `DatasetRef` 

681 Reference to the required dataset. 

682 predict : `bool`, optional 

683 If the datastore does not know about the dataset, controls whether 

684 it should return a predicted URI or not. 

685 

686 Returns 

687 ------- 

688 uris : `DatasetRefURIs` 

689 The URI to the primary artifact associated with this dataset (if 

690 the dataset was disassembled within the datastore this may be 

691 `None`), and the URIs to any components associated with the dataset 

692 artifact. (can be empty if there are no components). 

693 

694 Notes 

695 ----- 

696 The returned URI is from the first datastore in the list that has 

697 the dataset with preference given to the first dataset coming from 

698 a permanent datastore. If no datastores have the dataset and prediction 

699 is allowed, the predicted URI for the first datastore in the list will 

700 be returned. 

701 """ 

702 log.debug("Requesting URIs for %s", ref) 

703 predictedUri: DatasetRefURIs | None = None 

704 predictedEphemeralUri: DatasetRefURIs | None = None 

705 firstEphemeralUri: DatasetRefURIs | None = None 

706 for datastore in self.datastores: 

707 if datastore.exists(ref): 

708 if not datastore.isEphemeral: 

709 uri = datastore.getURIs(ref) 

710 log.debug("Retrieved non-ephemeral URI: %s", uri) 

711 return uri 

712 elif not firstEphemeralUri: 

713 firstEphemeralUri = datastore.getURIs(ref) 

714 elif predict: 

715 if not predictedUri and not datastore.isEphemeral: 

716 predictedUri = datastore.getURIs(ref, predict) 

717 elif not predictedEphemeralUri and datastore.isEphemeral: 

718 predictedEphemeralUri = datastore.getURIs(ref, predict) 

719 

720 if firstEphemeralUri: 

721 log.debug("Retrieved ephemeral URI: %s", firstEphemeralUri) 

722 return firstEphemeralUri 

723 

724 if predictedUri: 

725 log.debug("Retrieved predicted URI: %s", predictedUri) 

726 return predictedUri 

727 

728 if predictedEphemeralUri: 

729 log.debug("Retrieved predicted ephemeral URI: %s", predictedEphemeralUri) 

730 return predictedEphemeralUri 

731 

732 raise FileNotFoundError(f"Dataset {ref} not in any datastore") 

733 

734 def getURI(self, ref: DatasetRef, predict: bool = False) -> ResourcePath: 

735 """URI to the Dataset. 

736 

737 The returned URI is from the first datastore in the list that has 

738 the dataset with preference given to the first dataset coming from 

739 a permanent datastore. If no datastores have the dataset and prediction 

740 is allowed, the predicted URI for the first datastore in the list will 

741 be returned. 

742 

743 Parameters 

744 ---------- 

745 ref : `DatasetRef` 

746 Reference to the required Dataset. 

747 predict : `bool` 

748 If `True`, allow URIs to be returned of datasets that have not 

749 been written. 

750 

751 Returns 

752 ------- 

753 uri : `lsst.resources.ResourcePath` 

754 URI pointing to the dataset within the datastore. If the 

755 dataset does not exist in the datastore, and if ``predict`` is 

756 `True`, the URI will be a prediction and will include a URI 

757 fragment "#predicted". 

758 

759 Notes 

760 ----- 

761 If the datastore does not have entities that relate well 

762 to the concept of a URI the returned URI string will be 

763 descriptive. The returned URI is not guaranteed to be obtainable. 

764 

765 Raises 

766 ------ 

767 FileNotFoundError 

768 A URI has been requested for a dataset that does not exist and 

769 guessing is not allowed. 

770 RuntimeError 

771 Raised if a request is made for a single URI but multiple URIs 

772 are associated with this dataset. 

773 """ 

774 log.debug("Requesting URI for %s", ref) 

775 primary, components = self.getURIs(ref, predict) 

776 if primary is None or components: 776 ↛ 777line 776 didn't jump to line 777, because the condition on line 776 was never true

777 raise RuntimeError( 

778 f"Dataset ({ref}) includes distinct URIs for components. Use Datastore.getURIs() instead." 

779 ) 

780 return primary 

781 

782 def retrieveArtifacts( 

783 self, 

784 refs: Iterable[DatasetRef], 

785 destination: ResourcePath, 

786 transfer: str = "auto", 

787 preserve_path: bool = True, 

788 overwrite: bool = False, 

789 ) -> list[ResourcePath]: 

790 """Retrieve the file artifacts associated with the supplied refs. 

791 

792 Parameters 

793 ---------- 

794 refs : iterable of `DatasetRef` 

795 The datasets for which file artifacts are to be retrieved. 

796 A single ref can result in multiple files. The refs must 

797 be resolved. 

798 destination : `lsst.resources.ResourcePath` 

799 Location to write the file artifacts. 

800 transfer : `str`, optional 

801 Method to use to transfer the artifacts. Must be one of the options 

802 supported by `lsst.resources.ResourcePath.transfer_from()`. 

803 "move" is not allowed. 

804 preserve_path : `bool`, optional 

805 If `True` the full path of the file artifact within the datastore 

806 is preserved. If `False` the final file component of the path 

807 is used. 

808 overwrite : `bool`, optional 

809 If `True` allow transfers to overwrite existing files at the 

810 destination. 

811 

812 Returns 

813 ------- 

814 targets : `list` of `lsst.resources.ResourcePath` 

815 URIs of file artifacts in destination location. Order is not 

816 preserved. 

817 """ 

818 if not destination.isdir(): 818 ↛ 819line 818 didn't jump to line 819, because the condition on line 818 was never true

819 raise ValueError(f"Destination location must refer to a directory. Given {destination}") 

820 

821 # Using getURIs is not feasible since it becomes difficult to 

822 # determine the path within the datastore later on. For now 

823 # follow getURIs implementation approach. 

824 

825 pending = set(refs) 

826 

827 # There is a question as to whether an exception should be raised 

828 # early if some of the refs are missing, or whether files should be 

829 # transferred until a problem is hit. Prefer to complain up front. 

830 # Use the datastore integer as primary key. 

831 grouped_by_datastore: dict[int, set[DatasetRef]] = {} 

832 

833 for number, datastore in enumerate(self.datastores): 

834 if datastore.isEphemeral: 

835 # In the future we will want to distinguish in-memory from 

836 # caching datastore since using an on-disk local 

837 # cache is exactly what we should be doing. 

838 continue 

839 try: 

840 datastore_refs = {ref for ref in pending if datastore.exists(ref)} 

841 except NotImplementedError: 

842 # Some datastores may not support retrieving artifacts 

843 continue 

844 

845 if datastore_refs: 

846 grouped_by_datastore[number] = datastore_refs 

847 

848 # Remove these from the pending list so that we do not bother 

849 # looking for them any more. 

850 pending = pending - datastore_refs 

851 

852 if pending: 852 ↛ 853line 852 didn't jump to line 853, because the condition on line 852 was never true

853 raise RuntimeError(f"Some datasets were not found in any datastores: {pending}") 

854 

855 # Now do the transfer. 

856 targets: list[ResourcePath] = [] 

857 for number, datastore_refs in grouped_by_datastore.items(): 

858 targets.extend( 

859 self.datastores[number].retrieveArtifacts( 

860 datastore_refs, 

861 destination, 

862 transfer=transfer, 

863 preserve_path=preserve_path, 

864 overwrite=overwrite, 

865 ) 

866 ) 

867 

868 return targets 

869 

870 def remove(self, ref: DatasetRef) -> None: 

871 """Indicate to the datastore that a dataset can be removed. 

872 

873 The dataset will be removed from each datastore. The dataset is 

874 not required to exist in every child datastore. 

875 

876 Parameters 

877 ---------- 

878 ref : `DatasetRef` 

879 Reference to the required dataset. 

880 

881 Raises 

882 ------ 

883 FileNotFoundError 

884 Attempt to remove a dataset that does not exist. Raised if none 

885 of the child datastores removed the dataset. 

886 """ 

887 log.debug("Removing %s", ref) 

888 self.trash(ref, ignore_errors=False) 

889 self.emptyTrash(ignore_errors=False) 

890 

891 def forget(self, refs: Iterable[DatasetRef]) -> None: 

892 for datastore in tuple(self.datastores): 

893 datastore.forget(refs) 

894 

895 def trash(self, ref: DatasetRef | Iterable[DatasetRef], ignore_errors: bool = True) -> None: 

896 if isinstance(ref, DatasetRef): 

897 ref_label = str(ref) 

898 else: 

899 ref_label = "bulk datasets" 

900 

901 log.debug("Trashing %s", ref_label) 

902 

903 counter = 0 

904 for datastore in self.datastores: 

905 try: 

906 datastore.trash(ref, ignore_errors=ignore_errors) 

907 counter += 1 

908 except FileNotFoundError: 

909 pass 

910 

911 if counter == 0: 

912 err_msg = f"Could not mark for removal from any child datastore: {ref_label}" 

913 if ignore_errors: 913 ↛ 914line 913 didn't jump to line 914, because the condition on line 913 was never true

914 log.warning(err_msg) 

915 else: 

916 raise FileNotFoundError(err_msg) 

917 

918 def emptyTrash(self, ignore_errors: bool = True) -> None: 

919 for datastore in self.datastores: 

920 datastore.emptyTrash(ignore_errors=ignore_errors) 

921 

922 def transfer(self, inputDatastore: Datastore, ref: DatasetRef) -> None: 

923 """Retrieve a dataset from an input `Datastore`, 

924 and store the result in this `Datastore`. 

925 

926 Parameters 

927 ---------- 

928 inputDatastore : `Datastore` 

929 The external `Datastore` from which to retreive the Dataset. 

930 ref : `DatasetRef` 

931 Reference to the required dataset in the input data store. 

932 

933 Returns 

934 ------- 

935 results : `list` 

936 List containing the return value from the ``put()`` to each 

937 child datastore. 

938 """ 

939 assert inputDatastore is not self # unless we want it for renames? 

940 inMemoryDataset = inputDatastore.get(ref) 

941 self.put(inMemoryDataset, ref) 

942 

943 def validateConfiguration( 

944 self, entities: Iterable[DatasetRef | DatasetType | StorageClass], logFailures: bool = False 

945 ) -> None: 

946 """Validate some of the configuration for this datastore. 

947 

948 Parameters 

949 ---------- 

950 entities : iterable of `DatasetRef`, `DatasetType`, or `StorageClass` 

951 Entities to test against this configuration. Can be differing 

952 types. 

953 logFailures : `bool`, optional 

954 If `True`, output a log message for every validation error 

955 detected. 

956 

957 Raises 

958 ------ 

959 DatastoreValidationError 

960 Raised if there is a validation problem with a configuration. 

961 All the problems are reported in a single exception. 

962 

963 Notes 

964 ----- 

965 This method checks each datastore in turn. 

966 """ 

967 # Need to catch each of the datastore outputs and ensure that 

968 # all are tested. 

969 failures = [] 

970 for datastore in self.datastores: 

971 try: 

972 datastore.validateConfiguration(entities, logFailures=logFailures) 

973 except DatastoreValidationError as e: 

974 if logFailures: 974 ↛ 976line 974 didn't jump to line 976, because the condition on line 974 was never false

975 log.critical("Datastore %s failed validation", datastore.name) 

976 failures.append(f"Datastore {self.name}: {e}") 

977 

978 if failures: 

979 msg = ";\n".join(failures) 

980 raise DatastoreValidationError(msg) 

981 

982 def validateKey(self, lookupKey: LookupKey, entity: DatasetRef | DatasetType | StorageClass) -> None: 

983 # Docstring is inherited from base class 

984 failures = [] 

985 for datastore in self.datastores: 

986 try: 

987 datastore.validateKey(lookupKey, entity) 

988 except DatastoreValidationError as e: 

989 failures.append(f"Datastore {self.name}: {e}") 

990 

991 if failures: 

992 msg = ";\n".join(failures) 

993 raise DatastoreValidationError(msg) 

994 

995 def getLookupKeys(self) -> set[LookupKey]: 

996 # Docstring is inherited from base class 

997 keys = set() 

998 for datastore in self.datastores: 

999 keys.update(datastore.getLookupKeys()) 

1000 

1001 keys.update(self.constraints.getLookupKeys()) 

1002 for p in self.datastoreConstraints: 

1003 if p is not None: 1003 ↛ 1002line 1003 didn't jump to line 1002, because the condition on line 1003 was never false

1004 keys.update(p.getLookupKeys()) 

1005 

1006 return keys 

1007 

1008 def needs_expanded_data_ids( 

1009 self, 

1010 transfer: str | None, 

1011 entity: DatasetRef | DatasetType | StorageClass | None = None, 

1012 ) -> bool: 

1013 # Docstring inherited. 

1014 # We can't safely use `self.datastoreConstraints` with `entity` to 

1015 # check whether a child datastore would even want to ingest this 

1016 # dataset, because we don't want to filter out datastores that might 

1017 # need an expanded data ID based in incomplete information (e.g. we 

1018 # pass a StorageClass, but the constraint dispatches on DatasetType). 

1019 # So we pessimistically check if any datastore would need an expanded 

1020 # data ID for this transfer mode. 

1021 return any(datastore.needs_expanded_data_ids(transfer, entity) for datastore in self.datastores) 

1022 

1023 def import_records(self, data: Mapping[str, DatastoreRecordData]) -> None: 

1024 # Docstring inherited from the base class. 

1025 

1026 for datastore in self.datastores: 

1027 datastore.import_records(data) 

1028 

1029 def export_records(self, refs: Iterable[DatasetIdRef]) -> Mapping[str, DatastoreRecordData]: 

1030 # Docstring inherited from the base class. 

1031 

1032 all_records: dict[str, DatastoreRecordData] = {} 

1033 

1034 # Merge all sub-datastore records into one structure 

1035 for datastore in self.datastores: 

1036 sub_records = datastore.export_records(refs) 

1037 for name, record_data in sub_records.items(): 

1038 # All datastore names must be unique in a chain. 

1039 if name in all_records: 1039 ↛ 1040line 1039 didn't jump to line 1040, because the condition on line 1039 was never true

1040 raise ValueError("Non-unique datastore name found in datastore {datastore}") 

1041 all_records[name] = record_data 

1042 

1043 return all_records 

1044 

1045 def export( 

1046 self, 

1047 refs: Iterable[DatasetRef], 

1048 *, 

1049 directory: ResourcePathExpression | None = None, 

1050 transfer: str | None = "auto", 

1051 ) -> Iterable[FileDataset]: 

1052 # Docstring inherited from Datastore.export. 

1053 if transfer == "auto" and directory is None: 

1054 transfer = None 

1055 

1056 if transfer is not None and directory is None: 

1057 raise TypeError(f"Cannot export using transfer mode {transfer} with no export directory given") 

1058 

1059 if transfer == "move": 

1060 raise TypeError("Can not export by moving files out of datastore.") 

1061 

1062 # Exporting from a chain has the potential for a dataset to be 

1063 # in one or more of the datastores in the chain. We only need one 

1064 # of them since we assume the datasets are the same in all (but 

1065 # the file format could be different of course since that is a 

1066 # per-datastore configuration). 

1067 # We also do not know whether any of the datastores in the chain 

1068 # support file export. 

1069 

1070 # Ensure we have an ordered sequence that is not an iterator or set. 

1071 if not isinstance(refs, Sequence): 

1072 refs = list(refs) 

1073 

1074 # If any of the datasets are missing entirely we need to raise early 

1075 # before we try to run the export. This can be a little messy but is 

1076 # better than exporting files from the first datastore and then finding 

1077 # that one is missing but is not in the second datastore either. 

1078 known = [datastore.knows_these(refs) for datastore in self.datastores] 

1079 refs_known: set[DatasetRef] = set() 

1080 for known_to_this in known: 

1081 refs_known.update({ref for ref, knows_this in known_to_this.items() if knows_this}) 

1082 missing_count = len(refs) - len(refs_known) 

1083 if missing_count: 

1084 raise FileNotFoundError(f"Not all datasets known to this datastore. Missing {missing_count}") 

1085 

1086 # To allow us to slot each result into the right place after 

1087 # asking each datastore, create a dict with the index. 

1088 ref_positions = {ref: i for i, ref in enumerate(refs)} 

1089 

1090 # Presize the final export list. 

1091 exported: list[FileDataset | None] = [None] * len(refs) 

1092 

1093 # The order of the returned dataset has to match the order of the 

1094 # given refs, even if they are all from different datastores. 

1095 for i, datastore in enumerate(self.datastores): 

1096 known_to_this = known[i] 

1097 filtered = [ref for ref, knows in known_to_this.items() if knows and ref in ref_positions] 

1098 

1099 try: 

1100 this_export = datastore.export(filtered, directory=directory, transfer=transfer) 

1101 except NotImplementedError: 

1102 # Try the next datastore. 

1103 continue 

1104 

1105 for ref, export in zip(filtered, this_export, strict=True): 

1106 # Get the position and also delete it from the list. 

1107 exported[ref_positions.pop(ref)] = export 

1108 

1109 # Every dataset should be accounted for because of the earlier checks 

1110 # but make sure that we did fill all the slots to appease mypy. 

1111 for i, dataset in enumerate(exported): 

1112 if dataset is None: 1112 ↛ 1113line 1112 didn't jump to line 1113, because the condition on line 1112 was never true

1113 raise FileNotFoundError(f"Failed to export dataset {refs[i]}.") 

1114 yield dataset 

1115 

1116 def transfer_from( 

1117 self, 

1118 source_datastore: Datastore, 

1119 refs: Collection[DatasetRef], 

1120 transfer: str = "auto", 

1121 artifact_existence: dict[ResourcePath, bool] | None = None, 

1122 dry_run: bool = False, 

1123 ) -> tuple[set[DatasetRef], set[DatasetRef]]: 

1124 # Docstring inherited 

1125 # mypy does not understand "type(self) is not type(source)" 

1126 if isinstance(source_datastore, ChainedDatastore): 

1127 # Both the source and destination are chained datastores. 

1128 source_datastores = tuple(source_datastore.datastores) 

1129 else: 

1130 # The source datastore is different, forward everything to the 

1131 # child datastores. 

1132 source_datastores = (source_datastore,) 

1133 

1134 if not refs: 1134 ↛ 1136line 1134 didn't jump to line 1136, because the condition on line 1134 was never true

1135 # Nothing to transfer. 

1136 return set(), set() 

1137 

1138 # Need to know the set of all possible refs that could be transferred. 

1139 remaining_refs = set(refs) 

1140 

1141 missing_from_source: set[DatasetRef] | None = None 

1142 all_accepted = set() 

1143 nsuccess = 0 

1144 for source_child in source_datastores: 

1145 # If we are reading from a chained datastore, it's possible that 

1146 # only a subset of the datastores know about the dataset. We can't 

1147 # ask the receiving datastore to copy it when it doesn't exist 

1148 # so we have to filter again based on what the source datastore 

1149 # understands. 

1150 known_to_source = source_child.knows_these(list(refs)) 

1151 

1152 # Need to know that there is a possibility that some of these 

1153 # datasets exist but are unknown to the source datastore if 

1154 # trust is enabled. 

1155 if getattr(source_child, "trustGetRequest", False): 

1156 unknown = [ref for ref, known in known_to_source.items() if not known] 

1157 existence = source_child.mexists(unknown, artifact_existence) 

1158 for ref, exists in existence.items(): 

1159 known_to_source[ref] = exists 

1160 

1161 missing = {ref for ref, known in known_to_source.items() if not known} 

1162 if missing: 

1163 if missing_from_source is None: 

1164 missing_from_source = missing 

1165 else: 

1166 missing_from_source &= missing 

1167 

1168 # Try to transfer from each source datastore to each child 

1169 # datastore. Have to make sure we don't transfer something 

1170 # we've already transferred to this destination on later passes. 

1171 

1172 # Filter the initial list based on the datasets we have 

1173 # not yet transferred. 

1174 these_refs = [] 

1175 for ref in refs: 

1176 if ref in remaining_refs and known_to_source[ref]: 

1177 these_refs.append(ref) 

1178 

1179 if not these_refs: 

1180 # Already transferred all datasets known to this datastore. 

1181 continue 

1182 

1183 for datastore, constraints in zip(self.datastores, self.datastoreConstraints, strict=True): 

1184 if constraints is not None: 1184 ↛ 1192line 1184 didn't jump to line 1192, because the condition on line 1184 was never false

1185 filtered_refs = [] 

1186 for ref in these_refs: 

1187 if constraints.isAcceptable(ref): 

1188 filtered_refs.append(ref) 

1189 else: 

1190 log.debug("Rejecting ref by constraints: %s", ref) 

1191 else: 

1192 filtered_refs = list(these_refs) 

1193 try: 

1194 accepted, _ = datastore.transfer_from( 

1195 source_child, 

1196 filtered_refs, 

1197 transfer, 

1198 artifact_existence, 

1199 dry_run=dry_run, 

1200 ) 

1201 except (TypeError, NotImplementedError): 

1202 # The datastores were incompatible. 

1203 continue 

1204 else: 

1205 nsuccess += 1 

1206 

1207 # Remove the accepted datasets from those remaining. 

1208 remaining_refs = remaining_refs - accepted 

1209 

1210 # Keep track of everything we have accepted. 

1211 all_accepted.update(accepted) 

1212 

1213 if missing_from_source: 

1214 for ref in missing_from_source: 

1215 log.warning("Asked to transfer dataset %s but no file artifacts exist for it", ref) 

1216 

1217 if nsuccess == 0: 1217 ↛ 1218line 1217 didn't jump to line 1218, because the condition on line 1217 was never true

1218 raise TypeError(f"None of the child datastores could accept transfers from {source_datastore!r}") 

1219 

1220 return all_accepted, remaining_refs 

1221 

1222 def get_opaque_table_definitions(self) -> Mapping[str, DatastoreOpaqueTable]: 

1223 # Docstring inherited from the base class. 

1224 tables: dict[str, DatastoreOpaqueTable] = {} 

1225 for datastore in self.datastores: 

1226 tables.update(datastore.get_opaque_table_definitions()) 

1227 return tables