Coverage for python/lsst/daf/butler/datastores/chainedDatastore.py: 91%

283 statements  

« prev     ^ index     » next       coverage.py v6.5.0, created at 2022-12-01 19:54 +0000

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22from __future__ import annotations 

23 

24"""Chained datastore.""" 

25 

26__all__ = ("ChainedDatastore",) 

27 

28import time 

29import logging 

30import warnings 

31import itertools 

32from typing import ( 

33 TYPE_CHECKING, 

34 Any, 

35 Dict, 

36 List, 

37 Iterable, 

38 Mapping, 

39 Optional, 

40 Sequence, 

41 Set, 

42 Tuple, 

43 Union, 

44) 

45 

46from lsst.utils import doImport 

47from lsst.daf.butler import ButlerURI, Datastore, DatastoreConfig, DatasetTypeNotSupportedError, \ 

48 DatastoreValidationError, Constraints, FileDataset, DatasetRef 

49 

50if TYPE_CHECKING: 50 ↛ 51line 50 didn't jump to line 51, because the condition on line 50 was never true

51 from lsst.daf.butler import Config, DatasetType, LookupKey, StorageClass 

52 from lsst.daf.butler.registry.interfaces import DatastoreRegistryBridgeManager 

53 

54log = logging.getLogger(__name__) 

55 

56 

57class _IngestPrepData(Datastore.IngestPrepData): 

58 """Helper class for ChainedDatastore ingest implementation. 

59 

60 Parameters 

61 ---------- 

62 children : `list` of `tuple` 

63 Pairs of `Datastore`, `IngestPrepData` for all child datastores. 

64 """ 

65 def __init__(self, children: List[Tuple[Datastore, Datastore.IngestPrepData]]): 

66 super().__init__(itertools.chain.from_iterable(data.refs.values() for _, data in children)) 

67 self.children = children 

68 

69 

70class ChainedDatastore(Datastore): 

71 """Chained Datastores to allow read and writes from multiple datastores. 

72 

73 A ChainedDatastore is configured with multiple datastore configurations. 

74 A ``put()`` is always sent to each datastore. A ``get()`` 

75 operation is sent to each datastore in turn and the first datastore 

76 to return a valid dataset is used. 

77 

78 Parameters 

79 ---------- 

80 config : `DatastoreConfig` or `str` 

81 Configuration. This configuration must include a ``datastores`` field 

82 as a sequence of datastore configurations. The order in this sequence 

83 indicates the order to use for read operations. 

84 bridgeManager : `DatastoreRegistryBridgeManager` 

85 Object that manages the interface between `Registry` and datastores. 

86 butlerRoot : `str`, optional 

87 New datastore root to use to override the configuration value. This 

88 root is sent to each child datastore. 

89 

90 Notes 

91 ----- 

92 ChainedDatastore never supports `None` or `"move"` as an `ingest` transfer 

93 mode. It supports `"copy"`, `"symlink"`, `"relsymlink"` 

94 and `"hardlink"` if and only if all its child datastores do. 

95 """ 

96 

97 defaultConfigFile = "datastores/chainedDatastore.yaml" 

98 """Path to configuration defaults. Accessed within the ``configs`` resource 

99 or relative to a search path. Can be None if no defaults specified. 

100 """ 

101 

102 containerKey = "datastores" 

103 """Key to specify where child datastores are configured.""" 

104 

105 datastores: List[Datastore] 

106 """All the child datastores known to this datastore.""" 

107 

108 datastoreConstraints: Sequence[Optional[Constraints]] 

109 """Constraints to be applied to each of the child datastores.""" 

110 

111 @classmethod 

112 def setConfigRoot(cls, root: str, config: Config, full: Config, overwrite: bool = True) -> None: 

113 """Set any filesystem-dependent config options for child Datastores to 

114 be appropriate for a new empty repository with the given root. 

115 

116 Parameters 

117 ---------- 

118 root : `str` 

119 Filesystem path to the root of the data repository. 

120 config : `Config` 

121 A `Config` to update. Only the subset understood by 

122 this component will be updated. Will not expand 

123 defaults. 

124 full : `Config` 

125 A complete config with all defaults expanded that can be 

126 converted to a `DatastoreConfig`. Read-only and will not be 

127 modified by this method. 

128 Repository-specific options that should not be obtained 

129 from defaults when Butler instances are constructed 

130 should be copied from ``full`` to ``config``. 

131 overwrite : `bool`, optional 

132 If `False`, do not modify a value in ``config`` if the value 

133 already exists. Default is always to overwrite with the provided 

134 ``root``. 

135 

136 Notes 

137 ----- 

138 If a keyword is explicitly defined in the supplied ``config`` it 

139 will not be overridden by this method if ``overwrite`` is `False`. 

140 This allows explicit values set in external configs to be retained. 

141 """ 

142 

143 # Extract the part of the config we care about updating 

144 datastoreConfig = DatastoreConfig(config, mergeDefaults=False) 

145 

146 # And the subset of the full config that we can use for reference. 

147 # Do not bother with defaults because we are told this already has 

148 # them. 

149 fullDatastoreConfig = DatastoreConfig(full, mergeDefaults=False) 

150 

151 # Loop over each datastore config and pass the subsets to the 

152 # child datastores to process. 

153 

154 containerKey = cls.containerKey 

155 for idx, (child, fullChild) in enumerate(zip(datastoreConfig[containerKey], 

156 fullDatastoreConfig[containerKey])): 

157 childConfig = DatastoreConfig(child, mergeDefaults=False) 

158 fullChildConfig = DatastoreConfig(fullChild, mergeDefaults=False) 

159 datastoreClass = doImport(fullChildConfig["cls"]) 

160 newroot = "{}/{}_{}".format(root, datastoreClass.__qualname__, idx) 

161 datastoreClass.setConfigRoot(newroot, childConfig, fullChildConfig, overwrite=overwrite) 

162 

163 # Reattach to parent 

164 datastoreConfig[containerKey, idx] = childConfig 

165 

166 # Reattach modified datastore config to parent 

167 # If this has a datastore key we attach there, otherwise we assume 

168 # this information goes at the top of the config hierarchy. 

169 if DatastoreConfig.component in config: 

170 config[DatastoreConfig.component] = datastoreConfig 

171 else: 

172 config.update(datastoreConfig) 

173 

174 return 

175 

176 def __init__(self, config: Union[Config, str], bridgeManager: DatastoreRegistryBridgeManager, 

177 butlerRoot: str = None): 

178 super().__init__(config, bridgeManager) 

179 

180 # Scan for child datastores and instantiate them with the same registry 

181 self.datastores = [] 

182 for c in self.config["datastores"]: 

183 c = DatastoreConfig(c) 

184 datastoreType = doImport(c["cls"]) 

185 datastore = datastoreType(c, bridgeManager, butlerRoot=butlerRoot) 

186 log.debug("Creating child datastore %s", datastore.name) 

187 self.datastores.append(datastore) 

188 

189 # Name ourself based on our children 

190 if self.datastores: 190 ↛ 195line 190 didn't jump to line 195, because the condition on line 190 was never false

191 # We must set the names explicitly 

192 self._names = [d.name for d in self.datastores] 

193 childNames = ",".join(self.names) 

194 else: 

195 childNames = "(empty@{})".format(time.time()) 

196 self._names = [childNames] 

197 self.name = "{}[{}]".format(type(self).__qualname__, childNames) 

198 

199 # We declare we are ephemeral if all our child datastores declare 

200 # they are ephemeral 

201 isEphemeral = True 

202 for d in self.datastores: 

203 if not d.isEphemeral: 

204 isEphemeral = False 

205 break 

206 self.isEphemeral = isEphemeral 

207 

208 # per-datastore override constraints 

209 if "datastore_constraints" in self.config: 

210 overrides = self.config["datastore_constraints"] 

211 

212 if len(overrides) != len(self.datastores): 212 ↛ 213line 212 didn't jump to line 213, because the condition on line 212 was never true

213 raise DatastoreValidationError(f"Number of registered datastores ({len(self.datastores)})" 

214 " differs from number of constraints overrides" 

215 f" {len(overrides)}") 

216 

217 self.datastoreConstraints = [Constraints(c.get("constraints"), universe=bridgeManager.universe) 

218 for c in overrides] 

219 

220 else: 

221 self.datastoreConstraints = (None,) * len(self.datastores) 

222 

223 log.debug("Created %s (%s)", self.name, ("ephemeral" if self.isEphemeral else "permanent")) 

224 

225 @property 

226 def names(self) -> Tuple[str, ...]: 

227 return tuple(self._names) 

228 

229 def __str__(self) -> str: 

230 chainName = ", ".join(str(ds) for ds in self.datastores) 

231 return chainName 

232 

233 def knows(self, ref: DatasetRef) -> bool: 

234 """Check if the dataset is known to any of the datastores. 

235 

236 Does not check for existence of any artifact. 

237 

238 Parameters 

239 ---------- 

240 ref : `DatasetRef` 

241 Reference to the required dataset. 

242 

243 Returns 

244 ------- 

245 exists : `bool` 

246 `True` if the dataset is known to the datastore. 

247 """ 

248 for datastore in self.datastores: 

249 if datastore.knows(ref): 

250 log.debug("%s known to datastore %s", ref, datastore.name) 

251 return True 

252 return False 

253 

254 def mexists(self, refs: Iterable[DatasetRef], 

255 artifact_existence: Optional[Dict[ButlerURI, bool]] = None) -> Dict[DatasetRef, bool]: 

256 """Check the existence of multiple datasets at once. 

257 

258 Parameters 

259 ---------- 

260 refs : iterable of `DatasetRef` 

261 The datasets to be checked. 

262 artifact_existence : `dict` of [`ButlerURI`, `bool`], optional 

263 Mapping of datastore artifact to existence. Updated by this 

264 method with details of all artifacts tested. Can be `None` 

265 if the caller is not interested. 

266 

267 Returns 

268 ------- 

269 existence : `dict` of [`DatasetRef`, `bool`] 

270 Mapping from dataset to boolean indicating existence in any 

271 of the child datastores. 

272 """ 

273 dataset_existence: Dict[DatasetRef, bool] = {} 

274 for datastore in self.datastores: 

275 dataset_existence.update(datastore.mexists(refs, artifact_existence=artifact_existence)) 

276 

277 # For next datastore no point asking about ones we know 

278 # exist already. No special exemption for ephemeral datastores. 

279 refs = [ref for ref, exists in dataset_existence.items() if not exists] 

280 

281 return dataset_existence 

282 

283 def exists(self, ref: DatasetRef) -> bool: 

284 """Check if the dataset exists in one of the datastores. 

285 

286 Parameters 

287 ---------- 

288 ref : `DatasetRef` 

289 Reference to the required dataset. 

290 

291 Returns 

292 ------- 

293 exists : `bool` 

294 `True` if the entity exists in one of the child datastores. 

295 """ 

296 for datastore in self.datastores: 

297 if datastore.exists(ref): 

298 log.debug("Found %s in datastore %s", ref, datastore.name) 

299 return True 

300 return False 

301 

302 def get(self, ref: DatasetRef, parameters: Optional[Mapping[str, Any]] = None) -> Any: 

303 """Load an InMemoryDataset from the store. 

304 

305 The dataset is returned from the first datastore that has 

306 the dataset. 

307 

308 Parameters 

309 ---------- 

310 ref : `DatasetRef` 

311 Reference to the required Dataset. 

312 parameters : `dict` 

313 `StorageClass`-specific parameters that specify, for example, 

314 a slice of the dataset to be loaded. 

315 

316 Returns 

317 ------- 

318 inMemoryDataset : `object` 

319 Requested dataset or slice thereof as an InMemoryDataset. 

320 

321 Raises 

322 ------ 

323 FileNotFoundError 

324 Requested dataset can not be retrieved. 

325 TypeError 

326 Return value from formatter has unexpected type. 

327 ValueError 

328 Formatter failed to process the dataset. 

329 """ 

330 

331 for datastore in self.datastores: 

332 try: 

333 inMemoryObject = datastore.get(ref, parameters) 

334 log.debug("Found dataset %s in datastore %s", ref, datastore.name) 

335 return inMemoryObject 

336 except FileNotFoundError: 

337 pass 

338 

339 raise FileNotFoundError("Dataset {} could not be found in any of the datastores".format(ref)) 

340 

341 def put(self, inMemoryDataset: Any, ref: DatasetRef) -> None: 

342 """Write a InMemoryDataset with a given `DatasetRef` to each 

343 datastore. 

344 

345 The put() to child datastores can fail with 

346 `DatasetTypeNotSupportedError`. The put() for this datastore will be 

347 deemed to have succeeded so long as at least one child datastore 

348 accepted the inMemoryDataset. 

349 

350 Parameters 

351 ---------- 

352 inMemoryDataset : `object` 

353 The dataset to store. 

354 ref : `DatasetRef` 

355 Reference to the associated Dataset. 

356 

357 Raises 

358 ------ 

359 TypeError 

360 Supplied object and storage class are inconsistent. 

361 DatasetTypeNotSupportedError 

362 All datastores reported `DatasetTypeNotSupportedError`. 

363 """ 

364 log.debug("Put %s", ref) 

365 

366 # Confirm that we can accept this dataset 

367 if not self.constraints.isAcceptable(ref): 

368 # Raise rather than use boolean return value. 

369 raise DatasetTypeNotSupportedError(f"Dataset {ref} has been rejected by this datastore via" 

370 " configuration.") 

371 

372 isPermanent = False 

373 nsuccess = 0 

374 npermanent = 0 

375 nephemeral = 0 

376 for datastore, constraints in zip(self.datastores, self.datastoreConstraints): 

377 if constraints is not None and not constraints.isAcceptable(ref): 

378 log.debug("Datastore %s skipping put via configuration for ref %s", 

379 datastore.name, ref) 

380 continue 

381 

382 if datastore.isEphemeral: 

383 nephemeral += 1 

384 else: 

385 npermanent += 1 

386 try: 

387 datastore.put(inMemoryDataset, ref) 

388 nsuccess += 1 

389 if not datastore.isEphemeral: 

390 isPermanent = True 

391 except DatasetTypeNotSupportedError: 

392 pass 

393 

394 if nsuccess == 0: 

395 raise DatasetTypeNotSupportedError(f"None of the chained datastores supported ref {ref}") 

396 

397 if not isPermanent and npermanent > 0: 397 ↛ 398line 397 didn't jump to line 398, because the condition on line 397 was never true

398 warnings.warn(f"Put of {ref} only succeeded in ephemeral databases", stacklevel=2) 

399 

400 if self._transaction is not None: 

401 self._transaction.registerUndo('put', self.remove, ref) 

402 

403 def _overrideTransferMode(self, *datasets: Any, transfer: Optional[str] = None) -> Optional[str]: 

404 # Docstring inherited from base class. 

405 if transfer != "auto": 

406 return transfer 

407 # Ask each datastore what they think auto means 

408 transfers = {d._overrideTransferMode(*datasets, transfer=transfer) for d in self.datastores} 

409 

410 # Remove any untranslated "auto" values 

411 transfers.discard(transfer) 

412 

413 if len(transfers) == 1: 413 ↛ 414line 413 didn't jump to line 414, because the condition on line 413 was never true

414 return transfers.pop() 

415 if not transfers: 415 ↛ 419line 415 didn't jump to line 419, because the condition on line 415 was never false

416 # Everything reported "auto" 

417 return transfer 

418 

419 raise RuntimeError("Chained datastore does not yet support different transfer modes" 

420 f" from 'auto' in each child datastore (wanted {transfers})") 

421 

422 def _prepIngest(self, *datasets: FileDataset, transfer: Optional[str] = None) -> _IngestPrepData: 

423 # Docstring inherited from Datastore._prepIngest. 

424 if transfer is None or transfer == "move": 

425 raise NotImplementedError("ChainedDatastore does not support transfer=None or transfer='move'.") 

426 

427 def isDatasetAcceptable(dataset: FileDataset, *, name: str, constraints: Constraints) -> bool: 

428 acceptable = [ref for ref in dataset.refs if constraints.isAcceptable(ref)] 

429 if not acceptable: 

430 log.debug("Datastore %s skipping ingest via configuration for refs %s", 

431 name, ", ".join(str(ref) for ref in dataset.refs)) 

432 return False 

433 else: 

434 return True 

435 

436 # Filter down to just datasets the chained datastore's own 

437 # configuration accepts. 

438 okForParent: List[FileDataset] = [dataset for dataset in datasets 

439 if isDatasetAcceptable(dataset, name=self.name, 

440 constraints=self.constraints)] 

441 

442 # Iterate over nested datastores and call _prepIngest on each. 

443 # Save the results to a list: 

444 children: List[Tuple[Datastore, Datastore.IngestPrepData]] = [] 

445 # ...and remember whether all of the failures are due to 

446 # NotImplementedError being raised. 

447 allFailuresAreNotImplementedError = True 

448 for datastore, constraints in zip(self.datastores, self.datastoreConstraints): 

449 okForChild: List[FileDataset] 

450 if constraints is not None: 

451 okForChild = [dataset for dataset in okForParent 

452 if isDatasetAcceptable(dataset, name=datastore.name, 

453 constraints=constraints)] 

454 else: 

455 okForChild = okForParent 

456 try: 

457 prepDataForChild = datastore._prepIngest(*okForChild, transfer=transfer) 

458 except NotImplementedError: 

459 log.debug("Skipping ingest for datastore %s because transfer " 

460 "mode %s is not supported.", datastore.name, transfer) 

461 continue 

462 allFailuresAreNotImplementedError = False 

463 children.append((datastore, prepDataForChild)) 

464 if allFailuresAreNotImplementedError: 

465 raise NotImplementedError(f"No child datastore supports transfer mode {transfer}.") 

466 return _IngestPrepData(children=children) 

467 

468 def _finishIngest(self, prepData: _IngestPrepData, *, transfer: Optional[str] = None) -> None: 

469 # Docstring inherited from Datastore._finishIngest. 

470 for datastore, prepDataForChild in prepData.children: 

471 datastore._finishIngest(prepDataForChild, transfer=transfer) 

472 

473 def getURIs(self, ref: DatasetRef, 

474 predict: bool = False) -> Tuple[Optional[ButlerURI], Dict[str, ButlerURI]]: 

475 """Return URIs associated with dataset. 

476 

477 Parameters 

478 ---------- 

479 ref : `DatasetRef` 

480 Reference to the required dataset. 

481 predict : `bool`, optional 

482 If the datastore does not know about the dataset, should it 

483 return a predicted URI or not? 

484 

485 Returns 

486 ------- 

487 primary : `ButlerURI` 

488 The URI to the primary artifact associated with this dataset. 

489 If the dataset was disassembled within the datastore this 

490 may be `None`. 

491 components : `dict` 

492 URIs to any components associated with the dataset artifact. 

493 Can be empty if there are no components. 

494 

495 Notes 

496 ----- 

497 The returned URI is from the first datastore in the list that has 

498 the dataset with preference given to the first dataset coming from 

499 a permanent datastore. If no datastores have the dataset and prediction 

500 is allowed, the predicted URI for the first datastore in the list will 

501 be returned. 

502 """ 

503 DatastoreURIs = Tuple[Optional[ButlerURI], Dict[str, ButlerURI]] 

504 log.debug("Requesting URIs for %s", ref) 

505 predictedUri: Optional[DatastoreURIs] = None 

506 predictedEphemeralUri: Optional[DatastoreURIs] = None 

507 firstEphemeralUri: Optional[DatastoreURIs] = None 

508 for datastore in self.datastores: 

509 if datastore.exists(ref): 

510 if not datastore.isEphemeral: 

511 uri = datastore.getURIs(ref) 

512 log.debug("Retrieved non-ephemeral URI: %s", uri) 

513 return uri 

514 elif not firstEphemeralUri: 

515 firstEphemeralUri = datastore.getURIs(ref) 

516 elif predict: 

517 if not predictedUri and not datastore.isEphemeral: 

518 predictedUri = datastore.getURIs(ref, predict) 

519 elif not predictedEphemeralUri and datastore.isEphemeral: 

520 predictedEphemeralUri = datastore.getURIs(ref, predict) 

521 

522 if firstEphemeralUri: 

523 log.debug("Retrieved ephemeral URI: %s", firstEphemeralUri) 

524 return firstEphemeralUri 

525 

526 if predictedUri: 

527 log.debug("Retrieved predicted URI: %s", predictedUri) 

528 return predictedUri 

529 

530 if predictedEphemeralUri: 

531 log.debug("Retrieved predicted ephemeral URI: %s", predictedEphemeralUri) 

532 return predictedEphemeralUri 

533 

534 raise FileNotFoundError("Dataset {} not in any datastore".format(ref)) 

535 

536 def getURI(self, ref: DatasetRef, predict: bool = False) -> ButlerURI: 

537 """URI to the Dataset. 

538 

539 The returned URI is from the first datastore in the list that has 

540 the dataset with preference given to the first dataset coming from 

541 a permanent datastore. If no datastores have the dataset and prediction 

542 is allowed, the predicted URI for the first datastore in the list will 

543 be returned. 

544 

545 Parameters 

546 ---------- 

547 ref : `DatasetRef` 

548 Reference to the required Dataset. 

549 predict : `bool` 

550 If `True`, allow URIs to be returned of datasets that have not 

551 been written. 

552 

553 Returns 

554 ------- 

555 uri : `ButlerURI` 

556 URI pointing to the dataset within the datastore. If the 

557 dataset does not exist in the datastore, and if ``predict`` is 

558 `True`, the URI will be a prediction and will include a URI 

559 fragment "#predicted". 

560 

561 Notes 

562 ----- 

563 If the datastore does not have entities that relate well 

564 to the concept of a URI the returned URI string will be 

565 descriptive. The returned URI is not guaranteed to be obtainable. 

566 

567 Raises 

568 ------ 

569 FileNotFoundError 

570 A URI has been requested for a dataset that does not exist and 

571 guessing is not allowed. 

572 RuntimeError 

573 Raised if a request is made for a single URI but multiple URIs 

574 are associated with this dataset. 

575 """ 

576 log.debug("Requesting URI for %s", ref) 

577 primary, components = self.getURIs(ref, predict) 

578 if primary is None or components: 578 ↛ 579line 578 didn't jump to line 579, because the condition on line 578 was never true

579 raise RuntimeError(f"Dataset ({ref}) includes distinct URIs for components. " 

580 "Use Dataastore.getURIs() instead.") 

581 return primary 

582 

583 def retrieveArtifacts(self, refs: Iterable[DatasetRef], 

584 destination: ButlerURI, transfer: str = "auto", 

585 preserve_path: bool = True, 

586 overwrite: bool = False) -> List[ButlerURI]: 

587 """Retrieve the file artifacts associated with the supplied refs. 

588 

589 Parameters 

590 ---------- 

591 refs : iterable of `DatasetRef` 

592 The datasets for which file artifacts are to be retrieved. 

593 A single ref can result in multiple files. The refs must 

594 be resolved. 

595 destination : `ButlerURI` 

596 Location to write the file artifacts. 

597 transfer : `str`, optional 

598 Method to use to transfer the artifacts. Must be one of the options 

599 supported by `ButlerURI.transfer_from()`. "move" is not allowed. 

600 preserve_path : `bool`, optional 

601 If `True` the full path of the file artifact within the datastore 

602 is preserved. If `False` the final file component of the path 

603 is used. 

604 overwrite : `bool`, optional 

605 If `True` allow transfers to overwrite existing files at the 

606 destination. 

607 

608 Returns 

609 ------- 

610 targets : `list` of `ButlerURI` 

611 URIs of file artifacts in destination location. Order is not 

612 preserved. 

613 """ 

614 if not destination.isdir(): 614 ↛ 615line 614 didn't jump to line 615, because the condition on line 614 was never true

615 raise ValueError(f"Destination location must refer to a directory. Given {destination}") 

616 

617 # Using getURIs is not feasible since it becomes difficult to 

618 # determine the path within the datastore later on. For now 

619 # follow getURIs implementation approach. 

620 

621 pending = set(refs) 

622 

623 # There is a question as to whether an exception should be raised 

624 # early if some of the refs are missing, or whether files should be 

625 # transferred until a problem is hit. Prefer to complain up front. 

626 # Use the datastore integer as primary key. 

627 grouped_by_datastore: Dict[int, Set[DatasetRef]] = {} 

628 

629 for number, datastore in enumerate(self.datastores): 

630 if datastore.isEphemeral: 

631 # In the future we will want to distinguish in-memory from 

632 # caching datastore since using an on-disk local 

633 # cache is exactly what we should be doing. 

634 continue 

635 datastore_refs = {ref for ref in pending if datastore.exists(ref)} 

636 

637 if datastore_refs: 

638 grouped_by_datastore[number] = datastore_refs 

639 

640 # Remove these from the pending list so that we do not bother 

641 # looking for them any more. 

642 pending = pending - datastore_refs 

643 

644 if pending: 644 ↛ 645line 644 didn't jump to line 645, because the condition on line 644 was never true

645 raise RuntimeError(f"Some datasets were not found in any datastores: {pending}") 

646 

647 # Now do the transfer. 

648 targets: List[ButlerURI] = [] 

649 for number, datastore_refs in grouped_by_datastore.items(): 

650 targets.extend(self.datastores[number].retrieveArtifacts(datastore_refs, destination, 

651 transfer=transfer, 

652 preserve_path=preserve_path, 

653 overwrite=overwrite)) 

654 

655 return targets 

656 

657 def remove(self, ref: DatasetRef) -> None: 

658 """Indicate to the datastore that a dataset can be removed. 

659 

660 The dataset will be removed from each datastore. The dataset is 

661 not required to exist in every child datastore. 

662 

663 Parameters 

664 ---------- 

665 ref : `DatasetRef` 

666 Reference to the required dataset. 

667 

668 Raises 

669 ------ 

670 FileNotFoundError 

671 Attempt to remove a dataset that does not exist. Raised if none 

672 of the child datastores removed the dataset. 

673 """ 

674 log.debug("Removing %s", ref) 

675 self.trash(ref, ignore_errors=False) 

676 self.emptyTrash(ignore_errors=False) 

677 

678 def forget(self, refs: Iterable[DatasetRef]) -> None: 

679 for datastore in tuple(self.datastores): 

680 datastore.forget(refs) 

681 

682 def trash(self, ref: Union[DatasetRef, Iterable[DatasetRef]], ignore_errors: bool = True) -> None: 

683 if isinstance(ref, DatasetRef): 

684 ref_label = str(ref) 

685 else: 

686 ref_label = "bulk datasets" 

687 

688 log.debug("Trashing %s", ref_label) 

689 

690 counter = 0 

691 for datastore in self.datastores: 

692 try: 

693 datastore.trash(ref, ignore_errors=ignore_errors) 

694 counter += 1 

695 except FileNotFoundError: 

696 pass 

697 

698 if counter == 0: 

699 err_msg = f"Could not mark for removal from any child datastore: {ref_label}" 

700 if ignore_errors: 700 ↛ 701line 700 didn't jump to line 701, because the condition on line 700 was never true

701 log.warning(err_msg) 

702 else: 

703 raise FileNotFoundError(err_msg) 

704 

705 def emptyTrash(self, ignore_errors: bool = True) -> None: 

706 for datastore in self.datastores: 

707 datastore.emptyTrash(ignore_errors=ignore_errors) 

708 

709 def transfer(self, inputDatastore: Datastore, ref: DatasetRef) -> None: 

710 """Retrieve a dataset from an input `Datastore`, 

711 and store the result in this `Datastore`. 

712 

713 Parameters 

714 ---------- 

715 inputDatastore : `Datastore` 

716 The external `Datastore` from which to retreive the Dataset. 

717 ref : `DatasetRef` 

718 Reference to the required dataset in the input data store. 

719 

720 Returns 

721 ------- 

722 results : `list` 

723 List containing the return value from the ``put()`` to each 

724 child datastore. 

725 """ 

726 assert inputDatastore is not self # unless we want it for renames? 

727 inMemoryDataset = inputDatastore.get(ref) 

728 self.put(inMemoryDataset, ref) 

729 

730 def validateConfiguration(self, entities: Iterable[Union[DatasetRef, DatasetType, StorageClass]], 

731 logFailures: bool = False) -> None: 

732 """Validate some of the configuration for this datastore. 

733 

734 Parameters 

735 ---------- 

736 entities : iterable of `DatasetRef`, `DatasetType`, or `StorageClass` 

737 Entities to test against this configuration. Can be differing 

738 types. 

739 logFailures : `bool`, optional 

740 If `True`, output a log message for every validation error 

741 detected. 

742 

743 Raises 

744 ------ 

745 DatastoreValidationError 

746 Raised if there is a validation problem with a configuration. 

747 All the problems are reported in a single exception. 

748 

749 Notes 

750 ----- 

751 This method checks each datastore in turn. 

752 """ 

753 

754 # Need to catch each of the datastore outputs and ensure that 

755 # all are tested. 

756 failures = [] 

757 for datastore in self.datastores: 

758 try: 

759 datastore.validateConfiguration(entities, logFailures=logFailures) 

760 except DatastoreValidationError as e: 

761 if logFailures: 761 ↛ 763line 761 didn't jump to line 763, because the condition on line 761 was never false

762 log.critical("Datastore %s failed validation", datastore.name) 

763 failures.append(f"Datastore {self.name}: {e}") 

764 

765 if failures: 

766 msg = ";\n".join(failures) 

767 raise DatastoreValidationError(msg) 

768 

769 def validateKey(self, lookupKey: LookupKey, 

770 entity: Union[DatasetRef, DatasetType, StorageClass]) -> None: 

771 # Docstring is inherited from base class 

772 failures = [] 

773 for datastore in self.datastores: 

774 try: 

775 datastore.validateKey(lookupKey, entity) 

776 except DatastoreValidationError as e: 

777 failures.append(f"Datastore {self.name}: {e}") 

778 

779 if failures: 

780 msg = ";\n".join(failures) 

781 raise DatastoreValidationError(msg) 

782 

783 def getLookupKeys(self) -> Set[LookupKey]: 

784 # Docstring is inherited from base class 

785 keys = set() 

786 for datastore in self.datastores: 

787 keys.update(datastore.getLookupKeys()) 

788 

789 keys.update(self.constraints.getLookupKeys()) 

790 for p in self.datastoreConstraints: 

791 if p is not None: 791 ↛ 792line 791 didn't jump to line 792, because the condition on line 791 was never true

792 keys.update(p.getLookupKeys()) 

793 

794 return keys 

795 

796 def needs_expanded_data_ids( 

797 self, 

798 transfer: Optional[str], 

799 entity: Optional[Union[DatasetRef, DatasetType, StorageClass]] = None, 

800 ) -> bool: 

801 # Docstring inherited. 

802 # We can't safely use `self.datastoreConstraints` with `entity` to 

803 # check whether a child datastore would even want to ingest this 

804 # dataset, because we don't want to filter out datastores that might 

805 # need an expanded data ID based in incomplete information (e.g. we 

806 # pass a StorageClass, but the constraint dispatches on DatasetType). 

807 # So we pessimistically check if any datastore would need an expanded 

808 # data ID for this transfer mode. 

809 return any(datastore.needs_expanded_data_ids(transfer) for datastore in self.datastores) 809 ↛ exitline 809 didn't finish the generator expression on line 809