Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22from __future__ import annotations 

23 

24"""Chained datastore.""" 

25 

26__all__ = ("ChainedDatastore",) 

27 

28import time 

29import logging 

30import warnings 

31import itertools 

32from typing import ( 

33 TYPE_CHECKING, 

34 Any, 

35 List, 

36 Iterable, 

37 Mapping, 

38 Optional, 

39 Sequence, 

40 Set, 

41 Tuple, 

42 Union, 

43) 

44 

45from lsst.utils import doImport 

46from lsst.daf.butler import Datastore, DatastoreConfig, DatasetTypeNotSupportedError, \ 

47 DatastoreValidationError, Constraints, FileDataset 

48 

49if TYPE_CHECKING: 49 ↛ 50line 49 didn't jump to line 50, because the condition on line 49 was never true

50 from lsst.daf.butler import Config, DatasetRef, DatasetType, LookupKey, StorageClass 

51 from lsst.daf.butler.registry.interfaces import DatastoreRegistryBridgeManager 

52 

53log = logging.getLogger(__name__) 

54 

55 

56class _IngestPrepData(Datastore.IngestPrepData): 

57 """Helper class for ChainedDatastore ingest implementation. 

58 

59 Parameters 

60 ---------- 

61 children : `list` of `tuple` 

62 Pairs of `Datastore`, `IngestPrepData` for all child datastores. 

63 """ 

64 def __init__(self, children: List[Tuple[Datastore, Datastore.IngestPrepData]]): 

65 super().__init__(itertools.chain.from_iterable(data.refs.values() for _, data in children)) 

66 self.children = children 

67 

68 

69class ChainedDatastore(Datastore): 

70 """Chained Datastores to allow read and writes from multiple datastores. 

71 

72 A ChainedDatastore is configured with multiple datastore configurations. 

73 A ``put()`` is always sent to each datastore. A ``get()`` 

74 operation is sent to each datastore in turn and the first datastore 

75 to return a valid dataset is used. 

76 

77 Parameters 

78 ---------- 

79 config : `DatastoreConfig` or `str` 

80 Configuration. This configuration must include a ``datastores`` field 

81 as a sequence of datastore configurations. The order in this sequence 

82 indicates the order to use for read operations. 

83 bridgeManager : `DatastoreRegistryBridgeManager` 

84 Object that manages the interface between `Registry` and datastores. 

85 butlerRoot : `str`, optional 

86 New datastore root to use to override the configuration value. This 

87 root is sent to each child datastore. 

88 

89 Notes 

90 ----- 

91 ChainedDatastore never supports `None` or `"move"` as an `ingest` transfer 

92 mode. It supports `"copy"`, `"symlink"`, `"relsymlink"` 

93 and `"hardlink"` if and only if all its child datastores do. 

94 """ 

95 

96 defaultConfigFile = "datastores/chainedDatastore.yaml" 

97 """Path to configuration defaults. Relative to $DAF_BUTLER_DIR/config or 

98 absolute path. Can be None if no defaults specified. 

99 """ 

100 

101 containerKey = "datastores" 

102 """Key to specify where child datastores are configured.""" 

103 

104 datastores: List[Datastore] 

105 """All the child datastores known to this datastore.""" 

106 

107 datastoreConstraints: Sequence[Optional[Constraints]] 

108 """Constraints to be applied to each of the child datastores.""" 

109 

110 @classmethod 

111 def setConfigRoot(cls, root: str, config: Config, full: Config, overwrite: bool = True) -> None: 

112 """Set any filesystem-dependent config options for child Datastores to 

113 be appropriate for a new empty repository with the given root. 

114 

115 Parameters 

116 ---------- 

117 root : `str` 

118 Filesystem path to the root of the data repository. 

119 config : `Config` 

120 A `Config` to update. Only the subset understood by 

121 this component will be updated. Will not expand 

122 defaults. 

123 full : `Config` 

124 A complete config with all defaults expanded that can be 

125 converted to a `DatastoreConfig`. Read-only and will not be 

126 modified by this method. 

127 Repository-specific options that should not be obtained 

128 from defaults when Butler instances are constructed 

129 should be copied from ``full`` to ``config``. 

130 overwrite : `bool`, optional 

131 If `False`, do not modify a value in ``config`` if the value 

132 already exists. Default is always to overwrite with the provided 

133 ``root``. 

134 

135 Notes 

136 ----- 

137 If a keyword is explicitly defined in the supplied ``config`` it 

138 will not be overridden by this method if ``overwrite`` is `False`. 

139 This allows explicit values set in external configs to be retained. 

140 """ 

141 

142 # Extract the part of the config we care about updating 

143 datastoreConfig = DatastoreConfig(config, mergeDefaults=False) 

144 

145 # And the subset of the full config that we can use for reference. 

146 # Do not bother with defaults because we are told this already has 

147 # them. 

148 fullDatastoreConfig = DatastoreConfig(full, mergeDefaults=False) 

149 

150 # Loop over each datastore config and pass the subsets to the 

151 # child datastores to process. 

152 

153 containerKey = cls.containerKey 

154 for idx, (child, fullChild) in enumerate(zip(datastoreConfig[containerKey], 

155 fullDatastoreConfig[containerKey])): 

156 childConfig = DatastoreConfig(child, mergeDefaults=False) 

157 fullChildConfig = DatastoreConfig(fullChild, mergeDefaults=False) 

158 datastoreClass = doImport(fullChildConfig["cls"]) 

159 newroot = "{}/{}_{}".format(root, datastoreClass.__qualname__, idx) 

160 datastoreClass.setConfigRoot(newroot, childConfig, fullChildConfig, overwrite=overwrite) 

161 

162 # Reattach to parent 

163 datastoreConfig[containerKey, idx] = childConfig 

164 

165 # Reattach modified datastore config to parent 

166 # If this has a datastore key we attach there, otherwise we assume 

167 # this information goes at the top of the config hierarchy. 

168 if DatastoreConfig.component in config: 

169 config[DatastoreConfig.component] = datastoreConfig 

170 else: 

171 config.update(datastoreConfig) 

172 

173 return 

174 

175 def __init__(self, config: Union[Config, str], bridgeManager: DatastoreRegistryBridgeManager, 

176 butlerRoot: str = None): 

177 super().__init__(config, bridgeManager) 

178 

179 # Scan for child datastores and instantiate them with the same registry 

180 self.datastores = [] 

181 for c in self.config["datastores"]: 

182 c = DatastoreConfig(c) 

183 datastoreType = doImport(c["cls"]) 

184 datastore = datastoreType(c, bridgeManager, butlerRoot=butlerRoot) 

185 log.debug("Creating child datastore %s", datastore.name) 

186 self.datastores.append(datastore) 

187 

188 # Name ourself based on our children 

189 if self.datastores: 189 ↛ 194line 189 didn't jump to line 194, because the condition on line 189 was never false

190 # We must set the names explicitly 

191 self._names = [d.name for d in self.datastores] 

192 childNames = ",".join(self.names) 

193 else: 

194 childNames = "(empty@{})".format(time.time()) 

195 self._names = [childNames] 

196 self.name = "{}[{}]".format(type(self).__qualname__, childNames) 

197 

198 # We declare we are ephemeral if all our child datastores declare 

199 # they are ephemeral 

200 isEphemeral = True 

201 for d in self.datastores: 

202 if not d.isEphemeral: 

203 isEphemeral = False 

204 break 

205 self.isEphemeral = isEphemeral 

206 

207 # per-datastore override constraints 

208 if "datastore_constraints" in self.config: 

209 overrides = self.config["datastore_constraints"] 

210 

211 if len(overrides) != len(self.datastores): 211 ↛ 212line 211 didn't jump to line 212, because the condition on line 211 was never true

212 raise DatastoreValidationError(f"Number of registered datastores ({len(self.datastores)})" 

213 " differs from number of constraints overrides" 

214 f" {len(overrides)}") 

215 

216 self.datastoreConstraints = [Constraints(c.get("constraints"), universe=bridgeManager.universe) 

217 for c in overrides] 

218 

219 else: 

220 self.datastoreConstraints = (None,) * len(self.datastores) 

221 

222 log.debug("Created %s (%s)", self.name, ("ephemeral" if self.isEphemeral else "permanent")) 

223 

224 @property 

225 def names(self) -> Tuple[str, ...]: 

226 return tuple(self._names) 

227 

228 def __str__(self) -> str: 

229 chainName = ", ".join(str(ds) for ds in self.datastores) 

230 return chainName 

231 

232 def exists(self, ref: DatasetRef) -> bool: 

233 """Check if the dataset exists in one of the datastores. 

234 

235 Parameters 

236 ---------- 

237 ref : `DatasetRef` 

238 Reference to the required dataset. 

239 

240 Returns 

241 ------- 

242 exists : `bool` 

243 `True` if the entity exists in one of the child datastores. 

244 """ 

245 for datastore in self.datastores: 

246 if datastore.exists(ref): 

247 log.debug("Found %s in datastore %s", ref, datastore.name) 

248 return True 

249 return False 

250 

251 def get(self, ref: DatasetRef, parameters: Optional[Mapping[str, Any]] = None) -> Any: 

252 """Load an InMemoryDataset from the store. 

253 

254 The dataset is returned from the first datastore that has 

255 the dataset. 

256 

257 Parameters 

258 ---------- 

259 ref : `DatasetRef` 

260 Reference to the required Dataset. 

261 parameters : `dict` 

262 `StorageClass`-specific parameters that specify, for example, 

263 a slice of the dataset to be loaded. 

264 

265 Returns 

266 ------- 

267 inMemoryDataset : `object` 

268 Requested dataset or slice thereof as an InMemoryDataset. 

269 

270 Raises 

271 ------ 

272 FileNotFoundError 

273 Requested dataset can not be retrieved. 

274 TypeError 

275 Return value from formatter has unexpected type. 

276 ValueError 

277 Formatter failed to process the dataset. 

278 """ 

279 

280 for datastore in self.datastores: 

281 try: 

282 inMemoryObject = datastore.get(ref, parameters) 

283 log.debug("Found dataset %s in datastore %s", ref, datastore.name) 

284 return inMemoryObject 

285 except FileNotFoundError: 

286 pass 

287 

288 raise FileNotFoundError("Dataset {} could not be found in any of the datastores".format(ref)) 

289 

290 def put(self, inMemoryDataset: Any, ref: DatasetRef) -> None: 

291 """Write a InMemoryDataset with a given `DatasetRef` to each 

292 datastore. 

293 

294 The put() to child datastores can fail with 

295 `DatasetTypeNotSupportedError`. The put() for this datastore will be 

296 deemed to have succeeded so long as at least one child datastore 

297 accepted the inMemoryDataset. 

298 

299 Parameters 

300 ---------- 

301 inMemoryDataset : `object` 

302 The dataset to store. 

303 ref : `DatasetRef` 

304 Reference to the associated Dataset. 

305 

306 Raises 

307 ------ 

308 TypeError 

309 Supplied object and storage class are inconsistent. 

310 DatasetTypeNotSupportedError 

311 All datastores reported `DatasetTypeNotSupportedError`. 

312 """ 

313 log.debug("Put %s", ref) 

314 

315 # Confirm that we can accept this dataset 

316 if not self.constraints.isAcceptable(ref): 

317 # Raise rather than use boolean return value. 

318 raise DatasetTypeNotSupportedError(f"Dataset {ref} has been rejected by this datastore via" 

319 " configuration.") 

320 

321 isPermanent = False 

322 nsuccess = 0 

323 npermanent = 0 

324 nephemeral = 0 

325 for datastore, constraints in zip(self.datastores, self.datastoreConstraints): 

326 if constraints is not None and not constraints.isAcceptable(ref): 

327 log.debug("Datastore %s skipping put via configuration for ref %s", 

328 datastore.name, ref) 

329 continue 

330 

331 if datastore.isEphemeral: 

332 nephemeral += 1 

333 else: 

334 npermanent += 1 

335 try: 

336 datastore.put(inMemoryDataset, ref) 

337 nsuccess += 1 

338 if not datastore.isEphemeral: 

339 isPermanent = True 

340 except DatasetTypeNotSupportedError: 

341 pass 

342 

343 if nsuccess == 0: 

344 raise DatasetTypeNotSupportedError(f"None of the chained datastores supported ref {ref}") 

345 

346 if not isPermanent and npermanent > 0: 346 ↛ 347line 346 didn't jump to line 347, because the condition on line 346 was never true

347 warnings.warn(f"Put of {ref} only succeeded in ephemeral databases", stacklevel=2) 

348 

349 if self._transaction is not None: 

350 self._transaction.registerUndo('put', self.remove, ref) 

351 

352 def _overrideTransferMode(self, *datasets: Any, transfer: Optional[str] = None) -> Optional[str]: 

353 # Docstring inherited from base class. 

354 if transfer != "auto": 

355 return transfer 

356 # Ask each datastore what they think auto means 

357 transfers = {d._overrideTransferMode(*datasets, transfer=transfer) for d in self.datastores} 

358 

359 # Remove any untranslated "auto" values 

360 transfers.discard(transfer) 

361 

362 if len(transfers) == 1: 

363 return transfers.pop() 

364 if not transfers: 364 ↛ 368line 364 didn't jump to line 368, because the condition on line 364 was never false

365 # Everything reported "auto" 

366 return transfer 

367 

368 raise RuntimeError("Chained datastore does not yet support different transfer modes" 

369 f" from 'auto' in each child datastore (wanted {transfers})") 

370 

371 def _prepIngest(self, *datasets: FileDataset, transfer: Optional[str] = None) -> _IngestPrepData: 

372 # Docstring inherited from Datastore._prepIngest. 

373 if transfer is None or transfer == "move": 

374 raise NotImplementedError("ChainedDatastore does not support transfer=None or transfer='move'.") 

375 

376 def isDatasetAcceptable(dataset: FileDataset, *, name: str, constraints: Constraints) -> bool: 

377 acceptable = [ref for ref in dataset.refs if constraints.isAcceptable(ref)] 

378 if not acceptable: 

379 log.debug("Datastore %s skipping ingest via configuration for refs %s", 

380 name, ", ".join(str(ref) for ref in dataset.refs)) 

381 return False 

382 else: 

383 return True 

384 

385 # Filter down to just datasets the chained datastore's own 

386 # configuration accepts. 

387 okForParent: List[FileDataset] = [dataset for dataset in datasets 

388 if isDatasetAcceptable(dataset, name=self.name, 

389 constraints=self.constraints)] 

390 

391 # Iterate over nested datastores and call _prepIngest on each. 

392 # Save the results to a list: 

393 children: List[Tuple[Datastore, Datastore.IngestPrepData]] = [] 

394 # ...and remember whether all of the failures are due to 

395 # NotImplementedError being raised. 

396 allFailuresAreNotImplementedError = True 

397 for datastore, constraints in zip(self.datastores, self.datastoreConstraints): 

398 okForChild: List[FileDataset] 

399 if constraints is not None: 

400 okForChild = [dataset for dataset in okForParent 

401 if isDatasetAcceptable(dataset, name=datastore.name, 

402 constraints=constraints)] 

403 else: 

404 okForChild = okForParent 

405 try: 

406 prepDataForChild = datastore._prepIngest(*okForChild, transfer=transfer) 

407 except NotImplementedError: 

408 log.debug("Skipping ingest for datastore %s because transfer " 

409 "mode %s is not supported.", datastore.name, transfer) 

410 continue 

411 allFailuresAreNotImplementedError = False 

412 children.append((datastore, prepDataForChild)) 

413 if allFailuresAreNotImplementedError: 

414 raise NotImplementedError(f"No child datastore supports transfer mode {transfer}.") 

415 return _IngestPrepData(children=children) 

416 

417 def _finishIngest(self, prepData: _IngestPrepData, *, transfer: Optional[str] = None) -> None: 

418 # Docstring inherited from Datastore._finishIngest. 

419 for datastore, prepDataForChild in prepData.children: 

420 datastore._finishIngest(prepDataForChild, transfer=transfer) 

421 

422 def getUri(self, ref: DatasetRef, predict: bool = False) -> str: 

423 """URI to the Dataset. 

424 

425 The returned URI is from the first datastore in the list that has 

426 the dataset with preference given to the first dataset coming from 

427 a permanent datastore. If no datastores have the dataset and prediction 

428 is allowed, the predicted URI for the first datastore in the list will 

429 be returned. 

430 

431 Parameters 

432 ---------- 

433 ref : `DatasetRef` 

434 Reference to the required Dataset. 

435 predict : `bool` 

436 If `True`, allow URIs to be returned of datasets that have not 

437 been written. 

438 

439 Returns 

440 ------- 

441 uri : `str` 

442 URI string pointing to the dataset within the datastore. If the 

443 dataset does not exist in the datastore, and if ``predict`` is 

444 `True`, the URI will be a prediction and will include a URI 

445 fragment "#predicted". 

446 

447 Notes 

448 ----- 

449 If the datastore does not have entities that relate well 

450 to the concept of a URI the returned URI string will be 

451 descriptive. The returned URI is not guaranteed to be obtainable. 

452 

453 Raises 

454 ------ 

455 FileNotFoundError 

456 A URI has been requested for a dataset that does not exist and 

457 guessing is not allowed. 

458 """ 

459 log.debug("Requesting URI for %s", ref) 

460 predictedUri: Optional[str] = None 

461 predictedEphemeralUri: Optional[str] = None 

462 firstEphemeralUri: Optional[str] = None 

463 for datastore in self.datastores: 

464 if datastore.exists(ref): 

465 if not datastore.isEphemeral: 

466 uri = datastore.getUri(ref) 

467 log.debug("Retrieved ephemeral URI: %s", uri) 

468 return uri 

469 elif firstEphemeralUri is None: 

470 firstEphemeralUri = datastore.getUri(ref) 

471 elif predict: 

472 if predictedUri is None and not datastore.isEphemeral: 

473 predictedUri = datastore.getUri(ref, predict) 

474 elif predictedEphemeralUri is None and datastore.isEphemeral: 

475 predictedEphemeralUri = datastore.getUri(ref, predict) 

476 

477 if firstEphemeralUri is not None: 

478 log.debug("Retrieved ephemeral URI: %s", firstEphemeralUri) 

479 return firstEphemeralUri 

480 

481 if predictedUri is not None: 

482 log.debug("Retrieved predicted URI: %s", predictedUri) 

483 return predictedUri 

484 

485 if predictedEphemeralUri is not None: 

486 log.debug("Retrieved predicted ephemeral URI: %s", predictedEphemeralUri) 

487 return predictedEphemeralUri 

488 

489 raise FileNotFoundError("Dataset {} not in any datastore".format(ref)) 

490 

491 def remove(self, ref: DatasetRef) -> None: 

492 """Indicate to the datastore that a dataset can be removed. 

493 

494 The dataset will be removed from each datastore. The dataset is 

495 not required to exist in every child datastore. 

496 

497 Parameters 

498 ---------- 

499 ref : `DatasetRef` 

500 Reference to the required dataset. 

501 

502 Raises 

503 ------ 

504 FileNotFoundError 

505 Attempt to remove a dataset that does not exist. Raised if none 

506 of the child datastores removed the dataset. 

507 """ 

508 log.debug(f"Removing {ref}") 

509 self.trash(ref, ignore_errors=False) 

510 self.emptyTrash(ignore_errors=False) 

511 

512 def trash(self, ref: DatasetRef, ignore_errors: bool = True) -> None: 

513 log.debug("Trashing %s", ref) 

514 

515 counter = 0 

516 for datastore in self.datastores: 

517 try: 

518 datastore.trash(ref, ignore_errors=ignore_errors) 

519 counter += 1 

520 except FileNotFoundError: 

521 pass 

522 

523 if counter == 0: 

524 err_msg = f"Could not mark for removal from any child datastore: {ref}" 

525 if ignore_errors: 525 ↛ 526line 525 didn't jump to line 526, because the condition on line 525 was never true

526 log.warning(err_msg) 

527 else: 

528 raise FileNotFoundError(err_msg) 

529 

530 def emptyTrash(self, ignore_errors: bool = True) -> None: 

531 for datastore in self.datastores: 

532 datastore.emptyTrash(ignore_errors=ignore_errors) 

533 

534 def transfer(self, inputDatastore: Datastore, ref: DatasetRef) -> None: 

535 """Retrieve a dataset from an input `Datastore`, 

536 and store the result in this `Datastore`. 

537 

538 Parameters 

539 ---------- 

540 inputDatastore : `Datastore` 

541 The external `Datastore` from which to retreive the Dataset. 

542 ref : `DatasetRef` 

543 Reference to the required dataset in the input data store. 

544 

545 Returns 

546 ------- 

547 results : `list` 

548 List containing the return value from the ``put()`` to each 

549 child datastore. 

550 """ 

551 assert inputDatastore is not self # unless we want it for renames? 

552 inMemoryDataset = inputDatastore.get(ref) 

553 self.put(inMemoryDataset, ref) 

554 

555 def validateConfiguration(self, entities: Iterable[Union[DatasetRef, DatasetType, StorageClass]], 

556 logFailures: bool = False) -> None: 

557 """Validate some of the configuration for this datastore. 

558 

559 Parameters 

560 ---------- 

561 entities : iterable of `DatasetRef`, `DatasetType`, or `StorageClass` 

562 Entities to test against this configuration. Can be differing 

563 types. 

564 logFailures : `bool`, optional 

565 If `True`, output a log message for every validation error 

566 detected. 

567 

568 Raises 

569 ------ 

570 DatastoreValidationError 

571 Raised if there is a validation problem with a configuration. 

572 All the problems are reported in a single exception. 

573 

574 Notes 

575 ----- 

576 This method checks each datastore in turn. 

577 """ 

578 

579 # Need to catch each of the datastore outputs and ensure that 

580 # all are tested. 

581 failures = [] 

582 for datastore in self.datastores: 

583 try: 

584 datastore.validateConfiguration(entities, logFailures=logFailures) 

585 except DatastoreValidationError as e: 

586 if logFailures: 586 ↛ 588line 586 didn't jump to line 588, because the condition on line 586 was never false

587 log.fatal("Datastore %s failed validation", datastore.name) 

588 failures.append(f"Datastore {self.name}: {e}") 

589 

590 if failures: 

591 msg = ";\n".join(failures) 

592 raise DatastoreValidationError(msg) 

593 

594 def validateKey(self, lookupKey: LookupKey, 

595 entity: Union[DatasetRef, DatasetType, StorageClass]) -> None: 

596 # Docstring is inherited from base class 

597 failures = [] 

598 for datastore in self.datastores: 

599 try: 

600 datastore.validateKey(lookupKey, entity) 

601 except DatastoreValidationError as e: 

602 failures.append(f"Datastore {self.name}: {e}") 

603 

604 if failures: 

605 msg = ";\n".join(failures) 

606 raise DatastoreValidationError(msg) 

607 

608 def getLookupKeys(self) -> Set[LookupKey]: 

609 # Docstring is inherited from base class 

610 keys = set() 

611 for datastore in self.datastores: 

612 keys.update(datastore.getLookupKeys()) 

613 

614 keys.update(self.constraints.getLookupKeys()) 

615 for p in self.datastoreConstraints: 

616 if p is not None: 616 ↛ 617line 616 didn't jump to line 617, because the condition on line 616 was never true

617 keys.update(p.getLookupKeys()) 

618 

619 return keys