Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22"""Chained datastore.""" 

23 

24__all__ = ("ChainedDatastore",) 

25 

26import time 

27import logging 

28import warnings 

29import itertools 

30from typing import List, Sequence, Optional, Tuple, Any 

31 

32from lsst.utils import doImport 

33from lsst.daf.butler import Datastore, DatastoreConfig, DatasetTypeNotSupportedError, \ 

34 DatastoreValidationError, Constraints, FileDataset 

35 

36log = logging.getLogger(__name__) 

37 

38 

39class _IngestPrepData(Datastore.IngestPrepData): 

40 """Helper class for ChainedDatastore ingest implementation. 

41 

42 Parameters 

43 ---------- 

44 children : `list` of `tuple` 

45 Pairs of `Datastore`, `IngestPrepData` for all child datastores. 

46 """ 

47 def __init__(self, children: List[Tuple[Datastore, Datastore.IngestPrepData]]): 

48 super().__init__(itertools.chain.from_iterable(data.refs.values() for _, data in children)) 

49 self.children = children 

50 

51 

52class ChainedDatastore(Datastore): 

53 """Chained Datastores to allow read and writes from multiple datastores. 

54 

55 A ChainedDatastore is configured with multiple datastore configurations. 

56 A ``put()`` is always sent to each datastore. A ``get()`` 

57 operation is sent to each datastore in turn and the first datastore 

58 to return a valid dataset is used. 

59 

60 Parameters 

61 ---------- 

62 config : `DatastoreConfig` or `str` 

63 Configuration. This configuration must include a ``datastores`` field 

64 as a sequence of datastore configurations. The order in this sequence 

65 indicates the order to use for read operations. 

66 bridgeManager : `DatastoreRegistryBridgeManager` 

67 Object that manages the interface between `Registry` and datastores. 

68 butlerRoot : `str`, optional 

69 New datastore root to use to override the configuration value. This 

70 root is sent to each child datastore. 

71 

72 Notes 

73 ----- 

74 ChainedDatastore never supports `None` or `"move"` as an `ingest` transfer 

75 mode. It supports `"copy"`, `"symlink"`, `"relsymlink"` 

76 and `"hardlink"` if and only if all its child datastores do. 

77 """ 

78 

79 defaultConfigFile = "datastores/chainedDatastore.yaml" 

80 """Path to configuration defaults. Relative to $DAF_BUTLER_DIR/config or 

81 absolute path. Can be None if no defaults specified. 

82 """ 

83 

84 containerKey = "datastores" 

85 """Key to specify where child datastores are configured.""" 

86 

87 datastores: List[Datastore] 

88 """All the child datastores known to this datastore.""" 

89 

90 datastoreConstraints: Sequence[Optional[Constraints]] 

91 """Constraints to be applied to each of the child datastores.""" 

92 

93 @classmethod 

94 def setConfigRoot(cls, root, config, full, overwrite=True): 

95 """Set any filesystem-dependent config options for child Datastores to 

96 be appropriate for a new empty repository with the given root. 

97 

98 Parameters 

99 ---------- 

100 root : `str` 

101 Filesystem path to the root of the data repository. 

102 config : `Config` 

103 A `Config` to update. Only the subset understood by 

104 this component will be updated. Will not expand 

105 defaults. 

106 full : `Config` 

107 A complete config with all defaults expanded that can be 

108 converted to a `DatastoreConfig`. Read-only and will not be 

109 modified by this method. 

110 Repository-specific options that should not be obtained 

111 from defaults when Butler instances are constructed 

112 should be copied from ``full`` to ``config``. 

113 overwrite : `bool`, optional 

114 If `False`, do not modify a value in ``config`` if the value 

115 already exists. Default is always to overwrite with the provided 

116 ``root``. 

117 

118 Notes 

119 ----- 

120 If a keyword is explicitly defined in the supplied ``config`` it 

121 will not be overridden by this method if ``overwrite`` is `False`. 

122 This allows explicit values set in external configs to be retained. 

123 """ 

124 

125 # Extract the part of the config we care about updating 

126 datastoreConfig = DatastoreConfig(config, mergeDefaults=False) 

127 

128 # And the subset of the full config that we can use for reference. 

129 # Do not bother with defaults because we are told this already has 

130 # them. 

131 fullDatastoreConfig = DatastoreConfig(full, mergeDefaults=False) 

132 

133 # Loop over each datastore config and pass the subsets to the 

134 # child datastores to process. 

135 

136 containerKey = cls.containerKey 

137 for idx, (child, fullChild) in enumerate(zip(datastoreConfig[containerKey], 

138 fullDatastoreConfig[containerKey])): 

139 childConfig = DatastoreConfig(child, mergeDefaults=False) 

140 fullChildConfig = DatastoreConfig(fullChild, mergeDefaults=False) 

141 datastoreClass = doImport(fullChildConfig["cls"]) 

142 newroot = "{}/{}_{}".format(root, datastoreClass.__qualname__, idx) 

143 datastoreClass.setConfigRoot(newroot, childConfig, fullChildConfig, overwrite=overwrite) 

144 

145 # Reattach to parent 

146 datastoreConfig[containerKey, idx] = childConfig 

147 

148 # Reattach modified datastore config to parent 

149 # If this has a datastore key we attach there, otherwise we assume 

150 # this information goes at the top of the config hierarchy. 

151 if DatastoreConfig.component in config: 

152 config[DatastoreConfig.component] = datastoreConfig 

153 else: 

154 config.update(datastoreConfig) 

155 

156 return 

157 

158 def __init__(self, config, bridgeManager, butlerRoot=None): 

159 super().__init__(config, bridgeManager) 

160 

161 # Scan for child datastores and instantiate them with the same registry 

162 self.datastores = [] 

163 for c in self.config["datastores"]: 

164 c = DatastoreConfig(c) 

165 datastoreType = doImport(c["cls"]) 

166 datastore = datastoreType(c, bridgeManager, butlerRoot=butlerRoot) 

167 log.debug("Creating child datastore %s", datastore.name) 

168 self.datastores.append(datastore) 

169 

170 # Name ourself based on our children 

171 if self.datastores: 171 ↛ 176line 171 didn't jump to line 176, because the condition on line 171 was never false

172 # We must set the names explicitly 

173 self._names = [d.name for d in self.datastores] 

174 childNames = ",".join(self.names) 

175 else: 

176 childNames = "(empty@{})".format(time.time()) 

177 self._names = [childNames] 

178 self.name = "{}[{}]".format(type(self).__qualname__, childNames) 

179 

180 # We declare we are ephemeral if all our child datastores declare 

181 # they are ephemeral 

182 isEphemeral = True 

183 for d in self.datastores: 

184 if not d.isEphemeral: 

185 isEphemeral = False 

186 break 

187 self.isEphemeral = isEphemeral 

188 

189 # per-datastore override constraints 

190 if "datastore_constraints" in self.config: 

191 overrides = self.config["datastore_constraints"] 

192 

193 if len(overrides) != len(self.datastores): 193 ↛ 194line 193 didn't jump to line 194, because the condition on line 193 was never true

194 raise DatastoreValidationError(f"Number of registered datastores ({len(self.datastores)})" 

195 " differs from number of constraints overrides" 

196 f" {len(overrides)}") 

197 

198 self.datastoreConstraints = [Constraints(c.get("constraints"), universe=bridgeManager.universe) 

199 for c in overrides] 

200 

201 else: 

202 self.datastoreConstraints = (None,) * len(self.datastores) 

203 

204 log.debug("Created %s (%s)", self.name, ("ephemeral" if self.isEphemeral else "permanent")) 

205 

206 @property 

207 def names(self): 

208 return self._names 

209 

210 def __str__(self): 

211 chainName = ", ".join(str(ds) for ds in self.datastores) 

212 return chainName 

213 

214 def exists(self, ref): 

215 """Check if the dataset exists in one of the datastores. 

216 

217 Parameters 

218 ---------- 

219 ref : `DatasetRef` 

220 Reference to the required dataset. 

221 

222 Returns 

223 ------- 

224 exists : `bool` 

225 `True` if the entity exists in one of the child datastores. 

226 """ 

227 for datastore in self.datastores: 

228 if datastore.exists(ref): 

229 log.debug("Found %s in datastore %s", ref, datastore.name) 

230 return True 

231 return False 

232 

233 def get(self, ref, parameters=None): 

234 """Load an InMemoryDataset from the store. 

235 

236 The dataset is returned from the first datastore that has 

237 the dataset. 

238 

239 Parameters 

240 ---------- 

241 ref : `DatasetRef` 

242 Reference to the required Dataset. 

243 parameters : `dict` 

244 `StorageClass`-specific parameters that specify, for example, 

245 a slice of the dataset to be loaded. 

246 

247 Returns 

248 ------- 

249 inMemoryDataset : `object` 

250 Requested dataset or slice thereof as an InMemoryDataset. 

251 

252 Raises 

253 ------ 

254 FileNotFoundError 

255 Requested dataset can not be retrieved. 

256 TypeError 

257 Return value from formatter has unexpected type. 

258 ValueError 

259 Formatter failed to process the dataset. 

260 """ 

261 

262 for datastore in self.datastores: 

263 try: 

264 inMemoryObject = datastore.get(ref, parameters) 

265 log.debug("Found dataset %s in datastore %s", ref, datastore.name) 

266 return inMemoryObject 

267 except FileNotFoundError: 

268 pass 

269 

270 raise FileNotFoundError("Dataset {} could not be found in any of the datastores".format(ref)) 

271 

272 def put(self, inMemoryDataset, ref): 

273 """Write a InMemoryDataset with a given `DatasetRef` to each 

274 datastore. 

275 

276 The put() to child datastores can fail with 

277 `DatasetTypeNotSupportedError`. The put() for this datastore will be 

278 deemed to have succeeded so long as at least one child datastore 

279 accepted the inMemoryDataset. 

280 

281 Parameters 

282 ---------- 

283 inMemoryDataset : `object` 

284 The dataset to store. 

285 ref : `DatasetRef` 

286 Reference to the associated Dataset. 

287 

288 Raises 

289 ------ 

290 TypeError 

291 Supplied object and storage class are inconsistent. 

292 DatasetTypeNotSupportedError 

293 All datastores reported `DatasetTypeNotSupportedError`. 

294 """ 

295 log.debug("Put %s", ref) 

296 

297 # Confirm that we can accept this dataset 

298 if not self.constraints.isAcceptable(ref): 

299 # Raise rather than use boolean return value. 

300 raise DatasetTypeNotSupportedError(f"Dataset {ref} has been rejected by this datastore via" 

301 " configuration.") 

302 

303 isPermanent = False 

304 nsuccess = 0 

305 npermanent = 0 

306 nephemeral = 0 

307 for datastore, constraints in zip(self.datastores, self.datastoreConstraints): 

308 if constraints is not None and not constraints.isAcceptable(ref): 

309 log.debug("Datastore %s skipping put via configuration for ref %s", 

310 datastore.name, ref) 

311 continue 

312 

313 if datastore.isEphemeral: 

314 nephemeral += 1 

315 else: 

316 npermanent += 1 

317 try: 

318 datastore.put(inMemoryDataset, ref) 

319 nsuccess += 1 

320 if not datastore.isEphemeral: 

321 isPermanent = True 

322 except DatasetTypeNotSupportedError: 

323 pass 

324 

325 if nsuccess == 0: 

326 raise DatasetTypeNotSupportedError(f"None of the chained datastores supported ref {ref}") 

327 

328 if not isPermanent and npermanent > 0: 328 ↛ 329line 328 didn't jump to line 329, because the condition on line 328 was never true

329 warnings.warn(f"Put of {ref} only succeeded in ephemeral databases", stacklevel=2) 

330 

331 if self._transaction is not None: 

332 self._transaction.registerUndo('put', self.remove, ref) 

333 

334 def _overrideTransferMode(self, *datasets: Any, transfer: Optional[str] = None) -> str: 

335 # Docstring inherited from base class. 

336 if transfer != "auto": 

337 return transfer 

338 # Ask each datastore what they think auto means 

339 transfers = {d._overrideTransferMode(*datasets, transfer=transfer) for d in self.datastores} 

340 

341 # Remove any untranslated "auto" values 

342 transfers.discard(transfer) 

343 

344 if len(transfers) == 1: 

345 return transfers.pop() 

346 if not transfers: 346 ↛ 350line 346 didn't jump to line 350, because the condition on line 346 was never false

347 # Everything reported "auto" 

348 return transfer 

349 

350 raise RuntimeError("Chained datastore does not yet support different transfer modes" 

351 f" from 'auto' in each child datastore (wanted {transfers})") 

352 

353 def _prepIngest(self, *datasets: FileDataset, transfer: Optional[str] = None) -> _IngestPrepData: 

354 # Docstring inherited from Datastore._prepIngest. 

355 if transfer is None or transfer == "move": 

356 raise NotImplementedError("ChainedDatastore does not support transfer=None or transfer='move'.") 

357 

358 def isDatasetAcceptable(dataset, *, name, constraints): 

359 acceptable = [ref for ref in dataset.refs if constraints.isAcceptable(ref)] 

360 if not acceptable: 

361 log.debug("Datastore %s skipping ingest via configuration for refs %s", 

362 name, ", ".join(str(ref) for ref in dataset.refs)) 

363 return False 

364 else: 

365 return True 

366 

367 # Filter down to just datasets the chained datastore's own 

368 # configuration accepts. 

369 okForParent: List[FileDataset] = [dataset for dataset in datasets 

370 if isDatasetAcceptable(dataset, name=self.name, 

371 constraints=self.constraints)] 

372 

373 # Iterate over nested datastores and call _prepIngest on each. 

374 # Save the results to a list: 

375 children: List[Tuple[Datastore, Datastore.IngestPrepData]] = [] 

376 # ...and remember whether all of the failures are due to 

377 # NotImplementedError being raised. 

378 allFailuresAreNotImplementedError = True 

379 for datastore, constraints in zip(self.datastores, self.datastoreConstraints): 

380 if constraints is not None: 

381 okForChild: List[FileDataset] = [dataset for dataset in okForParent 

382 if isDatasetAcceptable(dataset, name=datastore.name, 

383 constraints=constraints)] 

384 else: 

385 okForChild: List[FileDataset] = okForParent 

386 try: 

387 prepDataForChild = datastore._prepIngest(*okForChild, transfer=transfer) 

388 except NotImplementedError: 

389 log.debug("Skipping ingest for datastore %s because transfer " 

390 "mode %s is not supported.", datastore.name, transfer) 

391 continue 

392 allFailuresAreNotImplementedError = False 

393 children.append((datastore, prepDataForChild)) 

394 if allFailuresAreNotImplementedError: 

395 raise NotImplementedError(f"No child datastore supports transfer mode {transfer}.") 

396 return _IngestPrepData(children=children) 

397 

398 def _finishIngest(self, prepData: _IngestPrepData, *, transfer: Optional[str] = None): 

399 # Docstring inherited from Datastore._finishIngest. 

400 for datastore, prepDataForChild in prepData.children: 

401 datastore._finishIngest(prepDataForChild, transfer=transfer) 

402 

403 def getUri(self, ref, predict=False): 

404 """URI to the Dataset. 

405 

406 The returned URI is from the first datastore in the list that has 

407 the dataset with preference given to the first dataset coming from 

408 a permanent datastore. If no datastores have the dataset and prediction 

409 is allowed, the predicted URI for the first datastore in the list will 

410 be returned. 

411 

412 Parameters 

413 ---------- 

414 ref : `DatasetRef` 

415 Reference to the required Dataset. 

416 predict : `bool` 

417 If `True`, allow URIs to be returned of datasets that have not 

418 been written. 

419 

420 Returns 

421 ------- 

422 uri : `str` 

423 URI string pointing to the dataset within the datastore. If the 

424 dataset does not exist in the datastore, and if ``predict`` is 

425 `True`, the URI will be a prediction and will include a URI 

426 fragment "#predicted". 

427 

428 Notes 

429 ----- 

430 If the datastore does not have entities that relate well 

431 to the concept of a URI the returned URI string will be 

432 descriptive. The returned URI is not guaranteed to be obtainable. 

433 

434 Raises 

435 ------ 

436 FileNotFoundError 

437 A URI has been requested for a dataset that does not exist and 

438 guessing is not allowed. 

439 """ 

440 log.debug("Requesting URI for %s", ref) 

441 predictedUri = None 

442 predictedEphemeralUri = None 

443 firstEphemeralUri = None 

444 for datastore in self.datastores: 

445 if datastore.exists(ref): 

446 if not datastore.isEphemeral: 

447 uri = datastore.getUri(ref) 

448 log.debug("Retrieved ephemeral URI: %s", uri) 

449 return uri 

450 elif firstEphemeralUri is None: 

451 firstEphemeralUri = datastore.getUri(ref) 

452 elif predict: 

453 if predictedUri is None and not datastore.isEphemeral: 

454 predictedUri = datastore.getUri(ref, predict) 

455 elif predictedEphemeralUri is None and datastore.isEphemeral: 

456 predictedEphemeralUri = datastore.getUri(ref, predict) 

457 

458 if firstEphemeralUri is not None: 

459 log.debug("Retrieved ephemeral URI: %s", firstEphemeralUri) 

460 return firstEphemeralUri 

461 

462 if predictedUri is not None: 

463 log.debug("Retrieved predicted URI: %s", predictedUri) 

464 return predictedUri 

465 

466 if predictedEphemeralUri is not None: 

467 log.debug("Retrieved predicted ephemeral URI: %s", predictedEphemeralUri) 

468 return predictedEphemeralUri 

469 

470 raise FileNotFoundError("Dataset {} not in any datastore".format(ref)) 

471 

472 def remove(self, ref): 

473 """Indicate to the datastore that a dataset can be removed. 

474 

475 The dataset will be removed from each datastore. The dataset is 

476 not required to exist in every child datastore. 

477 

478 Parameters 

479 ---------- 

480 ref : `DatasetRef` 

481 Reference to the required dataset. 

482 

483 Raises 

484 ------ 

485 FileNotFoundError 

486 Attempt to remove a dataset that does not exist. Raised if none 

487 of the child datastores removed the dataset. 

488 """ 

489 log.debug(f"Removing {ref}") 

490 self.trash(ref, ignore_errors=False) 

491 self.emptyTrash(ignore_errors=False) 

492 

493 def trash(self, ref, ignore_errors=True): 

494 log.debug("Trashing %s", ref) 

495 

496 counter = 0 

497 for datastore in self.datastores: 

498 try: 

499 datastore.trash(ref, ignore_errors=ignore_errors) 

500 counter += 1 

501 except FileNotFoundError: 

502 pass 

503 

504 if counter == 0: 

505 err_msg = f"Could not mark for removal from any child datastore: {ref}" 

506 if ignore_errors: 506 ↛ 507line 506 didn't jump to line 507, because the condition on line 506 was never true

507 log.warning(err_msg) 

508 else: 

509 raise FileNotFoundError(err_msg) 

510 

511 def emptyTrash(self, ignore_errors=True): 

512 for datastore in self.datastores: 

513 datastore.emptyTrash(ignore_errors=ignore_errors) 

514 

515 def transfer(self, inputDatastore, ref): 

516 """Retrieve a dataset from an input `Datastore`, 

517 and store the result in this `Datastore`. 

518 

519 Parameters 

520 ---------- 

521 inputDatastore : `Datastore` 

522 The external `Datastore` from which to retreive the Dataset. 

523 ref : `DatasetRef` 

524 Reference to the required dataset in the input data store. 

525 

526 Returns 

527 ------- 

528 results : `list` 

529 List containing the return value from the ``put()`` to each 

530 child datastore. 

531 """ 

532 assert inputDatastore is not self # unless we want it for renames? 

533 inMemoryDataset = inputDatastore.get(ref) 

534 return [datastore.put(inMemoryDataset, ref) for datastore in self.datastores] 

535 

536 def validateConfiguration(self, entities, logFailures=False): 

537 """Validate some of the configuration for this datastore. 

538 

539 Parameters 

540 ---------- 

541 entities : iterable of `DatasetRef`, `DatasetType`, or `StorageClass` 

542 Entities to test against this configuration. Can be differing 

543 types. 

544 logFailures : `bool`, optional 

545 If `True`, output a log message for every validation error 

546 detected. 

547 

548 Raises 

549 ------ 

550 DatastoreValidationError 

551 Raised if there is a validation problem with a configuration. 

552 All the problems are reported in a single exception. 

553 

554 Notes 

555 ----- 

556 This method checks each datastore in turn. 

557 """ 

558 

559 # Need to catch each of the datastore outputs and ensure that 

560 # all are tested. 

561 failures = [] 

562 for datastore in self.datastores: 

563 try: 

564 datastore.validateConfiguration(entities, logFailures=logFailures) 

565 except DatastoreValidationError as e: 

566 if logFailures: 566 ↛ 568line 566 didn't jump to line 568, because the condition on line 566 was never false

567 log.fatal("Datastore %s failed validation", datastore.name) 

568 failures.append(f"Datastore {self.name}: {e}") 

569 

570 if failures: 

571 msg = ";\n".join(failures) 

572 raise DatastoreValidationError(msg) 

573 

574 def validateKey(self, lookupKey, entity): 

575 # Docstring is inherited from base class 

576 failures = [] 

577 for datastore in self.datastores: 

578 try: 

579 datastore.validateKey(lookupKey, entity) 

580 except DatastoreValidationError as e: 

581 failures.append(f"Datastore {self.name}: {e}") 

582 

583 if failures: 

584 msg = ";\n".join(failures) 

585 raise DatastoreValidationError(msg) 

586 

587 def getLookupKeys(self): 

588 # Docstring is inherited from base class 

589 keys = set() 

590 for datastore in self.datastores: 

591 keys.update(datastore.getLookupKeys()) 

592 

593 keys.update(self.constraints.getLookupKeys()) 

594 for p in self.datastoreConstraints: 

595 if p is not None: 595 ↛ 596line 595 didn't jump to line 596, because the condition on line 595 was never true

596 keys.update(p.getLookupKeys()) 

597 

598 return keys