Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22"""Chained datastore.""" 

23 

24__all__ = ("ChainedDatastore",) 

25 

26import time 

27import logging 

28import warnings 

29import itertools 

30from typing import List, Sequence, Optional, Tuple 

31 

32from lsst.utils import doImport 

33from lsst.daf.butler import Datastore, DatastoreConfig, DatasetTypeNotSupportedError, \ 

34 DatastoreValidationError, Constraints, FileDataset 

35 

36log = logging.getLogger(__name__) 

37 

38 

39class _IngestPrepData(Datastore.IngestPrepData): 

40 """Helper class for ChainedDatastore ingest implementation. 

41 

42 Parameters 

43 ---------- 

44 children : `list` of `tuple` 

45 Pairs of `Datastore`, `IngestPrepData` for all child datastores. 

46 """ 

47 def __init__(self, children: List[Tuple[Datastore, Datastore.IngestPrepData]]): 

48 super().__init__(itertools.chain.from_iterable(data.refs.values() for _, data in children)) 

49 self.children = children 

50 

51 

52class ChainedDatastore(Datastore): 

53 """Chained Datastores to allow read and writes from multiple datastores. 

54 

55 A ChainedDatastore is configured with multiple datastore configurations. 

56 A ``put()`` is always sent to each datastore. A ``get()`` 

57 operation is sent to each datastore in turn and the first datastore 

58 to return a valid dataset is used. 

59 

60 Parameters 

61 ---------- 

62 config : `DatastoreConfig` or `str` 

63 Configuration. This configuration must include a ``datastores`` field 

64 as a sequence of datastore configurations. The order in this sequence 

65 indicates the order to use for read operations. 

66 registry : `Registry` 

67 Registry to use for storing internal information about the datasets. 

68 butlerRoot : `str`, optional 

69 New datastore root to use to override the configuration value. This 

70 root is sent to each child datastore. 

71 

72 Notes 

73 ----- 

74 ChainedDatastore never supports `None` or `"move"` as an `ingest` transfer 

75 mode. It supports `"copy"`, `"symlink"`, and `"hardlink"` if and only if 

76 its child datastores do. 

77 """ 

78 

79 defaultConfigFile = "datastores/chainedDatastore.yaml" 

80 """Path to configuration defaults. Relative to $DAF_BUTLER_DIR/config or 

81 absolute path. Can be None if no defaults specified. 

82 """ 

83 

84 containerKey = "datastores" 

85 """Key to specify where child datastores are configured.""" 

86 

87 datastores: List[Datastore] 

88 """All the child datastores known to this datastore.""" 

89 

90 datastoreConstraints: Sequence[Optional[Constraints]] 

91 """Constraints to be applied to each of the child datastores.""" 

92 

93 @classmethod 

94 def setConfigRoot(cls, root, config, full, overwrite=True): 

95 """Set any filesystem-dependent config options for child Datastores to 

96 be appropriate for a new empty repository with the given root. 

97 

98 Parameters 

99 ---------- 

100 root : `str` 

101 Filesystem path to the root of the data repository. 

102 config : `Config` 

103 A `Config` to update. Only the subset understood by 

104 this component will be updated. Will not expand 

105 defaults. 

106 full : `Config` 

107 A complete config with all defaults expanded that can be 

108 converted to a `DatastoreConfig`. Read-only and will not be 

109 modified by this method. 

110 Repository-specific options that should not be obtained 

111 from defaults when Butler instances are constructed 

112 should be copied from ``full`` to ``config``. 

113 overwrite : `bool`, optional 

114 If `False`, do not modify a value in ``config`` if the value 

115 already exists. Default is always to overwrite with the provided 

116 ``root``. 

117 

118 Notes 

119 ----- 

120 If a keyword is explicitly defined in the supplied ``config`` it 

121 will not be overridden by this method if ``overwrite`` is `False`. 

122 This allows explicit values set in external configs to be retained. 

123 """ 

124 

125 # Extract the part of the config we care about updating 

126 datastoreConfig = DatastoreConfig(config, mergeDefaults=False) 

127 

128 # And the subset of the full config that we can use for reference. 

129 # Do not bother with defaults because we are told this already has 

130 # them. 

131 fullDatastoreConfig = DatastoreConfig(full, mergeDefaults=False) 

132 

133 # Loop over each datastore config and pass the subsets to the 

134 # child datastores to process. 

135 

136 containerKey = cls.containerKey 

137 for idx, (child, fullChild) in enumerate(zip(datastoreConfig[containerKey], 

138 fullDatastoreConfig[containerKey])): 

139 childConfig = DatastoreConfig(child, mergeDefaults=False) 

140 fullChildConfig = DatastoreConfig(fullChild, mergeDefaults=False) 

141 datastoreClass = doImport(fullChildConfig["cls"]) 

142 newroot = "{}/{}_{}".format(root, datastoreClass.__qualname__, idx) 

143 datastoreClass.setConfigRoot(newroot, childConfig, fullChildConfig, overwrite=overwrite) 

144 

145 # Reattach to parent 

146 datastoreConfig[containerKey, idx] = childConfig 

147 

148 # Reattach modified datastore config to parent 

149 # If this has a datastore key we attach there, otherwise we assume 

150 # this information goes at the top of the config hierarchy. 

151 if DatastoreConfig.component in config: 

152 config[DatastoreConfig.component] = datastoreConfig 

153 else: 

154 config.update(datastoreConfig) 

155 

156 return 

157 

158 def __init__(self, config, registry=None, butlerRoot=None): 

159 super().__init__(config, registry) 

160 

161 # Scan for child datastores and instantiate them with the same registry 

162 self.datastores = [] 

163 for c in self.config["datastores"]: 

164 c = DatastoreConfig(c) 

165 datastoreType = doImport(c["cls"]) 

166 datastore = datastoreType(c, registry, butlerRoot=butlerRoot) 

167 log.debug("Creating child datastore %s", datastore.name) 

168 self.datastores.append(datastore) 

169 

170 # Name ourself based on our children 

171 if self.datastores: 171 ↛ 176line 171 didn't jump to line 176, because the condition on line 171 was never false

172 # We must set the names explicitly 

173 self._names = [d.name for d in self.datastores] 

174 childNames = ",".join(self.names) 

175 else: 

176 childNames = "(empty@{})".format(time.time()) 

177 self._names = [childNames] 

178 self.name = "{}[{}]".format(type(self).__qualname__, childNames) 

179 

180 # We declare we are ephemeral if all our child datastores declare 

181 # they are ephemeral 

182 isEphemeral = True 

183 for d in self.datastores: 

184 if not d.isEphemeral: 

185 isEphemeral = False 

186 break 

187 self.isEphemeral = isEphemeral 

188 

189 # per-datastore override constraints 

190 if "datastore_constraints" in self.config: 

191 overrides = self.config["datastore_constraints"] 

192 

193 if len(overrides) != len(self.datastores): 193 ↛ 194line 193 didn't jump to line 194, because the condition on line 193 was never true

194 raise DatastoreValidationError(f"Number of registered datastores ({len(self.datastores)})" 

195 " differs from number of constraints overrides" 

196 f" {len(overrides)}") 

197 

198 self.datastoreConstraints = [Constraints(c.get("constraints"), universe=self.registry.dimensions) 

199 for c in overrides] 

200 

201 else: 

202 self.datastoreConstraints = (None,) * len(self.datastores) 

203 

204 log.debug("Created %s (%s)", self.name, ("ephemeral" if self.isEphemeral else "permanent")) 

205 

206 @property 

207 def names(self): 

208 return self._names 

209 

210 def __str__(self): 

211 chainName = ", ".join(str(ds) for ds in self.datastores) 

212 return chainName 

213 

214 def exists(self, ref): 

215 """Check if the dataset exists in one of the datastores. 

216 

217 Parameters 

218 ---------- 

219 ref : `DatasetRef` 

220 Reference to the required dataset. 

221 

222 Returns 

223 ------- 

224 exists : `bool` 

225 `True` if the entity exists in one of the child datastores. 

226 """ 

227 for datastore in self.datastores: 

228 if datastore.exists(ref): 

229 log.debug("Found %s in datastore %s", ref, datastore.name) 

230 return True 

231 return False 

232 

233 def get(self, ref, parameters=None): 

234 """Load an InMemoryDataset from the store. 

235 

236 The dataset is returned from the first datastore that has 

237 the dataset. 

238 

239 Parameters 

240 ---------- 

241 ref : `DatasetRef` 

242 Reference to the required Dataset. 

243 parameters : `dict` 

244 `StorageClass`-specific parameters that specify, for example, 

245 a slice of the Dataset to be loaded. 

246 

247 Returns 

248 ------- 

249 inMemoryDataset : `object` 

250 Requested Dataset or slice thereof as an InMemoryDataset. 

251 

252 Raises 

253 ------ 

254 FileNotFoundError 

255 Requested dataset can not be retrieved. 

256 TypeError 

257 Return value from formatter has unexpected type. 

258 ValueError 

259 Formatter failed to process the dataset. 

260 """ 

261 

262 for datastore in self.datastores: 

263 try: 

264 inMemoryObject = datastore.get(ref, parameters) 

265 log.debug("Found Dataset %s in datastore %s", ref, datastore.name) 

266 return inMemoryObject 

267 except FileNotFoundError: 

268 pass 

269 

270 raise FileNotFoundError("Dataset {} could not be found in any of the datastores".format(ref)) 

271 

272 def put(self, inMemoryDataset, ref): 

273 """Write a InMemoryDataset with a given `DatasetRef` to each 

274 datastore. 

275 

276 The put() to child datastores can fail with 

277 `DatasetTypeNotSupportedError`. The put() for this datastore will be 

278 deemed to have succeeded so long as at least one child datastore 

279 accepted the inMemoryDataset. 

280 

281 Parameters 

282 ---------- 

283 inMemoryDataset : `object` 

284 The Dataset to store. 

285 ref : `DatasetRef` 

286 Reference to the associated Dataset. 

287 

288 Raises 

289 ------ 

290 TypeError 

291 Supplied object and storage class are inconsistent. 

292 DatasetTypeNotSupportedError 

293 All datastores reported `DatasetTypeNotSupportedError`. 

294 """ 

295 log.debug("Put %s", ref) 

296 

297 # Confirm that we can accept this dataset 

298 if not self.constraints.isAcceptable(ref): 

299 # Raise rather than use boolean return value. 

300 raise DatasetTypeNotSupportedError(f"Dataset {ref} has been rejected by this datastore via" 

301 " configuration.") 

302 

303 isPermanent = False 

304 nsuccess = 0 

305 npermanent = 0 

306 nephemeral = 0 

307 for datastore, constraints in zip(self.datastores, self.datastoreConstraints): 

308 if constraints is not None and not constraints.isAcceptable(ref): 

309 log.debug("Datastore %s skipping put via configuration for ref %s", 

310 datastore.name, ref) 

311 continue 

312 

313 if datastore.isEphemeral: 

314 nephemeral += 1 

315 else: 

316 npermanent += 1 

317 try: 

318 datastore.put(inMemoryDataset, ref) 

319 nsuccess += 1 

320 if not datastore.isEphemeral: 

321 isPermanent = True 

322 except DatasetTypeNotSupportedError: 

323 pass 

324 

325 if nsuccess == 0: 

326 raise DatasetTypeNotSupportedError(f"None of the chained datastores supported ref {ref}") 

327 

328 if not isPermanent and npermanent > 0: 328 ↛ 329line 328 didn't jump to line 329, because the condition on line 328 was never true

329 warnings.warn(f"Put of {ref} only succeeded in ephemeral databases", stacklevel=2) 

330 

331 if self._transaction is not None: 

332 self._transaction.registerUndo('put', self.remove, ref) 

333 

334 def _prepIngest(self, *datasets: FileDataset, transfer: Optional[str] = None) -> _IngestPrepData: 

335 # Docstring inherited from Datastore._prepIngest. 

336 if transfer is None or transfer == "move": 

337 raise NotImplementedError("ChainedDatastore does not support transfer=None or transfer='move'.") 

338 

339 def isDatasetAcceptable(dataset, *, name, constraints): 

340 acceptable = [ref for ref in dataset.refs if constraints.isAcceptable(ref)] 

341 if not acceptable: 

342 log.debug("Datastore %s skipping ingest via configuration for refs %s", 

343 name, ", ".join(str(ref) for ref in dataset.refs)) 

344 return False 

345 else: 

346 return True 

347 

348 # Filter down to just datasets the chained datastore's own 

349 # configuration accepts. 

350 okForParent: List[FileDataset] = [dataset for dataset in datasets 

351 if isDatasetAcceptable(dataset, name=self.name, 

352 constraints=self.constraints)] 

353 

354 # Iterate over nested datastores and call _prepIngest on each. 

355 # Save the results to a list: 

356 children: List[Tuple[Datastore, Datastore.IngestPrepData]] = [] 

357 # ...and remember whether all of the failures are due to 

358 # NotImplementedError being raised. 

359 allFailuresAreNotImplementedError = True 

360 for datastore, constraints in zip(self.datastores, self.datastoreConstraints): 

361 if constraints is not None: 

362 okForChild: List[FileDataset] = [dataset for dataset in okForParent 

363 if isDatasetAcceptable(dataset, name=datastore.name, 

364 constraints=constraints)] 

365 else: 

366 okForChild: List[FileDataset] = okForParent 

367 try: 

368 prepDataForChild = datastore._prepIngest(*okForChild, transfer=transfer) 

369 except NotImplementedError: 

370 log.debug("Skipping ingest for datastore %s because transfer " 

371 "mode %s is not supported.", datastore.name, transfer) 

372 continue 

373 allFailuresAreNotImplementedError = False 

374 children.append((datastore, prepDataForChild)) 

375 if allFailuresAreNotImplementedError: 

376 raise NotImplementedError(f"No child datastore supports transfer mode {transfer}.") 

377 return _IngestPrepData(children=children) 

378 

379 def _finishIngest(self, prepData: _IngestPrepData, *, transfer: Optional[str] = None): 

380 # Docstring inherited from Datastore._finishIngest. 

381 for datastore, prepDataForChild in prepData.children: 

382 datastore._finishIngest(prepDataForChild, transfer=transfer) 

383 

384 def getUri(self, ref, predict=False): 

385 """URI to the Dataset. 

386 

387 The returned URI is from the first datastore in the list that has 

388 the dataset with preference given to the first dataset coming from 

389 a permanent datastore. If no datastores have the dataset and prediction 

390 is allowed, the predicted URI for the first datastore in the list will 

391 be returned. 

392 

393 Parameters 

394 ---------- 

395 ref : `DatasetRef` 

396 Reference to the required Dataset. 

397 predict : `bool` 

398 If `True`, allow URIs to be returned of datasets that have not 

399 been written. 

400 

401 Returns 

402 ------- 

403 uri : `str` 

404 URI string pointing to the Dataset within the datastore. If the 

405 Dataset does not exist in the datastore, and if ``predict`` is 

406 `True`, the URI will be a prediction and will include a URI 

407 fragment "#predicted". 

408 

409 Notes 

410 ----- 

411 If the datastore does not have entities that relate well 

412 to the concept of a URI the returned URI string will be 

413 descriptive. The returned URI is not guaranteed to be obtainable. 

414 

415 Raises 

416 ------ 

417 FileNotFoundError 

418 A URI has been requested for a dataset that does not exist and 

419 guessing is not allowed. 

420 """ 

421 log.debug("Requesting URI for %s", ref) 

422 predictedUri = None 

423 predictedEphemeralUri = None 

424 firstEphemeralUri = None 

425 for datastore in self.datastores: 

426 if datastore.exists(ref): 

427 if not datastore.isEphemeral: 

428 uri = datastore.getUri(ref) 

429 log.debug("Retrieved ephemeral URI: %s", uri) 

430 return uri 

431 elif firstEphemeralUri is None: 

432 firstEphemeralUri = datastore.getUri(ref) 

433 elif predict: 

434 if predictedUri is None and not datastore.isEphemeral: 

435 predictedUri = datastore.getUri(ref, predict) 

436 elif predictedEphemeralUri is None and datastore.isEphemeral: 

437 predictedEphemeralUri = datastore.getUri(ref, predict) 

438 

439 if firstEphemeralUri is not None: 

440 log.debug("Retrieved ephemeral URI: %s", firstEphemeralUri) 

441 return firstEphemeralUri 

442 

443 if predictedUri is not None: 

444 log.debug("Retrieved predicted URI: %s", predictedUri) 

445 return predictedUri 

446 

447 if predictedEphemeralUri is not None: 

448 log.debug("Retrieved predicted ephemeral URI: %s", predictedEphemeralUri) 

449 return predictedEphemeralUri 

450 

451 raise FileNotFoundError("Dataset {} not in any datastore".format(ref)) 

452 

453 def remove(self, ref): 

454 """Indicate to the Datastore that a Dataset can be removed. 

455 

456 The dataset will be removed from each datastore. The dataset is 

457 not required to exist in every child datastore. 

458 

459 Parameters 

460 ---------- 

461 ref : `DatasetRef` 

462 Reference to the required Dataset. 

463 

464 Raises 

465 ------ 

466 FileNotFoundError 

467 Attempt to remove a dataset that does not exist. Raised if none 

468 of the child datastores removed the dataset. 

469 """ 

470 log.debug(f"Removing {ref}") 

471 

472 counter = 0 

473 for datastore in self.datastores: 

474 try: 

475 datastore.remove(ref) 

476 counter += 1 

477 except FileNotFoundError: 

478 pass 

479 

480 if counter == 0: 

481 raise FileNotFoundError(f"Could not remove from any child datastore: {ref}") 

482 

483 def transfer(self, inputDatastore, ref): 

484 """Retrieve a Dataset from an input `Datastore`, 

485 and store the result in this `Datastore`. 

486 

487 Parameters 

488 ---------- 

489 inputDatastore : `Datastore` 

490 The external `Datastore` from which to retreive the Dataset. 

491 ref : `DatasetRef` 

492 Reference to the required Dataset in the input data store. 

493 

494 Returns 

495 ------- 

496 results : `list` 

497 List containing the return value from the ``put()`` to each 

498 child datastore. 

499 """ 

500 assert inputDatastore is not self # unless we want it for renames? 

501 inMemoryDataset = inputDatastore.get(ref) 

502 return [datastore.put(inMemoryDataset, ref) for datastore in self.datastores] 

503 

504 def validateConfiguration(self, entities, logFailures=False): 

505 """Validate some of the configuration for this datastore. 

506 

507 Parameters 

508 ---------- 

509 entities : iterable of `DatasetRef`, `DatasetType`, or `StorageClass` 

510 Entities to test against this configuration. Can be differing 

511 types. 

512 logFailures : `bool`, optional 

513 If `True`, output a log message for every validation error 

514 detected. 

515 

516 Raises 

517 ------ 

518 DatastoreValidationError 

519 Raised if there is a validation problem with a configuration. 

520 All the problems are reported in a single exception. 

521 

522 Notes 

523 ----- 

524 This method checks each datastore in turn. 

525 """ 

526 

527 # Need to catch each of the datastore outputs and ensure that 

528 # all are tested. 

529 failures = [] 

530 for datastore in self.datastores: 

531 try: 

532 datastore.validateConfiguration(entities, logFailures=logFailures) 

533 except DatastoreValidationError as e: 

534 if logFailures: 534 ↛ 536line 534 didn't jump to line 536, because the condition on line 534 was never false

535 log.fatal("Datastore %s failed validation", datastore.name) 

536 failures.append(f"Datastore {self.name}: {e}") 

537 

538 if failures: 

539 msg = ";\n".join(failures) 

540 raise DatastoreValidationError(msg) 

541 

542 def validateKey(self, lookupKey, entity): 

543 # Docstring is inherited from base class 

544 failures = [] 

545 for datastore in self.datastores: 

546 try: 

547 datastore.validateKey(lookupKey, entity) 

548 except DatastoreValidationError as e: 

549 failures.append(f"Datastore {self.name}: {e}") 

550 

551 if failures: 

552 msg = ";\n".join(failures) 

553 raise DatastoreValidationError(msg) 

554 

555 def getLookupKeys(self): 

556 # Docstring is inherited from base class 

557 keys = set() 

558 for datastore in self.datastores: 

559 keys.update(datastore.getLookupKeys()) 

560 

561 keys.update(self.constraints.getLookupKeys()) 

562 for p in self.datastoreConstraints: 

563 if p is not None: 563 ↛ 564line 563 didn't jump to line 564, because the condition on line 563 was never true

564 keys.update(p.getLookupKeys()) 

565 

566 return keys