Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22""" 

23Support for generic data stores. 

24""" 

25 

26from __future__ import annotations 

27 

28__all__ = ("DatastoreConfig", "Datastore", "DatastoreValidationError") 

29 

30import contextlib 

31import logging 

32from collections import defaultdict 

33from typing import TYPE_CHECKING, Optional, Type, Callable, ClassVar, Any, Generator, Iterable 

34from dataclasses import dataclass 

35from abc import ABCMeta, abstractmethod 

36 

37from lsst.utils import doImport 

38from .config import ConfigSubset, Config 

39from .exceptions import ValidationError, DatasetTypeNotSupportedError 

40from .constraints import Constraints 

41from .storageClass import StorageClassFactory 

42 

43if TYPE_CHECKING: 43 ↛ 44line 43 didn't jump to line 44, because the condition on line 43 was never true

44 from ..registry import Registry 

45 from .datasets import DatasetRef 

46 from .repoTransfer import FileDataset 

47 

48 

49class DatastoreConfig(ConfigSubset): 

50 component = "datastore" 

51 requiredKeys = ("cls",) 

52 defaultConfigFile = "datastore.yaml" 

53 

54 

55class DatastoreValidationError(ValidationError): 

56 """There is a problem with the Datastore configuration. 

57 """ 

58 pass 

59 

60 

61@dataclass(frozen=True) 

62class Event: 

63 __slots__ = {"name", "undoFunc", "args", "kwargs"} 

64 name: str 

65 undoFunc: Callable 

66 args: tuple 

67 kwargs: dict 

68 

69 

70class IngestPrepData: 

71 """A helper base class for `Datastore` ingest implementations. 

72 

73 Datastore implementations will generally need a custom implementation of 

74 this class. 

75 

76 Should be accessed as ``Datastore.IngestPrepData`` instead of via direct 

77 import. 

78 

79 Parameters 

80 ---------- 

81 refs : iterable of `DatasetRef` 

82 References for the datasets that can be ingested by this datastore. 

83 """ 

84 def __init__(self, refs: Iterable[DatasetRef]): 

85 self.refs = {ref.id: ref for ref in refs} 

86 

87 

88class DatastoreTransaction: 

89 """Keeps a log of `Datastore` activity and allow rollback. 

90 

91 Parameters 

92 ---------- 

93 parent : `DatastoreTransaction`, optional 

94 The parent transaction (if any) 

95 """ 

96 Event: ClassVar[Type] = Event 

97 

98 parent: Optional['DatastoreTransaction'] 

99 """The parent transaction. (`DatastoreTransaction`, optional)""" 

100 

101 def __init__(self, parent=None): 

102 self.parent = parent 

103 self._log = [] 

104 

105 def registerUndo(self, name: str, undoFunc: Callable, *args: Any, **kwargs: Any) -> None: 

106 """Register event with undo function. 

107 

108 Parameters 

109 ---------- 

110 name : `str` 

111 Name of the event. 

112 undoFunc : func 

113 Function to undo this event. 

114 args : `tuple` 

115 Positional arguments to `undoFunc`. 

116 kwargs : `dict` 

117 Keyword arguments to `undoFunc`. 

118 """ 

119 self._log.append(self.Event(name, undoFunc, args, kwargs)) 

120 

121 @contextlib.contextmanager 

122 def undoWith(self, name: str, undoFunc: Callable, *args: Any, **kwargs: Any) -> Generator: 

123 """A context manager that calls `registerUndo` if the nested operation 

124 does not raise an exception. 

125 

126 This can be used to wrap individual undo-able statements within a 

127 DatastoreTransaction block. Multiple statements that can fail 

128 separately should not be part of the same `undoWith` block. 

129 

130 All arguments are forwarded directly to `registerUndo`. 

131 """ 

132 try: 

133 yield None 

134 except BaseException: 

135 raise 

136 else: 

137 self.registerUndo(name, undoFunc, *args, **kwargs) 

138 

139 def rollback(self) -> None: 

140 """Roll back all events in this transaction. 

141 """ 

142 while self._log: 

143 ev = self._log.pop() 

144 try: 

145 ev.undoFunc(*ev.args, **ev.kwargs) 

146 except BaseException as e: 

147 # Deliberately swallow error that may occur in unrolling 

148 log = logging.getLogger(__name__) 

149 log.warning("Exception: %s caught while unrolling: %s", e, ev.name) 

150 pass 

151 

152 def commit(self) -> None: 

153 """Commit this transaction. 

154 """ 

155 if self.parent is None: 

156 # Just forget about the events, they have already happened. 

157 return 

158 else: 

159 # We may still want to events from this transaction as part of 

160 # the parent. 

161 self.parent._log.extend(self._log) 

162 

163 

164class Datastore(metaclass=ABCMeta): 

165 """Datastore interface. 

166 

167 Parameters 

168 ---------- 

169 config : `DatastoreConfig` or `str` 

170 Load configuration either from an existing config instance or by 

171 referring to a configuration file. 

172 registry : `Registry` 

173 Registry to use for storing internal information about the datasets. 

174 butlerRoot : `str`, optional 

175 New datastore root to use to override the configuration value. 

176 """ 

177 

178 defaultConfigFile: ClassVar[Optional[str]] = None 

179 """Path to configuration defaults. Relative to $DAF_BUTLER_DIR/config or 

180 absolute path. Can be None if no defaults specified. 

181 """ 

182 

183 containerKey: ClassVar[Optional[str]] = None 

184 """Name of the key containing a list of subconfigurations that also 

185 need to be merged with defaults and will likely use different Python 

186 datastore classes (but all using DatastoreConfig). Assumed to be a 

187 list of configurations that can be represented in a DatastoreConfig 

188 and containing a "cls" definition. None indicates that no containers 

189 are expected in this Datastore.""" 

190 

191 isEphemeral: ClassVar[bool] = False 

192 """Indicate whether this Datastore is ephemeral or not. An ephemeral 

193 datastore is one where the contents of the datastore will not exist 

194 across process restarts.""" 

195 

196 config: DatastoreConfig 

197 """Configuration used to create Datastore.""" 

198 

199 registry: Registry 

200 """`Registry` to use when recording the writing of Datasets.""" 

201 

202 name: str 

203 """Label associated with this Datastore.""" 

204 

205 names: list 

206 """List of names associated with this Datastore. Can be different to 

207 ``name`` for a chaining datastore.""" 

208 

209 storageClassFactory: StorageClassFactory 

210 """Factory for creating storage class instances from name.""" 

211 

212 constraints: Constraints 

213 """Constraints to apply when putting datasets into the datastore.""" 

214 

215 IngestPrepData: ClassVar[Type] = IngestPrepData 

216 """Helper base class for ingest implementations. 

217 """ 

218 

219 @classmethod 

220 @abstractmethod 

221 def setConfigRoot(cls, root: str, config: Config, full: Config, overwrite: bool = True): 

222 """Set any filesystem-dependent config options for this Datastore to 

223 be appropriate for a new empty repository with the given root. 

224 

225 Parameters 

226 ---------- 

227 root : `str` 

228 Filesystem path to the root of the data repository. 

229 config : `Config` 

230 A `Config` to update. Only the subset understood by 

231 this component will be updated. Will not expand 

232 defaults. 

233 full : `Config` 

234 A complete config with all defaults expanded that can be 

235 converted to a `DatastoreConfig`. Read-only and will not be 

236 modified by this method. 

237 Repository-specific options that should not be obtained 

238 from defaults when Butler instances are constructed 

239 should be copied from ``full`` to ``config``. 

240 overwrite : `bool`, optional 

241 If `False`, do not modify a value in ``config`` if the value 

242 already exists. Default is always to overwrite with the provided 

243 ``root``. 

244 

245 Notes 

246 ----- 

247 If a keyword is explicitly defined in the supplied ``config`` it 

248 will not be overridden by this method if ``overwrite`` is `False`. 

249 This allows explicit values set in external configs to be retained. 

250 """ 

251 raise NotImplementedError() 

252 

253 @staticmethod 

254 def fromConfig(config: Config, registry: Registry, butlerRoot: Optional[str] = None) -> 'Datastore': 

255 """Create datastore from type specified in config file. 

256 

257 Parameters 

258 ---------- 

259 config : `Config` 

260 Configuration instance. 

261 registry : `Registry` 

262 Registry to be used by the Datastore for internal data. 

263 butlerRoot : `str`, optional 

264 Butler root directory. 

265 """ 

266 cls = doImport(config["datastore", "cls"]) 

267 return cls(config=config, registry=registry, butlerRoot=butlerRoot) 

268 

269 def __init__(self, config, registry, butlerRoot=None): 

270 self.config = DatastoreConfig(config) 

271 self.registry = registry 

272 self.name = "ABCDataStore" 

273 self._transaction = None 

274 

275 # All Datastores need storage classes and constraints 

276 self.storageClassFactory = StorageClassFactory() 

277 

278 # And read the constraints list 

279 constraintsConfig = self.config.get("constraints") 

280 self.constraints = Constraints(constraintsConfig, universe=self.registry.dimensions) 

281 

282 def __str__(self): 

283 return self.name 

284 

285 def __repr__(self): 

286 return self.name 

287 

288 @property 

289 def names(self): 

290 """Names associated with this datastore returned as a list. 

291 

292 Some datastores can have child datastores. 

293 """ 

294 # Default implementation returns solely the name itself 

295 return [self.name] 

296 

297 @contextlib.contextmanager 

298 def transaction(self): 

299 """Context manager supporting `Datastore` transactions. 

300 

301 Transactions can be nested, and are to be used in combination with 

302 `Registry.transaction`. 

303 """ 

304 self._transaction = DatastoreTransaction(self._transaction) 

305 try: 

306 yield self._transaction 

307 except BaseException: 

308 self._transaction.rollback() 

309 raise 

310 else: 

311 self._transaction.commit() 

312 self._transaction = self._transaction.parent 

313 

314 @abstractmethod 

315 def exists(self, datasetRef): 

316 """Check if the dataset exists in the datastore. 

317 

318 Parameters 

319 ---------- 

320 datasetRef : `DatasetRef` 

321 Reference to the required dataset. 

322 

323 Returns 

324 ------- 

325 exists : `bool` 

326 `True` if the entity exists in the `Datastore`. 

327 """ 

328 raise NotImplementedError("Must be implemented by subclass") 

329 

330 @abstractmethod 

331 def get(self, datasetRef, parameters=None): 

332 """Load an `InMemoryDataset` from the store. 

333 

334 Parameters 

335 ---------- 

336 datasetRef : `DatasetRef` 

337 Reference to the required Dataset. 

338 parameters : `dict` 

339 `StorageClass`-specific parameters that specify a slice of the 

340 Dataset to be loaded. 

341 

342 Returns 

343 ------- 

344 inMemoryDataset : `object` 

345 Requested Dataset or slice thereof as an InMemoryDataset. 

346 """ 

347 raise NotImplementedError("Must be implemented by subclass") 

348 

349 @abstractmethod 

350 def put(self, inMemoryDataset, datasetRef): 

351 """Write a `InMemoryDataset` with a given `DatasetRef` to the store. 

352 

353 Parameters 

354 ---------- 

355 inMemoryDataset : `InMemoryDataset` 

356 The Dataset to store. 

357 datasetRef : `DatasetRef` 

358 Reference to the associated Dataset. 

359 """ 

360 raise NotImplementedError("Must be implemented by subclass") 

361 

362 def _prepIngest(self, *datasets: FileDataset, transfer: Optional[str] = None) -> IngestPrepData: 

363 """Process datasets to identify which ones can be ingested into this 

364 Datastore. 

365 

366 Parameters 

367 ---------- 

368 datasets : `FileDataset` 

369 Each positional argument is a struct containing information about 

370 a file to be ingested, including its path (either absolute or 

371 relative to the datastore root, if applicable), a complete 

372 `DatasetRef` (with ``dataset_id not None``), and optionally a 

373 formatter class or its fully-qualified string name. If a formatter 

374 is not provided, this method should populate that attribute with 

375 the formatter the datastore would use for `put`. Subclasses are 

376 also permitted to modify the path attribute (typically to put it 

377 in what the datastore considers its standard form). 

378 transfer : `str`, optional 

379 How (and whether) the dataset should be added to the datastore. 

380 If `None` (default), the file must already be in a location 

381 appropriate for the datastore (e.g. within its root directory), 

382 and will not be modified. Other choices include "move", "copy", 

383 "symlink", and "hardlink". Most datastores do not support all 

384 transfer modes. 

385 

386 Returns 

387 ------- 

388 data : `IngestPrepData` 

389 An instance of a subclass of `IngestPrepData`, used to pass 

390 arbitrary data from `_prepIngest` to `_finishIngest`. This should 

391 include only the datasets this datastore can actually ingest; 

392 others should be silently ignored (`Datastore.ingest` will inspect 

393 `IngestPrepData.refs` and raise `DatasetTypeNotSupportedError` if 

394 necessary). 

395 

396 Raises 

397 ------ 

398 NotImplementedError 

399 Raised if the datastore does not support the given transfer mode 

400 (including the case where ingest is not supported at all). 

401 FileNotFoundError 

402 Raised if one of the given files does not exist. 

403 FileExistsError 

404 Raised if transfer is not `None` but the (internal) location the 

405 file would be moved to is already occupied. 

406 

407 Notes 

408 ----- 

409 This method (along with `_finishIngest`) should be implemented by 

410 subclasses to provide ingest support instead of implementing `ingest` 

411 directly. 

412 

413 `_prepIngest` should not modify the data repository or given files in 

414 any way; all changes should be deferred to `_finishIngest`. 

415 

416 When possible, exceptions should be raised in `_prepIngest` instead of 

417 `_finishIngest`. `NotImplementedError` exceptions that indicate that 

418 the transfer mode is not supported must be raised by `_prepIngest` 

419 instead of `_finishIngest`. 

420 """ 

421 raise NotImplementedError( 

422 "Datastore does not support direct file-based ingest." 

423 ) 

424 

425 def _finishIngest(self, prepData: IngestPrepData, *, transfer: Optional[str] = None): 

426 """Complete an ingest operation. 

427 

428 Parameters 

429 ---------- 

430 data : `IngestPrepData` 

431 An instance of a subclass of `IngestPrepData`. Guaranteed to be 

432 the direct result of a call to `_prepIngest` on this datastore. 

433 transfer : `str`, optional 

434 How (and whether) the dataset should be added to the datastore. 

435 If `None` (default), the file must already be in a location 

436 appropriate for the datastore (e.g. within its root directory), 

437 and will not be modified. Other choices include "move", "copy", 

438 "symlink", and "hardlink". Most datastores do not support all 

439 transfer modes. 

440 

441 Raises 

442 ------ 

443 FileNotFoundError 

444 Raised if one of the given files does not exist. 

445 FileExistsError 

446 Raised if transfer is not `None` but the (internal) location the 

447 file would be moved to is already occupied. 

448 

449 Notes 

450 ----- 

451 This method (along with `_prepIngest`) should be implemented by 

452 subclasses to provide ingest support instead of implementing `ingest` 

453 directly. 

454 """ 

455 raise NotImplementedError( 

456 "Datastore does not support direct file-based ingest." 

457 ) 

458 

459 def ingest(self, *datasets: FileDataset, transfer: Optional[str] = None): 

460 """Ingest one or more files into the datastore. 

461 

462 Parameters 

463 ---------- 

464 datasets : `FileDataset` 

465 Each positional argument is a struct containing information about 

466 a file to be ingested, including its path (either absolute or 

467 relative to the datastore root, if applicable), a complete 

468 `DatasetRef` (with ``dataset_id not None``), and optionally a 

469 formatter class or its fully-qualified string name. If a formatter 

470 is not provided, the one the datastore would use for ``put`` on 

471 that dataset is assumed. 

472 transfer : `str`, optional 

473 How (and whether) the dataset should be added to the datastore. 

474 If `None` (default), the file must already be in a location 

475 appropriate for the datastore (e.g. within its root directory), 

476 and will not be modified. Other choices include "move", "copy", 

477 "symlink", and "hardlink". Most datastores do not support all 

478 transfer modes. 

479 

480 Raises 

481 ------ 

482 NotImplementedError 

483 Raised if the datastore does not support the given transfer mode 

484 (including the case where ingest is not supported at all). 

485 DatasetTypeNotSupportedError 

486 Raised if one or more files to be ingested have a dataset type that 

487 is not supported by the datastore. 

488 FileNotFoundError 

489 Raised if one of the given files does not exist. 

490 FileExistsError 

491 Raised if transfer is not `None` but the (internal) location the 

492 file would be moved to is already occupied. 

493 

494 Notes 

495 ----- 

496 Subclasses should implement `_prepIngest` and `_finishIngest` instead 

497 of implementing `ingest` directly. Datastores that hold and 

498 delegate to child datastores may want to call those methods as well. 

499 

500 Subclasses are encouraged to document their supported transfer modes 

501 in their class documentation. 

502 """ 

503 prepData = self._prepIngest(*datasets, transfer=transfer) 

504 refs = {ref.id: ref for dataset in datasets for ref in dataset.refs} 

505 if refs.keys() != prepData.refs.keys(): 

506 unsupported = refs.keys() - prepData.refs.keys() 

507 # Group unsupported refs by DatasetType for an informative 

508 # but still concise error message. 

509 byDatasetType = defaultdict(list) 

510 for datasetId in unsupported: 

511 ref = refs[datasetId] 

512 byDatasetType[ref.datasetType].append(ref) 

513 raise DatasetTypeNotSupportedError( 

514 "DatasetType(s) not supported in ingest: " 

515 + ", ".join(f"{k.name} ({len(v)} dataset(s))" for k, v in byDatasetType.items()) 

516 ) 

517 self._finishIngest(prepData, transfer=transfer) 

518 

519 @abstractmethod 

520 def getUri(self, datasetRef): 

521 """URI to the Dataset. 

522 

523 Parameters 

524 ---------- 

525 datasetRef : `DatasetRef` 

526 Reference to the required Dataset. 

527 

528 Returns 

529 ------- 

530 uri : `str` 

531 URI string pointing to the Dataset within the datastore. If the 

532 Dataset does not exist in the datastore, the URI may be a guess. 

533 If the datastore does not have entities that relate well 

534 to the concept of a URI the returned URI string will be 

535 descriptive. The returned URI is not guaranteed to be obtainable. 

536 """ 

537 raise NotImplementedError("Must be implemented by subclass") 

538 

539 @abstractmethod 

540 def remove(self, datasetRef): 

541 """Indicate to the Datastore that a Dataset can be removed. 

542 

543 Parameters 

544 ---------- 

545 datasetRef : `DatasetRef` 

546 Reference to the required Dataset. 

547 

548 Raises 

549 ------ 

550 FileNotFoundError 

551 When Dataset does not exist. 

552 

553 Notes 

554 ----- 

555 Some Datastores may implement this method as a silent no-op to 

556 disable Dataset deletion through standard interfaces. 

557 """ 

558 raise NotImplementedError("Must be implemented by subclass") 

559 

560 @abstractmethod 

561 def transfer(self, inputDatastore, datasetRef): 

562 """Retrieve a Dataset from an input `Datastore`, and store the result 

563 in this `Datastore`. 

564 

565 Parameters 

566 ---------- 

567 inputDatastore : `Datastore` 

568 The external `Datastore` from which to retreive the Dataset. 

569 datasetRef : `DatasetRef` 

570 Reference to the required Dataset. 

571 """ 

572 raise NotImplementedError("Must be implemented by subclass") 

573 

574 def export(self, refs: Iterable[DatasetRef], *, 

575 directory: Optional[str] = None, transfer: Optional[str] = None) -> Iterable[FileDataset]: 

576 """Export datasets for transfer to another data repository. 

577 

578 Parameters 

579 ---------- 

580 refs : iterable of `DatasetRef` 

581 Dataset references to be exported. 

582 directory : `str`, optional 

583 Path to a directory that should contain files corresponding to 

584 output datasets. Ignored if ``transfer`` is `None`. 

585 transfer : `str`, optional 

586 Mode that should be used to move datasets out of the repository. 

587 Valid options are the same as those of the ``transfer`` argument 

588 to ``ingest``, and datastores may similarly signal that a transfer 

589 mode is not supported by raising `NotImplementedError`. 

590 

591 Returns 

592 ------- 

593 dataset : iterable of `DatasetTransfer` 

594 Structs containing information about the exported datasets, in the 

595 same order as ``refs``. 

596 

597 Raises 

598 ------ 

599 NotImplementedError 

600 Raised if the given transfer mode is not supported. 

601 """ 

602 raise NotImplementedError(f"Transfer mode {transfer} not supported.") 

603 

604 @abstractmethod 

605 def validateConfiguration(self, entities, logFailures=False): 

606 """Validate some of the configuration for this datastore. 

607 

608 Parameters 

609 ---------- 

610 entities : `DatasetRef`, `DatasetType`, or `StorageClass` 

611 Entities to test against this configuration. Can be differing 

612 types. 

613 logFailures : `bool`, optional 

614 If `True`, output a log message for every validation error 

615 detected. 

616 

617 Raises 

618 ------ 

619 DatastoreValidationError 

620 Raised if there is a validation problem with a configuration. 

621 

622 Notes 

623 ----- 

624 Which parts of the configuration are validated is at the discretion 

625 of each Datastore implementation. 

626 """ 

627 raise NotImplementedError("Must be implemented by subclass") 

628 

629 @abstractmethod 

630 def validateKey(self, lookupKey, entity, logFailures=False): 

631 """Validate a specific look up key with supplied entity. 

632 

633 Parameters 

634 ---------- 

635 lookupKey : `LookupKey` 

636 Key to use to retrieve information from the datastore 

637 configuration. 

638 entity : `DatasetRef`, `DatasetType`, or `StorageClass` 

639 Entity to compare with configuration retrieved using the 

640 specified lookup key. 

641 

642 Raises 

643 ------ 

644 DatastoreValidationError 

645 Raised if there is a problem with the combination of entity 

646 and lookup key. 

647 

648 Notes 

649 ----- 

650 Bypasses the normal selection priorities by allowing a key that 

651 would normally not be selected to be validated. 

652 """ 

653 raise NotImplementedError("Must be implemented by subclass") 

654 

655 @abstractmethod 

656 def getLookupKeys(self): 

657 """Return all the lookup keys relevant to this datastore. 

658 

659 Returns 

660 ------- 

661 keys : `set` of `LookupKey` 

662 The keys stored internally for looking up information based 

663 on `DatasetType` name or `StorageClass`. 

664 """ 

665 raise NotImplementedError("Must be implemented by subclass")