Coverage for python/lsst/daf/butler/core/datastore.py : 47%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22"""
23Support for generic data stores.
24"""
26from __future__ import annotations
28__all__ = ("DatastoreConfig", "Datastore", "DatastoreValidationError")
30import contextlib
31import logging
32from collections import defaultdict
33from typing import (
34 TYPE_CHECKING,
35 Any,
36 Callable,
37 ClassVar,
38 Iterable,
39 Iterator,
40 List,
41 Mapping,
42 Optional,
43 Set,
44 Tuple,
45 Type,
46 Union,
47)
49from dataclasses import dataclass
50from abc import ABCMeta, abstractmethod
52from lsst.utils import doImport
53from .config import ConfigSubset, Config
54from .exceptions import ValidationError, DatasetTypeNotSupportedError
55from .constraints import Constraints
56from .storageClass import StorageClassFactory
58if TYPE_CHECKING: 58 ↛ 59line 58 didn't jump to line 59, because the condition on line 58 was never true
59 from ..registry.interfaces import DatastoreRegistryBridgeManager
60 from .datasets import DatasetRef, DatasetType
61 from .configSupport import LookupKey
62 from .repoTransfers import FileDataset
63 from .storageClass import StorageClass
66class DatastoreConfig(ConfigSubset):
67 component = "datastore"
68 requiredKeys = ("cls",)
69 defaultConfigFile = "datastore.yaml"
72class DatastoreValidationError(ValidationError):
73 """There is a problem with the Datastore configuration.
74 """
75 pass
78@dataclass(frozen=True)
79class Event:
80 __slots__ = {"name", "undoFunc", "args", "kwargs"}
81 name: str
82 undoFunc: Callable
83 args: tuple
84 kwargs: dict
87class IngestPrepData:
88 """A helper base class for `Datastore` ingest implementations.
90 Datastore implementations will generally need a custom implementation of
91 this class.
93 Should be accessed as ``Datastore.IngestPrepData`` instead of via direct
94 import.
96 Parameters
97 ----------
98 refs : iterable of `DatasetRef`
99 References for the datasets that can be ingested by this datastore.
100 """
101 def __init__(self, refs: Iterable[DatasetRef]):
102 self.refs = {ref.id: ref for ref in refs}
105class DatastoreTransaction:
106 """Keeps a log of `Datastore` activity and allow rollback.
108 Parameters
109 ----------
110 parent : `DatastoreTransaction`, optional
111 The parent transaction (if any)
112 """
113 Event: ClassVar[Type] = Event
115 parent: Optional['DatastoreTransaction']
116 """The parent transaction. (`DatastoreTransaction`, optional)"""
118 def __init__(self, parent: Optional[DatastoreTransaction] = None):
119 self.parent = parent
120 self._log: List[Event] = []
122 def registerUndo(self, name: str, undoFunc: Callable, *args: Any, **kwargs: Any) -> None:
123 """Register event with undo function.
125 Parameters
126 ----------
127 name : `str`
128 Name of the event.
129 undoFunc : func
130 Function to undo this event.
131 args : `tuple`
132 Positional arguments to `undoFunc`.
133 kwargs : `dict`
134 Keyword arguments to `undoFunc`.
135 """
136 self._log.append(self.Event(name, undoFunc, args, kwargs))
138 @contextlib.contextmanager
139 def undoWith(self, name: str, undoFunc: Callable, *args: Any, **kwargs: Any) -> Iterator[None]:
140 """A context manager that calls `registerUndo` if the nested operation
141 does not raise an exception.
143 This can be used to wrap individual undo-able statements within a
144 DatastoreTransaction block. Multiple statements that can fail
145 separately should not be part of the same `undoWith` block.
147 All arguments are forwarded directly to `registerUndo`.
148 """
149 try:
150 yield None
151 except BaseException:
152 raise
153 else:
154 self.registerUndo(name, undoFunc, *args, **kwargs)
156 def rollback(self) -> None:
157 """Roll back all events in this transaction.
158 """
159 while self._log:
160 ev = self._log.pop()
161 try:
162 ev.undoFunc(*ev.args, **ev.kwargs)
163 except BaseException as e:
164 # Deliberately swallow error that may occur in unrolling
165 log = logging.getLogger(__name__)
166 log.warning("Exception: %s caught while unrolling: %s", e, ev.name)
167 pass
169 def commit(self) -> None:
170 """Commit this transaction.
171 """
172 if self.parent is None:
173 # Just forget about the events, they have already happened.
174 return
175 else:
176 # We may still want to events from this transaction as part of
177 # the parent.
178 self.parent._log.extend(self._log)
181class Datastore(metaclass=ABCMeta):
182 """Datastore interface.
184 Parameters
185 ----------
186 config : `DatastoreConfig` or `str`
187 Load configuration either from an existing config instance or by
188 referring to a configuration file.
189 bridgeManager : `DatastoreRegistryBridgeManager`
190 Object that manages the interface between `Registry` and datastores.
191 butlerRoot : `str`, optional
192 New datastore root to use to override the configuration value.
193 """
195 defaultConfigFile: ClassVar[Optional[str]] = None
196 """Path to configuration defaults. Relative to $DAF_BUTLER_DIR/config or
197 absolute path. Can be None if no defaults specified.
198 """
200 containerKey: ClassVar[Optional[str]] = None
201 """Name of the key containing a list of subconfigurations that also
202 need to be merged with defaults and will likely use different Python
203 datastore classes (but all using DatastoreConfig). Assumed to be a
204 list of configurations that can be represented in a DatastoreConfig
205 and containing a "cls" definition. None indicates that no containers
206 are expected in this Datastore."""
208 isEphemeral: bool = False
209 """Indicate whether this Datastore is ephemeral or not. An ephemeral
210 datastore is one where the contents of the datastore will not exist
211 across process restarts. This value can change per-instance."""
213 config: DatastoreConfig
214 """Configuration used to create Datastore."""
216 name: str
217 """Label associated with this Datastore."""
219 storageClassFactory: StorageClassFactory
220 """Factory for creating storage class instances from name."""
222 constraints: Constraints
223 """Constraints to apply when putting datasets into the datastore."""
225 IngestPrepData: ClassVar = IngestPrepData
226 """Helper base class for ingest implementations.
227 """
229 @classmethod
230 @abstractmethod
231 def setConfigRoot(cls, root: str, config: Config, full: Config, overwrite: bool = True) -> None:
232 """Set any filesystem-dependent config options for this Datastore to
233 be appropriate for a new empty repository with the given root.
235 Parameters
236 ----------
237 root : `str`
238 Filesystem path to the root of the data repository.
239 config : `Config`
240 A `Config` to update. Only the subset understood by
241 this component will be updated. Will not expand
242 defaults.
243 full : `Config`
244 A complete config with all defaults expanded that can be
245 converted to a `DatastoreConfig`. Read-only and will not be
246 modified by this method.
247 Repository-specific options that should not be obtained
248 from defaults when Butler instances are constructed
249 should be copied from ``full`` to ``config``.
250 overwrite : `bool`, optional
251 If `False`, do not modify a value in ``config`` if the value
252 already exists. Default is always to overwrite with the provided
253 ``root``.
255 Notes
256 -----
257 If a keyword is explicitly defined in the supplied ``config`` it
258 will not be overridden by this method if ``overwrite`` is `False`.
259 This allows explicit values set in external configs to be retained.
260 """
261 raise NotImplementedError()
263 @staticmethod
264 def fromConfig(config: Config, bridgeManager: DatastoreRegistryBridgeManager,
265 butlerRoot: Optional[str] = None) -> 'Datastore':
266 """Create datastore from type specified in config file.
268 Parameters
269 ----------
270 config : `Config`
271 Configuration instance.
272 bridgeManager : `DatastoreRegistryBridgeManager`
273 Object that manages the interface between `Registry` and
274 datastores.
275 butlerRoot : `str`, optional
276 Butler root directory.
277 """
278 cls = doImport(config["datastore", "cls"])
279 return cls(config=config, bridgeManager=bridgeManager, butlerRoot=butlerRoot)
281 def __init__(self, config: Union[Config, str],
282 bridgeManager: DatastoreRegistryBridgeManager, butlerRoot: str = None):
283 self.config = DatastoreConfig(config)
284 self.name = "ABCDataStore"
285 self._transaction: Optional[DatastoreTransaction] = None
287 # All Datastores need storage classes and constraints
288 self.storageClassFactory = StorageClassFactory()
290 # And read the constraints list
291 constraintsConfig = self.config.get("constraints")
292 self.constraints = Constraints(constraintsConfig, universe=bridgeManager.universe)
294 def __str__(self) -> str:
295 return self.name
297 def __repr__(self) -> str:
298 return self.name
300 @property
301 def names(self) -> Tuple[str, ...]:
302 """Names associated with this datastore returned as a list.
304 Can be different to ``name`` for a chaining datastore.
305 """
306 # Default implementation returns solely the name itself
307 return (self.name, )
309 @contextlib.contextmanager
310 def transaction(self) -> Iterator[DatastoreTransaction]:
311 """Context manager supporting `Datastore` transactions.
313 Transactions can be nested, and are to be used in combination with
314 `Registry.transaction`.
315 """
316 self._transaction = DatastoreTransaction(self._transaction)
317 try:
318 yield self._transaction
319 except BaseException:
320 self._transaction.rollback()
321 raise
322 else:
323 self._transaction.commit()
324 self._transaction = self._transaction.parent
326 @abstractmethod
327 def exists(self, datasetRef: DatasetRef) -> bool:
328 """Check if the dataset exists in the datastore.
330 Parameters
331 ----------
332 datasetRef : `DatasetRef`
333 Reference to the required dataset.
335 Returns
336 -------
337 exists : `bool`
338 `True` if the entity exists in the `Datastore`.
339 """
340 raise NotImplementedError("Must be implemented by subclass")
342 @abstractmethod
343 def get(self, datasetRef: DatasetRef, parameters: Mapping[str, Any] = None) -> Any:
344 """Load an `InMemoryDataset` from the store.
346 Parameters
347 ----------
348 datasetRef : `DatasetRef`
349 Reference to the required Dataset.
350 parameters : `dict`
351 `StorageClass`-specific parameters that specify a slice of the
352 Dataset to be loaded.
354 Returns
355 -------
356 inMemoryDataset : `object`
357 Requested Dataset or slice thereof as an InMemoryDataset.
358 """
359 raise NotImplementedError("Must be implemented by subclass")
361 @abstractmethod
362 def put(self, inMemoryDataset: Any, datasetRef: DatasetRef) -> None:
363 """Write a `InMemoryDataset` with a given `DatasetRef` to the store.
365 Parameters
366 ----------
367 inMemoryDataset : `object`
368 The Dataset to store.
369 datasetRef : `DatasetRef`
370 Reference to the associated Dataset.
371 """
372 raise NotImplementedError("Must be implemented by subclass")
374 def _overrideTransferMode(self, *datasets: FileDataset, transfer: Optional[str] = None) -> Optional[str]:
375 """Allow ingest transfer mode to be defaulted based on datasets.
377 Parameters
378 ----------
379 datasets : `FileDataset`
380 Each positional argument is a struct containing information about
381 a file to be ingested, including its path (either absolute or
382 relative to the datastore root, if applicable), a complete
383 `DatasetRef` (with ``dataset_id not None``), and optionally a
384 formatter class or its fully-qualified string name. If a formatter
385 is not provided, this method should populate that attribute with
386 the formatter the datastore would use for `put`. Subclasses are
387 also permitted to modify the path attribute (typically to put it
388 in what the datastore considers its standard form).
389 transfer : `str`, optional
390 How (and whether) the dataset should be added to the datastore.
391 See `ingest` for details of transfer modes.
393 Returns
394 -------
395 newTransfer : `str`
396 Transfer mode to use. Will be identical to the supplied transfer
397 mode unless "auto" is used.
398 """
399 if transfer != "auto":
400 return transfer
401 raise RuntimeError(f"{transfer} is not allowed without specialization.")
403 def _prepIngest(self, *datasets: FileDataset, transfer: Optional[str] = None) -> IngestPrepData:
404 """Process datasets to identify which ones can be ingested into this
405 Datastore.
407 Parameters
408 ----------
409 datasets : `FileDataset`
410 Each positional argument is a struct containing information about
411 a file to be ingested, including its path (either absolute or
412 relative to the datastore root, if applicable), a complete
413 `DatasetRef` (with ``dataset_id not None``), and optionally a
414 formatter class or its fully-qualified string name. If a formatter
415 is not provided, this method should populate that attribute with
416 the formatter the datastore would use for `put`. Subclasses are
417 also permitted to modify the path attribute (typically to put it
418 in what the datastore considers its standard form).
419 transfer : `str`, optional
420 How (and whether) the dataset should be added to the datastore.
421 See `ingest` for details of transfer modes.
423 Returns
424 -------
425 data : `IngestPrepData`
426 An instance of a subclass of `IngestPrepData`, used to pass
427 arbitrary data from `_prepIngest` to `_finishIngest`. This should
428 include only the datasets this datastore can actually ingest;
429 others should be silently ignored (`Datastore.ingest` will inspect
430 `IngestPrepData.refs` and raise `DatasetTypeNotSupportedError` if
431 necessary).
433 Raises
434 ------
435 NotImplementedError
436 Raised if the datastore does not support the given transfer mode
437 (including the case where ingest is not supported at all).
438 FileNotFoundError
439 Raised if one of the given files does not exist.
440 FileExistsError
441 Raised if transfer is not `None` but the (internal) location the
442 file would be moved to is already occupied.
444 Notes
445 -----
446 This method (along with `_finishIngest`) should be implemented by
447 subclasses to provide ingest support instead of implementing `ingest`
448 directly.
450 `_prepIngest` should not modify the data repository or given files in
451 any way; all changes should be deferred to `_finishIngest`.
453 When possible, exceptions should be raised in `_prepIngest` instead of
454 `_finishIngest`. `NotImplementedError` exceptions that indicate that
455 the transfer mode is not supported must be raised by `_prepIngest`
456 instead of `_finishIngest`.
457 """
458 raise NotImplementedError(
459 "Datastore does not support direct file-based ingest."
460 )
462 def _finishIngest(self, prepData: IngestPrepData, *, transfer: Optional[str] = None) -> None:
463 """Complete an ingest operation.
465 Parameters
466 ----------
467 data : `IngestPrepData`
468 An instance of a subclass of `IngestPrepData`. Guaranteed to be
469 the direct result of a call to `_prepIngest` on this datastore.
470 transfer : `str`, optional
471 How (and whether) the dataset should be added to the datastore.
472 See `ingest` for details of transfer modes.
474 Raises
475 ------
476 FileNotFoundError
477 Raised if one of the given files does not exist.
478 FileExistsError
479 Raised if transfer is not `None` but the (internal) location the
480 file would be moved to is already occupied.
482 Notes
483 -----
484 This method (along with `_prepIngest`) should be implemented by
485 subclasses to provide ingest support instead of implementing `ingest`
486 directly.
487 """
488 raise NotImplementedError(
489 "Datastore does not support direct file-based ingest."
490 )
492 def ingest(self, *datasets: FileDataset, transfer: Optional[str] = None) -> None:
493 """Ingest one or more files into the datastore.
495 Parameters
496 ----------
497 datasets : `FileDataset`
498 Each positional argument is a struct containing information about
499 a file to be ingested, including its path (either absolute or
500 relative to the datastore root, if applicable), a complete
501 `DatasetRef` (with ``dataset_id not None``), and optionally a
502 formatter class or its fully-qualified string name. If a formatter
503 is not provided, the one the datastore would use for ``put`` on
504 that dataset is assumed.
505 transfer : `str`, optional
506 How (and whether) the dataset should be added to the datastore.
507 If `None` (default), the file must already be in a location
508 appropriate for the datastore (e.g. within its root directory),
509 and will not be modified. Other choices include "move", "copy",
510 "link", "symlink", "relsymlink", and "hardlink". "link" is a
511 special transfer mode that will first try to make a hardlink and
512 if that fails a symlink will be used instead. "relsymlink" creates
513 a relative symlink rather than use an absolute path.
514 Most datastores do not support all transfer modes.
515 "auto" is a special option that will let the
516 data store choose the most natural option for itself.
518 Raises
519 ------
520 NotImplementedError
521 Raised if the datastore does not support the given transfer mode
522 (including the case where ingest is not supported at all).
523 DatasetTypeNotSupportedError
524 Raised if one or more files to be ingested have a dataset type that
525 is not supported by the datastore.
526 FileNotFoundError
527 Raised if one of the given files does not exist.
528 FileExistsError
529 Raised if transfer is not `None` but the (internal) location the
530 file would be moved to is already occupied.
532 Notes
533 -----
534 Subclasses should implement `_prepIngest` and `_finishIngest` instead
535 of implementing `ingest` directly. Datastores that hold and
536 delegate to child datastores may want to call those methods as well.
538 Subclasses are encouraged to document their supported transfer modes
539 in their class documentation.
540 """
541 # Allow a datastore to select a default transfer mode
542 transfer = self._overrideTransferMode(*datasets, transfer=transfer)
543 prepData = self._prepIngest(*datasets, transfer=transfer)
544 refs = {ref.id: ref for dataset in datasets for ref in dataset.refs}
545 if refs.keys() != prepData.refs.keys():
546 unsupported = refs.keys() - prepData.refs.keys()
547 # Group unsupported refs by DatasetType for an informative
548 # but still concise error message.
549 byDatasetType = defaultdict(list)
550 for datasetId in unsupported:
551 ref = refs[datasetId]
552 byDatasetType[ref.datasetType].append(ref)
553 raise DatasetTypeNotSupportedError(
554 "DatasetType(s) not supported in ingest: "
555 + ", ".join(f"{k.name} ({len(v)} dataset(s))" for k, v in byDatasetType.items())
556 )
557 self._finishIngest(prepData, transfer=transfer)
559 @abstractmethod
560 def getUri(self, datasetRef: DatasetRef, predict: bool = False) -> str:
561 """URI to the Dataset.
563 Parameters
564 ----------
565 datasetRef : `DatasetRef`
566 Reference to the required Dataset.
567 predict : `bool`
568 If `True` attempt to predict the URI for a dataset if it does
569 not exist in datastore.
571 Returns
572 -------
573 uri : `str`
574 URI string pointing to the Dataset within the datastore. If the
575 Dataset does not exist in the datastore, the URI may be a guess.
576 If the datastore does not have entities that relate well
577 to the concept of a URI the returned URI string will be
578 descriptive. The returned URI is not guaranteed to be obtainable.
580 Raises
581 ------
582 FileNotFoundError
583 A URI has been requested for a dataset that does not exist and
584 guessing is not allowed.
585 """
586 raise NotImplementedError("Must be implemented by subclass")
588 @abstractmethod
589 def remove(self, datasetRef: DatasetRef) -> None:
590 """Indicate to the Datastore that a Dataset can be removed.
592 Parameters
593 ----------
594 datasetRef : `DatasetRef`
595 Reference to the required Dataset.
597 Raises
598 ------
599 FileNotFoundError
600 When Dataset does not exist.
602 Notes
603 -----
604 Some Datastores may implement this method as a silent no-op to
605 disable Dataset deletion through standard interfaces.
606 """
607 raise NotImplementedError("Must be implemented by subclass")
609 @abstractmethod
610 def trash(self, datasetRef: DatasetRef, ignore_errors: bool = True) -> None:
611 """Indicate to the Datastore that a Dataset can be moved to the trash.
613 Parameters
614 ----------
615 datasetRef : `DatasetRef`
616 Reference to the required Dataset.
617 ignore_errors : `bool`, optional
618 Determine whether errors should be ignored.
620 Raises
621 ------
622 FileNotFoundError
623 When Dataset does not exist.
625 Notes
626 -----
627 Some Datastores may implement this method as a silent no-op to
628 disable Dataset deletion through standard interfaces.
629 """
630 raise NotImplementedError("Must be implemented by subclass")
632 @abstractmethod
633 def emptyTrash(self, ignore_errors: bool = True) -> None:
634 """Remove all datasets from the trash.
636 Parameters
637 ----------
638 ignore_errors : `bool`, optional
639 Determine whether errors should be ignored.
641 Notes
642 -----
643 Some Datastores may implement this method as a silent no-op to
644 disable Dataset deletion through standard interfaces.
645 """
646 raise NotImplementedError("Must be implemented by subclass")
648 @abstractmethod
649 def transfer(self, inputDatastore: Datastore, datasetRef: DatasetRef) -> None:
650 """Retrieve a Dataset from an input `Datastore`, and store the result
651 in this `Datastore`.
653 Parameters
654 ----------
655 inputDatastore : `Datastore`
656 The external `Datastore` from which to retreive the Dataset.
657 datasetRef : `DatasetRef`
658 Reference to the required Dataset.
659 """
660 raise NotImplementedError("Must be implemented by subclass")
662 def export(self, refs: Iterable[DatasetRef], *,
663 directory: Optional[str] = None, transfer: Optional[str] = None) -> Iterable[FileDataset]:
664 """Export datasets for transfer to another data repository.
666 Parameters
667 ----------
668 refs : iterable of `DatasetRef`
669 Dataset references to be exported.
670 directory : `str`, optional
671 Path to a directory that should contain files corresponding to
672 output datasets. Ignored if ``transfer`` is `None`.
673 transfer : `str`, optional
674 Mode that should be used to move datasets out of the repository.
675 Valid options are the same as those of the ``transfer`` argument
676 to ``ingest``, and datastores may similarly signal that a transfer
677 mode is not supported by raising `NotImplementedError`.
679 Returns
680 -------
681 dataset : iterable of `DatasetTransfer`
682 Structs containing information about the exported datasets, in the
683 same order as ``refs``.
685 Raises
686 ------
687 NotImplementedError
688 Raised if the given transfer mode is not supported.
689 """
690 raise NotImplementedError(f"Transfer mode {transfer} not supported.")
692 @abstractmethod
693 def validateConfiguration(self, entities: Iterable[Union[DatasetRef, DatasetType, StorageClass]],
694 logFailures: bool = False) -> None:
695 """Validate some of the configuration for this datastore.
697 Parameters
698 ----------
699 entities : iterable of `DatasetRef`, `DatasetType`, or `StorageClass`
700 Entities to test against this configuration. Can be differing
701 types.
702 logFailures : `bool`, optional
703 If `True`, output a log message for every validation error
704 detected.
706 Raises
707 ------
708 DatastoreValidationError
709 Raised if there is a validation problem with a configuration.
711 Notes
712 -----
713 Which parts of the configuration are validated is at the discretion
714 of each Datastore implementation.
715 """
716 raise NotImplementedError("Must be implemented by subclass")
718 @abstractmethod
719 def validateKey(self,
720 lookupKey: LookupKey, entity: Union[DatasetRef, DatasetType, StorageClass]) -> None:
721 """Validate a specific look up key with supplied entity.
723 Parameters
724 ----------
725 lookupKey : `LookupKey`
726 Key to use to retrieve information from the datastore
727 configuration.
728 entity : `DatasetRef`, `DatasetType`, or `StorageClass`
729 Entity to compare with configuration retrieved using the
730 specified lookup key.
732 Raises
733 ------
734 DatastoreValidationError
735 Raised if there is a problem with the combination of entity
736 and lookup key.
738 Notes
739 -----
740 Bypasses the normal selection priorities by allowing a key that
741 would normally not be selected to be validated.
742 """
743 raise NotImplementedError("Must be implemented by subclass")
745 @abstractmethod
746 def getLookupKeys(self) -> Set[LookupKey]:
747 """Return all the lookup keys relevant to this datastore.
749 Returns
750 -------
751 keys : `set` of `LookupKey`
752 The keys stored internally for looking up information based
753 on `DatasetType` name or `StorageClass`.
754 """
755 raise NotImplementedError("Must be implemented by subclass")