Coverage for python/lsst/daf/butler/core/datastore.py : 48%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22"""
23Support for generic data stores.
24"""
26from __future__ import annotations
28__all__ = ("DatastoreConfig", "Datastore", "DatastoreValidationError")
30import contextlib
31import logging
32from collections import defaultdict
33from typing import TYPE_CHECKING, Optional, Type, Callable, ClassVar, Any, Generator, Iterable
34from dataclasses import dataclass
35from abc import ABCMeta, abstractmethod
37from lsst.utils import doImport
38from .config import ConfigSubset, Config
39from .exceptions import ValidationError, DatasetTypeNotSupportedError
40from .constraints import Constraints
41from .storageClass import StorageClassFactory
43if TYPE_CHECKING: 43 ↛ 44line 43 didn't jump to line 44, because the condition on line 43 was never true
44 from ..registry import Registry
45 from .datasets import DatasetRef
46 from .repoTransfer import FileDataset
49class DatastoreConfig(ConfigSubset):
50 component = "datastore"
51 requiredKeys = ("cls",)
52 defaultConfigFile = "datastore.yaml"
55class DatastoreValidationError(ValidationError):
56 """There is a problem with the Datastore configuration.
57 """
58 pass
61@dataclass(frozen=True)
62class Event:
63 __slots__ = {"name", "undoFunc", "args", "kwargs"}
64 name: str
65 undoFunc: Callable
66 args: tuple
67 kwargs: dict
70class IngestPrepData:
71 """A helper base class for `Datastore` ingest implementations.
73 Datastore implementations will generally need a custom implementation of
74 this class.
76 Should be accessed as ``Datastore.IngestPrepData`` instead of via direct
77 import.
79 Parameters
80 ----------
81 refs : iterable of `DatasetRef`
82 References for the datasets that can be ingested by this datastore.
83 """
84 def __init__(self, refs: Iterable[DatasetRef]):
85 self.refs = {ref.id: ref for ref in refs}
88class DatastoreTransaction:
89 """Keeps a log of `Datastore` activity and allow rollback.
91 Parameters
92 ----------
93 parent : `DatastoreTransaction`, optional
94 The parent transaction (if any)
95 """
96 Event: ClassVar[Type] = Event
98 parent: Optional['DatastoreTransaction']
99 """The parent transaction. (`DatastoreTransaction`, optional)"""
101 def __init__(self, parent=None):
102 self.parent = parent
103 self._log = []
105 def registerUndo(self, name: str, undoFunc: Callable, *args: Any, **kwargs: Any) -> None:
106 """Register event with undo function.
108 Parameters
109 ----------
110 name : `str`
111 Name of the event.
112 undoFunc : func
113 Function to undo this event.
114 args : `tuple`
115 Positional arguments to `undoFunc`.
116 kwargs : `dict`
117 Keyword arguments to `undoFunc`.
118 """
119 self._log.append(self.Event(name, undoFunc, args, kwargs))
121 @contextlib.contextmanager
122 def undoWith(self, name: str, undoFunc: Callable, *args: Any, **kwargs: Any) -> Generator:
123 """A context manager that calls `registerUndo` if the nested operation
124 does not raise an exception.
126 This can be used to wrap individual undo-able statements within a
127 DatastoreTransaction block. Multiple statements that can fail
128 separately should not be part of the same `undoWith` block.
130 All arguments are forwarded directly to `registerUndo`.
131 """
132 try:
133 yield None
134 except BaseException:
135 raise
136 else:
137 self.registerUndo(name, undoFunc, *args, **kwargs)
139 def rollback(self) -> None:
140 """Roll back all events in this transaction.
141 """
142 while self._log:
143 ev = self._log.pop()
144 try:
145 ev.undoFunc(*ev.args, **ev.kwargs)
146 except BaseException as e:
147 # Deliberately swallow error that may occur in unrolling
148 log = logging.getLogger(__name__)
149 log.warning("Exception: %s caught while unrolling: %s", e, ev.name)
150 pass
152 def commit(self) -> None:
153 """Commit this transaction.
154 """
155 if self.parent is None:
156 # Just forget about the events, they have already happened.
157 return
158 else:
159 # We may still want to events from this transaction as part of
160 # the parent.
161 self.parent._log.extend(self._log)
164class Datastore(metaclass=ABCMeta):
165 """Datastore interface.
167 Parameters
168 ----------
169 config : `DatastoreConfig` or `str`
170 Load configuration either from an existing config instance or by
171 referring to a configuration file.
172 registry : `Registry`
173 Registry to use for storing internal information about the datasets.
174 butlerRoot : `str`, optional
175 New datastore root to use to override the configuration value.
176 """
178 defaultConfigFile: ClassVar[Optional[str]] = None
179 """Path to configuration defaults. Relative to $DAF_BUTLER_DIR/config or
180 absolute path. Can be None if no defaults specified.
181 """
183 containerKey: ClassVar[Optional[str]] = None
184 """Name of the key containing a list of subconfigurations that also
185 need to be merged with defaults and will likely use different Python
186 datastore classes (but all using DatastoreConfig). Assumed to be a
187 list of configurations that can be represented in a DatastoreConfig
188 and containing a "cls" definition. None indicates that no containers
189 are expected in this Datastore."""
191 isEphemeral: ClassVar[bool] = False
192 """Indicate whether this Datastore is ephemeral or not. An ephemeral
193 datastore is one where the contents of the datastore will not exist
194 across process restarts."""
196 config: DatastoreConfig
197 """Configuration used to create Datastore."""
199 registry: Registry
200 """`Registry` to use when recording the writing of Datasets."""
202 name: str
203 """Label associated with this Datastore."""
205 names: list
206 """List of names associated with this Datastore. Can be different to
207 ``name`` for a chaining datastore."""
209 storageClassFactory: StorageClassFactory
210 """Factory for creating storage class instances from name."""
212 constraints: Constraints
213 """Constraints to apply when putting datasets into the datastore."""
215 IngestPrepData: ClassVar[Type] = IngestPrepData
216 """Helper base class for ingest implementations.
217 """
219 @classmethod
220 @abstractmethod
221 def setConfigRoot(cls, root: str, config: Config, full: Config, overwrite: bool = True):
222 """Set any filesystem-dependent config options for this Datastore to
223 be appropriate for a new empty repository with the given root.
225 Parameters
226 ----------
227 root : `str`
228 Filesystem path to the root of the data repository.
229 config : `Config`
230 A `Config` to update. Only the subset understood by
231 this component will be updated. Will not expand
232 defaults.
233 full : `Config`
234 A complete config with all defaults expanded that can be
235 converted to a `DatastoreConfig`. Read-only and will not be
236 modified by this method.
237 Repository-specific options that should not be obtained
238 from defaults when Butler instances are constructed
239 should be copied from ``full`` to ``config``.
240 overwrite : `bool`, optional
241 If `False`, do not modify a value in ``config`` if the value
242 already exists. Default is always to overwrite with the provided
243 ``root``.
245 Notes
246 -----
247 If a keyword is explicitly defined in the supplied ``config`` it
248 will not be overridden by this method if ``overwrite`` is `False`.
249 This allows explicit values set in external configs to be retained.
250 """
251 raise NotImplementedError()
253 @staticmethod
254 def fromConfig(config: Config, registry: Registry, butlerRoot: Optional[str] = None) -> 'Datastore':
255 """Create datastore from type specified in config file.
257 Parameters
258 ----------
259 config : `Config`
260 Configuration instance.
261 registry : `Registry`
262 Registry to be used by the Datastore for internal data.
263 butlerRoot : `str`, optional
264 Butler root directory.
265 """
266 cls = doImport(config["datastore", "cls"])
267 return cls(config=config, registry=registry, butlerRoot=butlerRoot)
269 def __init__(self, config, registry, butlerRoot=None):
270 self.config = DatastoreConfig(config)
271 self.registry = registry
272 self.name = "ABCDataStore"
273 self._transaction = None
275 # All Datastores need storage classes and constraints
276 self.storageClassFactory = StorageClassFactory()
278 # And read the constraints list
279 constraintsConfig = self.config.get("constraints")
280 self.constraints = Constraints(constraintsConfig, universe=self.registry.dimensions)
282 def __str__(self):
283 return self.name
285 def __repr__(self):
286 return self.name
288 @property
289 def names(self):
290 """Names associated with this datastore returned as a list.
292 Some datastores can have child datastores.
293 """
294 # Default implementation returns solely the name itself
295 return [self.name]
297 @contextlib.contextmanager
298 def transaction(self):
299 """Context manager supporting `Datastore` transactions.
301 Transactions can be nested, and are to be used in combination with
302 `Registry.transaction`.
303 """
304 self._transaction = DatastoreTransaction(self._transaction)
305 try:
306 yield self._transaction
307 except BaseException:
308 self._transaction.rollback()
309 raise
310 else:
311 self._transaction.commit()
312 self._transaction = self._transaction.parent
314 @abstractmethod
315 def exists(self, datasetRef):
316 """Check if the dataset exists in the datastore.
318 Parameters
319 ----------
320 datasetRef : `DatasetRef`
321 Reference to the required dataset.
323 Returns
324 -------
325 exists : `bool`
326 `True` if the entity exists in the `Datastore`.
327 """
328 raise NotImplementedError("Must be implemented by subclass")
330 @abstractmethod
331 def get(self, datasetRef, parameters=None):
332 """Load an `InMemoryDataset` from the store.
334 Parameters
335 ----------
336 datasetRef : `DatasetRef`
337 Reference to the required Dataset.
338 parameters : `dict`
339 `StorageClass`-specific parameters that specify a slice of the
340 Dataset to be loaded.
342 Returns
343 -------
344 inMemoryDataset : `object`
345 Requested Dataset or slice thereof as an InMemoryDataset.
346 """
347 raise NotImplementedError("Must be implemented by subclass")
349 @abstractmethod
350 def put(self, inMemoryDataset, datasetRef):
351 """Write a `InMemoryDataset` with a given `DatasetRef` to the store.
353 Parameters
354 ----------
355 inMemoryDataset : `InMemoryDataset`
356 The Dataset to store.
357 datasetRef : `DatasetRef`
358 Reference to the associated Dataset.
359 """
360 raise NotImplementedError("Must be implemented by subclass")
362 def _overrideTransferMode(self, *datasets: FileDataset, transfer: Optional[str] = None) -> str:
363 """Allow ingest transfer mode to be defaulted based on datasets.
365 Parameters
366 ----------
367 datasets : `FileDataset`
368 Each positional argument is a struct containing information about
369 a file to be ingested, including its path (either absolute or
370 relative to the datastore root, if applicable), a complete
371 `DatasetRef` (with ``dataset_id not None``), and optionally a
372 formatter class or its fully-qualified string name. If a formatter
373 is not provided, this method should populate that attribute with
374 the formatter the datastore would use for `put`. Subclasses are
375 also permitted to modify the path attribute (typically to put it
376 in what the datastore considers its standard form).
377 transfer : `str`, optional
378 How (and whether) the dataset should be added to the datastore.
379 See `ingest` for details of transfer modes.
381 Returns
382 -------
383 newTransfer : `str`
384 Transfer mode to use. Will be identical to the supplied transfer
385 mode unless "auto" is used.
386 """
387 if transfer != "auto":
388 return transfer
389 raise RuntimeError(f"{transfer} is not allowed without specialization.")
391 def _prepIngest(self, *datasets: FileDataset, transfer: Optional[str] = None) -> IngestPrepData:
392 """Process datasets to identify which ones can be ingested into this
393 Datastore.
395 Parameters
396 ----------
397 datasets : `FileDataset`
398 Each positional argument is a struct containing information about
399 a file to be ingested, including its path (either absolute or
400 relative to the datastore root, if applicable), a complete
401 `DatasetRef` (with ``dataset_id not None``), and optionally a
402 formatter class or its fully-qualified string name. If a formatter
403 is not provided, this method should populate that attribute with
404 the formatter the datastore would use for `put`. Subclasses are
405 also permitted to modify the path attribute (typically to put it
406 in what the datastore considers its standard form).
407 transfer : `str`, optional
408 How (and whether) the dataset should be added to the datastore.
409 See `ingest` for details of transfer modes.
411 Returns
412 -------
413 data : `IngestPrepData`
414 An instance of a subclass of `IngestPrepData`, used to pass
415 arbitrary data from `_prepIngest` to `_finishIngest`. This should
416 include only the datasets this datastore can actually ingest;
417 others should be silently ignored (`Datastore.ingest` will inspect
418 `IngestPrepData.refs` and raise `DatasetTypeNotSupportedError` if
419 necessary).
421 Raises
422 ------
423 NotImplementedError
424 Raised if the datastore does not support the given transfer mode
425 (including the case where ingest is not supported at all).
426 FileNotFoundError
427 Raised if one of the given files does not exist.
428 FileExistsError
429 Raised if transfer is not `None` but the (internal) location the
430 file would be moved to is already occupied.
432 Notes
433 -----
434 This method (along with `_finishIngest`) should be implemented by
435 subclasses to provide ingest support instead of implementing `ingest`
436 directly.
438 `_prepIngest` should not modify the data repository or given files in
439 any way; all changes should be deferred to `_finishIngest`.
441 When possible, exceptions should be raised in `_prepIngest` instead of
442 `_finishIngest`. `NotImplementedError` exceptions that indicate that
443 the transfer mode is not supported must be raised by `_prepIngest`
444 instead of `_finishIngest`.
445 """
446 raise NotImplementedError(
447 "Datastore does not support direct file-based ingest."
448 )
450 def _finishIngest(self, prepData: IngestPrepData, *, transfer: Optional[str] = None):
451 """Complete an ingest operation.
453 Parameters
454 ----------
455 data : `IngestPrepData`
456 An instance of a subclass of `IngestPrepData`. Guaranteed to be
457 the direct result of a call to `_prepIngest` on this datastore.
458 transfer : `str`, optional
459 How (and whether) the dataset should be added to the datastore.
460 See `ingest` for details of transfer modes.
462 Raises
463 ------
464 FileNotFoundError
465 Raised if one of the given files does not exist.
466 FileExistsError
467 Raised if transfer is not `None` but the (internal) location the
468 file would be moved to is already occupied.
470 Notes
471 -----
472 This method (along with `_prepIngest`) should be implemented by
473 subclasses to provide ingest support instead of implementing `ingest`
474 directly.
475 """
476 raise NotImplementedError(
477 "Datastore does not support direct file-based ingest."
478 )
480 def ingest(self, *datasets: FileDataset, transfer: Optional[str] = None):
481 """Ingest one or more files into the datastore.
483 Parameters
484 ----------
485 datasets : `FileDataset`
486 Each positional argument is a struct containing information about
487 a file to be ingested, including its path (either absolute or
488 relative to the datastore root, if applicable), a complete
489 `DatasetRef` (with ``dataset_id not None``), and optionally a
490 formatter class or its fully-qualified string name. If a formatter
491 is not provided, the one the datastore would use for ``put`` on
492 that dataset is assumed.
493 transfer : `str`, optional
494 How (and whether) the dataset should be added to the datastore.
495 If `None` (default), the file must already be in a location
496 appropriate for the datastore (e.g. within its root directory),
497 and will not be modified. Other choices include "move", "copy",
498 "link", "symlink", "relsymlink", and "hardlink". "link" is a
499 special transfer mode that will first try to make a hardlink and
500 if that fails a symlink will be used instead. "relsymlink" creates
501 a relative symlink rather than use an absolute path.
502 Most datastores do not support all transfer modes.
503 "auto" is a special option that will let the
504 data store choose the most natural option for itself.
506 Raises
507 ------
508 NotImplementedError
509 Raised if the datastore does not support the given transfer mode
510 (including the case where ingest is not supported at all).
511 DatasetTypeNotSupportedError
512 Raised if one or more files to be ingested have a dataset type that
513 is not supported by the datastore.
514 FileNotFoundError
515 Raised if one of the given files does not exist.
516 FileExistsError
517 Raised if transfer is not `None` but the (internal) location the
518 file would be moved to is already occupied.
520 Notes
521 -----
522 Subclasses should implement `_prepIngest` and `_finishIngest` instead
523 of implementing `ingest` directly. Datastores that hold and
524 delegate to child datastores may want to call those methods as well.
526 Subclasses are encouraged to document their supported transfer modes
527 in their class documentation.
528 """
529 # Allow a datastore to select a default transfer mode
530 transfer = self._overrideTransferMode(*datasets, transfer=transfer)
531 prepData = self._prepIngest(*datasets, transfer=transfer)
532 refs = {ref.id: ref for dataset in datasets for ref in dataset.refs}
533 if refs.keys() != prepData.refs.keys():
534 unsupported = refs.keys() - prepData.refs.keys()
535 # Group unsupported refs by DatasetType for an informative
536 # but still concise error message.
537 byDatasetType = defaultdict(list)
538 for datasetId in unsupported:
539 ref = refs[datasetId]
540 byDatasetType[ref.datasetType].append(ref)
541 raise DatasetTypeNotSupportedError(
542 "DatasetType(s) not supported in ingest: "
543 + ", ".join(f"{k.name} ({len(v)} dataset(s))" for k, v in byDatasetType.items())
544 )
545 self._finishIngest(prepData, transfer=transfer)
547 @abstractmethod
548 def getUri(self, datasetRef):
549 """URI to the Dataset.
551 Parameters
552 ----------
553 datasetRef : `DatasetRef`
554 Reference to the required Dataset.
556 Returns
557 -------
558 uri : `str`
559 URI string pointing to the Dataset within the datastore. If the
560 Dataset does not exist in the datastore, the URI may be a guess.
561 If the datastore does not have entities that relate well
562 to the concept of a URI the returned URI string will be
563 descriptive. The returned URI is not guaranteed to be obtainable.
564 """
565 raise NotImplementedError("Must be implemented by subclass")
567 @abstractmethod
568 def remove(self, datasetRef):
569 """Indicate to the Datastore that a Dataset can be removed.
571 Parameters
572 ----------
573 datasetRef : `DatasetRef`
574 Reference to the required Dataset.
576 Raises
577 ------
578 FileNotFoundError
579 When Dataset does not exist.
581 Notes
582 -----
583 Some Datastores may implement this method as a silent no-op to
584 disable Dataset deletion through standard interfaces.
585 """
586 raise NotImplementedError("Must be implemented by subclass")
588 @abstractmethod
589 def trash(self, datasetRef, ignore_errors=True):
590 """Indicate to the Datastore that a Dataset can be moved to the trash.
592 Parameters
593 ----------
594 datasetRef : `DatasetRef`
595 Reference to the required Dataset.
596 ignore_errors : `bool`, optional
597 Determine whether errors should be ignored.
599 Raises
600 ------
601 FileNotFoundError
602 When Dataset does not exist.
604 Notes
605 -----
606 Some Datastores may implement this method as a silent no-op to
607 disable Dataset deletion through standard interfaces.
608 """
609 raise NotImplementedError("Must be implemented by subclass")
611 @abstractmethod
612 def emptyTrash(self, ignore_errors=True):
613 """Remove all datasets from the trash.
615 Parameters
616 ----------
617 ignore_errors : `bool`, optional
618 Determine whether errors should be ignored.
620 Notes
621 -----
622 Some Datastores may implement this method as a silent no-op to
623 disable Dataset deletion through standard interfaces.
624 """
625 raise NotImplementedError("Must be implemented by subclass")
627 @abstractmethod
628 def transfer(self, inputDatastore, datasetRef):
629 """Retrieve a Dataset from an input `Datastore`, and store the result
630 in this `Datastore`.
632 Parameters
633 ----------
634 inputDatastore : `Datastore`
635 The external `Datastore` from which to retreive the Dataset.
636 datasetRef : `DatasetRef`
637 Reference to the required Dataset.
638 """
639 raise NotImplementedError("Must be implemented by subclass")
641 def export(self, refs: Iterable[DatasetRef], *,
642 directory: Optional[str] = None, transfer: Optional[str] = None) -> Iterable[FileDataset]:
643 """Export datasets for transfer to another data repository.
645 Parameters
646 ----------
647 refs : iterable of `DatasetRef`
648 Dataset references to be exported.
649 directory : `str`, optional
650 Path to a directory that should contain files corresponding to
651 output datasets. Ignored if ``transfer`` is `None`.
652 transfer : `str`, optional
653 Mode that should be used to move datasets out of the repository.
654 Valid options are the same as those of the ``transfer`` argument
655 to ``ingest``, and datastores may similarly signal that a transfer
656 mode is not supported by raising `NotImplementedError`.
658 Returns
659 -------
660 dataset : iterable of `DatasetTransfer`
661 Structs containing information about the exported datasets, in the
662 same order as ``refs``.
664 Raises
665 ------
666 NotImplementedError
667 Raised if the given transfer mode is not supported.
668 """
669 raise NotImplementedError(f"Transfer mode {transfer} not supported.")
671 @abstractmethod
672 def validateConfiguration(self, entities, logFailures=False):
673 """Validate some of the configuration for this datastore.
675 Parameters
676 ----------
677 entities : `DatasetRef`, `DatasetType`, or `StorageClass`
678 Entities to test against this configuration. Can be differing
679 types.
680 logFailures : `bool`, optional
681 If `True`, output a log message for every validation error
682 detected.
684 Raises
685 ------
686 DatastoreValidationError
687 Raised if there is a validation problem with a configuration.
689 Notes
690 -----
691 Which parts of the configuration are validated is at the discretion
692 of each Datastore implementation.
693 """
694 raise NotImplementedError("Must be implemented by subclass")
696 @abstractmethod
697 def validateKey(self, lookupKey, entity, logFailures=False):
698 """Validate a specific look up key with supplied entity.
700 Parameters
701 ----------
702 lookupKey : `LookupKey`
703 Key to use to retrieve information from the datastore
704 configuration.
705 entity : `DatasetRef`, `DatasetType`, or `StorageClass`
706 Entity to compare with configuration retrieved using the
707 specified lookup key.
709 Raises
710 ------
711 DatastoreValidationError
712 Raised if there is a problem with the combination of entity
713 and lookup key.
715 Notes
716 -----
717 Bypasses the normal selection priorities by allowing a key that
718 would normally not be selected to be validated.
719 """
720 raise NotImplementedError("Must be implemented by subclass")
722 @abstractmethod
723 def getLookupKeys(self):
724 """Return all the lookup keys relevant to this datastore.
726 Returns
727 -------
728 keys : `set` of `LookupKey`
729 The keys stored internally for looking up information based
730 on `DatasetType` name or `StorageClass`.
731 """
732 raise NotImplementedError("Must be implemented by subclass")