Coverage for python/lsst/daf/butler/core/datastore.py : 48%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22"""
23Support for generic data stores.
24"""
26from __future__ import annotations
28__all__ = ("DatastoreConfig", "Datastore", "DatastoreValidationError")
30import contextlib
31import logging
32from collections import defaultdict
33from typing import TYPE_CHECKING, Optional, Type, Callable, ClassVar, Any, Generator, Iterable
34from dataclasses import dataclass
35from abc import ABCMeta, abstractmethod
37from lsst.utils import doImport
38from .config import ConfigSubset, Config
39from .exceptions import ValidationError, DatasetTypeNotSupportedError
40from .constraints import Constraints
41from .storageClass import StorageClassFactory
43if TYPE_CHECKING: 43 ↛ 44line 43 didn't jump to line 44, because the condition on line 43 was never true
44 from ..registry.interfaces import DatastoreRegistryBridgeManager
45 from .datasets import DatasetRef
46 from .repoTransfer import FileDataset
49class DatastoreConfig(ConfigSubset):
50 component = "datastore"
51 requiredKeys = ("cls",)
52 defaultConfigFile = "datastore.yaml"
55class DatastoreValidationError(ValidationError):
56 """There is a problem with the Datastore configuration.
57 """
58 pass
61@dataclass(frozen=True)
62class Event:
63 __slots__ = {"name", "undoFunc", "args", "kwargs"}
64 name: str
65 undoFunc: Callable
66 args: tuple
67 kwargs: dict
70class IngestPrepData:
71 """A helper base class for `Datastore` ingest implementations.
73 Datastore implementations will generally need a custom implementation of
74 this class.
76 Should be accessed as ``Datastore.IngestPrepData`` instead of via direct
77 import.
79 Parameters
80 ----------
81 refs : iterable of `DatasetRef`
82 References for the datasets that can be ingested by this datastore.
83 """
84 def __init__(self, refs: Iterable[DatasetRef]):
85 self.refs = {ref.id: ref for ref in refs}
88class DatastoreTransaction:
89 """Keeps a log of `Datastore` activity and allow rollback.
91 Parameters
92 ----------
93 parent : `DatastoreTransaction`, optional
94 The parent transaction (if any)
95 """
96 Event: ClassVar[Type] = Event
98 parent: Optional['DatastoreTransaction']
99 """The parent transaction. (`DatastoreTransaction`, optional)"""
101 def __init__(self, parent=None):
102 self.parent = parent
103 self._log = []
105 def registerUndo(self, name: str, undoFunc: Callable, *args: Any, **kwargs: Any) -> None:
106 """Register event with undo function.
108 Parameters
109 ----------
110 name : `str`
111 Name of the event.
112 undoFunc : func
113 Function to undo this event.
114 args : `tuple`
115 Positional arguments to `undoFunc`.
116 kwargs : `dict`
117 Keyword arguments to `undoFunc`.
118 """
119 self._log.append(self.Event(name, undoFunc, args, kwargs))
121 @contextlib.contextmanager
122 def undoWith(self, name: str, undoFunc: Callable, *args: Any, **kwargs: Any) -> Generator:
123 """A context manager that calls `registerUndo` if the nested operation
124 does not raise an exception.
126 This can be used to wrap individual undo-able statements within a
127 DatastoreTransaction block. Multiple statements that can fail
128 separately should not be part of the same `undoWith` block.
130 All arguments are forwarded directly to `registerUndo`.
131 """
132 try:
133 yield None
134 except BaseException:
135 raise
136 else:
137 self.registerUndo(name, undoFunc, *args, **kwargs)
139 def rollback(self) -> None:
140 """Roll back all events in this transaction.
141 """
142 while self._log:
143 ev = self._log.pop()
144 try:
145 ev.undoFunc(*ev.args, **ev.kwargs)
146 except BaseException as e:
147 # Deliberately swallow error that may occur in unrolling
148 log = logging.getLogger(__name__)
149 log.warning("Exception: %s caught while unrolling: %s", e, ev.name)
150 pass
152 def commit(self) -> None:
153 """Commit this transaction.
154 """
155 if self.parent is None:
156 # Just forget about the events, they have already happened.
157 return
158 else:
159 # We may still want to events from this transaction as part of
160 # the parent.
161 self.parent._log.extend(self._log)
164class Datastore(metaclass=ABCMeta):
165 """Datastore interface.
167 Parameters
168 ----------
169 config : `DatastoreConfig` or `str`
170 Load configuration either from an existing config instance or by
171 referring to a configuration file.
172 bridgeManager : `DatastoreRegistryBridgeManager`
173 Object that manages the interface between `Registry` and datastores.
174 butlerRoot : `str`, optional
175 New datastore root to use to override the configuration value.
176 """
178 defaultConfigFile: ClassVar[Optional[str]] = None
179 """Path to configuration defaults. Relative to $DAF_BUTLER_DIR/config or
180 absolute path. Can be None if no defaults specified.
181 """
183 containerKey: ClassVar[Optional[str]] = None
184 """Name of the key containing a list of subconfigurations that also
185 need to be merged with defaults and will likely use different Python
186 datastore classes (but all using DatastoreConfig). Assumed to be a
187 list of configurations that can be represented in a DatastoreConfig
188 and containing a "cls" definition. None indicates that no containers
189 are expected in this Datastore."""
191 isEphemeral: ClassVar[bool] = False
192 """Indicate whether this Datastore is ephemeral or not. An ephemeral
193 datastore is one where the contents of the datastore will not exist
194 across process restarts."""
196 config: DatastoreConfig
197 """Configuration used to create Datastore."""
199 name: str
200 """Label associated with this Datastore."""
202 names: list
203 """List of names associated with this Datastore. Can be different to
204 ``name`` for a chaining datastore."""
206 storageClassFactory: StorageClassFactory
207 """Factory for creating storage class instances from name."""
209 constraints: Constraints
210 """Constraints to apply when putting datasets into the datastore."""
212 IngestPrepData: ClassVar[Type] = IngestPrepData
213 """Helper base class for ingest implementations.
214 """
216 @classmethod
217 @abstractmethod
218 def setConfigRoot(cls, root: str, config: Config, full: Config, overwrite: bool = True):
219 """Set any filesystem-dependent config options for this Datastore to
220 be appropriate for a new empty repository with the given root.
222 Parameters
223 ----------
224 root : `str`
225 Filesystem path to the root of the data repository.
226 config : `Config`
227 A `Config` to update. Only the subset understood by
228 this component will be updated. Will not expand
229 defaults.
230 full : `Config`
231 A complete config with all defaults expanded that can be
232 converted to a `DatastoreConfig`. Read-only and will not be
233 modified by this method.
234 Repository-specific options that should not be obtained
235 from defaults when Butler instances are constructed
236 should be copied from ``full`` to ``config``.
237 overwrite : `bool`, optional
238 If `False`, do not modify a value in ``config`` if the value
239 already exists. Default is always to overwrite with the provided
240 ``root``.
242 Notes
243 -----
244 If a keyword is explicitly defined in the supplied ``config`` it
245 will not be overridden by this method if ``overwrite`` is `False`.
246 This allows explicit values set in external configs to be retained.
247 """
248 raise NotImplementedError()
250 @staticmethod
251 def fromConfig(config: Config, bridgeManager: DatastoreRegistryBridgeManager,
252 butlerRoot: Optional[str] = None) -> 'Datastore':
253 """Create datastore from type specified in config file.
255 Parameters
256 ----------
257 config : `Config`
258 Configuration instance.
259 bridgeManager : `DatastoreRegistryBridgeManager`
260 Object that manages the interface between `Registry` and
261 datastores.
262 butlerRoot : `str`, optional
263 Butler root directory.
264 """
265 cls = doImport(config["datastore", "cls"])
266 return cls(config=config, bridgeManager=bridgeManager, butlerRoot=butlerRoot)
268 def __init__(self, config, bridgeManager, butlerRoot=None):
269 self.config = DatastoreConfig(config)
270 self.name = "ABCDataStore"
271 self._transaction = None
273 # All Datastores need storage classes and constraints
274 self.storageClassFactory = StorageClassFactory()
276 # And read the constraints list
277 constraintsConfig = self.config.get("constraints")
278 self.constraints = Constraints(constraintsConfig, universe=bridgeManager.universe)
280 def __str__(self):
281 return self.name
283 def __repr__(self):
284 return self.name
286 @property
287 def names(self):
288 """Names associated with this datastore returned as a list.
290 Some datastores can have child datastores.
291 """
292 # Default implementation returns solely the name itself
293 return [self.name]
295 @contextlib.contextmanager
296 def transaction(self):
297 """Context manager supporting `Datastore` transactions.
299 Transactions can be nested, and are to be used in combination with
300 `Registry.transaction`.
301 """
302 self._transaction = DatastoreTransaction(self._transaction)
303 try:
304 yield self._transaction
305 except BaseException:
306 self._transaction.rollback()
307 raise
308 else:
309 self._transaction.commit()
310 self._transaction = self._transaction.parent
312 @abstractmethod
313 def exists(self, datasetRef):
314 """Check if the dataset exists in the datastore.
316 Parameters
317 ----------
318 datasetRef : `DatasetRef`
319 Reference to the required dataset.
321 Returns
322 -------
323 exists : `bool`
324 `True` if the entity exists in the `Datastore`.
325 """
326 raise NotImplementedError("Must be implemented by subclass")
328 @abstractmethod
329 def get(self, datasetRef, parameters=None):
330 """Load an `InMemoryDataset` from the store.
332 Parameters
333 ----------
334 datasetRef : `DatasetRef`
335 Reference to the required Dataset.
336 parameters : `dict`
337 `StorageClass`-specific parameters that specify a slice of the
338 Dataset to be loaded.
340 Returns
341 -------
342 inMemoryDataset : `object`
343 Requested Dataset or slice thereof as an InMemoryDataset.
344 """
345 raise NotImplementedError("Must be implemented by subclass")
347 @abstractmethod
348 def put(self, inMemoryDataset, datasetRef):
349 """Write a `InMemoryDataset` with a given `DatasetRef` to the store.
351 Parameters
352 ----------
353 inMemoryDataset : `InMemoryDataset`
354 The Dataset to store.
355 datasetRef : `DatasetRef`
356 Reference to the associated Dataset.
357 """
358 raise NotImplementedError("Must be implemented by subclass")
360 def _overrideTransferMode(self, *datasets: FileDataset, transfer: Optional[str] = None) -> str:
361 """Allow ingest transfer mode to be defaulted based on datasets.
363 Parameters
364 ----------
365 datasets : `FileDataset`
366 Each positional argument is a struct containing information about
367 a file to be ingested, including its path (either absolute or
368 relative to the datastore root, if applicable), a complete
369 `DatasetRef` (with ``dataset_id not None``), and optionally a
370 formatter class or its fully-qualified string name. If a formatter
371 is not provided, this method should populate that attribute with
372 the formatter the datastore would use for `put`. Subclasses are
373 also permitted to modify the path attribute (typically to put it
374 in what the datastore considers its standard form).
375 transfer : `str`, optional
376 How (and whether) the dataset should be added to the datastore.
377 See `ingest` for details of transfer modes.
379 Returns
380 -------
381 newTransfer : `str`
382 Transfer mode to use. Will be identical to the supplied transfer
383 mode unless "auto" is used.
384 """
385 if transfer != "auto":
386 return transfer
387 raise RuntimeError(f"{transfer} is not allowed without specialization.")
389 def _prepIngest(self, *datasets: FileDataset, transfer: Optional[str] = None) -> IngestPrepData:
390 """Process datasets to identify which ones can be ingested into this
391 Datastore.
393 Parameters
394 ----------
395 datasets : `FileDataset`
396 Each positional argument is a struct containing information about
397 a file to be ingested, including its path (either absolute or
398 relative to the datastore root, if applicable), a complete
399 `DatasetRef` (with ``dataset_id not None``), and optionally a
400 formatter class or its fully-qualified string name. If a formatter
401 is not provided, this method should populate that attribute with
402 the formatter the datastore would use for `put`. Subclasses are
403 also permitted to modify the path attribute (typically to put it
404 in what the datastore considers its standard form).
405 transfer : `str`, optional
406 How (and whether) the dataset should be added to the datastore.
407 See `ingest` for details of transfer modes.
409 Returns
410 -------
411 data : `IngestPrepData`
412 An instance of a subclass of `IngestPrepData`, used to pass
413 arbitrary data from `_prepIngest` to `_finishIngest`. This should
414 include only the datasets this datastore can actually ingest;
415 others should be silently ignored (`Datastore.ingest` will inspect
416 `IngestPrepData.refs` and raise `DatasetTypeNotSupportedError` if
417 necessary).
419 Raises
420 ------
421 NotImplementedError
422 Raised if the datastore does not support the given transfer mode
423 (including the case where ingest is not supported at all).
424 FileNotFoundError
425 Raised if one of the given files does not exist.
426 FileExistsError
427 Raised if transfer is not `None` but the (internal) location the
428 file would be moved to is already occupied.
430 Notes
431 -----
432 This method (along with `_finishIngest`) should be implemented by
433 subclasses to provide ingest support instead of implementing `ingest`
434 directly.
436 `_prepIngest` should not modify the data repository or given files in
437 any way; all changes should be deferred to `_finishIngest`.
439 When possible, exceptions should be raised in `_prepIngest` instead of
440 `_finishIngest`. `NotImplementedError` exceptions that indicate that
441 the transfer mode is not supported must be raised by `_prepIngest`
442 instead of `_finishIngest`.
443 """
444 raise NotImplementedError(
445 "Datastore does not support direct file-based ingest."
446 )
448 def _finishIngest(self, prepData: IngestPrepData, *, transfer: Optional[str] = None):
449 """Complete an ingest operation.
451 Parameters
452 ----------
453 data : `IngestPrepData`
454 An instance of a subclass of `IngestPrepData`. Guaranteed to be
455 the direct result of a call to `_prepIngest` on this datastore.
456 transfer : `str`, optional
457 How (and whether) the dataset should be added to the datastore.
458 See `ingest` for details of transfer modes.
460 Raises
461 ------
462 FileNotFoundError
463 Raised if one of the given files does not exist.
464 FileExistsError
465 Raised if transfer is not `None` but the (internal) location the
466 file would be moved to is already occupied.
468 Notes
469 -----
470 This method (along with `_prepIngest`) should be implemented by
471 subclasses to provide ingest support instead of implementing `ingest`
472 directly.
473 """
474 raise NotImplementedError(
475 "Datastore does not support direct file-based ingest."
476 )
478 def ingest(self, *datasets: FileDataset, transfer: Optional[str] = None):
479 """Ingest one or more files into the datastore.
481 Parameters
482 ----------
483 datasets : `FileDataset`
484 Each positional argument is a struct containing information about
485 a file to be ingested, including its path (either absolute or
486 relative to the datastore root, if applicable), a complete
487 `DatasetRef` (with ``dataset_id not None``), and optionally a
488 formatter class or its fully-qualified string name. If a formatter
489 is not provided, the one the datastore would use for ``put`` on
490 that dataset is assumed.
491 transfer : `str`, optional
492 How (and whether) the dataset should be added to the datastore.
493 If `None` (default), the file must already be in a location
494 appropriate for the datastore (e.g. within its root directory),
495 and will not be modified. Other choices include "move", "copy",
496 "link", "symlink", "relsymlink", and "hardlink". "link" is a
497 special transfer mode that will first try to make a hardlink and
498 if that fails a symlink will be used instead. "relsymlink" creates
499 a relative symlink rather than use an absolute path.
500 Most datastores do not support all transfer modes.
501 "auto" is a special option that will let the
502 data store choose the most natural option for itself.
504 Raises
505 ------
506 NotImplementedError
507 Raised if the datastore does not support the given transfer mode
508 (including the case where ingest is not supported at all).
509 DatasetTypeNotSupportedError
510 Raised if one or more files to be ingested have a dataset type that
511 is not supported by the datastore.
512 FileNotFoundError
513 Raised if one of the given files does not exist.
514 FileExistsError
515 Raised if transfer is not `None` but the (internal) location the
516 file would be moved to is already occupied.
518 Notes
519 -----
520 Subclasses should implement `_prepIngest` and `_finishIngest` instead
521 of implementing `ingest` directly. Datastores that hold and
522 delegate to child datastores may want to call those methods as well.
524 Subclasses are encouraged to document their supported transfer modes
525 in their class documentation.
526 """
527 # Allow a datastore to select a default transfer mode
528 transfer = self._overrideTransferMode(*datasets, transfer=transfer)
529 prepData = self._prepIngest(*datasets, transfer=transfer)
530 refs = {ref.id: ref for dataset in datasets for ref in dataset.refs}
531 if refs.keys() != prepData.refs.keys():
532 unsupported = refs.keys() - prepData.refs.keys()
533 # Group unsupported refs by DatasetType for an informative
534 # but still concise error message.
535 byDatasetType = defaultdict(list)
536 for datasetId in unsupported:
537 ref = refs[datasetId]
538 byDatasetType[ref.datasetType].append(ref)
539 raise DatasetTypeNotSupportedError(
540 "DatasetType(s) not supported in ingest: "
541 + ", ".join(f"{k.name} ({len(v)} dataset(s))" for k, v in byDatasetType.items())
542 )
543 self._finishIngest(prepData, transfer=transfer)
545 @abstractmethod
546 def getUri(self, datasetRef):
547 """URI to the Dataset.
549 Parameters
550 ----------
551 datasetRef : `DatasetRef`
552 Reference to the required Dataset.
554 Returns
555 -------
556 uri : `str`
557 URI string pointing to the Dataset within the datastore. If the
558 Dataset does not exist in the datastore, the URI may be a guess.
559 If the datastore does not have entities that relate well
560 to the concept of a URI the returned URI string will be
561 descriptive. The returned URI is not guaranteed to be obtainable.
562 """
563 raise NotImplementedError("Must be implemented by subclass")
565 @abstractmethod
566 def remove(self, datasetRef):
567 """Indicate to the Datastore that a Dataset can be removed.
569 Parameters
570 ----------
571 datasetRef : `DatasetRef`
572 Reference to the required Dataset.
574 Raises
575 ------
576 FileNotFoundError
577 When Dataset does not exist.
579 Notes
580 -----
581 Some Datastores may implement this method as a silent no-op to
582 disable Dataset deletion through standard interfaces.
583 """
584 raise NotImplementedError("Must be implemented by subclass")
586 @abstractmethod
587 def trash(self, datasetRef, ignore_errors=True):
588 """Indicate to the Datastore that a Dataset can be moved to the trash.
590 Parameters
591 ----------
592 datasetRef : `DatasetRef`
593 Reference to the required Dataset.
594 ignore_errors : `bool`, optional
595 Determine whether errors should be ignored.
597 Raises
598 ------
599 FileNotFoundError
600 When Dataset does not exist.
602 Notes
603 -----
604 Some Datastores may implement this method as a silent no-op to
605 disable Dataset deletion through standard interfaces.
606 """
607 raise NotImplementedError("Must be implemented by subclass")
609 @abstractmethod
610 def emptyTrash(self, ignore_errors=True):
611 """Remove all datasets from the trash.
613 Parameters
614 ----------
615 ignore_errors : `bool`, optional
616 Determine whether errors should be ignored.
618 Notes
619 -----
620 Some Datastores may implement this method as a silent no-op to
621 disable Dataset deletion through standard interfaces.
622 """
623 raise NotImplementedError("Must be implemented by subclass")
625 @abstractmethod
626 def transfer(self, inputDatastore, datasetRef):
627 """Retrieve a Dataset from an input `Datastore`, and store the result
628 in this `Datastore`.
630 Parameters
631 ----------
632 inputDatastore : `Datastore`
633 The external `Datastore` from which to retreive the Dataset.
634 datasetRef : `DatasetRef`
635 Reference to the required Dataset.
636 """
637 raise NotImplementedError("Must be implemented by subclass")
639 def export(self, refs: Iterable[DatasetRef], *,
640 directory: Optional[str] = None, transfer: Optional[str] = None) -> Iterable[FileDataset]:
641 """Export datasets for transfer to another data repository.
643 Parameters
644 ----------
645 refs : iterable of `DatasetRef`
646 Dataset references to be exported.
647 directory : `str`, optional
648 Path to a directory that should contain files corresponding to
649 output datasets. Ignored if ``transfer`` is `None`.
650 transfer : `str`, optional
651 Mode that should be used to move datasets out of the repository.
652 Valid options are the same as those of the ``transfer`` argument
653 to ``ingest``, and datastores may similarly signal that a transfer
654 mode is not supported by raising `NotImplementedError`.
656 Returns
657 -------
658 dataset : iterable of `DatasetTransfer`
659 Structs containing information about the exported datasets, in the
660 same order as ``refs``.
662 Raises
663 ------
664 NotImplementedError
665 Raised if the given transfer mode is not supported.
666 """
667 raise NotImplementedError(f"Transfer mode {transfer} not supported.")
669 @abstractmethod
670 def validateConfiguration(self, entities, logFailures=False):
671 """Validate some of the configuration for this datastore.
673 Parameters
674 ----------
675 entities : `DatasetRef`, `DatasetType`, or `StorageClass`
676 Entities to test against this configuration. Can be differing
677 types.
678 logFailures : `bool`, optional
679 If `True`, output a log message for every validation error
680 detected.
682 Raises
683 ------
684 DatastoreValidationError
685 Raised if there is a validation problem with a configuration.
687 Notes
688 -----
689 Which parts of the configuration are validated is at the discretion
690 of each Datastore implementation.
691 """
692 raise NotImplementedError("Must be implemented by subclass")
694 @abstractmethod
695 def validateKey(self, lookupKey, entity, logFailures=False):
696 """Validate a specific look up key with supplied entity.
698 Parameters
699 ----------
700 lookupKey : `LookupKey`
701 Key to use to retrieve information from the datastore
702 configuration.
703 entity : `DatasetRef`, `DatasetType`, or `StorageClass`
704 Entity to compare with configuration retrieved using the
705 specified lookup key.
707 Raises
708 ------
709 DatastoreValidationError
710 Raised if there is a problem with the combination of entity
711 and lookup key.
713 Notes
714 -----
715 Bypasses the normal selection priorities by allowing a key that
716 would normally not be selected to be validated.
717 """
718 raise NotImplementedError("Must be implemented by subclass")
720 @abstractmethod
721 def getLookupKeys(self):
722 """Return all the lookup keys relevant to this datastore.
724 Returns
725 -------
726 keys : `set` of `LookupKey`
727 The keys stored internally for looking up information based
728 on `DatasetType` name or `StorageClass`.
729 """
730 raise NotImplementedError("Must be implemented by subclass")