Coverage for python/lsst/daf/butler/datastore/_datastore.py: 63%
261 statements
« prev ^ index » next coverage.py v7.4.0, created at 2024-01-16 10:44 +0000
« prev ^ index » next coverage.py v7.4.0, created at 2024-01-16 10:44 +0000
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This software is dual licensed under the GNU General Public License and also
10# under a 3-clause BSD license. Recipients may choose which of these licenses
11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt,
12# respectively. If you choose the GPL option then the following text applies
13# (but note that there is still no warranty even if you opt for BSD instead):
14#
15# This program is free software: you can redistribute it and/or modify
16# it under the terms of the GNU General Public License as published by
17# the Free Software Foundation, either version 3 of the License, or
18# (at your option) any later version.
19#
20# This program is distributed in the hope that it will be useful,
21# but WITHOUT ANY WARRANTY; without even the implied warranty of
22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23# GNU General Public License for more details.
24#
25# You should have received a copy of the GNU General Public License
26# along with this program. If not, see <http://www.gnu.org/licenses/>.
28"""Support for generic data stores."""
30from __future__ import annotations
32__all__ = (
33 "DatasetRefURIs",
34 "Datastore",
35 "DatastoreConfig",
36 "DatastoreOpaqueTable",
37 "DatastoreValidationError",
38 "NullDatastore",
39 "DatastoreTransaction",
40)
42import contextlib
43import dataclasses
44import logging
45import time
46from abc import ABCMeta, abstractmethod
47from collections import abc, defaultdict
48from collections.abc import Callable, Iterable, Iterator, Mapping
49from typing import TYPE_CHECKING, Any, ClassVar
51from lsst.utils import doImportType
53from .._config import Config, ConfigSubset
54from .._exceptions import DatasetTypeNotSupportedError, ValidationError
55from .._file_dataset import FileDataset
56from .._storage_class import StorageClassFactory
57from .constraints import Constraints
59if TYPE_CHECKING:
60 from lsst.resources import ResourcePath, ResourcePathExpression
62 from .. import ddl
63 from .._config_support import LookupKey
64 from .._dataset_ref import DatasetRef
65 from .._dataset_type import DatasetType
66 from .._storage_class import StorageClass
67 from ..registry.interfaces import DatasetIdRef, DatastoreRegistryBridgeManager
68 from .record_data import DatastoreRecordData
69 from .stored_file_info import StoredDatastoreItemInfo
71_LOG = logging.getLogger(__name__)
74class DatastoreConfig(ConfigSubset):
75 """Configuration for Datastores."""
77 component = "datastore"
78 requiredKeys = ("cls",)
79 defaultConfigFile = "datastore.yaml"
82class DatastoreValidationError(ValidationError):
83 """There is a problem with the Datastore configuration."""
85 pass
88@dataclasses.dataclass(frozen=True)
89class Event:
90 """Representation of an event that can be rolled back."""
92 __slots__ = {"name", "undoFunc", "args", "kwargs"}
93 name: str
94 undoFunc: Callable
95 args: tuple
96 kwargs: dict
99@dataclasses.dataclass(frozen=True)
100class DatastoreOpaqueTable:
101 """Definition of the opaque table which stores datastore records.
103 Table definition contains `.ddl.TableSpec` for a table and a class
104 of a record which must be a subclass of `StoredDatastoreItemInfo`.
105 """
107 __slots__ = {"table_spec", "record_class"}
108 table_spec: ddl.TableSpec
109 record_class: type[StoredDatastoreItemInfo]
112class IngestPrepData:
113 """A helper base class for `Datastore` ingest implementations.
115 Datastore implementations will generally need a custom implementation of
116 this class.
118 Should be accessed as ``Datastore.IngestPrepData`` instead of via direct
119 import.
121 Parameters
122 ----------
123 refs : iterable of `DatasetRef`
124 References for the datasets that can be ingested by this datastore.
125 """
127 def __init__(self, refs: Iterable[DatasetRef]):
128 self.refs = {ref.id: ref for ref in refs}
131class DatastoreTransaction:
132 """Keeps a log of `Datastore` activity and allow rollback.
134 Parameters
135 ----------
136 parent : `DatastoreTransaction`, optional
137 The parent transaction (if any).
138 """
140 Event: ClassVar[type] = Event
142 parent: DatastoreTransaction | None
143 """The parent transaction. (`DatastoreTransaction`, optional)"""
145 def __init__(self, parent: DatastoreTransaction | None = None):
146 self.parent = parent
147 self._log: list[Event] = []
149 def registerUndo(self, name: str, undoFunc: Callable, *args: Any, **kwargs: Any) -> None:
150 """Register event with undo function.
152 Parameters
153 ----------
154 name : `str`
155 Name of the event.
156 undoFunc : `~collections.abc.Callable`
157 Function to undo this event.
158 *args : `tuple`
159 Positional arguments to ``undoFunc``.
160 **kwargs
161 Keyword arguments to ``undoFunc``.
162 """
163 self._log.append(self.Event(name, undoFunc, args, kwargs))
165 @contextlib.contextmanager
166 def undoWith(self, name: str, undoFunc: Callable, *args: Any, **kwargs: Any) -> Iterator[None]:
167 """Register undo function if nested operation succeeds.
169 Calls `registerUndo`.
171 This can be used to wrap individual undo-able statements within a
172 DatastoreTransaction block. Multiple statements that can fail
173 separately should not be part of the same `undoWith` block.
175 All arguments are forwarded directly to `registerUndo`.
177 Parameters
178 ----------
179 name : `str`
180 The name to associate with this event.
181 undoFunc : `~collections.abc.Callable`
182 Function to undo this event.
183 *args : `tuple`
184 Positional arguments for ``undoFunc``.
185 **kwargs : `typing.Any`
186 Keyword arguments for ``undoFunc``.
187 """
188 try:
189 yield None
190 except BaseException:
191 raise
192 else:
193 self.registerUndo(name, undoFunc, *args, **kwargs)
195 def rollback(self) -> None:
196 """Roll back all events in this transaction."""
197 log = logging.getLogger(__name__)
198 while self._log:
199 ev = self._log.pop()
200 try:
201 log.debug(
202 "Rolling back transaction: %s: %s(%s,%s)",
203 ev.name,
204 ev.undoFunc,
205 ",".join(str(a) for a in ev.args),
206 ",".join(f"{k}={v}" for k, v in ev.kwargs.items()),
207 )
208 except Exception:
209 # In case we had a problem in stringification of arguments
210 log.warning("Rolling back transaction: %s", ev.name)
211 try:
212 ev.undoFunc(*ev.args, **ev.kwargs)
213 except BaseException as e:
214 # Deliberately swallow error that may occur in unrolling
215 log.warning("Exception: %s caught while unrolling: %s", e, ev.name)
216 pass
218 def commit(self) -> None:
219 """Commit this transaction."""
220 if self.parent is None:
221 # Just forget about the events, they have already happened.
222 return
223 else:
224 # We may still want to events from this transaction as part of
225 # the parent.
226 self.parent._log.extend(self._log)
229@dataclasses.dataclass
230class DatasetRefURIs(abc.Sequence):
231 """Represents the primary and component ResourcePath(s) associated with a
232 DatasetRef.
234 This is used in places where its members used to be represented as a tuple
235 `(primaryURI, componentURIs)`. To maintain backward compatibility this
236 inherits from Sequence and so instances can be treated as a two-item
237 tuple.
239 Parameters
240 ----------
241 primaryURI : `lsst.resources.ResourcePath` or `None`, optional
242 The URI to the primary artifact associated with this dataset. If the
243 dataset was disassembled within the datastore this may be `None`.
244 componentURIs : `dict` [`str`, `~lsst.resources.ResourcePath`] or `None`
245 The URIs to any components associated with the dataset artifact
246 indexed by component name. This can be empty if there are no
247 components.
248 """
250 def __init__(
251 self,
252 primaryURI: ResourcePath | None = None,
253 componentURIs: dict[str, ResourcePath] | None = None,
254 ):
255 self.primaryURI = primaryURI
256 self.componentURIs = componentURIs or {}
258 def __getitem__(self, index: Any) -> Any:
259 """Get primaryURI and componentURIs by index.
261 Provides support for tuple-like access.
262 """
263 if index == 0:
264 return self.primaryURI
265 elif index == 1:
266 return self.componentURIs
267 raise IndexError("list index out of range")
269 def __len__(self) -> int:
270 """Get the number of data members.
272 Provides support for tuple-like access.
273 """
274 return 2
276 def __repr__(self) -> str:
277 return f"DatasetRefURIs({repr(self.primaryURI)}, {repr(self.componentURIs)})"
280class Datastore(metaclass=ABCMeta):
281 """Datastore interface.
283 Parameters
284 ----------
285 config : `DatastoreConfig` or `str`
286 Load configuration either from an existing config instance or by
287 referring to a configuration file.
288 bridgeManager : `DatastoreRegistryBridgeManager`
289 Object that manages the interface between `Registry` and datastores.
290 butlerRoot : `str`, optional
291 New datastore root to use to override the configuration value.
292 """
294 defaultConfigFile: ClassVar[str | None] = None
295 """Path to configuration defaults. Accessed within the ``config`` resource
296 or relative to a search path. Can be None if no defaults specified.
297 """
299 containerKey: ClassVar[str | None] = None
300 """Name of the key containing a list of subconfigurations that also
301 need to be merged with defaults and will likely use different Python
302 datastore classes (but all using DatastoreConfig). Assumed to be a
303 list of configurations that can be represented in a DatastoreConfig
304 and containing a "cls" definition. None indicates that no containers
305 are expected in this Datastore."""
307 isEphemeral: bool = False
308 """Indicate whether this Datastore is ephemeral or not. An ephemeral
309 datastore is one where the contents of the datastore will not exist
310 across process restarts. This value can change per-instance."""
312 config: DatastoreConfig
313 """Configuration used to create Datastore."""
315 name: str
316 """Label associated with this Datastore."""
318 storageClassFactory: StorageClassFactory
319 """Factory for creating storage class instances from name."""
321 constraints: Constraints
322 """Constraints to apply when putting datasets into the datastore."""
324 # MyPy does not like for this to be annotated as any kind of type, because
325 # it can't do static checking on type variables that can change at runtime.
326 IngestPrepData: ClassVar[Any] = IngestPrepData
327 """Helper base class for ingest implementations.
328 """
330 @classmethod
331 @abstractmethod
332 def setConfigRoot(cls, root: str, config: Config, full: Config, overwrite: bool = True) -> None:
333 """Set filesystem-dependent config options for this datastore.
335 The options will be appropriate for a new empty repository with the
336 given root.
338 Parameters
339 ----------
340 root : `str`
341 Filesystem path to the root of the data repository.
342 config : `Config`
343 A `Config` to update. Only the subset understood by
344 this component will be updated. Will not expand
345 defaults.
346 full : `Config`
347 A complete config with all defaults expanded that can be
348 converted to a `DatastoreConfig`. Read-only and will not be
349 modified by this method.
350 Repository-specific options that should not be obtained
351 from defaults when Butler instances are constructed
352 should be copied from ``full`` to ``config``.
353 overwrite : `bool`, optional
354 If `False`, do not modify a value in ``config`` if the value
355 already exists. Default is always to overwrite with the provided
356 ``root``.
358 Notes
359 -----
360 If a keyword is explicitly defined in the supplied ``config`` it
361 will not be overridden by this method if ``overwrite`` is `False`.
362 This allows explicit values set in external configs to be retained.
363 """
364 raise NotImplementedError()
366 @staticmethod
367 def fromConfig(
368 config: Config,
369 bridgeManager: DatastoreRegistryBridgeManager,
370 butlerRoot: ResourcePathExpression | None = None,
371 ) -> Datastore:
372 """Create datastore from type specified in config file.
374 Parameters
375 ----------
376 config : `Config` or `~lsst.resources.ResourcePathExpression`
377 Configuration instance.
378 bridgeManager : `DatastoreRegistryBridgeManager`
379 Object that manages the interface between `Registry` and
380 datastores.
381 butlerRoot : `str`, optional
382 Butler root directory.
383 """
384 cls = doImportType(config["datastore", "cls"])
385 if not issubclass(cls, Datastore):
386 raise TypeError(f"Imported child class {config['datastore', 'cls']} is not a Datastore")
387 return cls(config=config, bridgeManager=bridgeManager, butlerRoot=butlerRoot)
389 def __init__(
390 self,
391 config: Config | ResourcePathExpression,
392 bridgeManager: DatastoreRegistryBridgeManager,
393 butlerRoot: ResourcePathExpression | None = None,
394 ):
395 self.config = DatastoreConfig(config)
396 self.name = "ABCDataStore"
397 self._transaction: DatastoreTransaction | None = None
399 # All Datastores need storage classes and constraints
400 self.storageClassFactory = StorageClassFactory()
402 # And read the constraints list
403 constraintsConfig = self.config.get("constraints")
404 self.constraints = Constraints(constraintsConfig, universe=bridgeManager.universe)
406 def __str__(self) -> str:
407 return self.name
409 def __repr__(self) -> str:
410 return self.name
412 @property
413 def names(self) -> tuple[str, ...]:
414 """Names associated with this datastore returned as a list.
416 Can be different to ``name`` for a chaining datastore.
417 """
418 # Default implementation returns solely the name itself
419 return (self.name,)
421 @property
422 def roots(self) -> dict[str, ResourcePath | None]:
423 """Return the root URIs for each named datastore.
425 Mapping from datastore name to root URI. The URI can be `None`
426 if a datastore has no concept of a root URI.
427 (`dict` [`str`, `ResourcePath` | `None`])
428 """
429 return {self.name: None}
431 @contextlib.contextmanager
432 def transaction(self) -> Iterator[DatastoreTransaction]:
433 """Context manager supporting `Datastore` transactions.
435 Transactions can be nested, and are to be used in combination with
436 `Registry.transaction`.
437 """
438 self._transaction = DatastoreTransaction(self._transaction)
439 try:
440 yield self._transaction
441 except BaseException:
442 self._transaction.rollback()
443 raise
444 else:
445 self._transaction.commit()
446 self._transaction = self._transaction.parent
448 @abstractmethod
449 def knows(self, ref: DatasetRef) -> bool:
450 """Check if the dataset is known to the datastore.
452 Does not check for existence of any artifact.
454 Parameters
455 ----------
456 ref : `DatasetRef`
457 Reference to the required dataset.
459 Returns
460 -------
461 exists : `bool`
462 `True` if the dataset is known to the datastore.
463 """
464 raise NotImplementedError()
466 def knows_these(self, refs: Iterable[DatasetRef]) -> dict[DatasetRef, bool]:
467 """Check which of the given datasets are known to this datastore.
469 This is like ``mexist()`` but does not check that the file exists.
471 Parameters
472 ----------
473 refs : iterable `DatasetRef`
474 The datasets to check.
476 Returns
477 -------
478 exists : `dict`[`DatasetRef`, `bool`]
479 Mapping of dataset to boolean indicating whether the dataset
480 is known to the datastore.
481 """
482 # Non-optimized default calls knows() repeatedly.
483 return {ref: self.knows(ref) for ref in refs}
485 def mexists(
486 self, refs: Iterable[DatasetRef], artifact_existence: dict[ResourcePath, bool] | None = None
487 ) -> dict[DatasetRef, bool]:
488 """Check the existence of multiple datasets at once.
490 Parameters
491 ----------
492 refs : iterable of `DatasetRef`
493 The datasets to be checked.
494 artifact_existence : `dict` [`lsst.resources.ResourcePath`, `bool`]
495 Optional mapping of datastore artifact to existence. Updated by
496 this method with details of all artifacts tested. Can be `None`
497 if the caller is not interested.
499 Returns
500 -------
501 existence : `dict` of [`DatasetRef`, `bool`]
502 Mapping from dataset to boolean indicating existence.
503 """
504 existence: dict[DatasetRef, bool] = {}
505 # Non-optimized default.
506 for ref in refs:
507 existence[ref] = self.exists(ref)
508 return existence
510 @abstractmethod
511 def exists(self, datasetRef: DatasetRef) -> bool:
512 """Check if the dataset exists in the datastore.
514 Parameters
515 ----------
516 datasetRef : `DatasetRef`
517 Reference to the required dataset.
519 Returns
520 -------
521 exists : `bool`
522 `True` if the entity exists in the `Datastore`.
523 """
524 raise NotImplementedError("Must be implemented by subclass")
526 @abstractmethod
527 def get(
528 self,
529 datasetRef: DatasetRef,
530 parameters: Mapping[str, Any] | None = None,
531 storageClass: StorageClass | str | None = None,
532 ) -> Any:
533 """Load an `InMemoryDataset` from the store.
535 Parameters
536 ----------
537 datasetRef : `DatasetRef`
538 Reference to the required Dataset.
539 parameters : `dict`
540 `StorageClass`-specific parameters that specify a slice of the
541 Dataset to be loaded.
542 storageClass : `StorageClass` or `str`, optional
543 The storage class to be used to override the Python type
544 returned by this method. By default the returned type matches
545 the dataset type definition for this dataset. Specifying a
546 read `StorageClass` can force a different type to be returned.
547 This type must be compatible with the original type.
549 Returns
550 -------
551 inMemoryDataset : `object`
552 Requested Dataset or slice thereof as an InMemoryDataset.
553 """
554 raise NotImplementedError("Must be implemented by subclass")
556 def prepare_get_for_external_client(self, ref: DatasetRef) -> object:
557 """Retrieve serializable data that can be used to execute a ``get()``.
559 Parameters
560 ----------
561 ref : `DatasetRef`
562 Reference to the required dataset.
564 Returns
565 -------
566 payload : `object`
567 Serializable payload containing the information needed to perform a
568 get() operation. This payload may be sent over the wire to another
569 system to perform the get().
570 """
571 raise NotImplementedError()
573 @abstractmethod
574 def put(self, inMemoryDataset: Any, datasetRef: DatasetRef) -> None:
575 """Write a `InMemoryDataset` with a given `DatasetRef` to the store.
577 Parameters
578 ----------
579 inMemoryDataset : `object`
580 The Dataset to store.
581 datasetRef : `DatasetRef`
582 Reference to the associated Dataset.
583 """
584 raise NotImplementedError("Must be implemented by subclass")
586 @abstractmethod
587 def put_new(self, in_memory_dataset: Any, ref: DatasetRef) -> Mapping[str, DatasetRef]:
588 """Write a `InMemoryDataset` with a given `DatasetRef` to the store.
590 Parameters
591 ----------
592 in_memory_dataset : `object`
593 The Dataset to store.
594 ref : `DatasetRef`
595 Reference to the associated Dataset.
597 Returns
598 -------
599 datastore_refs : `~collections.abc.Mapping` [`str`, `DatasetRef`]
600 Mapping of a datastore name to dataset reference stored in that
601 datastore, reference will include datastore records. Only
602 non-ephemeral datastores will appear in this mapping.
603 """
604 raise NotImplementedError("Must be implemented by subclass")
606 def _overrideTransferMode(self, *datasets: FileDataset, transfer: str | None = None) -> str | None:
607 """Allow ingest transfer mode to be defaulted based on datasets.
609 Parameters
610 ----------
611 *datasets : `FileDataset`
612 Each positional argument is a struct containing information about
613 a file to be ingested, including its path (either absolute or
614 relative to the datastore root, if applicable), a complete
615 `DatasetRef` (with ``dataset_id not None``), and optionally a
616 formatter class or its fully-qualified string name. If a formatter
617 is not provided, this method should populate that attribute with
618 the formatter the datastore would use for `put`. Subclasses are
619 also permitted to modify the path attribute (typically to put it
620 in what the datastore considers its standard form).
621 transfer : `str`, optional
622 How (and whether) the dataset should be added to the datastore.
623 See `ingest` for details of transfer modes.
625 Returns
626 -------
627 newTransfer : `str`
628 Transfer mode to use. Will be identical to the supplied transfer
629 mode unless "auto" is used.
630 """
631 if transfer != "auto":
632 return transfer
633 raise RuntimeError(f"{transfer} is not allowed without specialization.")
635 def _prepIngest(self, *datasets: FileDataset, transfer: str | None = None) -> IngestPrepData:
636 """Process datasets to identify which ones can be ingested.
638 Parameters
639 ----------
640 *datasets : `FileDataset`
641 Each positional argument is a struct containing information about
642 a file to be ingested, including its path (either absolute or
643 relative to the datastore root, if applicable), a complete
644 `DatasetRef` (with ``dataset_id not None``), and optionally a
645 formatter class or its fully-qualified string name. If a formatter
646 is not provided, this method should populate that attribute with
647 the formatter the datastore would use for `put`. Subclasses are
648 also permitted to modify the path attribute (typically to put it
649 in what the datastore considers its standard form).
650 transfer : `str`, optional
651 How (and whether) the dataset should be added to the datastore.
652 See `ingest` for details of transfer modes.
654 Returns
655 -------
656 data : `IngestPrepData`
657 An instance of a subclass of `IngestPrepData`, used to pass
658 arbitrary data from `_prepIngest` to `_finishIngest`. This should
659 include only the datasets this datastore can actually ingest;
660 others should be silently ignored (`Datastore.ingest` will inspect
661 `IngestPrepData.refs` and raise `DatasetTypeNotSupportedError` if
662 necessary).
664 Raises
665 ------
666 NotImplementedError
667 Raised if the datastore does not support the given transfer mode
668 (including the case where ingest is not supported at all).
669 FileNotFoundError
670 Raised if one of the given files does not exist.
671 FileExistsError
672 Raised if transfer is not `None` but the (internal) location the
673 file would be moved to is already occupied.
675 Notes
676 -----
677 This method (along with `_finishIngest`) should be implemented by
678 subclasses to provide ingest support instead of implementing `ingest`
679 directly.
681 `_prepIngest` should not modify the data repository or given files in
682 any way; all changes should be deferred to `_finishIngest`.
684 When possible, exceptions should be raised in `_prepIngest` instead of
685 `_finishIngest`. `NotImplementedError` exceptions that indicate that
686 the transfer mode is not supported must be raised by `_prepIngest`
687 instead of `_finishIngest`.
688 """
689 raise NotImplementedError(f"Datastore {self} does not support direct file-based ingest.")
691 def _finishIngest(
692 self, prepData: IngestPrepData, *, transfer: str | None = None, record_validation_info: bool = True
693 ) -> None:
694 """Complete an ingest operation.
696 Parameters
697 ----------
698 prepData : `IngestPrepData`
699 An instance of a subclass of `IngestPrepData`. Guaranteed to be
700 the direct result of a call to `_prepIngest` on this datastore.
701 transfer : `str`, optional
702 How (and whether) the dataset should be added to the datastore.
703 See `ingest` for details of transfer modes.
704 record_validation_info : `bool`, optional
705 If `True`, the default, the datastore can record validation
706 information associated with the file. If `False` the datastore
707 will not attempt to track any information such as checksums
708 or file sizes. This can be useful if such information is tracked
709 in an external system or if the file is to be compressed in place.
710 It is up to the datastore whether this parameter is relevant.
712 Raises
713 ------
714 FileNotFoundError
715 Raised if one of the given files does not exist.
716 FileExistsError
717 Raised if transfer is not `None` but the (internal) location the
718 file would be moved to is already occupied.
720 Notes
721 -----
722 This method (along with `_prepIngest`) should be implemented by
723 subclasses to provide ingest support instead of implementing `ingest`
724 directly.
725 """
726 raise NotImplementedError(f"Datastore {self} does not support direct file-based ingest.")
728 def ingest(
729 self, *datasets: FileDataset, transfer: str | None = None, record_validation_info: bool = True
730 ) -> None:
731 """Ingest one or more files into the datastore.
733 Parameters
734 ----------
735 *datasets : `FileDataset`
736 Each positional argument is a struct containing information about
737 a file to be ingested, including its path (either absolute or
738 relative to the datastore root, if applicable), a complete
739 `DatasetRef` (with ``dataset_id not None``), and optionally a
740 formatter class or its fully-qualified string name. If a formatter
741 is not provided, the one the datastore would use for ``put`` on
742 that dataset is assumed.
743 transfer : `str`, optional
744 How (and whether) the dataset should be added to the datastore.
745 If `None` (default), the file must already be in a location
746 appropriate for the datastore (e.g. within its root directory),
747 and will not be modified. Other choices include "move", "copy",
748 "link", "symlink", "relsymlink", and "hardlink". "link" is a
749 special transfer mode that will first try to make a hardlink and
750 if that fails a symlink will be used instead. "relsymlink" creates
751 a relative symlink rather than use an absolute path.
752 Most datastores do not support all transfer modes.
753 "auto" is a special option that will let the
754 data store choose the most natural option for itself.
755 record_validation_info : `bool`, optional
756 If `True`, the default, the datastore can record validation
757 information associated with the file. If `False` the datastore
758 will not attempt to track any information such as checksums
759 or file sizes. This can be useful if such information is tracked
760 in an external system or if the file is to be compressed in place.
761 It is up to the datastore whether this parameter is relevant.
763 Raises
764 ------
765 NotImplementedError
766 Raised if the datastore does not support the given transfer mode
767 (including the case where ingest is not supported at all).
768 DatasetTypeNotSupportedError
769 Raised if one or more files to be ingested have a dataset type that
770 is not supported by the datastore.
771 FileNotFoundError
772 Raised if one of the given files does not exist.
773 FileExistsError
774 Raised if transfer is not `None` but the (internal) location the
775 file would be moved to is already occupied.
777 Notes
778 -----
779 Subclasses should implement `_prepIngest` and `_finishIngest` instead
780 of implementing `ingest` directly. Datastores that hold and
781 delegate to child datastores may want to call those methods as well.
783 Subclasses are encouraged to document their supported transfer modes
784 in their class documentation.
785 """
786 # Allow a datastore to select a default transfer mode
787 transfer = self._overrideTransferMode(*datasets, transfer=transfer)
788 prepData = self._prepIngest(*datasets, transfer=transfer)
789 refs = {ref.id: ref for dataset in datasets for ref in dataset.refs}
790 if refs.keys() != prepData.refs.keys():
791 unsupported = refs.keys() - prepData.refs.keys()
792 # Group unsupported refs by DatasetType for an informative
793 # but still concise error message.
794 byDatasetType = defaultdict(list)
795 for datasetId in unsupported:
796 ref = refs[datasetId]
797 byDatasetType[ref.datasetType].append(ref)
798 raise DatasetTypeNotSupportedError(
799 "DatasetType(s) not supported in ingest: "
800 + ", ".join(f"{k.name} ({len(v)} dataset(s))" for k, v in byDatasetType.items())
801 )
802 self._finishIngest(prepData, transfer=transfer, record_validation_info=record_validation_info)
804 def transfer_from(
805 self,
806 source_datastore: Datastore,
807 refs: Iterable[DatasetRef],
808 transfer: str = "auto",
809 artifact_existence: dict[ResourcePath, bool] | None = None,
810 ) -> tuple[set[DatasetRef], set[DatasetRef]]:
811 """Transfer dataset artifacts from another datastore to this one.
813 Parameters
814 ----------
815 source_datastore : `Datastore`
816 The datastore from which to transfer artifacts. That datastore
817 must be compatible with this datastore receiving the artifacts.
818 refs : iterable of `DatasetRef`
819 The datasets to transfer from the source datastore.
820 transfer : `str`, optional
821 How (and whether) the dataset should be added to the datastore.
822 Choices include "move", "copy",
823 "link", "symlink", "relsymlink", and "hardlink". "link" is a
824 special transfer mode that will first try to make a hardlink and
825 if that fails a symlink will be used instead. "relsymlink" creates
826 a relative symlink rather than use an absolute path.
827 Most datastores do not support all transfer modes.
828 "auto" (the default) is a special option that will let the
829 data store choose the most natural option for itself.
830 If the source location and transfer location are identical the
831 transfer mode will be ignored.
832 artifact_existence : `dict` [`lsst.resources.ResourcePath`, `bool`]
833 Optional mapping of datastore artifact to existence. Updated by
834 this method with details of all artifacts tested. Can be `None`
835 if the caller is not interested.
837 Returns
838 -------
839 accepted : `set` [`DatasetRef`]
840 The datasets that were transferred.
841 rejected : `set` [`DatasetRef`]
842 The datasets that were rejected due to a constraints violation.
844 Raises
845 ------
846 TypeError
847 Raised if the two datastores are not compatible.
848 """
849 if type(self) is not type(source_datastore):
850 raise TypeError(
851 f"Datastore mismatch between this datastore ({type(self)}) and the "
852 f"source datastore ({type(source_datastore)})."
853 )
855 raise NotImplementedError(f"Datastore {type(self)} must implement a transfer_from method.")
857 def getManyURIs(
858 self,
859 refs: Iterable[DatasetRef],
860 predict: bool = False,
861 allow_missing: bool = False,
862 ) -> dict[DatasetRef, DatasetRefURIs]:
863 """Return URIs associated with many datasets.
865 Parameters
866 ----------
867 refs : iterable of `DatasetIdRef`
868 References to the required datasets.
869 predict : `bool`, optional
870 If `True`, allow URIs to be returned of datasets that have not
871 been written.
872 allow_missing : `bool`
873 If `False`, and ``predict`` is `False`, will raise if a
874 `DatasetRef` does not exist.
876 Returns
877 -------
878 URIs : `dict` of [`DatasetRef`, `DatasetRefUris`]
879 A dict of primary and component URIs, indexed by the passed-in
880 refs.
882 Raises
883 ------
884 FileNotFoundError
885 A URI has been requested for a dataset that does not exist and
886 guessing is not allowed.
888 Notes
889 -----
890 In file-based datastores, getManyURIs does not check that the file is
891 really there, it's assuming it is if datastore is aware of the file
892 then it actually exists.
893 """
894 uris: dict[DatasetRef, DatasetRefURIs] = {}
895 missing_refs = []
896 for ref in refs:
897 try:
898 uris[ref] = self.getURIs(ref, predict=predict)
899 except FileNotFoundError:
900 missing_refs.append(ref)
901 if missing_refs and not allow_missing:
902 raise FileNotFoundError(
903 "Missing {} refs from datastore out of {} and predict=False.".format(
904 num_missing := len(missing_refs), num_missing + len(uris)
905 )
906 )
907 return uris
909 @abstractmethod
910 def getURIs(self, datasetRef: DatasetRef, predict: bool = False) -> DatasetRefURIs:
911 """Return URIs associated with dataset.
913 Parameters
914 ----------
915 datasetRef : `DatasetRef`
916 Reference to the required dataset.
917 predict : `bool`, optional
918 If the datastore does not know about the dataset, controls whether
919 it should return a predicted URI or not.
921 Returns
922 -------
923 uris : `DatasetRefURIs`
924 The URI to the primary artifact associated with this dataset (if
925 the dataset was disassembled within the datastore this may be
926 `None`), and the URIs to any components associated with the dataset
927 artifact. (can be empty if there are no components).
928 """
929 raise NotImplementedError()
931 @abstractmethod
932 def getURI(self, datasetRef: DatasetRef, predict: bool = False) -> ResourcePath:
933 """URI to the Dataset.
935 Parameters
936 ----------
937 datasetRef : `DatasetRef`
938 Reference to the required Dataset.
939 predict : `bool`
940 If `True` attempt to predict the URI for a dataset if it does
941 not exist in datastore.
943 Returns
944 -------
945 uri : `str`
946 URI string pointing to the Dataset within the datastore. If the
947 Dataset does not exist in the datastore, the URI may be a guess.
948 If the datastore does not have entities that relate well
949 to the concept of a URI the returned URI string will be
950 descriptive. The returned URI is not guaranteed to be obtainable.
952 Raises
953 ------
954 FileNotFoundError
955 A URI has been requested for a dataset that does not exist and
956 guessing is not allowed.
957 """
958 raise NotImplementedError("Must be implemented by subclass")
960 @abstractmethod
961 def retrieveArtifacts(
962 self,
963 refs: Iterable[DatasetRef],
964 destination: ResourcePath,
965 transfer: str = "auto",
966 preserve_path: bool = True,
967 overwrite: bool = False,
968 ) -> list[ResourcePath]:
969 """Retrieve the artifacts associated with the supplied refs.
971 Parameters
972 ----------
973 refs : iterable of `DatasetRef`
974 The datasets for which artifacts are to be retrieved.
975 A single ref can result in multiple artifacts. The refs must
976 be resolved.
977 destination : `lsst.resources.ResourcePath`
978 Location to write the artifacts.
979 transfer : `str`, optional
980 Method to use to transfer the artifacts. Must be one of the options
981 supported by `lsst.resources.ResourcePath.transfer_from()`.
982 "move" is not allowed.
983 preserve_path : `bool`, optional
984 If `True` the full path of the artifact within the datastore
985 is preserved. If `False` the final file component of the path
986 is used.
987 overwrite : `bool`, optional
988 If `True` allow transfers to overwrite existing files at the
989 destination.
991 Returns
992 -------
993 targets : `list` of `lsst.resources.ResourcePath`
994 URIs of file artifacts in destination location. Order is not
995 preserved.
997 Notes
998 -----
999 For non-file datastores the artifacts written to the destination
1000 may not match the representation inside the datastore. For example
1001 a hierarchichal data structure in a NoSQL database may well be stored
1002 as a JSON file.
1003 """
1004 raise NotImplementedError()
1006 @abstractmethod
1007 def remove(self, datasetRef: DatasetRef) -> None:
1008 """Indicate to the Datastore that a Dataset can be removed.
1010 Parameters
1011 ----------
1012 datasetRef : `DatasetRef`
1013 Reference to the required Dataset.
1015 Raises
1016 ------
1017 FileNotFoundError
1018 When Dataset does not exist.
1020 Notes
1021 -----
1022 Some Datastores may implement this method as a silent no-op to
1023 disable Dataset deletion through standard interfaces.
1024 """
1025 raise NotImplementedError("Must be implemented by subclass")
1027 @abstractmethod
1028 def forget(self, refs: Iterable[DatasetRef]) -> None:
1029 """Indicate to the Datastore that it should remove all records of the
1030 given datasets, without actually deleting them.
1032 Parameters
1033 ----------
1034 refs : `~collections.abc.Iterable` [ `DatasetRef` ]
1035 References to the datasets being forgotten.
1037 Notes
1038 -----
1039 Asking a datastore to forget a `DatasetRef` it does not hold should be
1040 a silent no-op, not an error.
1041 """
1042 raise NotImplementedError("Must be implemented by subclass")
1044 @abstractmethod
1045 def trash(self, ref: DatasetRef | Iterable[DatasetRef], ignore_errors: bool = True) -> None:
1046 """Indicate to the Datastore that a Dataset can be moved to the trash.
1048 Parameters
1049 ----------
1050 ref : `DatasetRef` or iterable thereof
1051 Reference(s) to the required Dataset.
1052 ignore_errors : `bool`, optional
1053 Determine whether errors should be ignored. When multiple
1054 refs are being trashed there will be no per-ref check.
1056 Raises
1057 ------
1058 FileNotFoundError
1059 When Dataset does not exist and errors are not ignored. Only
1060 checked if a single ref is supplied (and not in a list).
1062 Notes
1063 -----
1064 Some Datastores may implement this method as a silent no-op to
1065 disable Dataset deletion through standard interfaces.
1066 """
1067 raise NotImplementedError("Must be implemented by subclass")
1069 @abstractmethod
1070 def emptyTrash(self, ignore_errors: bool = True) -> None:
1071 """Remove all datasets from the trash.
1073 Parameters
1074 ----------
1075 ignore_errors : `bool`, optional
1076 Determine whether errors should be ignored.
1078 Notes
1079 -----
1080 Some Datastores may implement this method as a silent no-op to
1081 disable Dataset deletion through standard interfaces.
1082 """
1083 raise NotImplementedError("Must be implemented by subclass")
1085 @abstractmethod
1086 def transfer(self, inputDatastore: Datastore, datasetRef: DatasetRef) -> None:
1087 """Transfer a dataset from another datastore to this datastore.
1089 Parameters
1090 ----------
1091 inputDatastore : `Datastore`
1092 The external `Datastore` from which to retrieve the Dataset.
1093 datasetRef : `DatasetRef`
1094 Reference to the required Dataset.
1095 """
1096 raise NotImplementedError("Must be implemented by subclass")
1098 def export(
1099 self,
1100 refs: Iterable[DatasetRef],
1101 *,
1102 directory: ResourcePathExpression | None = None,
1103 transfer: str | None = "auto",
1104 ) -> Iterable[FileDataset]:
1105 """Export datasets for transfer to another data repository.
1107 Parameters
1108 ----------
1109 refs : iterable of `DatasetRef`
1110 Dataset references to be exported.
1111 directory : `str`, optional
1112 Path to a directory that should contain files corresponding to
1113 output datasets. Ignored if ``transfer`` is explicitly `None`.
1114 transfer : `str`, optional
1115 Mode that should be used to move datasets out of the repository.
1116 Valid options are the same as those of the ``transfer`` argument
1117 to ``ingest``, and datastores may similarly signal that a transfer
1118 mode is not supported by raising `NotImplementedError`. If "auto"
1119 is given and no ``directory`` is specified, `None` will be
1120 implied.
1122 Returns
1123 -------
1124 dataset : iterable of `DatasetTransfer`
1125 Structs containing information about the exported datasets, in the
1126 same order as ``refs``.
1128 Raises
1129 ------
1130 NotImplementedError
1131 Raised if the given transfer mode is not supported.
1132 """
1133 raise NotImplementedError(f"Transfer mode {transfer} not supported.")
1135 @abstractmethod
1136 def validateConfiguration(
1137 self, entities: Iterable[DatasetRef | DatasetType | StorageClass], logFailures: bool = False
1138 ) -> None:
1139 """Validate some of the configuration for this datastore.
1141 Parameters
1142 ----------
1143 entities : iterable of `DatasetRef`, `DatasetType`, or `StorageClass`
1144 Entities to test against this configuration. Can be differing
1145 types.
1146 logFailures : `bool`, optional
1147 If `True`, output a log message for every validation error
1148 detected.
1150 Raises
1151 ------
1152 DatastoreValidationError
1153 Raised if there is a validation problem with a configuration.
1155 Notes
1156 -----
1157 Which parts of the configuration are validated is at the discretion
1158 of each Datastore implementation.
1159 """
1160 raise NotImplementedError("Must be implemented by subclass")
1162 @abstractmethod
1163 def validateKey(self, lookupKey: LookupKey, entity: DatasetRef | DatasetType | StorageClass) -> None:
1164 """Validate a specific look up key with supplied entity.
1166 Parameters
1167 ----------
1168 lookupKey : `LookupKey`
1169 Key to use to retrieve information from the datastore
1170 configuration.
1171 entity : `DatasetRef`, `DatasetType`, or `StorageClass`
1172 Entity to compare with configuration retrieved using the
1173 specified lookup key.
1175 Raises
1176 ------
1177 DatastoreValidationError
1178 Raised if there is a problem with the combination of entity
1179 and lookup key.
1181 Notes
1182 -----
1183 Bypasses the normal selection priorities by allowing a key that
1184 would normally not be selected to be validated.
1185 """
1186 raise NotImplementedError("Must be implemented by subclass")
1188 @abstractmethod
1189 def getLookupKeys(self) -> set[LookupKey]:
1190 """Return all the lookup keys relevant to this datastore.
1192 Returns
1193 -------
1194 keys : `set` of `LookupKey`
1195 The keys stored internally for looking up information based
1196 on `DatasetType` name or `StorageClass`.
1197 """
1198 raise NotImplementedError("Must be implemented by subclass")
1200 def needs_expanded_data_ids(
1201 self,
1202 transfer: str | None,
1203 entity: DatasetRef | DatasetType | StorageClass | None = None,
1204 ) -> bool:
1205 """Test whether this datastore needs expanded data IDs to ingest.
1207 Parameters
1208 ----------
1209 transfer : `str` or `None`
1210 Transfer mode for ingest.
1211 entity : `DatasetRef` or `DatasetType` or `StorageClass` or `None`, \
1212 optional
1213 Object representing what will be ingested. If not provided (or not
1214 specific enough), `True` may be returned even if expanded data
1215 IDs aren't necessary.
1217 Returns
1218 -------
1219 needed : `bool`
1220 If `True`, expanded data IDs may be needed. `False` only if
1221 expansion definitely isn't necessary.
1222 """
1223 return True
1225 @abstractmethod
1226 def import_records(
1227 self,
1228 data: Mapping[str, DatastoreRecordData],
1229 ) -> None:
1230 """Import datastore location and record data from an in-memory data
1231 structure.
1233 Parameters
1234 ----------
1235 data : `~collections.abc.Mapping` [ `str`, `DatastoreRecordData` ]
1236 Datastore records indexed by datastore name. May contain data for
1237 other `Datastore` instances (generally because they are chained to
1238 this one), which should be ignored.
1240 Notes
1241 -----
1242 Implementations should generally not check that any external resources
1243 (e.g. files) referred to by these records actually exist, for
1244 performance reasons; we expect higher-level code to guarantee that they
1245 do.
1247 Implementations are responsible for calling
1248 `DatastoreRegistryBridge.insert` on all datasets in ``data.locations``
1249 where the key is in `names`, as well as loading any opaque table data.
1251 Implementations may assume that datasets are either fully present or
1252 not at all (single-component exports are not permitted).
1253 """
1254 raise NotImplementedError()
1256 @abstractmethod
1257 def export_records(
1258 self,
1259 refs: Iterable[DatasetIdRef],
1260 ) -> Mapping[str, DatastoreRecordData]:
1261 """Export datastore records and locations to an in-memory data
1262 structure.
1264 Parameters
1265 ----------
1266 refs : `~collections.abc.Iterable` [ `DatasetIdRef` ]
1267 Datasets to save. This may include datasets not known to this
1268 datastore, which should be ignored. May not include component
1269 datasets.
1271 Returns
1272 -------
1273 data : `~collections.abc.Mapping` [ `str`, `DatastoreRecordData` ]
1274 Exported datastore records indexed by datastore name.
1275 """
1276 raise NotImplementedError()
1278 def set_retrieve_dataset_type_method(self, method: Callable[[str], DatasetType | None] | None) -> None:
1279 """Specify a method that can be used by datastore to retrieve
1280 registry-defined dataset type.
1282 Parameters
1283 ----------
1284 method : `~collections.abc.Callable` | `None`
1285 Method that takes a name of the dataset type and returns a
1286 corresponding `DatasetType` instance as defined in Registry. If
1287 dataset type name is not known to registry `None` is returned.
1289 Notes
1290 -----
1291 This method is only needed for a Datastore supporting a "trusted" mode
1292 when it does not have an access to datastore records and needs to
1293 guess dataset location based on its stored dataset type.
1294 """
1295 pass
1297 @abstractmethod
1298 def get_opaque_table_definitions(self) -> Mapping[str, DatastoreOpaqueTable]:
1299 """Make definitions of the opaque tables used by this Datastore.
1301 Returns
1302 -------
1303 tables : `~collections.abc.Mapping` [ `str`, `.ddl.TableSpec` ]
1304 Mapping of opaque table names to their definitions. This can be an
1305 empty mapping if Datastore does not use opaque tables to keep
1306 datastore records.
1307 """
1308 raise NotImplementedError()
1311class NullDatastore(Datastore):
1312 """A datastore that implements the `Datastore` API but always fails when
1313 it accepts any request.
1315 Parameters
1316 ----------
1317 config : `Config` or `~lsst.resources.ResourcePathExpression` or `None`
1318 Ignored.
1319 bridgeManager : `DatastoreRegistryBridgeManager` or `None`
1320 Ignored.
1321 butlerRoot : `~lsst.resources.ResourcePathExpression` or `None`
1322 Ignored.
1323 """
1325 @classmethod
1326 def setConfigRoot(cls, root: str, config: Config, full: Config, overwrite: bool = True) -> None:
1327 # Nothing to do. This is not a real Datastore.
1328 pass
1330 def __init__(
1331 self,
1332 config: Config | ResourcePathExpression | None,
1333 bridgeManager: DatastoreRegistryBridgeManager | None,
1334 butlerRoot: ResourcePathExpression | None = None,
1335 ):
1336 # Name ourselves with the timestamp the datastore
1337 # was created.
1338 self.name = f"{type(self).__name__}@{time.time()}"
1339 _LOG.debug("Creating datastore %s", self.name)
1341 return
1343 def knows(self, ref: DatasetRef) -> bool:
1344 return False
1346 def exists(self, datasetRef: DatasetRef) -> bool:
1347 return False
1349 def get(
1350 self,
1351 datasetRef: DatasetRef,
1352 parameters: Mapping[str, Any] | None = None,
1353 storageClass: StorageClass | str | None = None,
1354 ) -> Any:
1355 raise FileNotFoundError("This is a no-op datastore that can not access a real datastore")
1357 def put(self, inMemoryDataset: Any, datasetRef: DatasetRef) -> None:
1358 raise NotImplementedError("This is a no-op datastore that can not access a real datastore")
1360 def put_new(self, in_memory_dataset: Any, ref: DatasetRef) -> Mapping[str, DatasetRef]:
1361 raise NotImplementedError("This is a no-op datastore that can not access a real datastore")
1363 def ingest(
1364 self, *datasets: FileDataset, transfer: str | None = None, record_validation_info: bool = True
1365 ) -> None:
1366 raise NotImplementedError("This is a no-op datastore that can not access a real datastore")
1368 def transfer_from(
1369 self,
1370 source_datastore: Datastore,
1371 refs: Iterable[DatasetRef],
1372 transfer: str = "auto",
1373 artifact_existence: dict[ResourcePath, bool] | None = None,
1374 ) -> tuple[set[DatasetRef], set[DatasetRef]]:
1375 raise NotImplementedError("This is a no-op datastore that can not access a real datastore")
1377 def getURIs(self, datasetRef: DatasetRef, predict: bool = False) -> DatasetRefURIs:
1378 raise FileNotFoundError("This is a no-op datastore that can not access a real datastore")
1380 def getURI(self, datasetRef: DatasetRef, predict: bool = False) -> ResourcePath:
1381 raise FileNotFoundError("This is a no-op datastore that can not access a real datastore")
1383 def retrieveArtifacts(
1384 self,
1385 refs: Iterable[DatasetRef],
1386 destination: ResourcePath,
1387 transfer: str = "auto",
1388 preserve_path: bool = True,
1389 overwrite: bool = False,
1390 ) -> list[ResourcePath]:
1391 raise NotImplementedError("This is a no-op datastore that can not access a real datastore")
1393 def remove(self, datasetRef: DatasetRef) -> None:
1394 raise NotImplementedError("This is a no-op datastore that can not access a real datastore")
1396 def forget(self, refs: Iterable[DatasetRef]) -> None:
1397 raise NotImplementedError("This is a no-op datastore that can not access a real datastore")
1399 def trash(self, ref: DatasetRef | Iterable[DatasetRef], ignore_errors: bool = True) -> None:
1400 raise NotImplementedError("This is a no-op datastore that can not access a real datastore")
1402 def emptyTrash(self, ignore_errors: bool = True) -> None:
1403 raise NotImplementedError("This is a no-op datastore that can not access a real datastore")
1405 def transfer(self, inputDatastore: Datastore, datasetRef: DatasetRef) -> None:
1406 raise NotImplementedError("This is a no-op datastore that can not access a real datastore")
1408 def export(
1409 self,
1410 refs: Iterable[DatasetRef],
1411 *,
1412 directory: ResourcePathExpression | None = None,
1413 transfer: str | None = "auto",
1414 ) -> Iterable[FileDataset]:
1415 raise NotImplementedError("This is a no-op datastore that can not access a real datastore")
1417 def validateConfiguration(
1418 self, entities: Iterable[DatasetRef | DatasetType | StorageClass], logFailures: bool = False
1419 ) -> None:
1420 # No configuration so always validates.
1421 pass
1423 def validateKey(self, lookupKey: LookupKey, entity: DatasetRef | DatasetType | StorageClass) -> None:
1424 pass
1426 def getLookupKeys(self) -> set[LookupKey]:
1427 raise NotImplementedError("This is a no-op datastore that can not access a real datastore")
1429 def import_records(
1430 self,
1431 data: Mapping[str, DatastoreRecordData],
1432 ) -> None:
1433 raise NotImplementedError("This is a no-op datastore that can not access a real datastore")
1435 def export_records(
1436 self,
1437 refs: Iterable[DatasetIdRef],
1438 ) -> Mapping[str, DatastoreRecordData]:
1439 raise NotImplementedError("This is a no-op datastore that can not access a real datastore")
1441 def get_opaque_table_definitions(self) -> Mapping[str, DatastoreOpaqueTable]:
1442 return {}