Coverage for python/lsst/daf/butler/datastore/_datastore.py: 62%
262 statements
« prev ^ index » next coverage.py v7.3.2, created at 2023-12-01 11:00 +0000
« prev ^ index » next coverage.py v7.3.2, created at 2023-12-01 11:00 +0000
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This software is dual licensed under the GNU General Public License and also
10# under a 3-clause BSD license. Recipients may choose which of these licenses
11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt,
12# respectively. If you choose the GPL option then the following text applies
13# (but note that there is still no warranty even if you opt for BSD instead):
14#
15# This program is free software: you can redistribute it and/or modify
16# it under the terms of the GNU General Public License as published by
17# the Free Software Foundation, either version 3 of the License, or
18# (at your option) any later version.
19#
20# This program is distributed in the hope that it will be useful,
21# but WITHOUT ANY WARRANTY; without even the implied warranty of
22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23# GNU General Public License for more details.
24#
25# You should have received a copy of the GNU General Public License
26# along with this program. If not, see <http://www.gnu.org/licenses/>.
28"""Support for generic data stores."""
30from __future__ import annotations
32__all__ = (
33 "DatasetRefURIs",
34 "Datastore",
35 "DatastoreConfig",
36 "DatastoreOpaqueTable",
37 "DatastoreValidationError",
38 "NullDatastore",
39 "DatastoreTransaction",
40)
42import contextlib
43import dataclasses
44import logging
45import time
46from abc import ABCMeta, abstractmethod
47from collections import abc, defaultdict
48from collections.abc import Callable, Iterable, Iterator, Mapping
49from typing import TYPE_CHECKING, Any, ClassVar
51from lsst.utils import doImportType
53from .._config import Config, ConfigSubset
54from .._exceptions import DatasetTypeNotSupportedError, ValidationError
55from .._file_dataset import FileDataset
56from .._storage_class import StorageClassFactory
57from .constraints import Constraints
59if TYPE_CHECKING:
60 from lsst.resources import ResourcePath, ResourcePathExpression
62 from .. import ddl
63 from .._config_support import LookupKey
64 from .._dataset_ref import DatasetRef
65 from .._dataset_type import DatasetType
66 from .._storage_class import StorageClass
67 from ..registry.interfaces import DatasetIdRef, DatastoreRegistryBridgeManager
68 from .record_data import DatastoreRecordData
69 from .stored_file_info import StoredDatastoreItemInfo
71_LOG = logging.getLogger(__name__)
74class DatastoreConfig(ConfigSubset):
75 """Configuration for Datastores."""
77 component = "datastore"
78 requiredKeys = ("cls",)
79 defaultConfigFile = "datastore.yaml"
82class DatastoreValidationError(ValidationError):
83 """There is a problem with the Datastore configuration."""
85 pass
88@dataclasses.dataclass(frozen=True)
89class Event:
90 """Representation of an event that can be rolled back."""
92 __slots__ = {"name", "undoFunc", "args", "kwargs"}
93 name: str
94 undoFunc: Callable
95 args: tuple
96 kwargs: dict
99@dataclasses.dataclass(frozen=True)
100class DatastoreOpaqueTable:
101 """Definition of the opaque table which stores datastore records.
103 Table definition contains `.ddl.TableSpec` for a table and a class
104 of a record which must be a subclass of `StoredDatastoreItemInfo`.
105 """
107 __slots__ = {"table_spec", "record_class"}
108 table_spec: ddl.TableSpec
109 record_class: type[StoredDatastoreItemInfo]
112class IngestPrepData:
113 """A helper base class for `Datastore` ingest implementations.
115 Datastore implementations will generally need a custom implementation of
116 this class.
118 Should be accessed as ``Datastore.IngestPrepData`` instead of via direct
119 import.
121 Parameters
122 ----------
123 refs : iterable of `DatasetRef`
124 References for the datasets that can be ingested by this datastore.
125 """
127 def __init__(self, refs: Iterable[DatasetRef]):
128 self.refs = {ref.id: ref for ref in refs}
131class DatastoreTransaction:
132 """Keeps a log of `Datastore` activity and allow rollback.
134 Parameters
135 ----------
136 parent : `DatastoreTransaction`, optional
137 The parent transaction (if any)
138 """
140 Event: ClassVar[type] = Event
142 parent: DatastoreTransaction | None
143 """The parent transaction. (`DatastoreTransaction`, optional)"""
145 def __init__(self, parent: DatastoreTransaction | None = None):
146 self.parent = parent
147 self._log: list[Event] = []
149 def registerUndo(self, name: str, undoFunc: Callable, *args: Any, **kwargs: Any) -> None:
150 """Register event with undo function.
152 Parameters
153 ----------
154 name : `str`
155 Name of the event.
156 undoFunc : func
157 Function to undo this event.
158 args : `tuple`
159 Positional arguments to `undoFunc`.
160 **kwargs
161 Keyword arguments to `undoFunc`.
162 """
163 self._log.append(self.Event(name, undoFunc, args, kwargs))
165 @contextlib.contextmanager
166 def undoWith(self, name: str, undoFunc: Callable, *args: Any, **kwargs: Any) -> Iterator[None]:
167 """Register undo function if nested operation succeeds.
169 Calls `registerUndo`.
171 This can be used to wrap individual undo-able statements within a
172 DatastoreTransaction block. Multiple statements that can fail
173 separately should not be part of the same `undoWith` block.
175 All arguments are forwarded directly to `registerUndo`.
176 """
177 try:
178 yield None
179 except BaseException:
180 raise
181 else:
182 self.registerUndo(name, undoFunc, *args, **kwargs)
184 def rollback(self) -> None:
185 """Roll back all events in this transaction."""
186 log = logging.getLogger(__name__)
187 while self._log:
188 ev = self._log.pop()
189 try:
190 log.debug(
191 "Rolling back transaction: %s: %s(%s,%s)",
192 ev.name,
193 ev.undoFunc,
194 ",".join(str(a) for a in ev.args),
195 ",".join(f"{k}={v}" for k, v in ev.kwargs.items()),
196 )
197 except Exception:
198 # In case we had a problem in stringification of arguments
199 log.warning("Rolling back transaction: %s", ev.name)
200 try:
201 ev.undoFunc(*ev.args, **ev.kwargs)
202 except BaseException as e:
203 # Deliberately swallow error that may occur in unrolling
204 log.warning("Exception: %s caught while unrolling: %s", e, ev.name)
205 pass
207 def commit(self) -> None:
208 """Commit this transaction."""
209 if self.parent is None:
210 # Just forget about the events, they have already happened.
211 return
212 else:
213 # We may still want to events from this transaction as part of
214 # the parent.
215 self.parent._log.extend(self._log)
218@dataclasses.dataclass
219class DatasetRefURIs(abc.Sequence):
220 """Represents the primary and component ResourcePath(s) associated with a
221 DatasetRef.
223 This is used in places where its members used to be represented as a tuple
224 `(primaryURI, componentURIs)`. To maintain backward compatibility this
225 inherits from Sequence and so instances can be treated as a two-item
226 tuple.
227 """
229 def __init__(
230 self,
231 primaryURI: ResourcePath | None = None,
232 componentURIs: dict[str, ResourcePath] | None = None,
233 ):
234 self.primaryURI = primaryURI
235 """The URI to the primary artifact associated with this dataset. If the
236 dataset was disassembled within the datastore this may be `None`.
237 """
239 self.componentURIs = componentURIs or {}
240 """The URIs to any components associated with the dataset artifact
241 indexed by component name. This can be empty if there are no
242 components.
243 """
245 def __getitem__(self, index: Any) -> Any:
246 """Get primaryURI and componentURIs by index.
248 Provides support for tuple-like access.
249 """
250 if index == 0:
251 return self.primaryURI
252 elif index == 1:
253 return self.componentURIs
254 raise IndexError("list index out of range")
256 def __len__(self) -> int:
257 """Get the number of data members.
259 Provides support for tuple-like access.
260 """
261 return 2
263 def __repr__(self) -> str:
264 return f"DatasetRefURIs({repr(self.primaryURI)}, {repr(self.componentURIs)})"
267class Datastore(metaclass=ABCMeta):
268 """Datastore interface.
270 Parameters
271 ----------
272 config : `DatastoreConfig` or `str`
273 Load configuration either from an existing config instance or by
274 referring to a configuration file.
275 bridgeManager : `DatastoreRegistryBridgeManager`
276 Object that manages the interface between `Registry` and datastores.
277 butlerRoot : `str`, optional
278 New datastore root to use to override the configuration value.
279 """
281 defaultConfigFile: ClassVar[str | None] = None
282 """Path to configuration defaults. Accessed within the ``config`` resource
283 or relative to a search path. Can be None if no defaults specified.
284 """
286 containerKey: ClassVar[str | None] = None
287 """Name of the key containing a list of subconfigurations that also
288 need to be merged with defaults and will likely use different Python
289 datastore classes (but all using DatastoreConfig). Assumed to be a
290 list of configurations that can be represented in a DatastoreConfig
291 and containing a "cls" definition. None indicates that no containers
292 are expected in this Datastore."""
294 isEphemeral: bool = False
295 """Indicate whether this Datastore is ephemeral or not. An ephemeral
296 datastore is one where the contents of the datastore will not exist
297 across process restarts. This value can change per-instance."""
299 config: DatastoreConfig
300 """Configuration used to create Datastore."""
302 name: str
303 """Label associated with this Datastore."""
305 storageClassFactory: StorageClassFactory
306 """Factory for creating storage class instances from name."""
308 constraints: Constraints
309 """Constraints to apply when putting datasets into the datastore."""
311 # MyPy does not like for this to be annotated as any kind of type, because
312 # it can't do static checking on type variables that can change at runtime.
313 IngestPrepData: ClassVar[Any] = IngestPrepData
314 """Helper base class for ingest implementations.
315 """
317 @classmethod
318 @abstractmethod
319 def setConfigRoot(cls, root: str, config: Config, full: Config, overwrite: bool = True) -> None:
320 """Set filesystem-dependent config options for this datastore.
322 The options will be appropriate for a new empty repository with the
323 given root.
325 Parameters
326 ----------
327 root : `str`
328 Filesystem path to the root of the data repository.
329 config : `Config`
330 A `Config` to update. Only the subset understood by
331 this component will be updated. Will not expand
332 defaults.
333 full : `Config`
334 A complete config with all defaults expanded that can be
335 converted to a `DatastoreConfig`. Read-only and will not be
336 modified by this method.
337 Repository-specific options that should not be obtained
338 from defaults when Butler instances are constructed
339 should be copied from ``full`` to ``config``.
340 overwrite : `bool`, optional
341 If `False`, do not modify a value in ``config`` if the value
342 already exists. Default is always to overwrite with the provided
343 ``root``.
345 Notes
346 -----
347 If a keyword is explicitly defined in the supplied ``config`` it
348 will not be overridden by this method if ``overwrite`` is `False`.
349 This allows explicit values set in external configs to be retained.
350 """
351 raise NotImplementedError()
353 @staticmethod
354 def fromConfig(
355 config: Config,
356 bridgeManager: DatastoreRegistryBridgeManager,
357 butlerRoot: ResourcePathExpression | None = None,
358 ) -> Datastore:
359 """Create datastore from type specified in config file.
361 Parameters
362 ----------
363 config : `Config` or `~lsst.resources.ResourcePathExpression`
364 Configuration instance.
365 bridgeManager : `DatastoreRegistryBridgeManager`
366 Object that manages the interface between `Registry` and
367 datastores.
368 butlerRoot : `str`, optional
369 Butler root directory.
370 """
371 cls = doImportType(config["datastore", "cls"])
372 if not issubclass(cls, Datastore):
373 raise TypeError(f"Imported child class {config['datastore', 'cls']} is not a Datastore")
374 return cls(config=config, bridgeManager=bridgeManager, butlerRoot=butlerRoot)
376 def __init__(
377 self,
378 config: Config | ResourcePathExpression,
379 bridgeManager: DatastoreRegistryBridgeManager,
380 butlerRoot: ResourcePathExpression | None = None,
381 ):
382 self.config = DatastoreConfig(config)
383 self.name = "ABCDataStore"
384 self._transaction: DatastoreTransaction | None = None
386 # All Datastores need storage classes and constraints
387 self.storageClassFactory = StorageClassFactory()
389 # And read the constraints list
390 constraintsConfig = self.config.get("constraints")
391 self.constraints = Constraints(constraintsConfig, universe=bridgeManager.universe)
393 def __str__(self) -> str:
394 return self.name
396 def __repr__(self) -> str:
397 return self.name
399 @property
400 def names(self) -> tuple[str, ...]:
401 """Names associated with this datastore returned as a list.
403 Can be different to ``name`` for a chaining datastore.
404 """
405 # Default implementation returns solely the name itself
406 return (self.name,)
408 @property
409 def roots(self) -> dict[str, ResourcePath | None]:
410 """Return the root URIs for each named datastore.
412 Mapping from datastore name to root URI. The URI can be `None`
413 if a datastore has no concept of a root URI.
414 (`dict` [`str`, `ResourcePath` | `None`])
415 """
416 return {self.name: None}
418 @contextlib.contextmanager
419 def transaction(self) -> Iterator[DatastoreTransaction]:
420 """Context manager supporting `Datastore` transactions.
422 Transactions can be nested, and are to be used in combination with
423 `Registry.transaction`.
424 """
425 self._transaction = DatastoreTransaction(self._transaction)
426 try:
427 yield self._transaction
428 except BaseException:
429 self._transaction.rollback()
430 raise
431 else:
432 self._transaction.commit()
433 self._transaction = self._transaction.parent
435 @abstractmethod
436 def knows(self, ref: DatasetRef) -> bool:
437 """Check if the dataset is known to the datastore.
439 Does not check for existence of any artifact.
441 Parameters
442 ----------
443 ref : `DatasetRef`
444 Reference to the required dataset.
446 Returns
447 -------
448 exists : `bool`
449 `True` if the dataset is known to the datastore.
450 """
451 raise NotImplementedError()
453 def knows_these(self, refs: Iterable[DatasetRef]) -> dict[DatasetRef, bool]:
454 """Check which of the given datasets are known to this datastore.
456 This is like ``mexist()`` but does not check that the file exists.
458 Parameters
459 ----------
460 refs : iterable `DatasetRef`
461 The datasets to check.
463 Returns
464 -------
465 exists : `dict`[`DatasetRef`, `bool`]
466 Mapping of dataset to boolean indicating whether the dataset
467 is known to the datastore.
468 """
469 # Non-optimized default calls knows() repeatedly.
470 return {ref: self.knows(ref) for ref in refs}
472 def mexists(
473 self, refs: Iterable[DatasetRef], artifact_existence: dict[ResourcePath, bool] | None = None
474 ) -> dict[DatasetRef, bool]:
475 """Check the existence of multiple datasets at once.
477 Parameters
478 ----------
479 refs : iterable of `DatasetRef`
480 The datasets to be checked.
481 artifact_existence : `dict` [`lsst.resources.ResourcePath`, `bool`]
482 Optional mapping of datastore artifact to existence. Updated by
483 this method with details of all artifacts tested. Can be `None`
484 if the caller is not interested.
486 Returns
487 -------
488 existence : `dict` of [`DatasetRef`, `bool`]
489 Mapping from dataset to boolean indicating existence.
490 """
491 existence: dict[DatasetRef, bool] = {}
492 # Non-optimized default.
493 for ref in refs:
494 existence[ref] = self.exists(ref)
495 return existence
497 @abstractmethod
498 def exists(self, datasetRef: DatasetRef) -> bool:
499 """Check if the dataset exists in the datastore.
501 Parameters
502 ----------
503 datasetRef : `DatasetRef`
504 Reference to the required dataset.
506 Returns
507 -------
508 exists : `bool`
509 `True` if the entity exists in the `Datastore`.
510 """
511 raise NotImplementedError("Must be implemented by subclass")
513 @abstractmethod
514 def get(
515 self,
516 datasetRef: DatasetRef,
517 parameters: Mapping[str, Any] | None = None,
518 storageClass: StorageClass | str | None = None,
519 ) -> Any:
520 """Load an `InMemoryDataset` from the store.
522 Parameters
523 ----------
524 datasetRef : `DatasetRef`
525 Reference to the required Dataset.
526 parameters : `dict`
527 `StorageClass`-specific parameters that specify a slice of the
528 Dataset to be loaded.
529 storageClass : `StorageClass` or `str`, optional
530 The storage class to be used to override the Python type
531 returned by this method. By default the returned type matches
532 the dataset type definition for this dataset. Specifying a
533 read `StorageClass` can force a different type to be returned.
534 This type must be compatible with the original type.
536 Returns
537 -------
538 inMemoryDataset : `object`
539 Requested Dataset or slice thereof as an InMemoryDataset.
540 """
541 raise NotImplementedError("Must be implemented by subclass")
543 @abstractmethod
544 def put(self, inMemoryDataset: Any, datasetRef: DatasetRef) -> None:
545 """Write a `InMemoryDataset` with a given `DatasetRef` to the store.
547 Parameters
548 ----------
549 inMemoryDataset : `object`
550 The Dataset to store.
551 datasetRef : `DatasetRef`
552 Reference to the associated Dataset.
553 """
554 raise NotImplementedError("Must be implemented by subclass")
556 @abstractmethod
557 def put_new(self, in_memory_dataset: Any, dataset_ref: DatasetRef) -> Mapping[str, DatasetRef]:
558 """Write a `InMemoryDataset` with a given `DatasetRef` to the store.
560 Parameters
561 ----------
562 inMemoryDataset : `object`
563 The Dataset to store.
564 datasetRef : `DatasetRef`
565 Reference to the associated Dataset.
567 Returns
568 -------
569 datastore_refs : `~collections.abc.Mapping` [`str`, `DatasetRef`]
570 Mapping of a datastore name to dataset reference stored in that
571 datastore, reference will include datastore records. Only
572 non-ephemeral datastores will appear in this mapping.
573 """
574 raise NotImplementedError("Must be implemented by subclass")
576 def _overrideTransferMode(self, *datasets: FileDataset, transfer: str | None = None) -> str | None:
577 """Allow ingest transfer mode to be defaulted based on datasets.
579 Parameters
580 ----------
581 datasets : `FileDataset`
582 Each positional argument is a struct containing information about
583 a file to be ingested, including its path (either absolute or
584 relative to the datastore root, if applicable), a complete
585 `DatasetRef` (with ``dataset_id not None``), and optionally a
586 formatter class or its fully-qualified string name. If a formatter
587 is not provided, this method should populate that attribute with
588 the formatter the datastore would use for `put`. Subclasses are
589 also permitted to modify the path attribute (typically to put it
590 in what the datastore considers its standard form).
591 transfer : `str`, optional
592 How (and whether) the dataset should be added to the datastore.
593 See `ingest` for details of transfer modes.
595 Returns
596 -------
597 newTransfer : `str`
598 Transfer mode to use. Will be identical to the supplied transfer
599 mode unless "auto" is used.
600 """
601 if transfer != "auto":
602 return transfer
603 raise RuntimeError(f"{transfer} is not allowed without specialization.")
605 def _prepIngest(self, *datasets: FileDataset, transfer: str | None = None) -> IngestPrepData:
606 """Process datasets to identify which ones can be ingested.
608 Parameters
609 ----------
610 datasets : `FileDataset`
611 Each positional argument is a struct containing information about
612 a file to be ingested, including its path (either absolute or
613 relative to the datastore root, if applicable), a complete
614 `DatasetRef` (with ``dataset_id not None``), and optionally a
615 formatter class or its fully-qualified string name. If a formatter
616 is not provided, this method should populate that attribute with
617 the formatter the datastore would use for `put`. Subclasses are
618 also permitted to modify the path attribute (typically to put it
619 in what the datastore considers its standard form).
620 transfer : `str`, optional
621 How (and whether) the dataset should be added to the datastore.
622 See `ingest` for details of transfer modes.
624 Returns
625 -------
626 data : `IngestPrepData`
627 An instance of a subclass of `IngestPrepData`, used to pass
628 arbitrary data from `_prepIngest` to `_finishIngest`. This should
629 include only the datasets this datastore can actually ingest;
630 others should be silently ignored (`Datastore.ingest` will inspect
631 `IngestPrepData.refs` and raise `DatasetTypeNotSupportedError` if
632 necessary).
634 Raises
635 ------
636 NotImplementedError
637 Raised if the datastore does not support the given transfer mode
638 (including the case where ingest is not supported at all).
639 FileNotFoundError
640 Raised if one of the given files does not exist.
641 FileExistsError
642 Raised if transfer is not `None` but the (internal) location the
643 file would be moved to is already occupied.
645 Notes
646 -----
647 This method (along with `_finishIngest`) should be implemented by
648 subclasses to provide ingest support instead of implementing `ingest`
649 directly.
651 `_prepIngest` should not modify the data repository or given files in
652 any way; all changes should be deferred to `_finishIngest`.
654 When possible, exceptions should be raised in `_prepIngest` instead of
655 `_finishIngest`. `NotImplementedError` exceptions that indicate that
656 the transfer mode is not supported must be raised by `_prepIngest`
657 instead of `_finishIngest`.
658 """
659 raise NotImplementedError(f"Datastore {self} does not support direct file-based ingest.")
661 def _finishIngest(
662 self, prepData: IngestPrepData, *, transfer: str | None = None, record_validation_info: bool = True
663 ) -> None:
664 """Complete an ingest operation.
666 Parameters
667 ----------
668 data : `IngestPrepData`
669 An instance of a subclass of `IngestPrepData`. Guaranteed to be
670 the direct result of a call to `_prepIngest` on this datastore.
671 transfer : `str`, optional
672 How (and whether) the dataset should be added to the datastore.
673 See `ingest` for details of transfer modes.
674 record_validation_info : `bool`, optional
675 If `True`, the default, the datastore can record validation
676 information associated with the file. If `False` the datastore
677 will not attempt to track any information such as checksums
678 or file sizes. This can be useful if such information is tracked
679 in an external system or if the file is to be compressed in place.
680 It is up to the datastore whether this parameter is relevant.
682 Raises
683 ------
684 FileNotFoundError
685 Raised if one of the given files does not exist.
686 FileExistsError
687 Raised if transfer is not `None` but the (internal) location the
688 file would be moved to is already occupied.
690 Notes
691 -----
692 This method (along with `_prepIngest`) should be implemented by
693 subclasses to provide ingest support instead of implementing `ingest`
694 directly.
695 """
696 raise NotImplementedError(f"Datastore {self} does not support direct file-based ingest.")
698 def ingest(
699 self, *datasets: FileDataset, transfer: str | None = None, record_validation_info: bool = True
700 ) -> None:
701 """Ingest one or more files into the datastore.
703 Parameters
704 ----------
705 datasets : `FileDataset`
706 Each positional argument is a struct containing information about
707 a file to be ingested, including its path (either absolute or
708 relative to the datastore root, if applicable), a complete
709 `DatasetRef` (with ``dataset_id not None``), and optionally a
710 formatter class or its fully-qualified string name. If a formatter
711 is not provided, the one the datastore would use for ``put`` on
712 that dataset is assumed.
713 transfer : `str`, optional
714 How (and whether) the dataset should be added to the datastore.
715 If `None` (default), the file must already be in a location
716 appropriate for the datastore (e.g. within its root directory),
717 and will not be modified. Other choices include "move", "copy",
718 "link", "symlink", "relsymlink", and "hardlink". "link" is a
719 special transfer mode that will first try to make a hardlink and
720 if that fails a symlink will be used instead. "relsymlink" creates
721 a relative symlink rather than use an absolute path.
722 Most datastores do not support all transfer modes.
723 "auto" is a special option that will let the
724 data store choose the most natural option for itself.
725 record_validation_info : `bool`, optional
726 If `True`, the default, the datastore can record validation
727 information associated with the file. If `False` the datastore
728 will not attempt to track any information such as checksums
729 or file sizes. This can be useful if such information is tracked
730 in an external system or if the file is to be compressed in place.
731 It is up to the datastore whether this parameter is relevant.
733 Raises
734 ------
735 NotImplementedError
736 Raised if the datastore does not support the given transfer mode
737 (including the case where ingest is not supported at all).
738 DatasetTypeNotSupportedError
739 Raised if one or more files to be ingested have a dataset type that
740 is not supported by the datastore.
741 FileNotFoundError
742 Raised if one of the given files does not exist.
743 FileExistsError
744 Raised if transfer is not `None` but the (internal) location the
745 file would be moved to is already occupied.
747 Notes
748 -----
749 Subclasses should implement `_prepIngest` and `_finishIngest` instead
750 of implementing `ingest` directly. Datastores that hold and
751 delegate to child datastores may want to call those methods as well.
753 Subclasses are encouraged to document their supported transfer modes
754 in their class documentation.
755 """
756 # Allow a datastore to select a default transfer mode
757 transfer = self._overrideTransferMode(*datasets, transfer=transfer)
758 prepData = self._prepIngest(*datasets, transfer=transfer)
759 refs = {ref.id: ref for dataset in datasets for ref in dataset.refs}
760 if refs.keys() != prepData.refs.keys():
761 unsupported = refs.keys() - prepData.refs.keys()
762 # Group unsupported refs by DatasetType for an informative
763 # but still concise error message.
764 byDatasetType = defaultdict(list)
765 for datasetId in unsupported:
766 ref = refs[datasetId]
767 byDatasetType[ref.datasetType].append(ref)
768 raise DatasetTypeNotSupportedError(
769 "DatasetType(s) not supported in ingest: "
770 + ", ".join(f"{k.name} ({len(v)} dataset(s))" for k, v in byDatasetType.items())
771 )
772 self._finishIngest(prepData, transfer=transfer, record_validation_info=record_validation_info)
774 def transfer_from(
775 self,
776 source_datastore: Datastore,
777 refs: Iterable[DatasetRef],
778 transfer: str = "auto",
779 artifact_existence: dict[ResourcePath, bool] | None = None,
780 ) -> tuple[set[DatasetRef], set[DatasetRef]]:
781 """Transfer dataset artifacts from another datastore to this one.
783 Parameters
784 ----------
785 source_datastore : `Datastore`
786 The datastore from which to transfer artifacts. That datastore
787 must be compatible with this datastore receiving the artifacts.
788 refs : iterable of `DatasetRef`
789 The datasets to transfer from the source datastore.
790 transfer : `str`, optional
791 How (and whether) the dataset should be added to the datastore.
792 Choices include "move", "copy",
793 "link", "symlink", "relsymlink", and "hardlink". "link" is a
794 special transfer mode that will first try to make a hardlink and
795 if that fails a symlink will be used instead. "relsymlink" creates
796 a relative symlink rather than use an absolute path.
797 Most datastores do not support all transfer modes.
798 "auto" (the default) is a special option that will let the
799 data store choose the most natural option for itself.
800 If the source location and transfer location are identical the
801 transfer mode will be ignored.
802 artifact_existence : `dict` [`lsst.resources.ResourcePath`, `bool`]
803 Optional mapping of datastore artifact to existence. Updated by
804 this method with details of all artifacts tested. Can be `None`
805 if the caller is not interested.
807 Returns
808 -------
809 accepted : `set` [`DatasetRef`]
810 The datasets that were transferred.
811 rejected : `set` [`DatasetRef`]
812 The datasets that were rejected due to a constraints violation.
814 Raises
815 ------
816 TypeError
817 Raised if the two datastores are not compatible.
818 """
819 if type(self) is not type(source_datastore):
820 raise TypeError(
821 f"Datastore mismatch between this datastore ({type(self)}) and the "
822 f"source datastore ({type(source_datastore)})."
823 )
825 raise NotImplementedError(f"Datastore {type(self)} must implement a transfer_from method.")
827 def getManyURIs(
828 self,
829 refs: Iterable[DatasetRef],
830 predict: bool = False,
831 allow_missing: bool = False,
832 ) -> dict[DatasetRef, DatasetRefURIs]:
833 """Return URIs associated with many datasets.
835 Parameters
836 ----------
837 refs : iterable of `DatasetIdRef`
838 References to the required datasets.
839 predict : `bool`, optional
840 If `True`, allow URIs to be returned of datasets that have not
841 been written.
842 allow_missing : `bool`
843 If `False`, and ``predict`` is `False`, will raise if a
844 `DatasetRef` does not exist.
846 Returns
847 -------
848 URIs : `dict` of [`DatasetRef`, `DatasetRefUris`]
849 A dict of primary and component URIs, indexed by the passed-in
850 refs.
852 Raises
853 ------
854 FileNotFoundError
855 A URI has been requested for a dataset that does not exist and
856 guessing is not allowed.
858 Notes
859 -----
860 In file-based datastores, getManyURIs does not check that the file is
861 really there, it's assuming it is if datastore is aware of the file
862 then it actually exists.
863 """
864 uris: dict[DatasetRef, DatasetRefURIs] = {}
865 missing_refs = []
866 for ref in refs:
867 try:
868 uris[ref] = self.getURIs(ref, predict=predict)
869 except FileNotFoundError:
870 missing_refs.append(ref)
871 if missing_refs and not allow_missing:
872 raise FileNotFoundError(
873 "Missing {} refs from datastore out of {} and predict=False.".format(
874 num_missing := len(missing_refs), num_missing + len(uris)
875 )
876 )
877 return uris
879 @abstractmethod
880 def getURIs(self, datasetRef: DatasetRef, predict: bool = False) -> DatasetRefURIs:
881 """Return URIs associated with dataset.
883 Parameters
884 ----------
885 ref : `DatasetRef`
886 Reference to the required dataset.
887 predict : `bool`, optional
888 If the datastore does not know about the dataset, should it
889 return a predicted URI or not?
891 Returns
892 -------
893 uris : `DatasetRefURIs`
894 The URI to the primary artifact associated with this dataset (if
895 the dataset was disassembled within the datastore this may be
896 `None`), and the URIs to any components associated with the dataset
897 artifact. (can be empty if there are no components).
898 """
899 raise NotImplementedError()
901 @abstractmethod
902 def getURI(self, datasetRef: DatasetRef, predict: bool = False) -> ResourcePath:
903 """URI to the Dataset.
905 Parameters
906 ----------
907 datasetRef : `DatasetRef`
908 Reference to the required Dataset.
909 predict : `bool`
910 If `True` attempt to predict the URI for a dataset if it does
911 not exist in datastore.
913 Returns
914 -------
915 uri : `str`
916 URI string pointing to the Dataset within the datastore. If the
917 Dataset does not exist in the datastore, the URI may be a guess.
918 If the datastore does not have entities that relate well
919 to the concept of a URI the returned URI string will be
920 descriptive. The returned URI is not guaranteed to be obtainable.
922 Raises
923 ------
924 FileNotFoundError
925 A URI has been requested for a dataset that does not exist and
926 guessing is not allowed.
927 """
928 raise NotImplementedError("Must be implemented by subclass")
930 @abstractmethod
931 def retrieveArtifacts(
932 self,
933 refs: Iterable[DatasetRef],
934 destination: ResourcePath,
935 transfer: str = "auto",
936 preserve_path: bool = True,
937 overwrite: bool = False,
938 ) -> list[ResourcePath]:
939 """Retrieve the artifacts associated with the supplied refs.
941 Parameters
942 ----------
943 refs : iterable of `DatasetRef`
944 The datasets for which artifacts are to be retrieved.
945 A single ref can result in multiple artifacts. The refs must
946 be resolved.
947 destination : `lsst.resources.ResourcePath`
948 Location to write the artifacts.
949 transfer : `str`, optional
950 Method to use to transfer the artifacts. Must be one of the options
951 supported by `lsst.resources.ResourcePath.transfer_from()`.
952 "move" is not allowed.
953 preserve_path : `bool`, optional
954 If `True` the full path of the artifact within the datastore
955 is preserved. If `False` the final file component of the path
956 is used.
957 overwrite : `bool`, optional
958 If `True` allow transfers to overwrite existing files at the
959 destination.
961 Returns
962 -------
963 targets : `list` of `lsst.resources.ResourcePath`
964 URIs of file artifacts in destination location. Order is not
965 preserved.
967 Notes
968 -----
969 For non-file datastores the artifacts written to the destination
970 may not match the representation inside the datastore. For example
971 a hierarchichal data structure in a NoSQL database may well be stored
972 as a JSON file.
973 """
974 raise NotImplementedError()
976 @abstractmethod
977 def remove(self, datasetRef: DatasetRef) -> None:
978 """Indicate to the Datastore that a Dataset can be removed.
980 Parameters
981 ----------
982 datasetRef : `DatasetRef`
983 Reference to the required Dataset.
985 Raises
986 ------
987 FileNotFoundError
988 When Dataset does not exist.
990 Notes
991 -----
992 Some Datastores may implement this method as a silent no-op to
993 disable Dataset deletion through standard interfaces.
994 """
995 raise NotImplementedError("Must be implemented by subclass")
997 @abstractmethod
998 def forget(self, refs: Iterable[DatasetRef]) -> None:
999 """Indicate to the Datastore that it should remove all records of the
1000 given datasets, without actually deleting them.
1002 Parameters
1003 ----------
1004 refs : `~collections.abc.Iterable` [ `DatasetRef` ]
1005 References to the datasets being forgotten.
1007 Notes
1008 -----
1009 Asking a datastore to forget a `DatasetRef` it does not hold should be
1010 a silent no-op, not an error.
1011 """
1012 raise NotImplementedError("Must be implemented by subclass")
1014 @abstractmethod
1015 def trash(self, ref: DatasetRef | Iterable[DatasetRef], ignore_errors: bool = True) -> None:
1016 """Indicate to the Datastore that a Dataset can be moved to the trash.
1018 Parameters
1019 ----------
1020 ref : `DatasetRef` or iterable thereof
1021 Reference(s) to the required Dataset.
1022 ignore_errors : `bool`, optional
1023 Determine whether errors should be ignored. When multiple
1024 refs are being trashed there will be no per-ref check.
1026 Raises
1027 ------
1028 FileNotFoundError
1029 When Dataset does not exist and errors are not ignored. Only
1030 checked if a single ref is supplied (and not in a list).
1032 Notes
1033 -----
1034 Some Datastores may implement this method as a silent no-op to
1035 disable Dataset deletion through standard interfaces.
1036 """
1037 raise NotImplementedError("Must be implemented by subclass")
1039 @abstractmethod
1040 def emptyTrash(self, ignore_errors: bool = True) -> None:
1041 """Remove all datasets from the trash.
1043 Parameters
1044 ----------
1045 ignore_errors : `bool`, optional
1046 Determine whether errors should be ignored.
1048 Notes
1049 -----
1050 Some Datastores may implement this method as a silent no-op to
1051 disable Dataset deletion through standard interfaces.
1052 """
1053 raise NotImplementedError("Must be implemented by subclass")
1055 @abstractmethod
1056 def transfer(self, inputDatastore: Datastore, datasetRef: DatasetRef) -> None:
1057 """Transfer a dataset from another datastore to this datastore.
1059 Parameters
1060 ----------
1061 inputDatastore : `Datastore`
1062 The external `Datastore` from which to retrieve the Dataset.
1063 datasetRef : `DatasetRef`
1064 Reference to the required Dataset.
1065 """
1066 raise NotImplementedError("Must be implemented by subclass")
1068 def export(
1069 self,
1070 refs: Iterable[DatasetRef],
1071 *,
1072 directory: ResourcePathExpression | None = None,
1073 transfer: str | None = "auto",
1074 ) -> Iterable[FileDataset]:
1075 """Export datasets for transfer to another data repository.
1077 Parameters
1078 ----------
1079 refs : iterable of `DatasetRef`
1080 Dataset references to be exported.
1081 directory : `str`, optional
1082 Path to a directory that should contain files corresponding to
1083 output datasets. Ignored if ``transfer`` is explicitly `None`.
1084 transfer : `str`, optional
1085 Mode that should be used to move datasets out of the repository.
1086 Valid options are the same as those of the ``transfer`` argument
1087 to ``ingest``, and datastores may similarly signal that a transfer
1088 mode is not supported by raising `NotImplementedError`. If "auto"
1089 is given and no ``directory`` is specified, `None` will be
1090 implied.
1092 Returns
1093 -------
1094 dataset : iterable of `DatasetTransfer`
1095 Structs containing information about the exported datasets, in the
1096 same order as ``refs``.
1098 Raises
1099 ------
1100 NotImplementedError
1101 Raised if the given transfer mode is not supported.
1102 """
1103 raise NotImplementedError(f"Transfer mode {transfer} not supported.")
1105 @abstractmethod
1106 def validateConfiguration(
1107 self, entities: Iterable[DatasetRef | DatasetType | StorageClass], logFailures: bool = False
1108 ) -> None:
1109 """Validate some of the configuration for this datastore.
1111 Parameters
1112 ----------
1113 entities : iterable of `DatasetRef`, `DatasetType`, or `StorageClass`
1114 Entities to test against this configuration. Can be differing
1115 types.
1116 logFailures : `bool`, optional
1117 If `True`, output a log message for every validation error
1118 detected.
1120 Raises
1121 ------
1122 DatastoreValidationError
1123 Raised if there is a validation problem with a configuration.
1125 Notes
1126 -----
1127 Which parts of the configuration are validated is at the discretion
1128 of each Datastore implementation.
1129 """
1130 raise NotImplementedError("Must be implemented by subclass")
1132 @abstractmethod
1133 def validateKey(self, lookupKey: LookupKey, entity: DatasetRef | DatasetType | StorageClass) -> None:
1134 """Validate a specific look up key with supplied entity.
1136 Parameters
1137 ----------
1138 lookupKey : `LookupKey`
1139 Key to use to retrieve information from the datastore
1140 configuration.
1141 entity : `DatasetRef`, `DatasetType`, or `StorageClass`
1142 Entity to compare with configuration retrieved using the
1143 specified lookup key.
1145 Raises
1146 ------
1147 DatastoreValidationError
1148 Raised if there is a problem with the combination of entity
1149 and lookup key.
1151 Notes
1152 -----
1153 Bypasses the normal selection priorities by allowing a key that
1154 would normally not be selected to be validated.
1155 """
1156 raise NotImplementedError("Must be implemented by subclass")
1158 @abstractmethod
1159 def getLookupKeys(self) -> set[LookupKey]:
1160 """Return all the lookup keys relevant to this datastore.
1162 Returns
1163 -------
1164 keys : `set` of `LookupKey`
1165 The keys stored internally for looking up information based
1166 on `DatasetType` name or `StorageClass`.
1167 """
1168 raise NotImplementedError("Must be implemented by subclass")
1170 def needs_expanded_data_ids(
1171 self,
1172 transfer: str | None,
1173 entity: DatasetRef | DatasetType | StorageClass | None = None,
1174 ) -> bool:
1175 """Test whether this datastore needs expanded data IDs to ingest.
1177 Parameters
1178 ----------
1179 transfer : `str` or `None`
1180 Transfer mode for ingest.
1181 entity, optional
1182 Object representing what will be ingested. If not provided (or not
1183 specific enough), `True` may be returned even if expanded data
1184 IDs aren't necessary.
1186 Returns
1187 -------
1188 needed : `bool`
1189 If `True`, expanded data IDs may be needed. `False` only if
1190 expansion definitely isn't necessary.
1191 """
1192 return True
1194 @abstractmethod
1195 def import_records(
1196 self,
1197 data: Mapping[str, DatastoreRecordData],
1198 ) -> None:
1199 """Import datastore location and record data from an in-memory data
1200 structure.
1202 Parameters
1203 ----------
1204 data : `~collections.abc.Mapping` [ `str`, `DatastoreRecordData` ]
1205 Datastore records indexed by datastore name. May contain data for
1206 other `Datastore` instances (generally because they are chained to
1207 this one), which should be ignored.
1209 Notes
1210 -----
1211 Implementations should generally not check that any external resources
1212 (e.g. files) referred to by these records actually exist, for
1213 performance reasons; we expect higher-level code to guarantee that they
1214 do.
1216 Implementations are responsible for calling
1217 `DatastoreRegistryBridge.insert` on all datasets in ``data.locations``
1218 where the key is in `names`, as well as loading any opaque table data.
1220 Implementations may assume that datasets are either fully present or
1221 not at all (single-component exports are not permitted).
1222 """
1223 raise NotImplementedError()
1225 @abstractmethod
1226 def export_records(
1227 self,
1228 refs: Iterable[DatasetIdRef],
1229 ) -> Mapping[str, DatastoreRecordData]:
1230 """Export datastore records and locations to an in-memory data
1231 structure.
1233 Parameters
1234 ----------
1235 refs : `~collections.abc.Iterable` [ `DatasetIdRef` ]
1236 Datasets to save. This may include datasets not known to this
1237 datastore, which should be ignored. May not include component
1238 datasets.
1240 Returns
1241 -------
1242 data : `~collections.abc.Mapping` [ `str`, `DatastoreRecordData` ]
1243 Exported datastore records indexed by datastore name.
1244 """
1245 raise NotImplementedError()
1247 def set_retrieve_dataset_type_method(self, method: Callable[[str], DatasetType | None] | None) -> None:
1248 """Specify a method that can be used by datastore to retrieve
1249 registry-defined dataset type.
1251 Parameters
1252 ----------
1253 method : `~collections.abc.Callable` | `None`
1254 Method that takes a name of the dataset type and returns a
1255 corresponding `DatasetType` instance as defined in Registry. If
1256 dataset type name is not known to registry `None` is returned.
1258 Notes
1259 -----
1260 This method is only needed for a Datastore supporting a "trusted" mode
1261 when it does not have an access to datastore records and needs to
1262 guess dataset location based on its stored dataset type.
1263 """
1264 pass
1266 @abstractmethod
1267 def get_opaque_table_definitions(self) -> Mapping[str, DatastoreOpaqueTable]:
1268 """Make definitions of the opaque tables used by this Datastore.
1270 Returns
1271 -------
1272 tables : `~collections.abc.Mapping` [ `str`, `.ddl.TableSpec` ]
1273 Mapping of opaque table names to their definitions. This can be an
1274 empty mapping if Datastore does not use opaque tables to keep
1275 datastore records.
1276 """
1277 raise NotImplementedError()
1280class NullDatastore(Datastore):
1281 """A datastore that implements the `Datastore` API but always fails when
1282 it accepts any request.
1283 """
1285 @classmethod
1286 def setConfigRoot(cls, root: str, config: Config, full: Config, overwrite: bool = True) -> None:
1287 # Nothing to do. This is not a real Datastore.
1288 pass
1290 def __init__(
1291 self,
1292 config: Config | ResourcePathExpression | None,
1293 bridgeManager: DatastoreRegistryBridgeManager | None,
1294 butlerRoot: ResourcePathExpression | None = None,
1295 ):
1296 # Name ourselves with the timestamp the datastore
1297 # was created.
1298 self.name = f"{type(self).__name__}@{time.time()}"
1299 _LOG.debug("Creating datastore %s", self.name)
1301 return
1303 def knows(self, ref: DatasetRef) -> bool:
1304 return False
1306 def exists(self, datasetRef: DatasetRef) -> bool:
1307 return False
1309 def get(
1310 self,
1311 datasetRef: DatasetRef,
1312 parameters: Mapping[str, Any] | None = None,
1313 storageClass: StorageClass | str | None = None,
1314 ) -> Any:
1315 raise FileNotFoundError("This is a no-op datastore that can not access a real datastore")
1317 def put(self, inMemoryDataset: Any, datasetRef: DatasetRef) -> None:
1318 raise NotImplementedError("This is a no-op datastore that can not access a real datastore")
1320 def put_new(self, inMemoryDataset: Any, datasetRef: DatasetRef) -> Mapping[str, DatasetRef]:
1321 raise NotImplementedError("This is a no-op datastore that can not access a real datastore")
1323 def ingest(
1324 self, *datasets: FileDataset, transfer: str | None = None, record_validation_info: bool = True
1325 ) -> None:
1326 raise NotImplementedError("This is a no-op datastore that can not access a real datastore")
1328 def transfer_from(
1329 self,
1330 source_datastore: Datastore,
1331 refs: Iterable[DatasetRef],
1332 transfer: str = "auto",
1333 artifact_existence: dict[ResourcePath, bool] | None = None,
1334 ) -> tuple[set[DatasetRef], set[DatasetRef]]:
1335 raise NotImplementedError("This is a no-op datastore that can not access a real datastore")
1337 def getURIs(self, datasetRef: DatasetRef, predict: bool = False) -> DatasetRefURIs:
1338 raise FileNotFoundError("This is a no-op datastore that can not access a real datastore")
1340 def getURI(self, datasetRef: DatasetRef, predict: bool = False) -> ResourcePath:
1341 raise FileNotFoundError("This is a no-op datastore that can not access a real datastore")
1343 def retrieveArtifacts(
1344 self,
1345 refs: Iterable[DatasetRef],
1346 destination: ResourcePath,
1347 transfer: str = "auto",
1348 preserve_path: bool = True,
1349 overwrite: bool = False,
1350 ) -> list[ResourcePath]:
1351 raise NotImplementedError("This is a no-op datastore that can not access a real datastore")
1353 def remove(self, datasetRef: DatasetRef) -> None:
1354 raise NotImplementedError("This is a no-op datastore that can not access a real datastore")
1356 def forget(self, refs: Iterable[DatasetRef]) -> None:
1357 raise NotImplementedError("This is a no-op datastore that can not access a real datastore")
1359 def trash(self, ref: DatasetRef | Iterable[DatasetRef], ignore_errors: bool = True) -> None:
1360 raise NotImplementedError("This is a no-op datastore that can not access a real datastore")
1362 def emptyTrash(self, ignore_errors: bool = True) -> None:
1363 raise NotImplementedError("This is a no-op datastore that can not access a real datastore")
1365 def transfer(self, inputDatastore: Datastore, datasetRef: DatasetRef) -> None:
1366 raise NotImplementedError("This is a no-op datastore that can not access a real datastore")
1368 def export(
1369 self,
1370 refs: Iterable[DatasetRef],
1371 *,
1372 directory: ResourcePathExpression | None = None,
1373 transfer: str | None = "auto",
1374 ) -> Iterable[FileDataset]:
1375 raise NotImplementedError("This is a no-op datastore that can not access a real datastore")
1377 def validateConfiguration(
1378 self, entities: Iterable[DatasetRef | DatasetType | StorageClass], logFailures: bool = False
1379 ) -> None:
1380 # No configuration so always validates.
1381 pass
1383 def validateKey(self, lookupKey: LookupKey, entity: DatasetRef | DatasetType | StorageClass) -> None:
1384 pass
1386 def getLookupKeys(self) -> set[LookupKey]:
1387 raise NotImplementedError("This is a no-op datastore that can not access a real datastore")
1389 def import_records(
1390 self,
1391 data: Mapping[str, DatastoreRecordData],
1392 ) -> None:
1393 raise NotImplementedError("This is a no-op datastore that can not access a real datastore")
1395 def export_records(
1396 self,
1397 refs: Iterable[DatasetIdRef],
1398 ) -> Mapping[str, DatastoreRecordData]:
1399 raise NotImplementedError("This is a no-op datastore that can not access a real datastore")
1401 def get_opaque_table_definitions(self) -> Mapping[str, DatastoreOpaqueTable]:
1402 return {}