Coverage for python/lsst/daf/butler/core/datastore.py: 51%
210 statements
« prev ^ index » next coverage.py v7.2.7, created at 2023-06-28 10:10 +0000
« prev ^ index » next coverage.py v7.2.7, created at 2023-06-28 10:10 +0000
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22"""Support for generic data stores."""
24from __future__ import annotations
26__all__ = ("DatastoreConfig", "Datastore", "DatastoreValidationError", "DatasetRefURIs")
28import contextlib
29import dataclasses
30import logging
31from abc import ABCMeta, abstractmethod
32from collections import abc, defaultdict
33from collections.abc import Callable, Iterable, Iterator, Mapping
34from typing import TYPE_CHECKING, Any, ClassVar
36from lsst.utils import doImportType
38from .config import Config, ConfigSubset
39from .constraints import Constraints
40from .exceptions import DatasetTypeNotSupportedError, ValidationError
41from .fileDataset import FileDataset
42from .storageClass import StorageClassFactory
44if TYPE_CHECKING:
45 from lsst.resources import ResourcePath, ResourcePathExpression
47 from ..registry.interfaces import DatasetIdRef, DatastoreRegistryBridgeManager
48 from .configSupport import LookupKey
49 from .datasets import DatasetRef, DatasetType
50 from .datastoreRecordData import DatastoreRecordData
51 from .storageClass import StorageClass
54class DatastoreConfig(ConfigSubset):
55 """Configuration for Datastores."""
57 component = "datastore"
58 requiredKeys = ("cls",)
59 defaultConfigFile = "datastore.yaml"
62class DatastoreValidationError(ValidationError):
63 """There is a problem with the Datastore configuration."""
65 pass
68@dataclasses.dataclass(frozen=True)
69class Event:
70 """Representation of an event that can be rolled back."""
72 __slots__ = {"name", "undoFunc", "args", "kwargs"}
73 name: str
74 undoFunc: Callable
75 args: tuple
76 kwargs: dict
79class IngestPrepData:
80 """A helper base class for `Datastore` ingest implementations.
82 Datastore implementations will generally need a custom implementation of
83 this class.
85 Should be accessed as ``Datastore.IngestPrepData`` instead of via direct
86 import.
88 Parameters
89 ----------
90 refs : iterable of `DatasetRef`
91 References for the datasets that can be ingested by this datastore.
92 """
94 def __init__(self, refs: Iterable[DatasetRef]):
95 self.refs = {ref.id: ref for ref in refs}
98class DatastoreTransaction:
99 """Keeps a log of `Datastore` activity and allow rollback.
101 Parameters
102 ----------
103 parent : `DatastoreTransaction`, optional
104 The parent transaction (if any)
105 """
107 Event: ClassVar[type] = Event
109 parent: DatastoreTransaction | None
110 """The parent transaction. (`DatastoreTransaction`, optional)"""
112 def __init__(self, parent: DatastoreTransaction | None = None):
113 self.parent = parent
114 self._log: list[Event] = []
116 def registerUndo(self, name: str, undoFunc: Callable, *args: Any, **kwargs: Any) -> None:
117 """Register event with undo function.
119 Parameters
120 ----------
121 name : `str`
122 Name of the event.
123 undoFunc : func
124 Function to undo this event.
125 args : `tuple`
126 Positional arguments to `undoFunc`.
127 **kwargs
128 Keyword arguments to `undoFunc`.
129 """
130 self._log.append(self.Event(name, undoFunc, args, kwargs))
132 @contextlib.contextmanager
133 def undoWith(self, name: str, undoFunc: Callable, *args: Any, **kwargs: Any) -> Iterator[None]:
134 """Register undo function if nested operation succeeds.
136 Calls `registerUndo`.
138 This can be used to wrap individual undo-able statements within a
139 DatastoreTransaction block. Multiple statements that can fail
140 separately should not be part of the same `undoWith` block.
142 All arguments are forwarded directly to `registerUndo`.
143 """
144 try:
145 yield None
146 except BaseException:
147 raise
148 else:
149 self.registerUndo(name, undoFunc, *args, **kwargs)
151 def rollback(self) -> None:
152 """Roll back all events in this transaction."""
153 log = logging.getLogger(__name__)
154 while self._log:
155 ev = self._log.pop()
156 try:
157 log.debug(
158 "Rolling back transaction: %s: %s(%s,%s)",
159 ev.name,
160 ev.undoFunc,
161 ",".join(str(a) for a in ev.args),
162 ",".join(f"{k}={v}" for k, v in ev.kwargs.items()),
163 )
164 except Exception:
165 # In case we had a problem in stringification of arguments
166 log.warning("Rolling back transaction: %s", ev.name)
167 try:
168 ev.undoFunc(*ev.args, **ev.kwargs)
169 except BaseException as e:
170 # Deliberately swallow error that may occur in unrolling
171 log.warning("Exception: %s caught while unrolling: %s", e, ev.name)
172 pass
174 def commit(self) -> None:
175 """Commit this transaction."""
176 if self.parent is None:
177 # Just forget about the events, they have already happened.
178 return
179 else:
180 # We may still want to events from this transaction as part of
181 # the parent.
182 self.parent._log.extend(self._log)
185@dataclasses.dataclass
186class DatasetRefURIs(abc.Sequence):
187 """Represents the primary and component ResourcePath(s) associated with a
188 DatasetRef.
190 This is used in places where its members used to be represented as a tuple
191 `(primaryURI, componentURIs)`. To maintain backward compatibility this
192 inherits from Sequence and so instances can be treated as a two-item
193 tuple.
194 """
196 def __init__(
197 self,
198 primaryURI: ResourcePath | None = None,
199 componentURIs: dict[str, ResourcePath] | None = None,
200 ):
201 self.primaryURI = primaryURI
202 """The URI to the primary artifact associated with this dataset. If the
203 dataset was disassembled within the datastore this may be `None`.
204 """
206 self.componentURIs = componentURIs or {}
207 """The URIs to any components associated with the dataset artifact
208 indexed by component name. This can be empty if there are no
209 components.
210 """
212 def __getitem__(self, index: Any) -> Any:
213 """Get primaryURI and componentURIs by index.
215 Provides support for tuple-like access.
216 """
217 if index == 0:
218 return self.primaryURI
219 elif index == 1:
220 return self.componentURIs
221 raise IndexError("list index out of range")
223 def __len__(self) -> int:
224 """Get the number of data members.
226 Provides support for tuple-like access.
227 """
228 return 2
230 def __repr__(self) -> str:
231 return f"DatasetRefURIs({repr(self.primaryURI)}, {repr(self.componentURIs)})"
234class Datastore(metaclass=ABCMeta):
235 """Datastore interface.
237 Parameters
238 ----------
239 config : `DatastoreConfig` or `str`
240 Load configuration either from an existing config instance or by
241 referring to a configuration file.
242 bridgeManager : `DatastoreRegistryBridgeManager`
243 Object that manages the interface between `Registry` and datastores.
244 butlerRoot : `str`, optional
245 New datastore root to use to override the configuration value.
246 """
248 defaultConfigFile: ClassVar[str | None] = None
249 """Path to configuration defaults. Accessed within the ``config`` resource
250 or relative to a search path. Can be None if no defaults specified.
251 """
253 containerKey: ClassVar[str | None] = None
254 """Name of the key containing a list of subconfigurations that also
255 need to be merged with defaults and will likely use different Python
256 datastore classes (but all using DatastoreConfig). Assumed to be a
257 list of configurations that can be represented in a DatastoreConfig
258 and containing a "cls" definition. None indicates that no containers
259 are expected in this Datastore."""
261 isEphemeral: bool = False
262 """Indicate whether this Datastore is ephemeral or not. An ephemeral
263 datastore is one where the contents of the datastore will not exist
264 across process restarts. This value can change per-instance."""
266 config: DatastoreConfig
267 """Configuration used to create Datastore."""
269 name: str
270 """Label associated with this Datastore."""
272 storageClassFactory: StorageClassFactory
273 """Factory for creating storage class instances from name."""
275 constraints: Constraints
276 """Constraints to apply when putting datasets into the datastore."""
278 # MyPy does not like for this to be annotated as any kind of type, because
279 # it can't do static checking on type variables that can change at runtime.
280 IngestPrepData: ClassVar[Any] = IngestPrepData
281 """Helper base class for ingest implementations.
282 """
284 @classmethod
285 @abstractmethod
286 def setConfigRoot(cls, root: str, config: Config, full: Config, overwrite: bool = True) -> None:
287 """Set filesystem-dependent config options for this datastore.
289 The options will be appropriate for a new empty repository with the
290 given root.
292 Parameters
293 ----------
294 root : `str`
295 Filesystem path to the root of the data repository.
296 config : `Config`
297 A `Config` to update. Only the subset understood by
298 this component will be updated. Will not expand
299 defaults.
300 full : `Config`
301 A complete config with all defaults expanded that can be
302 converted to a `DatastoreConfig`. Read-only and will not be
303 modified by this method.
304 Repository-specific options that should not be obtained
305 from defaults when Butler instances are constructed
306 should be copied from ``full`` to ``config``.
307 overwrite : `bool`, optional
308 If `False`, do not modify a value in ``config`` if the value
309 already exists. Default is always to overwrite with the provided
310 ``root``.
312 Notes
313 -----
314 If a keyword is explicitly defined in the supplied ``config`` it
315 will not be overridden by this method if ``overwrite`` is `False`.
316 This allows explicit values set in external configs to be retained.
317 """
318 raise NotImplementedError()
320 @staticmethod
321 def fromConfig(
322 config: Config,
323 bridgeManager: DatastoreRegistryBridgeManager,
324 butlerRoot: ResourcePathExpression | None = None,
325 ) -> Datastore:
326 """Create datastore from type specified in config file.
328 Parameters
329 ----------
330 config : `Config` or `~lsst.resources.ResourcePathExpression`
331 Configuration instance.
332 bridgeManager : `DatastoreRegistryBridgeManager`
333 Object that manages the interface between `Registry` and
334 datastores.
335 butlerRoot : `str`, optional
336 Butler root directory.
337 """
338 cls = doImportType(config["datastore", "cls"])
339 if not issubclass(cls, Datastore):
340 raise TypeError(f"Imported child class {config['datastore', 'cls']} is not a Datastore")
341 return cls(config=config, bridgeManager=bridgeManager, butlerRoot=butlerRoot)
343 def __init__(
344 self,
345 config: Config | ResourcePathExpression,
346 bridgeManager: DatastoreRegistryBridgeManager,
347 butlerRoot: ResourcePathExpression | None = None,
348 ):
349 self.config = DatastoreConfig(config)
350 self.name = "ABCDataStore"
351 self._transaction: DatastoreTransaction | None = None
353 # All Datastores need storage classes and constraints
354 self.storageClassFactory = StorageClassFactory()
356 # And read the constraints list
357 constraintsConfig = self.config.get("constraints")
358 self.constraints = Constraints(constraintsConfig, universe=bridgeManager.universe)
360 def __str__(self) -> str:
361 return self.name
363 def __repr__(self) -> str:
364 return self.name
366 @property
367 def names(self) -> tuple[str, ...]:
368 """Names associated with this datastore returned as a list.
370 Can be different to ``name`` for a chaining datastore.
371 """
372 # Default implementation returns solely the name itself
373 return (self.name,)
375 @contextlib.contextmanager
376 def transaction(self) -> Iterator[DatastoreTransaction]:
377 """Context manager supporting `Datastore` transactions.
379 Transactions can be nested, and are to be used in combination with
380 `Registry.transaction`.
381 """
382 self._transaction = DatastoreTransaction(self._transaction)
383 try:
384 yield self._transaction
385 except BaseException:
386 self._transaction.rollback()
387 raise
388 else:
389 self._transaction.commit()
390 self._transaction = self._transaction.parent
392 @abstractmethod
393 def knows(self, ref: DatasetRef) -> bool:
394 """Check if the dataset is known to the datastore.
396 Does not check for existence of any artifact.
398 Parameters
399 ----------
400 ref : `DatasetRef`
401 Reference to the required dataset.
403 Returns
404 -------
405 exists : `bool`
406 `True` if the dataset is known to the datastore.
407 """
408 raise NotImplementedError()
410 def knows_these(self, refs: Iterable[DatasetRef]) -> dict[DatasetRef, bool]:
411 """Check which of the given datasets are known to this datastore.
413 This is like ``mexist()`` but does not check that the file exists.
415 Parameters
416 ----------
417 refs : iterable `DatasetRef`
418 The datasets to check.
420 Returns
421 -------
422 exists : `dict`[`DatasetRef`, `bool`]
423 Mapping of dataset to boolean indicating whether the dataset
424 is known to the datastore.
425 """
426 # Non-optimized default calls knows() repeatedly.
427 return {ref: self.knows(ref) for ref in refs}
429 def mexists(
430 self, refs: Iterable[DatasetRef], artifact_existence: dict[ResourcePath, bool] | None = None
431 ) -> dict[DatasetRef, bool]:
432 """Check the existence of multiple datasets at once.
434 Parameters
435 ----------
436 refs : iterable of `DatasetRef`
437 The datasets to be checked.
438 artifact_existence : `dict` [`lsst.resources.ResourcePath`, `bool`]
439 Optional mapping of datastore artifact to existence. Updated by
440 this method with details of all artifacts tested. Can be `None`
441 if the caller is not interested.
443 Returns
444 -------
445 existence : `dict` of [`DatasetRef`, `bool`]
446 Mapping from dataset to boolean indicating existence.
447 """
448 existence: dict[DatasetRef, bool] = {}
449 # Non-optimized default.
450 for ref in refs:
451 existence[ref] = self.exists(ref)
452 return existence
454 @abstractmethod
455 def exists(self, datasetRef: DatasetRef) -> bool:
456 """Check if the dataset exists in the datastore.
458 Parameters
459 ----------
460 datasetRef : `DatasetRef`
461 Reference to the required dataset.
463 Returns
464 -------
465 exists : `bool`
466 `True` if the entity exists in the `Datastore`.
467 """
468 raise NotImplementedError("Must be implemented by subclass")
470 @abstractmethod
471 def get(
472 self,
473 datasetRef: DatasetRef,
474 parameters: Mapping[str, Any] | None = None,
475 storageClass: StorageClass | str | None = None,
476 ) -> Any:
477 """Load an `InMemoryDataset` from the store.
479 Parameters
480 ----------
481 datasetRef : `DatasetRef`
482 Reference to the required Dataset.
483 parameters : `dict`
484 `StorageClass`-specific parameters that specify a slice of the
485 Dataset to be loaded.
486 storageClass : `StorageClass` or `str`, optional
487 The storage class to be used to override the Python type
488 returned by this method. By default the returned type matches
489 the dataset type definition for this dataset. Specifying a
490 read `StorageClass` can force a different type to be returned.
491 This type must be compatible with the original type.
493 Returns
494 -------
495 inMemoryDataset : `object`
496 Requested Dataset or slice thereof as an InMemoryDataset.
497 """
498 raise NotImplementedError("Must be implemented by subclass")
500 @abstractmethod
501 def put(self, inMemoryDataset: Any, datasetRef: DatasetRef) -> None:
502 """Write a `InMemoryDataset` with a given `DatasetRef` to the store.
504 Parameters
505 ----------
506 inMemoryDataset : `object`
507 The Dataset to store.
508 datasetRef : `DatasetRef`
509 Reference to the associated Dataset.
510 """
511 raise NotImplementedError("Must be implemented by subclass")
513 def _overrideTransferMode(self, *datasets: FileDataset, transfer: str | None = None) -> str | None:
514 """Allow ingest transfer mode to be defaulted based on datasets.
516 Parameters
517 ----------
518 datasets : `FileDataset`
519 Each positional argument is a struct containing information about
520 a file to be ingested, including its path (either absolute or
521 relative to the datastore root, if applicable), a complete
522 `DatasetRef` (with ``dataset_id not None``), and optionally a
523 formatter class or its fully-qualified string name. If a formatter
524 is not provided, this method should populate that attribute with
525 the formatter the datastore would use for `put`. Subclasses are
526 also permitted to modify the path attribute (typically to put it
527 in what the datastore considers its standard form).
528 transfer : `str`, optional
529 How (and whether) the dataset should be added to the datastore.
530 See `ingest` for details of transfer modes.
532 Returns
533 -------
534 newTransfer : `str`
535 Transfer mode to use. Will be identical to the supplied transfer
536 mode unless "auto" is used.
537 """
538 if transfer != "auto":
539 return transfer
540 raise RuntimeError(f"{transfer} is not allowed without specialization.")
542 def _prepIngest(self, *datasets: FileDataset, transfer: str | None = None) -> IngestPrepData:
543 """Process datasets to identify which ones can be ingested.
545 Parameters
546 ----------
547 datasets : `FileDataset`
548 Each positional argument is a struct containing information about
549 a file to be ingested, including its path (either absolute or
550 relative to the datastore root, if applicable), a complete
551 `DatasetRef` (with ``dataset_id not None``), and optionally a
552 formatter class or its fully-qualified string name. If a formatter
553 is not provided, this method should populate that attribute with
554 the formatter the datastore would use for `put`. Subclasses are
555 also permitted to modify the path attribute (typically to put it
556 in what the datastore considers its standard form).
557 transfer : `str`, optional
558 How (and whether) the dataset should be added to the datastore.
559 See `ingest` for details of transfer modes.
561 Returns
562 -------
563 data : `IngestPrepData`
564 An instance of a subclass of `IngestPrepData`, used to pass
565 arbitrary data from `_prepIngest` to `_finishIngest`. This should
566 include only the datasets this datastore can actually ingest;
567 others should be silently ignored (`Datastore.ingest` will inspect
568 `IngestPrepData.refs` and raise `DatasetTypeNotSupportedError` if
569 necessary).
571 Raises
572 ------
573 NotImplementedError
574 Raised if the datastore does not support the given transfer mode
575 (including the case where ingest is not supported at all).
576 FileNotFoundError
577 Raised if one of the given files does not exist.
578 FileExistsError
579 Raised if transfer is not `None` but the (internal) location the
580 file would be moved to is already occupied.
582 Notes
583 -----
584 This method (along with `_finishIngest`) should be implemented by
585 subclasses to provide ingest support instead of implementing `ingest`
586 directly.
588 `_prepIngest` should not modify the data repository or given files in
589 any way; all changes should be deferred to `_finishIngest`.
591 When possible, exceptions should be raised in `_prepIngest` instead of
592 `_finishIngest`. `NotImplementedError` exceptions that indicate that
593 the transfer mode is not supported must be raised by `_prepIngest`
594 instead of `_finishIngest`.
595 """
596 raise NotImplementedError(f"Datastore {self} does not support direct file-based ingest.")
598 def _finishIngest(
599 self, prepData: IngestPrepData, *, transfer: str | None = None, record_validation_info: bool = True
600 ) -> None:
601 """Complete an ingest operation.
603 Parameters
604 ----------
605 data : `IngestPrepData`
606 An instance of a subclass of `IngestPrepData`. Guaranteed to be
607 the direct result of a call to `_prepIngest` on this datastore.
608 transfer : `str`, optional
609 How (and whether) the dataset should be added to the datastore.
610 See `ingest` for details of transfer modes.
611 record_validation_info : `bool`, optional
612 If `True`, the default, the datastore can record validation
613 information associated with the file. If `False` the datastore
614 will not attempt to track any information such as checksums
615 or file sizes. This can be useful if such information is tracked
616 in an external system or if the file is to be compressed in place.
617 It is up to the datastore whether this parameter is relevant.
619 Raises
620 ------
621 FileNotFoundError
622 Raised if one of the given files does not exist.
623 FileExistsError
624 Raised if transfer is not `None` but the (internal) location the
625 file would be moved to is already occupied.
627 Notes
628 -----
629 This method (along with `_prepIngest`) should be implemented by
630 subclasses to provide ingest support instead of implementing `ingest`
631 directly.
632 """
633 raise NotImplementedError(f"Datastore {self} does not support direct file-based ingest.")
635 def ingest(
636 self, *datasets: FileDataset, transfer: str | None = None, record_validation_info: bool = True
637 ) -> None:
638 """Ingest one or more files into the datastore.
640 Parameters
641 ----------
642 datasets : `FileDataset`
643 Each positional argument is a struct containing information about
644 a file to be ingested, including its path (either absolute or
645 relative to the datastore root, if applicable), a complete
646 `DatasetRef` (with ``dataset_id not None``), and optionally a
647 formatter class or its fully-qualified string name. If a formatter
648 is not provided, the one the datastore would use for ``put`` on
649 that dataset is assumed.
650 transfer : `str`, optional
651 How (and whether) the dataset should be added to the datastore.
652 If `None` (default), the file must already be in a location
653 appropriate for the datastore (e.g. within its root directory),
654 and will not be modified. Other choices include "move", "copy",
655 "link", "symlink", "relsymlink", and "hardlink". "link" is a
656 special transfer mode that will first try to make a hardlink and
657 if that fails a symlink will be used instead. "relsymlink" creates
658 a relative symlink rather than use an absolute path.
659 Most datastores do not support all transfer modes.
660 "auto" is a special option that will let the
661 data store choose the most natural option for itself.
662 record_validation_info : `bool`, optional
663 If `True`, the default, the datastore can record validation
664 information associated with the file. If `False` the datastore
665 will not attempt to track any information such as checksums
666 or file sizes. This can be useful if such information is tracked
667 in an external system or if the file is to be compressed in place.
668 It is up to the datastore whether this parameter is relevant.
670 Raises
671 ------
672 NotImplementedError
673 Raised if the datastore does not support the given transfer mode
674 (including the case where ingest is not supported at all).
675 DatasetTypeNotSupportedError
676 Raised if one or more files to be ingested have a dataset type that
677 is not supported by the datastore.
678 FileNotFoundError
679 Raised if one of the given files does not exist.
680 FileExistsError
681 Raised if transfer is not `None` but the (internal) location the
682 file would be moved to is already occupied.
684 Notes
685 -----
686 Subclasses should implement `_prepIngest` and `_finishIngest` instead
687 of implementing `ingest` directly. Datastores that hold and
688 delegate to child datastores may want to call those methods as well.
690 Subclasses are encouraged to document their supported transfer modes
691 in their class documentation.
692 """
693 # Allow a datastore to select a default transfer mode
694 transfer = self._overrideTransferMode(*datasets, transfer=transfer)
695 prepData = self._prepIngest(*datasets, transfer=transfer)
696 refs = {ref.id: ref for dataset in datasets for ref in dataset.refs}
697 if refs.keys() != prepData.refs.keys():
698 unsupported = refs.keys() - prepData.refs.keys()
699 # Group unsupported refs by DatasetType for an informative
700 # but still concise error message.
701 byDatasetType = defaultdict(list)
702 for datasetId in unsupported:
703 ref = refs[datasetId]
704 byDatasetType[ref.datasetType].append(ref)
705 raise DatasetTypeNotSupportedError(
706 "DatasetType(s) not supported in ingest: "
707 + ", ".join(f"{k.name} ({len(v)} dataset(s))" for k, v in byDatasetType.items())
708 )
709 self._finishIngest(prepData, transfer=transfer, record_validation_info=record_validation_info)
711 def transfer_from(
712 self,
713 source_datastore: Datastore,
714 refs: Iterable[DatasetRef],
715 transfer: str = "auto",
716 artifact_existence: dict[ResourcePath, bool] | None = None,
717 ) -> tuple[set[DatasetRef], set[DatasetRef]]:
718 """Transfer dataset artifacts from another datastore to this one.
720 Parameters
721 ----------
722 source_datastore : `Datastore`
723 The datastore from which to transfer artifacts. That datastore
724 must be compatible with this datastore receiving the artifacts.
725 refs : iterable of `DatasetRef`
726 The datasets to transfer from the source datastore.
727 transfer : `str`, optional
728 How (and whether) the dataset should be added to the datastore.
729 Choices include "move", "copy",
730 "link", "symlink", "relsymlink", and "hardlink". "link" is a
731 special transfer mode that will first try to make a hardlink and
732 if that fails a symlink will be used instead. "relsymlink" creates
733 a relative symlink rather than use an absolute path.
734 Most datastores do not support all transfer modes.
735 "auto" (the default) is a special option that will let the
736 data store choose the most natural option for itself.
737 If the source location and transfer location are identical the
738 transfer mode will be ignored.
739 artifact_existence : `dict` [`lsst.resources.ResourcePath`, `bool`]
740 Optional mapping of datastore artifact to existence. Updated by
741 this method with details of all artifacts tested. Can be `None`
742 if the caller is not interested.
744 Returns
745 -------
746 accepted : `set` [`DatasetRef`]
747 The datasets that were transferred.
748 rejected : `set` [`DatasetRef`]
749 The datasets that were rejected due to a constraints violation.
751 Raises
752 ------
753 TypeError
754 Raised if the two datastores are not compatible.
755 """
756 if type(self) is not type(source_datastore):
757 raise TypeError(
758 f"Datastore mismatch between this datastore ({type(self)}) and the "
759 f"source datastore ({type(source_datastore)})."
760 )
762 raise NotImplementedError(f"Datastore {type(self)} must implement a transfer_from method.")
764 def getManyURIs(
765 self,
766 refs: Iterable[DatasetRef],
767 predict: bool = False,
768 allow_missing: bool = False,
769 ) -> dict[DatasetRef, DatasetRefURIs]:
770 """Return URIs associated with many datasets.
772 Parameters
773 ----------
774 refs : iterable of `DatasetIdRef`
775 References to the required datasets.
776 predict : `bool`, optional
777 If the datastore does not know about a dataset, should it
778 return a predicted URI or not?
779 allow_missing : `bool`
780 If `False`, and `predict` is `False`, will raise if a `DatasetRef`
781 does not exist.
783 Returns
784 -------
785 URIs : `dict` of [`DatasetRef`, `DatasetRefUris`]
786 A dict of primary and component URIs, indexed by the passed-in
787 refs.
789 Raises
790 ------
791 FileNotFoundError
792 A URI has been requested for a dataset that does not exist and
793 guessing is not allowed.
795 Notes
796 -----
797 In file-based datastores, getManuURIs does not check that the file is
798 really there, it's assuming it is if datastore is aware of the file
799 then it actually exists.
800 """
801 uris: dict[DatasetRef, DatasetRefURIs] = {}
802 missing_refs = []
803 for ref in refs:
804 try:
805 uris[ref] = self.getURIs(ref, predict=predict)
806 except FileNotFoundError:
807 missing_refs.append(ref)
808 if missing_refs and not allow_missing:
809 raise FileNotFoundError(
810 "Missing {} refs from datastore out of {} and predict=False.".format(
811 num_missing := len(missing_refs), num_missing + len(uris)
812 )
813 )
814 return uris
816 @abstractmethod
817 def getURIs(self, datasetRef: DatasetRef, predict: bool = False) -> DatasetRefURIs:
818 """Return URIs associated with dataset.
820 Parameters
821 ----------
822 ref : `DatasetRef`
823 Reference to the required dataset.
824 predict : `bool`, optional
825 If the datastore does not know about the dataset, should it
826 return a predicted URI or not?
828 Returns
829 -------
830 uris : `DatasetRefURIs`
831 The URI to the primary artifact associated with this dataset (if
832 the dataset was disassembled within the datastore this may be
833 `None`), and the URIs to any components associated with the dataset
834 artifact. (can be empty if there are no components).
835 """
836 raise NotImplementedError()
838 @abstractmethod
839 def getURI(self, datasetRef: DatasetRef, predict: bool = False) -> ResourcePath:
840 """URI to the Dataset.
842 Parameters
843 ----------
844 datasetRef : `DatasetRef`
845 Reference to the required Dataset.
846 predict : `bool`
847 If `True` attempt to predict the URI for a dataset if it does
848 not exist in datastore.
850 Returns
851 -------
852 uri : `str`
853 URI string pointing to the Dataset within the datastore. If the
854 Dataset does not exist in the datastore, the URI may be a guess.
855 If the datastore does not have entities that relate well
856 to the concept of a URI the returned URI string will be
857 descriptive. The returned URI is not guaranteed to be obtainable.
859 Raises
860 ------
861 FileNotFoundError
862 A URI has been requested for a dataset that does not exist and
863 guessing is not allowed.
864 """
865 raise NotImplementedError("Must be implemented by subclass")
867 @abstractmethod
868 def retrieveArtifacts(
869 self,
870 refs: Iterable[DatasetRef],
871 destination: ResourcePath,
872 transfer: str = "auto",
873 preserve_path: bool = True,
874 overwrite: bool = False,
875 ) -> list[ResourcePath]:
876 """Retrieve the artifacts associated with the supplied refs.
878 Parameters
879 ----------
880 refs : iterable of `DatasetRef`
881 The datasets for which artifacts are to be retrieved.
882 A single ref can result in multiple artifacts. The refs must
883 be resolved.
884 destination : `lsst.resources.ResourcePath`
885 Location to write the artifacts.
886 transfer : `str`, optional
887 Method to use to transfer the artifacts. Must be one of the options
888 supported by `lsst.resources.ResourcePath.transfer_from()`.
889 "move" is not allowed.
890 preserve_path : `bool`, optional
891 If `True` the full path of the artifact within the datastore
892 is preserved. If `False` the final file component of the path
893 is used.
894 overwrite : `bool`, optional
895 If `True` allow transfers to overwrite existing files at the
896 destination.
898 Returns
899 -------
900 targets : `list` of `lsst.resources.ResourcePath`
901 URIs of file artifacts in destination location. Order is not
902 preserved.
904 Notes
905 -----
906 For non-file datastores the artifacts written to the destination
907 may not match the representation inside the datastore. For example
908 a hierarchichal data structure in a NoSQL database may well be stored
909 as a JSON file.
910 """
911 raise NotImplementedError()
913 @abstractmethod
914 def remove(self, datasetRef: DatasetRef) -> None:
915 """Indicate to the Datastore that a Dataset can be removed.
917 Parameters
918 ----------
919 datasetRef : `DatasetRef`
920 Reference to the required Dataset.
922 Raises
923 ------
924 FileNotFoundError
925 When Dataset does not exist.
927 Notes
928 -----
929 Some Datastores may implement this method as a silent no-op to
930 disable Dataset deletion through standard interfaces.
931 """
932 raise NotImplementedError("Must be implemented by subclass")
934 @abstractmethod
935 def forget(self, refs: Iterable[DatasetRef]) -> None:
936 """Indicate to the Datastore that it should remove all records of the
937 given datasets, without actually deleting them.
939 Parameters
940 ----------
941 refs : `~collections.abc.Iterable` [ `DatasetRef` ]
942 References to the datasets being forgotten.
944 Notes
945 -----
946 Asking a datastore to forget a `DatasetRef` it does not hold should be
947 a silent no-op, not an error.
948 """
949 raise NotImplementedError("Must be implemented by subclass")
951 @abstractmethod
952 def trash(self, ref: DatasetRef | Iterable[DatasetRef], ignore_errors: bool = True) -> None:
953 """Indicate to the Datastore that a Dataset can be moved to the trash.
955 Parameters
956 ----------
957 ref : `DatasetRef` or iterable thereof
958 Reference(s) to the required Dataset.
959 ignore_errors : `bool`, optional
960 Determine whether errors should be ignored. When multiple
961 refs are being trashed there will be no per-ref check.
963 Raises
964 ------
965 FileNotFoundError
966 When Dataset does not exist and errors are not ignored. Only
967 checked if a single ref is supplied (and not in a list).
969 Notes
970 -----
971 Some Datastores may implement this method as a silent no-op to
972 disable Dataset deletion through standard interfaces.
973 """
974 raise NotImplementedError("Must be implemented by subclass")
976 @abstractmethod
977 def emptyTrash(self, ignore_errors: bool = True) -> None:
978 """Remove all datasets from the trash.
980 Parameters
981 ----------
982 ignore_errors : `bool`, optional
983 Determine whether errors should be ignored.
985 Notes
986 -----
987 Some Datastores may implement this method as a silent no-op to
988 disable Dataset deletion through standard interfaces.
989 """
990 raise NotImplementedError("Must be implemented by subclass")
992 @abstractmethod
993 def transfer(self, inputDatastore: Datastore, datasetRef: DatasetRef) -> None:
994 """Transfer a dataset from another datastore to this datastore.
996 Parameters
997 ----------
998 inputDatastore : `Datastore`
999 The external `Datastore` from which to retrieve the Dataset.
1000 datasetRef : `DatasetRef`
1001 Reference to the required Dataset.
1002 """
1003 raise NotImplementedError("Must be implemented by subclass")
1005 def export(
1006 self,
1007 refs: Iterable[DatasetRef],
1008 *,
1009 directory: ResourcePathExpression | None = None,
1010 transfer: str | None = "auto",
1011 ) -> Iterable[FileDataset]:
1012 """Export datasets for transfer to another data repository.
1014 Parameters
1015 ----------
1016 refs : iterable of `DatasetRef`
1017 Dataset references to be exported.
1018 directory : `str`, optional
1019 Path to a directory that should contain files corresponding to
1020 output datasets. Ignored if ``transfer`` is explicitly `None`.
1021 transfer : `str`, optional
1022 Mode that should be used to move datasets out of the repository.
1023 Valid options are the same as those of the ``transfer`` argument
1024 to ``ingest``, and datastores may similarly signal that a transfer
1025 mode is not supported by raising `NotImplementedError`. If "auto"
1026 is given and no ``directory`` is specified, `None` will be
1027 implied.
1029 Returns
1030 -------
1031 dataset : iterable of `DatasetTransfer`
1032 Structs containing information about the exported datasets, in the
1033 same order as ``refs``.
1035 Raises
1036 ------
1037 NotImplementedError
1038 Raised if the given transfer mode is not supported.
1039 """
1040 raise NotImplementedError(f"Transfer mode {transfer} not supported.")
1042 @abstractmethod
1043 def validateConfiguration(
1044 self, entities: Iterable[DatasetRef | DatasetType | StorageClass], logFailures: bool = False
1045 ) -> None:
1046 """Validate some of the configuration for this datastore.
1048 Parameters
1049 ----------
1050 entities : iterable of `DatasetRef`, `DatasetType`, or `StorageClass`
1051 Entities to test against this configuration. Can be differing
1052 types.
1053 logFailures : `bool`, optional
1054 If `True`, output a log message for every validation error
1055 detected.
1057 Raises
1058 ------
1059 DatastoreValidationError
1060 Raised if there is a validation problem with a configuration.
1062 Notes
1063 -----
1064 Which parts of the configuration are validated is at the discretion
1065 of each Datastore implementation.
1066 """
1067 raise NotImplementedError("Must be implemented by subclass")
1069 @abstractmethod
1070 def validateKey(self, lookupKey: LookupKey, entity: DatasetRef | DatasetType | StorageClass) -> None:
1071 """Validate a specific look up key with supplied entity.
1073 Parameters
1074 ----------
1075 lookupKey : `LookupKey`
1076 Key to use to retrieve information from the datastore
1077 configuration.
1078 entity : `DatasetRef`, `DatasetType`, or `StorageClass`
1079 Entity to compare with configuration retrieved using the
1080 specified lookup key.
1082 Raises
1083 ------
1084 DatastoreValidationError
1085 Raised if there is a problem with the combination of entity
1086 and lookup key.
1088 Notes
1089 -----
1090 Bypasses the normal selection priorities by allowing a key that
1091 would normally not be selected to be validated.
1092 """
1093 raise NotImplementedError("Must be implemented by subclass")
1095 @abstractmethod
1096 def getLookupKeys(self) -> set[LookupKey]:
1097 """Return all the lookup keys relevant to this datastore.
1099 Returns
1100 -------
1101 keys : `set` of `LookupKey`
1102 The keys stored internally for looking up information based
1103 on `DatasetType` name or `StorageClass`.
1104 """
1105 raise NotImplementedError("Must be implemented by subclass")
1107 def needs_expanded_data_ids(
1108 self,
1109 transfer: str | None,
1110 entity: DatasetRef | DatasetType | StorageClass | None = None,
1111 ) -> bool:
1112 """Test whether this datastore needs expanded data IDs to ingest.
1114 Parameters
1115 ----------
1116 transfer : `str` or `None`
1117 Transfer mode for ingest.
1118 entity, optional
1119 Object representing what will be ingested. If not provided (or not
1120 specific enough), `True` may be returned even if expanded data
1121 IDs aren't necessary.
1123 Returns
1124 -------
1125 needed : `bool`
1126 If `True`, expanded data IDs may be needed. `False` only if
1127 expansion definitely isn't necessary.
1128 """
1129 return True
1131 @abstractmethod
1132 def import_records(
1133 self,
1134 data: Mapping[str, DatastoreRecordData],
1135 ) -> None:
1136 """Import datastore location and record data from an in-memory data
1137 structure.
1139 Parameters
1140 ----------
1141 data : `~collections.abc.Mapping` [ `str`, `DatastoreRecordData` ]
1142 Datastore records indexed by datastore name. May contain data for
1143 other `Datastore` instances (generally because they are chained to
1144 this one), which should be ignored.
1146 Notes
1147 -----
1148 Implementations should generally not check that any external resources
1149 (e.g. files) referred to by these records actually exist, for
1150 performance reasons; we expect higher-level code to guarantee that they
1151 do.
1153 Implementations are responsible for calling
1154 `DatastoreRegistryBridge.insert` on all datasets in ``data.locations``
1155 where the key is in `names`, as well as loading any opaque table data.
1156 """
1157 raise NotImplementedError()
1159 @abstractmethod
1160 def export_records(
1161 self,
1162 refs: Iterable[DatasetIdRef],
1163 ) -> Mapping[str, DatastoreRecordData]:
1164 """Export datastore records and locations to an in-memory data
1165 structure.
1167 Parameters
1168 ----------
1169 refs : `~collections.abc.Iterable` [ `DatasetIdRef` ]
1170 Datasets to save. This may include datasets not known to this
1171 datastore, which should be ignored.
1173 Returns
1174 -------
1175 data : `~collections.abc.Mapping` [ `str`, `DatastoreRecordData` ]
1176 Exported datastore records indexed by datastore name.
1177 """
1178 raise NotImplementedError()
1180 def set_retrieve_dataset_type_method(self, method: Callable[[str], DatasetType | None] | None) -> None:
1181 """Specify a method that can be used by datastore to retrieve
1182 registry-defined dataset type.
1184 Parameters
1185 ----------
1186 method : `~collections.abc.Callable` | `None`
1187 Method that takes a name of the dataset type and returns a
1188 corresponding `DatasetType` instance as defined in Registry. If
1189 dataset type name is not known to registry `None` is returned.
1191 Notes
1192 -----
1193 This method is only needed for a Datastore supporting a "trusted" mode
1194 when it does not have an access to datastore records and needs to
1195 guess dataset location based on its stored dataset type.
1196 """
1197 pass