Coverage for python/lsst/daf/butler/core/datastore.py: 51%
210 statements
« prev ^ index » next coverage.py v7.2.7, created at 2023-06-15 09:13 +0000
« prev ^ index » next coverage.py v7.2.7, created at 2023-06-15 09:13 +0000
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22"""Support for generic data stores."""
24from __future__ import annotations
26__all__ = ("DatastoreConfig", "Datastore", "DatastoreValidationError", "DatasetRefURIs")
28import contextlib
29import dataclasses
30import logging
31from abc import ABCMeta, abstractmethod
32from collections import abc, defaultdict
33from collections.abc import Callable, Iterable, Iterator, Mapping
34from typing import TYPE_CHECKING, Any, ClassVar
36from lsst.utils import doImportType
38from .config import Config, ConfigSubset
39from .constraints import Constraints
40from .exceptions import DatasetTypeNotSupportedError, ValidationError
41from .fileDataset import FileDataset
42from .storageClass import StorageClassFactory
44if TYPE_CHECKING:
45 from lsst.resources import ResourcePath, ResourcePathExpression
47 from ..registry.interfaces import DatasetIdRef, DatastoreRegistryBridgeManager
48 from .configSupport import LookupKey
49 from .datasets import DatasetRef, DatasetType
50 from .datastoreRecordData import DatastoreRecordData
51 from .storageClass import StorageClass
54class DatastoreConfig(ConfigSubset):
55 """Configuration for Datastores."""
57 component = "datastore"
58 requiredKeys = ("cls",)
59 defaultConfigFile = "datastore.yaml"
62class DatastoreValidationError(ValidationError):
63 """There is a problem with the Datastore configuration."""
65 pass
68@dataclasses.dataclass(frozen=True)
69class Event:
70 __slots__ = {"name", "undoFunc", "args", "kwargs"}
71 name: str
72 undoFunc: Callable
73 args: tuple
74 kwargs: dict
77class IngestPrepData:
78 """A helper base class for `Datastore` ingest implementations.
80 Datastore implementations will generally need a custom implementation of
81 this class.
83 Should be accessed as ``Datastore.IngestPrepData`` instead of via direct
84 import.
86 Parameters
87 ----------
88 refs : iterable of `DatasetRef`
89 References for the datasets that can be ingested by this datastore.
90 """
92 def __init__(self, refs: Iterable[DatasetRef]):
93 self.refs = {ref.id: ref for ref in refs}
96class DatastoreTransaction:
97 """Keeps a log of `Datastore` activity and allow rollback.
99 Parameters
100 ----------
101 parent : `DatastoreTransaction`, optional
102 The parent transaction (if any)
103 """
105 Event: ClassVar[type] = Event
107 parent: DatastoreTransaction | None
108 """The parent transaction. (`DatastoreTransaction`, optional)"""
110 def __init__(self, parent: DatastoreTransaction | None = None):
111 self.parent = parent
112 self._log: list[Event] = []
114 def registerUndo(self, name: str, undoFunc: Callable, *args: Any, **kwargs: Any) -> None:
115 """Register event with undo function.
117 Parameters
118 ----------
119 name : `str`
120 Name of the event.
121 undoFunc : func
122 Function to undo this event.
123 args : `tuple`
124 Positional arguments to `undoFunc`.
125 **kwargs
126 Keyword arguments to `undoFunc`.
127 """
128 self._log.append(self.Event(name, undoFunc, args, kwargs))
130 @contextlib.contextmanager
131 def undoWith(self, name: str, undoFunc: Callable, *args: Any, **kwargs: Any) -> Iterator[None]:
132 """Register undo function if nested operation succeeds.
134 Calls `registerUndo`.
136 This can be used to wrap individual undo-able statements within a
137 DatastoreTransaction block. Multiple statements that can fail
138 separately should not be part of the same `undoWith` block.
140 All arguments are forwarded directly to `registerUndo`.
141 """
142 try:
143 yield None
144 except BaseException:
145 raise
146 else:
147 self.registerUndo(name, undoFunc, *args, **kwargs)
149 def rollback(self) -> None:
150 """Roll back all events in this transaction."""
151 log = logging.getLogger(__name__)
152 while self._log:
153 ev = self._log.pop()
154 try:
155 log.debug(
156 "Rolling back transaction: %s: %s(%s,%s)",
157 ev.name,
158 ev.undoFunc,
159 ",".join(str(a) for a in ev.args),
160 ",".join(f"{k}={v}" for k, v in ev.kwargs.items()),
161 )
162 except Exception:
163 # In case we had a problem in stringification of arguments
164 log.warning("Rolling back transaction: %s", ev.name)
165 try:
166 ev.undoFunc(*ev.args, **ev.kwargs)
167 except BaseException as e:
168 # Deliberately swallow error that may occur in unrolling
169 log.warning("Exception: %s caught while unrolling: %s", e, ev.name)
170 pass
172 def commit(self) -> None:
173 """Commit this transaction."""
174 if self.parent is None:
175 # Just forget about the events, they have already happened.
176 return
177 else:
178 # We may still want to events from this transaction as part of
179 # the parent.
180 self.parent._log.extend(self._log)
183@dataclasses.dataclass
184class DatasetRefURIs(abc.Sequence):
185 """Represents the primary and component ResourcePath(s) associated with a
186 DatasetRef.
188 This is used in places where its members used to be represented as a tuple
189 `(primaryURI, componentURIs)`. To maintain backward compatibility this
190 inherits from Sequence and so instances can be treated as a two-item
191 tuple.
192 """
194 def __init__(
195 self,
196 primaryURI: ResourcePath | None = None,
197 componentURIs: dict[str, ResourcePath] | None = None,
198 ):
199 self.primaryURI = primaryURI
200 """The URI to the primary artifact associated with this dataset. If the
201 dataset was disassembled within the datastore this may be `None`.
202 """
204 self.componentURIs = componentURIs or {}
205 """The URIs to any components associated with the dataset artifact
206 indexed by component name. This can be empty if there are no
207 components.
208 """
210 def __getitem__(self, index: Any) -> Any:
211 """Get primaryURI and componentURIs by index.
213 Provides support for tuple-like access.
214 """
215 if index == 0:
216 return self.primaryURI
217 elif index == 1:
218 return self.componentURIs
219 raise IndexError("list index out of range")
221 def __len__(self) -> int:
222 """Get the number of data members.
224 Provides support for tuple-like access.
225 """
226 return 2
228 def __repr__(self) -> str:
229 return f"DatasetRefURIs({repr(self.primaryURI)}, {repr(self.componentURIs)})"
232class Datastore(metaclass=ABCMeta):
233 """Datastore interface.
235 Parameters
236 ----------
237 config : `DatastoreConfig` or `str`
238 Load configuration either from an existing config instance or by
239 referring to a configuration file.
240 bridgeManager : `DatastoreRegistryBridgeManager`
241 Object that manages the interface between `Registry` and datastores.
242 butlerRoot : `str`, optional
243 New datastore root to use to override the configuration value.
244 """
246 defaultConfigFile: ClassVar[str | None] = None
247 """Path to configuration defaults. Accessed within the ``config`` resource
248 or relative to a search path. Can be None if no defaults specified.
249 """
251 containerKey: ClassVar[str | None] = None
252 """Name of the key containing a list of subconfigurations that also
253 need to be merged with defaults and will likely use different Python
254 datastore classes (but all using DatastoreConfig). Assumed to be a
255 list of configurations that can be represented in a DatastoreConfig
256 and containing a "cls" definition. None indicates that no containers
257 are expected in this Datastore."""
259 isEphemeral: bool = False
260 """Indicate whether this Datastore is ephemeral or not. An ephemeral
261 datastore is one where the contents of the datastore will not exist
262 across process restarts. This value can change per-instance."""
264 config: DatastoreConfig
265 """Configuration used to create Datastore."""
267 name: str
268 """Label associated with this Datastore."""
270 storageClassFactory: StorageClassFactory
271 """Factory for creating storage class instances from name."""
273 constraints: Constraints
274 """Constraints to apply when putting datasets into the datastore."""
276 # MyPy does not like for this to be annotated as any kind of type, because
277 # it can't do static checking on type variables that can change at runtime.
278 IngestPrepData: ClassVar[Any] = IngestPrepData
279 """Helper base class for ingest implementations.
280 """
282 @classmethod
283 @abstractmethod
284 def setConfigRoot(cls, root: str, config: Config, full: Config, overwrite: bool = True) -> None:
285 """Set filesystem-dependent config options for this datastore.
287 The options will be appropriate for a new empty repository with the
288 given root.
290 Parameters
291 ----------
292 root : `str`
293 Filesystem path to the root of the data repository.
294 config : `Config`
295 A `Config` to update. Only the subset understood by
296 this component will be updated. Will not expand
297 defaults.
298 full : `Config`
299 A complete config with all defaults expanded that can be
300 converted to a `DatastoreConfig`. Read-only and will not be
301 modified by this method.
302 Repository-specific options that should not be obtained
303 from defaults when Butler instances are constructed
304 should be copied from ``full`` to ``config``.
305 overwrite : `bool`, optional
306 If `False`, do not modify a value in ``config`` if the value
307 already exists. Default is always to overwrite with the provided
308 ``root``.
310 Notes
311 -----
312 If a keyword is explicitly defined in the supplied ``config`` it
313 will not be overridden by this method if ``overwrite`` is `False`.
314 This allows explicit values set in external configs to be retained.
315 """
316 raise NotImplementedError()
318 @staticmethod
319 def fromConfig(
320 config: Config,
321 bridgeManager: DatastoreRegistryBridgeManager,
322 butlerRoot: ResourcePathExpression | None = None,
323 ) -> Datastore:
324 """Create datastore from type specified in config file.
326 Parameters
327 ----------
328 config : `Config` or `~lsst.resources.ResourcePathExpression`
329 Configuration instance.
330 bridgeManager : `DatastoreRegistryBridgeManager`
331 Object that manages the interface between `Registry` and
332 datastores.
333 butlerRoot : `str`, optional
334 Butler root directory.
335 """
336 cls = doImportType(config["datastore", "cls"])
337 if not issubclass(cls, Datastore):
338 raise TypeError(f"Imported child class {config['datastore', 'cls']} is not a Datastore")
339 return cls(config=config, bridgeManager=bridgeManager, butlerRoot=butlerRoot)
341 def __init__(
342 self,
343 config: Config | ResourcePathExpression,
344 bridgeManager: DatastoreRegistryBridgeManager,
345 butlerRoot: ResourcePathExpression | None = None,
346 ):
347 self.config = DatastoreConfig(config)
348 self.name = "ABCDataStore"
349 self._transaction: DatastoreTransaction | None = None
351 # All Datastores need storage classes and constraints
352 self.storageClassFactory = StorageClassFactory()
354 # And read the constraints list
355 constraintsConfig = self.config.get("constraints")
356 self.constraints = Constraints(constraintsConfig, universe=bridgeManager.universe)
358 def __str__(self) -> str:
359 return self.name
361 def __repr__(self) -> str:
362 return self.name
364 @property
365 def names(self) -> tuple[str, ...]:
366 """Names associated with this datastore returned as a list.
368 Can be different to ``name`` for a chaining datastore.
369 """
370 # Default implementation returns solely the name itself
371 return (self.name,)
373 @contextlib.contextmanager
374 def transaction(self) -> Iterator[DatastoreTransaction]:
375 """Context manager supporting `Datastore` transactions.
377 Transactions can be nested, and are to be used in combination with
378 `Registry.transaction`.
379 """
380 self._transaction = DatastoreTransaction(self._transaction)
381 try:
382 yield self._transaction
383 except BaseException:
384 self._transaction.rollback()
385 raise
386 else:
387 self._transaction.commit()
388 self._transaction = self._transaction.parent
390 @abstractmethod
391 def knows(self, ref: DatasetRef) -> bool:
392 """Check if the dataset is known to the datastore.
394 Does not check for existence of any artifact.
396 Parameters
397 ----------
398 ref : `DatasetRef`
399 Reference to the required dataset.
401 Returns
402 -------
403 exists : `bool`
404 `True` if the dataset is known to the datastore.
405 """
406 raise NotImplementedError()
408 def knows_these(self, refs: Iterable[DatasetRef]) -> dict[DatasetRef, bool]:
409 """Check which of the given datasets are known to this datastore.
411 This is like ``mexist()`` but does not check that the file exists.
413 Parameters
414 ----------
415 refs : iterable `DatasetRef`
416 The datasets to check.
418 Returns
419 -------
420 exists : `dict`[`DatasetRef`, `bool`]
421 Mapping of dataset to boolean indicating whether the dataset
422 is known to the datastore.
423 """
424 # Non-optimized default calls knows() repeatedly.
425 return {ref: self.knows(ref) for ref in refs}
427 def mexists(
428 self, refs: Iterable[DatasetRef], artifact_existence: dict[ResourcePath, bool] | None = None
429 ) -> dict[DatasetRef, bool]:
430 """Check the existence of multiple datasets at once.
432 Parameters
433 ----------
434 refs : iterable of `DatasetRef`
435 The datasets to be checked.
436 artifact_existence : `dict` [`lsst.resources.ResourcePath`, `bool`]
437 Optional mapping of datastore artifact to existence. Updated by
438 this method with details of all artifacts tested. Can be `None`
439 if the caller is not interested.
441 Returns
442 -------
443 existence : `dict` of [`DatasetRef`, `bool`]
444 Mapping from dataset to boolean indicating existence.
445 """
446 existence: dict[DatasetRef, bool] = {}
447 # Non-optimized default.
448 for ref in refs:
449 existence[ref] = self.exists(ref)
450 return existence
452 @abstractmethod
453 def exists(self, datasetRef: DatasetRef) -> bool:
454 """Check if the dataset exists in the datastore.
456 Parameters
457 ----------
458 datasetRef : `DatasetRef`
459 Reference to the required dataset.
461 Returns
462 -------
463 exists : `bool`
464 `True` if the entity exists in the `Datastore`.
465 """
466 raise NotImplementedError("Must be implemented by subclass")
468 @abstractmethod
469 def get(
470 self,
471 datasetRef: DatasetRef,
472 parameters: Mapping[str, Any] | None = None,
473 storageClass: StorageClass | str | None = None,
474 ) -> Any:
475 """Load an `InMemoryDataset` from the store.
477 Parameters
478 ----------
479 datasetRef : `DatasetRef`
480 Reference to the required Dataset.
481 parameters : `dict`
482 `StorageClass`-specific parameters that specify a slice of the
483 Dataset to be loaded.
484 storageClass : `StorageClass` or `str`, optional
485 The storage class to be used to override the Python type
486 returned by this method. By default the returned type matches
487 the dataset type definition for this dataset. Specifying a
488 read `StorageClass` can force a different type to be returned.
489 This type must be compatible with the original type.
491 Returns
492 -------
493 inMemoryDataset : `object`
494 Requested Dataset or slice thereof as an InMemoryDataset.
495 """
496 raise NotImplementedError("Must be implemented by subclass")
498 @abstractmethod
499 def put(self, inMemoryDataset: Any, datasetRef: DatasetRef) -> None:
500 """Write a `InMemoryDataset` with a given `DatasetRef` to the store.
502 Parameters
503 ----------
504 inMemoryDataset : `object`
505 The Dataset to store.
506 datasetRef : `DatasetRef`
507 Reference to the associated Dataset.
508 """
509 raise NotImplementedError("Must be implemented by subclass")
511 def _overrideTransferMode(self, *datasets: FileDataset, transfer: str | None = None) -> str | None:
512 """Allow ingest transfer mode to be defaulted based on datasets.
514 Parameters
515 ----------
516 datasets : `FileDataset`
517 Each positional argument is a struct containing information about
518 a file to be ingested, including its path (either absolute or
519 relative to the datastore root, if applicable), a complete
520 `DatasetRef` (with ``dataset_id not None``), and optionally a
521 formatter class or its fully-qualified string name. If a formatter
522 is not provided, this method should populate that attribute with
523 the formatter the datastore would use for `put`. Subclasses are
524 also permitted to modify the path attribute (typically to put it
525 in what the datastore considers its standard form).
526 transfer : `str`, optional
527 How (and whether) the dataset should be added to the datastore.
528 See `ingest` for details of transfer modes.
530 Returns
531 -------
532 newTransfer : `str`
533 Transfer mode to use. Will be identical to the supplied transfer
534 mode unless "auto" is used.
535 """
536 if transfer != "auto":
537 return transfer
538 raise RuntimeError(f"{transfer} is not allowed without specialization.")
540 def _prepIngest(self, *datasets: FileDataset, transfer: str | None = None) -> IngestPrepData:
541 """Process datasets to identify which ones can be ingested.
543 Parameters
544 ----------
545 datasets : `FileDataset`
546 Each positional argument is a struct containing information about
547 a file to be ingested, including its path (either absolute or
548 relative to the datastore root, if applicable), a complete
549 `DatasetRef` (with ``dataset_id not None``), and optionally a
550 formatter class or its fully-qualified string name. If a formatter
551 is not provided, this method should populate that attribute with
552 the formatter the datastore would use for `put`. Subclasses are
553 also permitted to modify the path attribute (typically to put it
554 in what the datastore considers its standard form).
555 transfer : `str`, optional
556 How (and whether) the dataset should be added to the datastore.
557 See `ingest` for details of transfer modes.
559 Returns
560 -------
561 data : `IngestPrepData`
562 An instance of a subclass of `IngestPrepData`, used to pass
563 arbitrary data from `_prepIngest` to `_finishIngest`. This should
564 include only the datasets this datastore can actually ingest;
565 others should be silently ignored (`Datastore.ingest` will inspect
566 `IngestPrepData.refs` and raise `DatasetTypeNotSupportedError` if
567 necessary).
569 Raises
570 ------
571 NotImplementedError
572 Raised if the datastore does not support the given transfer mode
573 (including the case where ingest is not supported at all).
574 FileNotFoundError
575 Raised if one of the given files does not exist.
576 FileExistsError
577 Raised if transfer is not `None` but the (internal) location the
578 file would be moved to is already occupied.
580 Notes
581 -----
582 This method (along with `_finishIngest`) should be implemented by
583 subclasses to provide ingest support instead of implementing `ingest`
584 directly.
586 `_prepIngest` should not modify the data repository or given files in
587 any way; all changes should be deferred to `_finishIngest`.
589 When possible, exceptions should be raised in `_prepIngest` instead of
590 `_finishIngest`. `NotImplementedError` exceptions that indicate that
591 the transfer mode is not supported must be raised by `_prepIngest`
592 instead of `_finishIngest`.
593 """
594 raise NotImplementedError(f"Datastore {self} does not support direct file-based ingest.")
596 def _finishIngest(
597 self, prepData: IngestPrepData, *, transfer: str | None = None, record_validation_info: bool = True
598 ) -> None:
599 """Complete an ingest operation.
601 Parameters
602 ----------
603 data : `IngestPrepData`
604 An instance of a subclass of `IngestPrepData`. Guaranteed to be
605 the direct result of a call to `_prepIngest` on this datastore.
606 transfer : `str`, optional
607 How (and whether) the dataset should be added to the datastore.
608 See `ingest` for details of transfer modes.
609 record_validation_info : `bool`, optional
610 If `True`, the default, the datastore can record validation
611 information associated with the file. If `False` the datastore
612 will not attempt to track any information such as checksums
613 or file sizes. This can be useful if such information is tracked
614 in an external system or if the file is to be compressed in place.
615 It is up to the datastore whether this parameter is relevant.
617 Raises
618 ------
619 FileNotFoundError
620 Raised if one of the given files does not exist.
621 FileExistsError
622 Raised if transfer is not `None` but the (internal) location the
623 file would be moved to is already occupied.
625 Notes
626 -----
627 This method (along with `_prepIngest`) should be implemented by
628 subclasses to provide ingest support instead of implementing `ingest`
629 directly.
630 """
631 raise NotImplementedError(f"Datastore {self} does not support direct file-based ingest.")
633 def ingest(
634 self, *datasets: FileDataset, transfer: str | None = None, record_validation_info: bool = True
635 ) -> None:
636 """Ingest one or more files into the datastore.
638 Parameters
639 ----------
640 datasets : `FileDataset`
641 Each positional argument is a struct containing information about
642 a file to be ingested, including its path (either absolute or
643 relative to the datastore root, if applicable), a complete
644 `DatasetRef` (with ``dataset_id not None``), and optionally a
645 formatter class or its fully-qualified string name. If a formatter
646 is not provided, the one the datastore would use for ``put`` on
647 that dataset is assumed.
648 transfer : `str`, optional
649 How (and whether) the dataset should be added to the datastore.
650 If `None` (default), the file must already be in a location
651 appropriate for the datastore (e.g. within its root directory),
652 and will not be modified. Other choices include "move", "copy",
653 "link", "symlink", "relsymlink", and "hardlink". "link" is a
654 special transfer mode that will first try to make a hardlink and
655 if that fails a symlink will be used instead. "relsymlink" creates
656 a relative symlink rather than use an absolute path.
657 Most datastores do not support all transfer modes.
658 "auto" is a special option that will let the
659 data store choose the most natural option for itself.
660 record_validation_info : `bool`, optional
661 If `True`, the default, the datastore can record validation
662 information associated with the file. If `False` the datastore
663 will not attempt to track any information such as checksums
664 or file sizes. This can be useful if such information is tracked
665 in an external system or if the file is to be compressed in place.
666 It is up to the datastore whether this parameter is relevant.
668 Raises
669 ------
670 NotImplementedError
671 Raised if the datastore does not support the given transfer mode
672 (including the case where ingest is not supported at all).
673 DatasetTypeNotSupportedError
674 Raised if one or more files to be ingested have a dataset type that
675 is not supported by the datastore.
676 FileNotFoundError
677 Raised if one of the given files does not exist.
678 FileExistsError
679 Raised if transfer is not `None` but the (internal) location the
680 file would be moved to is already occupied.
682 Notes
683 -----
684 Subclasses should implement `_prepIngest` and `_finishIngest` instead
685 of implementing `ingest` directly. Datastores that hold and
686 delegate to child datastores may want to call those methods as well.
688 Subclasses are encouraged to document their supported transfer modes
689 in their class documentation.
690 """
691 # Allow a datastore to select a default transfer mode
692 transfer = self._overrideTransferMode(*datasets, transfer=transfer)
693 prepData = self._prepIngest(*datasets, transfer=transfer)
694 refs = {ref.id: ref for dataset in datasets for ref in dataset.refs}
695 if refs.keys() != prepData.refs.keys():
696 unsupported = refs.keys() - prepData.refs.keys()
697 # Group unsupported refs by DatasetType for an informative
698 # but still concise error message.
699 byDatasetType = defaultdict(list)
700 for datasetId in unsupported:
701 ref = refs[datasetId]
702 byDatasetType[ref.datasetType].append(ref)
703 raise DatasetTypeNotSupportedError(
704 "DatasetType(s) not supported in ingest: "
705 + ", ".join(f"{k.name} ({len(v)} dataset(s))" for k, v in byDatasetType.items())
706 )
707 self._finishIngest(prepData, transfer=transfer, record_validation_info=record_validation_info)
709 def transfer_from(
710 self,
711 source_datastore: Datastore,
712 refs: Iterable[DatasetRef],
713 transfer: str = "auto",
714 artifact_existence: dict[ResourcePath, bool] | None = None,
715 ) -> tuple[set[DatasetRef], set[DatasetRef]]:
716 """Transfer dataset artifacts from another datastore to this one.
718 Parameters
719 ----------
720 source_datastore : `Datastore`
721 The datastore from which to transfer artifacts. That datastore
722 must be compatible with this datastore receiving the artifacts.
723 refs : iterable of `DatasetRef`
724 The datasets to transfer from the source datastore.
725 transfer : `str`, optional
726 How (and whether) the dataset should be added to the datastore.
727 Choices include "move", "copy",
728 "link", "symlink", "relsymlink", and "hardlink". "link" is a
729 special transfer mode that will first try to make a hardlink and
730 if that fails a symlink will be used instead. "relsymlink" creates
731 a relative symlink rather than use an absolute path.
732 Most datastores do not support all transfer modes.
733 "auto" (the default) is a special option that will let the
734 data store choose the most natural option for itself.
735 If the source location and transfer location are identical the
736 transfer mode will be ignored.
737 artifact_existence : `dict` [`lsst.resources.ResourcePath`, `bool`]
738 Optional mapping of datastore artifact to existence. Updated by
739 this method with details of all artifacts tested. Can be `None`
740 if the caller is not interested.
742 Returns
743 -------
744 accepted : `set` [`DatasetRef`]
745 The datasets that were transferred.
746 rejected : `set` [`DatasetRef`]
747 The datasets that were rejected due to a constraints violation.
749 Raises
750 ------
751 TypeError
752 Raised if the two datastores are not compatible.
753 """
754 if type(self) is not type(source_datastore):
755 raise TypeError(
756 f"Datastore mismatch between this datastore ({type(self)}) and the "
757 f"source datastore ({type(source_datastore)})."
758 )
760 raise NotImplementedError(f"Datastore {type(self)} must implement a transfer_from method.")
762 def getManyURIs(
763 self,
764 refs: Iterable[DatasetRef],
765 predict: bool = False,
766 allow_missing: bool = False,
767 ) -> dict[DatasetRef, DatasetRefURIs]:
768 """Return URIs associated with many datasets.
770 Parameters
771 ----------
772 refs : iterable of `DatasetIdRef`
773 References to the required datasets.
774 predict : `bool`, optional
775 If the datastore does not know about a dataset, should it
776 return a predicted URI or not?
777 allow_missing : `bool`
778 If `False`, and `predict` is `False`, will raise if a `DatasetRef`
779 does not exist.
781 Returns
782 -------
783 URIs : `dict` of [`DatasetRef`, `DatasetRefUris`]
784 A dict of primary and component URIs, indexed by the passed-in
785 refs.
787 Raises
788 ------
789 FileNotFoundError
790 A URI has been requested for a dataset that does not exist and
791 guessing is not allowed.
793 Notes
794 -----
795 In file-based datastores, getManuURIs does not check that the file is
796 really there, it's assuming it is if datastore is aware of the file
797 then it actually exists.
798 """
799 uris: dict[DatasetRef, DatasetRefURIs] = {}
800 missing_refs = []
801 for ref in refs:
802 try:
803 uris[ref] = self.getURIs(ref, predict=predict)
804 except FileNotFoundError:
805 missing_refs.append(ref)
806 if missing_refs and not allow_missing:
807 raise FileNotFoundError(
808 "Missing {} refs from datastore out of {} and predict=False.".format(
809 num_missing := len(missing_refs), num_missing + len(uris)
810 )
811 )
812 return uris
814 @abstractmethod
815 def getURIs(self, datasetRef: DatasetRef, predict: bool = False) -> DatasetRefURIs:
816 """Return URIs associated with dataset.
818 Parameters
819 ----------
820 ref : `DatasetRef`
821 Reference to the required dataset.
822 predict : `bool`, optional
823 If the datastore does not know about the dataset, should it
824 return a predicted URI or not?
826 Returns
827 -------
828 uris : `DatasetRefURIs`
829 The URI to the primary artifact associated with this dataset (if
830 the dataset was disassembled within the datastore this may be
831 `None`), and the URIs to any components associated with the dataset
832 artifact. (can be empty if there are no components).
833 """
834 raise NotImplementedError()
836 @abstractmethod
837 def getURI(self, datasetRef: DatasetRef, predict: bool = False) -> ResourcePath:
838 """URI to the Dataset.
840 Parameters
841 ----------
842 datasetRef : `DatasetRef`
843 Reference to the required Dataset.
844 predict : `bool`
845 If `True` attempt to predict the URI for a dataset if it does
846 not exist in datastore.
848 Returns
849 -------
850 uri : `str`
851 URI string pointing to the Dataset within the datastore. If the
852 Dataset does not exist in the datastore, the URI may be a guess.
853 If the datastore does not have entities that relate well
854 to the concept of a URI the returned URI string will be
855 descriptive. The returned URI is not guaranteed to be obtainable.
857 Raises
858 ------
859 FileNotFoundError
860 A URI has been requested for a dataset that does not exist and
861 guessing is not allowed.
862 """
863 raise NotImplementedError("Must be implemented by subclass")
865 @abstractmethod
866 def retrieveArtifacts(
867 self,
868 refs: Iterable[DatasetRef],
869 destination: ResourcePath,
870 transfer: str = "auto",
871 preserve_path: bool = True,
872 overwrite: bool = False,
873 ) -> list[ResourcePath]:
874 """Retrieve the artifacts associated with the supplied refs.
876 Parameters
877 ----------
878 refs : iterable of `DatasetRef`
879 The datasets for which artifacts are to be retrieved.
880 A single ref can result in multiple artifacts. The refs must
881 be resolved.
882 destination : `lsst.resources.ResourcePath`
883 Location to write the artifacts.
884 transfer : `str`, optional
885 Method to use to transfer the artifacts. Must be one of the options
886 supported by `lsst.resources.ResourcePath.transfer_from()`.
887 "move" is not allowed.
888 preserve_path : `bool`, optional
889 If `True` the full path of the artifact within the datastore
890 is preserved. If `False` the final file component of the path
891 is used.
892 overwrite : `bool`, optional
893 If `True` allow transfers to overwrite existing files at the
894 destination.
896 Returns
897 -------
898 targets : `list` of `lsst.resources.ResourcePath`
899 URIs of file artifacts in destination location. Order is not
900 preserved.
902 Notes
903 -----
904 For non-file datastores the artifacts written to the destination
905 may not match the representation inside the datastore. For example
906 a hierarchichal data structure in a NoSQL database may well be stored
907 as a JSON file.
908 """
909 raise NotImplementedError()
911 @abstractmethod
912 def remove(self, datasetRef: DatasetRef) -> None:
913 """Indicate to the Datastore that a Dataset can be removed.
915 Parameters
916 ----------
917 datasetRef : `DatasetRef`
918 Reference to the required Dataset.
920 Raises
921 ------
922 FileNotFoundError
923 When Dataset does not exist.
925 Notes
926 -----
927 Some Datastores may implement this method as a silent no-op to
928 disable Dataset deletion through standard interfaces.
929 """
930 raise NotImplementedError("Must be implemented by subclass")
932 @abstractmethod
933 def forget(self, refs: Iterable[DatasetRef]) -> None:
934 """Indicate to the Datastore that it should remove all records of the
935 given datasets, without actually deleting them.
937 Parameters
938 ----------
939 refs : `~collections.abc.Iterable` [ `DatasetRef` ]
940 References to the datasets being forgotten.
942 Notes
943 -----
944 Asking a datastore to forget a `DatasetRef` it does not hold should be
945 a silent no-op, not an error.
946 """
947 raise NotImplementedError("Must be implemented by subclass")
949 @abstractmethod
950 def trash(self, ref: DatasetRef | Iterable[DatasetRef], ignore_errors: bool = True) -> None:
951 """Indicate to the Datastore that a Dataset can be moved to the trash.
953 Parameters
954 ----------
955 ref : `DatasetRef` or iterable thereof
956 Reference(s) to the required Dataset.
957 ignore_errors : `bool`, optional
958 Determine whether errors should be ignored. When multiple
959 refs are being trashed there will be no per-ref check.
961 Raises
962 ------
963 FileNotFoundError
964 When Dataset does not exist and errors are not ignored. Only
965 checked if a single ref is supplied (and not in a list).
967 Notes
968 -----
969 Some Datastores may implement this method as a silent no-op to
970 disable Dataset deletion through standard interfaces.
971 """
972 raise NotImplementedError("Must be implemented by subclass")
974 @abstractmethod
975 def emptyTrash(self, ignore_errors: bool = True) -> None:
976 """Remove all datasets from the trash.
978 Parameters
979 ----------
980 ignore_errors : `bool`, optional
981 Determine whether errors should be ignored.
983 Notes
984 -----
985 Some Datastores may implement this method as a silent no-op to
986 disable Dataset deletion through standard interfaces.
987 """
988 raise NotImplementedError("Must be implemented by subclass")
990 @abstractmethod
991 def transfer(self, inputDatastore: Datastore, datasetRef: DatasetRef) -> None:
992 """Transfer a dataset from another datastore to this datastore.
994 Parameters
995 ----------
996 inputDatastore : `Datastore`
997 The external `Datastore` from which to retrieve the Dataset.
998 datasetRef : `DatasetRef`
999 Reference to the required Dataset.
1000 """
1001 raise NotImplementedError("Must be implemented by subclass")
1003 def export(
1004 self,
1005 refs: Iterable[DatasetRef],
1006 *,
1007 directory: ResourcePathExpression | None = None,
1008 transfer: str | None = "auto",
1009 ) -> Iterable[FileDataset]:
1010 """Export datasets for transfer to another data repository.
1012 Parameters
1013 ----------
1014 refs : iterable of `DatasetRef`
1015 Dataset references to be exported.
1016 directory : `str`, optional
1017 Path to a directory that should contain files corresponding to
1018 output datasets. Ignored if ``transfer`` is explicitly `None`.
1019 transfer : `str`, optional
1020 Mode that should be used to move datasets out of the repository.
1021 Valid options are the same as those of the ``transfer`` argument
1022 to ``ingest``, and datastores may similarly signal that a transfer
1023 mode is not supported by raising `NotImplementedError`. If "auto"
1024 is given and no ``directory`` is specified, `None` will be
1025 implied.
1027 Returns
1028 -------
1029 dataset : iterable of `DatasetTransfer`
1030 Structs containing information about the exported datasets, in the
1031 same order as ``refs``.
1033 Raises
1034 ------
1035 NotImplementedError
1036 Raised if the given transfer mode is not supported.
1037 """
1038 raise NotImplementedError(f"Transfer mode {transfer} not supported.")
1040 @abstractmethod
1041 def validateConfiguration(
1042 self, entities: Iterable[DatasetRef | DatasetType | StorageClass], logFailures: bool = False
1043 ) -> None:
1044 """Validate some of the configuration for this datastore.
1046 Parameters
1047 ----------
1048 entities : iterable of `DatasetRef`, `DatasetType`, or `StorageClass`
1049 Entities to test against this configuration. Can be differing
1050 types.
1051 logFailures : `bool`, optional
1052 If `True`, output a log message for every validation error
1053 detected.
1055 Raises
1056 ------
1057 DatastoreValidationError
1058 Raised if there is a validation problem with a configuration.
1060 Notes
1061 -----
1062 Which parts of the configuration are validated is at the discretion
1063 of each Datastore implementation.
1064 """
1065 raise NotImplementedError("Must be implemented by subclass")
1067 @abstractmethod
1068 def validateKey(self, lookupKey: LookupKey, entity: DatasetRef | DatasetType | StorageClass) -> None:
1069 """Validate a specific look up key with supplied entity.
1071 Parameters
1072 ----------
1073 lookupKey : `LookupKey`
1074 Key to use to retrieve information from the datastore
1075 configuration.
1076 entity : `DatasetRef`, `DatasetType`, or `StorageClass`
1077 Entity to compare with configuration retrieved using the
1078 specified lookup key.
1080 Raises
1081 ------
1082 DatastoreValidationError
1083 Raised if there is a problem with the combination of entity
1084 and lookup key.
1086 Notes
1087 -----
1088 Bypasses the normal selection priorities by allowing a key that
1089 would normally not be selected to be validated.
1090 """
1091 raise NotImplementedError("Must be implemented by subclass")
1093 @abstractmethod
1094 def getLookupKeys(self) -> set[LookupKey]:
1095 """Return all the lookup keys relevant to this datastore.
1097 Returns
1098 -------
1099 keys : `set` of `LookupKey`
1100 The keys stored internally for looking up information based
1101 on `DatasetType` name or `StorageClass`.
1102 """
1103 raise NotImplementedError("Must be implemented by subclass")
1105 def needs_expanded_data_ids(
1106 self,
1107 transfer: str | None,
1108 entity: DatasetRef | DatasetType | StorageClass | None = None,
1109 ) -> bool:
1110 """Test whether this datastore needs expanded data IDs to ingest.
1112 Parameters
1113 ----------
1114 transfer : `str` or `None`
1115 Transfer mode for ingest.
1116 entity, optional
1117 Object representing what will be ingested. If not provided (or not
1118 specific enough), `True` may be returned even if expanded data
1119 IDs aren't necessary.
1121 Returns
1122 -------
1123 needed : `bool`
1124 If `True`, expanded data IDs may be needed. `False` only if
1125 expansion definitely isn't necessary.
1126 """
1127 return True
1129 @abstractmethod
1130 def import_records(
1131 self,
1132 data: Mapping[str, DatastoreRecordData],
1133 ) -> None:
1134 """Import datastore location and record data from an in-memory data
1135 structure.
1137 Parameters
1138 ----------
1139 data : `~collections.abc.Mapping` [ `str`, `DatastoreRecordData` ]
1140 Datastore records indexed by datastore name. May contain data for
1141 other `Datastore` instances (generally because they are chained to
1142 this one), which should be ignored.
1144 Notes
1145 -----
1146 Implementations should generally not check that any external resources
1147 (e.g. files) referred to by these records actually exist, for
1148 performance reasons; we expect higher-level code to guarantee that they
1149 do.
1151 Implementations are responsible for calling
1152 `DatastoreRegistryBridge.insert` on all datasets in ``data.locations``
1153 where the key is in `names`, as well as loading any opaque table data.
1154 """
1155 raise NotImplementedError()
1157 @abstractmethod
1158 def export_records(
1159 self,
1160 refs: Iterable[DatasetIdRef],
1161 ) -> Mapping[str, DatastoreRecordData]:
1162 """Export datastore records and locations to an in-memory data
1163 structure.
1165 Parameters
1166 ----------
1167 refs : `~collections.abc.Iterable` [ `DatasetIdRef` ]
1168 Datasets to save. This may include datasets not known to this
1169 datastore, which should be ignored.
1171 Returns
1172 -------
1173 data : `~collections.abc.Mapping` [ `str`, `DatastoreRecordData` ]
1174 Exported datastore records indexed by datastore name.
1175 """
1176 raise NotImplementedError()
1178 def set_retrieve_dataset_type_method(self, method: Callable[[str], DatasetType | None] | None) -> None:
1179 """Specify a method that can be used by datastore to retrieve
1180 registry-defined dataset type.
1182 Parameters
1183 ----------
1184 method : `~collections.abc.Callable` | `None`
1185 Method that takes a name of the dataset type and returns a
1186 corresponding `DatasetType` instance as defined in Registry. If
1187 dataset type name is not known to registry `None` is returned.
1189 Notes
1190 -----
1191 This method is only needed for a Datastore supporting a "trusted" mode
1192 when it does not have an access to datastore records and needs to
1193 guess dataset location based on its stored dataset type.
1194 """
1195 pass