Coverage for python/lsst/daf/butler/core/datastore.py: 51%
213 statements
« prev ^ index » next coverage.py v7.2.7, created at 2023-07-12 10:56 -0700
« prev ^ index » next coverage.py v7.2.7, created at 2023-07-12 10:56 -0700
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22"""Support for generic data stores."""
24from __future__ import annotations
26__all__ = ("DatastoreConfig", "Datastore", "DatastoreValidationError", "DatasetRefURIs")
28import contextlib
29import dataclasses
30import logging
31from abc import ABCMeta, abstractmethod
32from collections import abc, defaultdict
33from collections.abc import Callable, Iterable, Iterator, Mapping
34from typing import TYPE_CHECKING, Any, ClassVar
36from lsst.utils import doImportType
38from .config import Config, ConfigSubset
39from .constraints import Constraints
40from .exceptions import DatasetTypeNotSupportedError, ValidationError
41from .fileDataset import FileDataset
42from .storageClass import StorageClassFactory
44if TYPE_CHECKING:
45 from lsst.resources import ResourcePath, ResourcePathExpression
47 from ..registry.interfaces import DatasetIdRef, DatastoreRegistryBridgeManager
48 from .configSupport import LookupKey
49 from .datasets import DatasetRef, DatasetType
50 from .datastoreRecordData import DatastoreRecordData
51 from .storageClass import StorageClass
54class DatastoreConfig(ConfigSubset):
55 """Configuration for Datastores."""
57 component = "datastore"
58 requiredKeys = ("cls",)
59 defaultConfigFile = "datastore.yaml"
62class DatastoreValidationError(ValidationError):
63 """There is a problem with the Datastore configuration."""
65 pass
68@dataclasses.dataclass(frozen=True)
69class Event:
70 """Representation of an event that can be rolled back."""
72 __slots__ = {"name", "undoFunc", "args", "kwargs"}
73 name: str
74 undoFunc: Callable
75 args: tuple
76 kwargs: dict
79class IngestPrepData:
80 """A helper base class for `Datastore` ingest implementations.
82 Datastore implementations will generally need a custom implementation of
83 this class.
85 Should be accessed as ``Datastore.IngestPrepData`` instead of via direct
86 import.
88 Parameters
89 ----------
90 refs : iterable of `DatasetRef`
91 References for the datasets that can be ingested by this datastore.
92 """
94 def __init__(self, refs: Iterable[DatasetRef]):
95 self.refs = {ref.id: ref for ref in refs}
98class DatastoreTransaction:
99 """Keeps a log of `Datastore` activity and allow rollback.
101 Parameters
102 ----------
103 parent : `DatastoreTransaction`, optional
104 The parent transaction (if any)
105 """
107 Event: ClassVar[type] = Event
109 parent: DatastoreTransaction | None
110 """The parent transaction. (`DatastoreTransaction`, optional)"""
112 def __init__(self, parent: DatastoreTransaction | None = None):
113 self.parent = parent
114 self._log: list[Event] = []
116 def registerUndo(self, name: str, undoFunc: Callable, *args: Any, **kwargs: Any) -> None:
117 """Register event with undo function.
119 Parameters
120 ----------
121 name : `str`
122 Name of the event.
123 undoFunc : func
124 Function to undo this event.
125 args : `tuple`
126 Positional arguments to `undoFunc`.
127 **kwargs
128 Keyword arguments to `undoFunc`.
129 """
130 self._log.append(self.Event(name, undoFunc, args, kwargs))
132 @contextlib.contextmanager
133 def undoWith(self, name: str, undoFunc: Callable, *args: Any, **kwargs: Any) -> Iterator[None]:
134 """Register undo function if nested operation succeeds.
136 Calls `registerUndo`.
138 This can be used to wrap individual undo-able statements within a
139 DatastoreTransaction block. Multiple statements that can fail
140 separately should not be part of the same `undoWith` block.
142 All arguments are forwarded directly to `registerUndo`.
143 """
144 try:
145 yield None
146 except BaseException:
147 raise
148 else:
149 self.registerUndo(name, undoFunc, *args, **kwargs)
151 def rollback(self) -> None:
152 """Roll back all events in this transaction."""
153 log = logging.getLogger(__name__)
154 while self._log:
155 ev = self._log.pop()
156 try:
157 log.debug(
158 "Rolling back transaction: %s: %s(%s,%s)",
159 ev.name,
160 ev.undoFunc,
161 ",".join(str(a) for a in ev.args),
162 ",".join(f"{k}={v}" for k, v in ev.kwargs.items()),
163 )
164 except Exception:
165 # In case we had a problem in stringification of arguments
166 log.warning("Rolling back transaction: %s", ev.name)
167 try:
168 ev.undoFunc(*ev.args, **ev.kwargs)
169 except BaseException as e:
170 # Deliberately swallow error that may occur in unrolling
171 log.warning("Exception: %s caught while unrolling: %s", e, ev.name)
172 pass
174 def commit(self) -> None:
175 """Commit this transaction."""
176 if self.parent is None:
177 # Just forget about the events, they have already happened.
178 return
179 else:
180 # We may still want to events from this transaction as part of
181 # the parent.
182 self.parent._log.extend(self._log)
185@dataclasses.dataclass
186class DatasetRefURIs(abc.Sequence):
187 """Represents the primary and component ResourcePath(s) associated with a
188 DatasetRef.
190 This is used in places where its members used to be represented as a tuple
191 `(primaryURI, componentURIs)`. To maintain backward compatibility this
192 inherits from Sequence and so instances can be treated as a two-item
193 tuple.
194 """
196 def __init__(
197 self,
198 primaryURI: ResourcePath | None = None,
199 componentURIs: dict[str, ResourcePath] | None = None,
200 ):
201 self.primaryURI = primaryURI
202 """The URI to the primary artifact associated with this dataset. If the
203 dataset was disassembled within the datastore this may be `None`.
204 """
206 self.componentURIs = componentURIs or {}
207 """The URIs to any components associated with the dataset artifact
208 indexed by component name. This can be empty if there are no
209 components.
210 """
212 def __getitem__(self, index: Any) -> Any:
213 """Get primaryURI and componentURIs by index.
215 Provides support for tuple-like access.
216 """
217 if index == 0:
218 return self.primaryURI
219 elif index == 1:
220 return self.componentURIs
221 raise IndexError("list index out of range")
223 def __len__(self) -> int:
224 """Get the number of data members.
226 Provides support for tuple-like access.
227 """
228 return 2
230 def __repr__(self) -> str:
231 return f"DatasetRefURIs({repr(self.primaryURI)}, {repr(self.componentURIs)})"
234class Datastore(metaclass=ABCMeta):
235 """Datastore interface.
237 Parameters
238 ----------
239 config : `DatastoreConfig` or `str`
240 Load configuration either from an existing config instance or by
241 referring to a configuration file.
242 bridgeManager : `DatastoreRegistryBridgeManager`
243 Object that manages the interface between `Registry` and datastores.
244 butlerRoot : `str`, optional
245 New datastore root to use to override the configuration value.
246 """
248 defaultConfigFile: ClassVar[str | None] = None
249 """Path to configuration defaults. Accessed within the ``config`` resource
250 or relative to a search path. Can be None if no defaults specified.
251 """
253 containerKey: ClassVar[str | None] = None
254 """Name of the key containing a list of subconfigurations that also
255 need to be merged with defaults and will likely use different Python
256 datastore classes (but all using DatastoreConfig). Assumed to be a
257 list of configurations that can be represented in a DatastoreConfig
258 and containing a "cls" definition. None indicates that no containers
259 are expected in this Datastore."""
261 isEphemeral: bool = False
262 """Indicate whether this Datastore is ephemeral or not. An ephemeral
263 datastore is one where the contents of the datastore will not exist
264 across process restarts. This value can change per-instance."""
266 config: DatastoreConfig
267 """Configuration used to create Datastore."""
269 name: str
270 """Label associated with this Datastore."""
272 storageClassFactory: StorageClassFactory
273 """Factory for creating storage class instances from name."""
275 constraints: Constraints
276 """Constraints to apply when putting datasets into the datastore."""
278 # MyPy does not like for this to be annotated as any kind of type, because
279 # it can't do static checking on type variables that can change at runtime.
280 IngestPrepData: ClassVar[Any] = IngestPrepData
281 """Helper base class for ingest implementations.
282 """
284 @classmethod
285 @abstractmethod
286 def setConfigRoot(cls, root: str, config: Config, full: Config, overwrite: bool = True) -> None:
287 """Set filesystem-dependent config options for this datastore.
289 The options will be appropriate for a new empty repository with the
290 given root.
292 Parameters
293 ----------
294 root : `str`
295 Filesystem path to the root of the data repository.
296 config : `Config`
297 A `Config` to update. Only the subset understood by
298 this component will be updated. Will not expand
299 defaults.
300 full : `Config`
301 A complete config with all defaults expanded that can be
302 converted to a `DatastoreConfig`. Read-only and will not be
303 modified by this method.
304 Repository-specific options that should not be obtained
305 from defaults when Butler instances are constructed
306 should be copied from ``full`` to ``config``.
307 overwrite : `bool`, optional
308 If `False`, do not modify a value in ``config`` if the value
309 already exists. Default is always to overwrite with the provided
310 ``root``.
312 Notes
313 -----
314 If a keyword is explicitly defined in the supplied ``config`` it
315 will not be overridden by this method if ``overwrite`` is `False`.
316 This allows explicit values set in external configs to be retained.
317 """
318 raise NotImplementedError()
320 @staticmethod
321 def fromConfig(
322 config: Config,
323 bridgeManager: DatastoreRegistryBridgeManager,
324 butlerRoot: ResourcePathExpression | None = None,
325 ) -> Datastore:
326 """Create datastore from type specified in config file.
328 Parameters
329 ----------
330 config : `Config` or `~lsst.resources.ResourcePathExpression`
331 Configuration instance.
332 bridgeManager : `DatastoreRegistryBridgeManager`
333 Object that manages the interface between `Registry` and
334 datastores.
335 butlerRoot : `str`, optional
336 Butler root directory.
337 """
338 cls = doImportType(config["datastore", "cls"])
339 if not issubclass(cls, Datastore):
340 raise TypeError(f"Imported child class {config['datastore', 'cls']} is not a Datastore")
341 return cls(config=config, bridgeManager=bridgeManager, butlerRoot=butlerRoot)
343 def __init__(
344 self,
345 config: Config | ResourcePathExpression,
346 bridgeManager: DatastoreRegistryBridgeManager,
347 butlerRoot: ResourcePathExpression | None = None,
348 ):
349 self.config = DatastoreConfig(config)
350 self.name = "ABCDataStore"
351 self._transaction: DatastoreTransaction | None = None
353 # All Datastores need storage classes and constraints
354 self.storageClassFactory = StorageClassFactory()
356 # And read the constraints list
357 constraintsConfig = self.config.get("constraints")
358 self.constraints = Constraints(constraintsConfig, universe=bridgeManager.universe)
360 def __str__(self) -> str:
361 return self.name
363 def __repr__(self) -> str:
364 return self.name
366 @property
367 def names(self) -> tuple[str, ...]:
368 """Names associated with this datastore returned as a list.
370 Can be different to ``name`` for a chaining datastore.
371 """
372 # Default implementation returns solely the name itself
373 return (self.name,)
375 @property
376 def roots(self) -> dict[str, ResourcePath | None]:
377 """Return the root URIs for each named datastore.
379 Mapping from datastore name to root URI. The URI can be `None`
380 if a datastore has no concept of a root URI.
381 (`dict` [`str`, `ResourcePath` | `None`])
382 """
383 return {self.name: None}
385 @contextlib.contextmanager
386 def transaction(self) -> Iterator[DatastoreTransaction]:
387 """Context manager supporting `Datastore` transactions.
389 Transactions can be nested, and are to be used in combination with
390 `Registry.transaction`.
391 """
392 self._transaction = DatastoreTransaction(self._transaction)
393 try:
394 yield self._transaction
395 except BaseException:
396 self._transaction.rollback()
397 raise
398 else:
399 self._transaction.commit()
400 self._transaction = self._transaction.parent
402 @abstractmethod
403 def knows(self, ref: DatasetRef) -> bool:
404 """Check if the dataset is known to the datastore.
406 Does not check for existence of any artifact.
408 Parameters
409 ----------
410 ref : `DatasetRef`
411 Reference to the required dataset.
413 Returns
414 -------
415 exists : `bool`
416 `True` if the dataset is known to the datastore.
417 """
418 raise NotImplementedError()
420 def knows_these(self, refs: Iterable[DatasetRef]) -> dict[DatasetRef, bool]:
421 """Check which of the given datasets are known to this datastore.
423 This is like ``mexist()`` but does not check that the file exists.
425 Parameters
426 ----------
427 refs : iterable `DatasetRef`
428 The datasets to check.
430 Returns
431 -------
432 exists : `dict`[`DatasetRef`, `bool`]
433 Mapping of dataset to boolean indicating whether the dataset
434 is known to the datastore.
435 """
436 # Non-optimized default calls knows() repeatedly.
437 return {ref: self.knows(ref) for ref in refs}
439 def mexists(
440 self, refs: Iterable[DatasetRef], artifact_existence: dict[ResourcePath, bool] | None = None
441 ) -> dict[DatasetRef, bool]:
442 """Check the existence of multiple datasets at once.
444 Parameters
445 ----------
446 refs : iterable of `DatasetRef`
447 The datasets to be checked.
448 artifact_existence : `dict` [`lsst.resources.ResourcePath`, `bool`]
449 Optional mapping of datastore artifact to existence. Updated by
450 this method with details of all artifacts tested. Can be `None`
451 if the caller is not interested.
453 Returns
454 -------
455 existence : `dict` of [`DatasetRef`, `bool`]
456 Mapping from dataset to boolean indicating existence.
457 """
458 existence: dict[DatasetRef, bool] = {}
459 # Non-optimized default.
460 for ref in refs:
461 existence[ref] = self.exists(ref)
462 return existence
464 @abstractmethod
465 def exists(self, datasetRef: DatasetRef) -> bool:
466 """Check if the dataset exists in the datastore.
468 Parameters
469 ----------
470 datasetRef : `DatasetRef`
471 Reference to the required dataset.
473 Returns
474 -------
475 exists : `bool`
476 `True` if the entity exists in the `Datastore`.
477 """
478 raise NotImplementedError("Must be implemented by subclass")
480 @abstractmethod
481 def get(
482 self,
483 datasetRef: DatasetRef,
484 parameters: Mapping[str, Any] | None = None,
485 storageClass: StorageClass | str | None = None,
486 ) -> Any:
487 """Load an `InMemoryDataset` from the store.
489 Parameters
490 ----------
491 datasetRef : `DatasetRef`
492 Reference to the required Dataset.
493 parameters : `dict`
494 `StorageClass`-specific parameters that specify a slice of the
495 Dataset to be loaded.
496 storageClass : `StorageClass` or `str`, optional
497 The storage class to be used to override the Python type
498 returned by this method. By default the returned type matches
499 the dataset type definition for this dataset. Specifying a
500 read `StorageClass` can force a different type to be returned.
501 This type must be compatible with the original type.
503 Returns
504 -------
505 inMemoryDataset : `object`
506 Requested Dataset or slice thereof as an InMemoryDataset.
507 """
508 raise NotImplementedError("Must be implemented by subclass")
510 @abstractmethod
511 def put(self, inMemoryDataset: Any, datasetRef: DatasetRef) -> None:
512 """Write a `InMemoryDataset` with a given `DatasetRef` to the store.
514 Parameters
515 ----------
516 inMemoryDataset : `object`
517 The Dataset to store.
518 datasetRef : `DatasetRef`
519 Reference to the associated Dataset.
520 """
521 raise NotImplementedError("Must be implemented by subclass")
523 def _overrideTransferMode(self, *datasets: FileDataset, transfer: str | None = None) -> str | None:
524 """Allow ingest transfer mode to be defaulted based on datasets.
526 Parameters
527 ----------
528 datasets : `FileDataset`
529 Each positional argument is a struct containing information about
530 a file to be ingested, including its path (either absolute or
531 relative to the datastore root, if applicable), a complete
532 `DatasetRef` (with ``dataset_id not None``), and optionally a
533 formatter class or its fully-qualified string name. If a formatter
534 is not provided, this method should populate that attribute with
535 the formatter the datastore would use for `put`. Subclasses are
536 also permitted to modify the path attribute (typically to put it
537 in what the datastore considers its standard form).
538 transfer : `str`, optional
539 How (and whether) the dataset should be added to the datastore.
540 See `ingest` for details of transfer modes.
542 Returns
543 -------
544 newTransfer : `str`
545 Transfer mode to use. Will be identical to the supplied transfer
546 mode unless "auto" is used.
547 """
548 if transfer != "auto":
549 return transfer
550 raise RuntimeError(f"{transfer} is not allowed without specialization.")
552 def _prepIngest(self, *datasets: FileDataset, transfer: str | None = None) -> IngestPrepData:
553 """Process datasets to identify which ones can be ingested.
555 Parameters
556 ----------
557 datasets : `FileDataset`
558 Each positional argument is a struct containing information about
559 a file to be ingested, including its path (either absolute or
560 relative to the datastore root, if applicable), a complete
561 `DatasetRef` (with ``dataset_id not None``), and optionally a
562 formatter class or its fully-qualified string name. If a formatter
563 is not provided, this method should populate that attribute with
564 the formatter the datastore would use for `put`. Subclasses are
565 also permitted to modify the path attribute (typically to put it
566 in what the datastore considers its standard form).
567 transfer : `str`, optional
568 How (and whether) the dataset should be added to the datastore.
569 See `ingest` for details of transfer modes.
571 Returns
572 -------
573 data : `IngestPrepData`
574 An instance of a subclass of `IngestPrepData`, used to pass
575 arbitrary data from `_prepIngest` to `_finishIngest`. This should
576 include only the datasets this datastore can actually ingest;
577 others should be silently ignored (`Datastore.ingest` will inspect
578 `IngestPrepData.refs` and raise `DatasetTypeNotSupportedError` if
579 necessary).
581 Raises
582 ------
583 NotImplementedError
584 Raised if the datastore does not support the given transfer mode
585 (including the case where ingest is not supported at all).
586 FileNotFoundError
587 Raised if one of the given files does not exist.
588 FileExistsError
589 Raised if transfer is not `None` but the (internal) location the
590 file would be moved to is already occupied.
592 Notes
593 -----
594 This method (along with `_finishIngest`) should be implemented by
595 subclasses to provide ingest support instead of implementing `ingest`
596 directly.
598 `_prepIngest` should not modify the data repository or given files in
599 any way; all changes should be deferred to `_finishIngest`.
601 When possible, exceptions should be raised in `_prepIngest` instead of
602 `_finishIngest`. `NotImplementedError` exceptions that indicate that
603 the transfer mode is not supported must be raised by `_prepIngest`
604 instead of `_finishIngest`.
605 """
606 raise NotImplementedError(f"Datastore {self} does not support direct file-based ingest.")
608 def _finishIngest(
609 self, prepData: IngestPrepData, *, transfer: str | None = None, record_validation_info: bool = True
610 ) -> None:
611 """Complete an ingest operation.
613 Parameters
614 ----------
615 data : `IngestPrepData`
616 An instance of a subclass of `IngestPrepData`. Guaranteed to be
617 the direct result of a call to `_prepIngest` on this datastore.
618 transfer : `str`, optional
619 How (and whether) the dataset should be added to the datastore.
620 See `ingest` for details of transfer modes.
621 record_validation_info : `bool`, optional
622 If `True`, the default, the datastore can record validation
623 information associated with the file. If `False` the datastore
624 will not attempt to track any information such as checksums
625 or file sizes. This can be useful if such information is tracked
626 in an external system or if the file is to be compressed in place.
627 It is up to the datastore whether this parameter is relevant.
629 Raises
630 ------
631 FileNotFoundError
632 Raised if one of the given files does not exist.
633 FileExistsError
634 Raised if transfer is not `None` but the (internal) location the
635 file would be moved to is already occupied.
637 Notes
638 -----
639 This method (along with `_prepIngest`) should be implemented by
640 subclasses to provide ingest support instead of implementing `ingest`
641 directly.
642 """
643 raise NotImplementedError(f"Datastore {self} does not support direct file-based ingest.")
645 def ingest(
646 self, *datasets: FileDataset, transfer: str | None = None, record_validation_info: bool = True
647 ) -> None:
648 """Ingest one or more files into the datastore.
650 Parameters
651 ----------
652 datasets : `FileDataset`
653 Each positional argument is a struct containing information about
654 a file to be ingested, including its path (either absolute or
655 relative to the datastore root, if applicable), a complete
656 `DatasetRef` (with ``dataset_id not None``), and optionally a
657 formatter class or its fully-qualified string name. If a formatter
658 is not provided, the one the datastore would use for ``put`` on
659 that dataset is assumed.
660 transfer : `str`, optional
661 How (and whether) the dataset should be added to the datastore.
662 If `None` (default), the file must already be in a location
663 appropriate for the datastore (e.g. within its root directory),
664 and will not be modified. Other choices include "move", "copy",
665 "link", "symlink", "relsymlink", and "hardlink". "link" is a
666 special transfer mode that will first try to make a hardlink and
667 if that fails a symlink will be used instead. "relsymlink" creates
668 a relative symlink rather than use an absolute path.
669 Most datastores do not support all transfer modes.
670 "auto" is a special option that will let the
671 data store choose the most natural option for itself.
672 record_validation_info : `bool`, optional
673 If `True`, the default, the datastore can record validation
674 information associated with the file. If `False` the datastore
675 will not attempt to track any information such as checksums
676 or file sizes. This can be useful if such information is tracked
677 in an external system or if the file is to be compressed in place.
678 It is up to the datastore whether this parameter is relevant.
680 Raises
681 ------
682 NotImplementedError
683 Raised if the datastore does not support the given transfer mode
684 (including the case where ingest is not supported at all).
685 DatasetTypeNotSupportedError
686 Raised if one or more files to be ingested have a dataset type that
687 is not supported by the datastore.
688 FileNotFoundError
689 Raised if one of the given files does not exist.
690 FileExistsError
691 Raised if transfer is not `None` but the (internal) location the
692 file would be moved to is already occupied.
694 Notes
695 -----
696 Subclasses should implement `_prepIngest` and `_finishIngest` instead
697 of implementing `ingest` directly. Datastores that hold and
698 delegate to child datastores may want to call those methods as well.
700 Subclasses are encouraged to document their supported transfer modes
701 in their class documentation.
702 """
703 # Allow a datastore to select a default transfer mode
704 transfer = self._overrideTransferMode(*datasets, transfer=transfer)
705 prepData = self._prepIngest(*datasets, transfer=transfer)
706 refs = {ref.id: ref for dataset in datasets for ref in dataset.refs}
707 if refs.keys() != prepData.refs.keys():
708 unsupported = refs.keys() - prepData.refs.keys()
709 # Group unsupported refs by DatasetType for an informative
710 # but still concise error message.
711 byDatasetType = defaultdict(list)
712 for datasetId in unsupported:
713 ref = refs[datasetId]
714 byDatasetType[ref.datasetType].append(ref)
715 raise DatasetTypeNotSupportedError(
716 "DatasetType(s) not supported in ingest: "
717 + ", ".join(f"{k.name} ({len(v)} dataset(s))" for k, v in byDatasetType.items())
718 )
719 self._finishIngest(prepData, transfer=transfer, record_validation_info=record_validation_info)
721 def transfer_from(
722 self,
723 source_datastore: Datastore,
724 refs: Iterable[DatasetRef],
725 transfer: str = "auto",
726 artifact_existence: dict[ResourcePath, bool] | None = None,
727 ) -> tuple[set[DatasetRef], set[DatasetRef]]:
728 """Transfer dataset artifacts from another datastore to this one.
730 Parameters
731 ----------
732 source_datastore : `Datastore`
733 The datastore from which to transfer artifacts. That datastore
734 must be compatible with this datastore receiving the artifacts.
735 refs : iterable of `DatasetRef`
736 The datasets to transfer from the source datastore.
737 transfer : `str`, optional
738 How (and whether) the dataset should be added to the datastore.
739 Choices include "move", "copy",
740 "link", "symlink", "relsymlink", and "hardlink". "link" is a
741 special transfer mode that will first try to make a hardlink and
742 if that fails a symlink will be used instead. "relsymlink" creates
743 a relative symlink rather than use an absolute path.
744 Most datastores do not support all transfer modes.
745 "auto" (the default) is a special option that will let the
746 data store choose the most natural option for itself.
747 If the source location and transfer location are identical the
748 transfer mode will be ignored.
749 artifact_existence : `dict` [`lsst.resources.ResourcePath`, `bool`]
750 Optional mapping of datastore artifact to existence. Updated by
751 this method with details of all artifacts tested. Can be `None`
752 if the caller is not interested.
754 Returns
755 -------
756 accepted : `set` [`DatasetRef`]
757 The datasets that were transferred.
758 rejected : `set` [`DatasetRef`]
759 The datasets that were rejected due to a constraints violation.
761 Raises
762 ------
763 TypeError
764 Raised if the two datastores are not compatible.
765 """
766 if type(self) is not type(source_datastore):
767 raise TypeError(
768 f"Datastore mismatch between this datastore ({type(self)}) and the "
769 f"source datastore ({type(source_datastore)})."
770 )
772 raise NotImplementedError(f"Datastore {type(self)} must implement a transfer_from method.")
774 def getManyURIs(
775 self,
776 refs: Iterable[DatasetRef],
777 predict: bool = False,
778 allow_missing: bool = False,
779 ) -> dict[DatasetRef, DatasetRefURIs]:
780 """Return URIs associated with many datasets.
782 Parameters
783 ----------
784 refs : iterable of `DatasetIdRef`
785 References to the required datasets.
786 predict : `bool`, optional
787 If `True`, allow URIs to be returned of datasets that have not
788 been written.
789 allow_missing : `bool`
790 If `False`, and ``predict`` is `False`, will raise if a
791 `DatasetRef` does not exist.
793 Returns
794 -------
795 URIs : `dict` of [`DatasetRef`, `DatasetRefUris`]
796 A dict of primary and component URIs, indexed by the passed-in
797 refs.
799 Raises
800 ------
801 FileNotFoundError
802 A URI has been requested for a dataset that does not exist and
803 guessing is not allowed.
805 Notes
806 -----
807 In file-based datastores, getManyURIs does not check that the file is
808 really there, it's assuming it is if datastore is aware of the file
809 then it actually exists.
810 """
811 uris: dict[DatasetRef, DatasetRefURIs] = {}
812 missing_refs = []
813 for ref in refs:
814 try:
815 uris[ref] = self.getURIs(ref, predict=predict)
816 except FileNotFoundError:
817 missing_refs.append(ref)
818 if missing_refs and not allow_missing:
819 raise FileNotFoundError(
820 "Missing {} refs from datastore out of {} and predict=False.".format(
821 num_missing := len(missing_refs), num_missing + len(uris)
822 )
823 )
824 return uris
826 @abstractmethod
827 def getURIs(self, datasetRef: DatasetRef, predict: bool = False) -> DatasetRefURIs:
828 """Return URIs associated with dataset.
830 Parameters
831 ----------
832 ref : `DatasetRef`
833 Reference to the required dataset.
834 predict : `bool`, optional
835 If the datastore does not know about the dataset, should it
836 return a predicted URI or not?
838 Returns
839 -------
840 uris : `DatasetRefURIs`
841 The URI to the primary artifact associated with this dataset (if
842 the dataset was disassembled within the datastore this may be
843 `None`), and the URIs to any components associated with the dataset
844 artifact. (can be empty if there are no components).
845 """
846 raise NotImplementedError()
848 @abstractmethod
849 def getURI(self, datasetRef: DatasetRef, predict: bool = False) -> ResourcePath:
850 """URI to the Dataset.
852 Parameters
853 ----------
854 datasetRef : `DatasetRef`
855 Reference to the required Dataset.
856 predict : `bool`
857 If `True` attempt to predict the URI for a dataset if it does
858 not exist in datastore.
860 Returns
861 -------
862 uri : `str`
863 URI string pointing to the Dataset within the datastore. If the
864 Dataset does not exist in the datastore, the URI may be a guess.
865 If the datastore does not have entities that relate well
866 to the concept of a URI the returned URI string will be
867 descriptive. The returned URI is not guaranteed to be obtainable.
869 Raises
870 ------
871 FileNotFoundError
872 A URI has been requested for a dataset that does not exist and
873 guessing is not allowed.
874 """
875 raise NotImplementedError("Must be implemented by subclass")
877 @abstractmethod
878 def retrieveArtifacts(
879 self,
880 refs: Iterable[DatasetRef],
881 destination: ResourcePath,
882 transfer: str = "auto",
883 preserve_path: bool = True,
884 overwrite: bool = False,
885 ) -> list[ResourcePath]:
886 """Retrieve the artifacts associated with the supplied refs.
888 Parameters
889 ----------
890 refs : iterable of `DatasetRef`
891 The datasets for which artifacts are to be retrieved.
892 A single ref can result in multiple artifacts. The refs must
893 be resolved.
894 destination : `lsst.resources.ResourcePath`
895 Location to write the artifacts.
896 transfer : `str`, optional
897 Method to use to transfer the artifacts. Must be one of the options
898 supported by `lsst.resources.ResourcePath.transfer_from()`.
899 "move" is not allowed.
900 preserve_path : `bool`, optional
901 If `True` the full path of the artifact within the datastore
902 is preserved. If `False` the final file component of the path
903 is used.
904 overwrite : `bool`, optional
905 If `True` allow transfers to overwrite existing files at the
906 destination.
908 Returns
909 -------
910 targets : `list` of `lsst.resources.ResourcePath`
911 URIs of file artifacts in destination location. Order is not
912 preserved.
914 Notes
915 -----
916 For non-file datastores the artifacts written to the destination
917 may not match the representation inside the datastore. For example
918 a hierarchichal data structure in a NoSQL database may well be stored
919 as a JSON file.
920 """
921 raise NotImplementedError()
923 @abstractmethod
924 def remove(self, datasetRef: DatasetRef) -> None:
925 """Indicate to the Datastore that a Dataset can be removed.
927 Parameters
928 ----------
929 datasetRef : `DatasetRef`
930 Reference to the required Dataset.
932 Raises
933 ------
934 FileNotFoundError
935 When Dataset does not exist.
937 Notes
938 -----
939 Some Datastores may implement this method as a silent no-op to
940 disable Dataset deletion through standard interfaces.
941 """
942 raise NotImplementedError("Must be implemented by subclass")
944 @abstractmethod
945 def forget(self, refs: Iterable[DatasetRef]) -> None:
946 """Indicate to the Datastore that it should remove all records of the
947 given datasets, without actually deleting them.
949 Parameters
950 ----------
951 refs : `~collections.abc.Iterable` [ `DatasetRef` ]
952 References to the datasets being forgotten.
954 Notes
955 -----
956 Asking a datastore to forget a `DatasetRef` it does not hold should be
957 a silent no-op, not an error.
958 """
959 raise NotImplementedError("Must be implemented by subclass")
961 @abstractmethod
962 def trash(self, ref: DatasetRef | Iterable[DatasetRef], ignore_errors: bool = True) -> None:
963 """Indicate to the Datastore that a Dataset can be moved to the trash.
965 Parameters
966 ----------
967 ref : `DatasetRef` or iterable thereof
968 Reference(s) to the required Dataset.
969 ignore_errors : `bool`, optional
970 Determine whether errors should be ignored. When multiple
971 refs are being trashed there will be no per-ref check.
973 Raises
974 ------
975 FileNotFoundError
976 When Dataset does not exist and errors are not ignored. Only
977 checked if a single ref is supplied (and not in a list).
979 Notes
980 -----
981 Some Datastores may implement this method as a silent no-op to
982 disable Dataset deletion through standard interfaces.
983 """
984 raise NotImplementedError("Must be implemented by subclass")
986 @abstractmethod
987 def emptyTrash(self, ignore_errors: bool = True) -> None:
988 """Remove all datasets from the trash.
990 Parameters
991 ----------
992 ignore_errors : `bool`, optional
993 Determine whether errors should be ignored.
995 Notes
996 -----
997 Some Datastores may implement this method as a silent no-op to
998 disable Dataset deletion through standard interfaces.
999 """
1000 raise NotImplementedError("Must be implemented by subclass")
1002 @abstractmethod
1003 def transfer(self, inputDatastore: Datastore, datasetRef: DatasetRef) -> None:
1004 """Transfer a dataset from another datastore to this datastore.
1006 Parameters
1007 ----------
1008 inputDatastore : `Datastore`
1009 The external `Datastore` from which to retrieve the Dataset.
1010 datasetRef : `DatasetRef`
1011 Reference to the required Dataset.
1012 """
1013 raise NotImplementedError("Must be implemented by subclass")
1015 def export(
1016 self,
1017 refs: Iterable[DatasetRef],
1018 *,
1019 directory: ResourcePathExpression | None = None,
1020 transfer: str | None = "auto",
1021 ) -> Iterable[FileDataset]:
1022 """Export datasets for transfer to another data repository.
1024 Parameters
1025 ----------
1026 refs : iterable of `DatasetRef`
1027 Dataset references to be exported.
1028 directory : `str`, optional
1029 Path to a directory that should contain files corresponding to
1030 output datasets. Ignored if ``transfer`` is explicitly `None`.
1031 transfer : `str`, optional
1032 Mode that should be used to move datasets out of the repository.
1033 Valid options are the same as those of the ``transfer`` argument
1034 to ``ingest``, and datastores may similarly signal that a transfer
1035 mode is not supported by raising `NotImplementedError`. If "auto"
1036 is given and no ``directory`` is specified, `None` will be
1037 implied.
1039 Returns
1040 -------
1041 dataset : iterable of `DatasetTransfer`
1042 Structs containing information about the exported datasets, in the
1043 same order as ``refs``.
1045 Raises
1046 ------
1047 NotImplementedError
1048 Raised if the given transfer mode is not supported.
1049 """
1050 raise NotImplementedError(f"Transfer mode {transfer} not supported.")
1052 @abstractmethod
1053 def validateConfiguration(
1054 self, entities: Iterable[DatasetRef | DatasetType | StorageClass], logFailures: bool = False
1055 ) -> None:
1056 """Validate some of the configuration for this datastore.
1058 Parameters
1059 ----------
1060 entities : iterable of `DatasetRef`, `DatasetType`, or `StorageClass`
1061 Entities to test against this configuration. Can be differing
1062 types.
1063 logFailures : `bool`, optional
1064 If `True`, output a log message for every validation error
1065 detected.
1067 Raises
1068 ------
1069 DatastoreValidationError
1070 Raised if there is a validation problem with a configuration.
1072 Notes
1073 -----
1074 Which parts of the configuration are validated is at the discretion
1075 of each Datastore implementation.
1076 """
1077 raise NotImplementedError("Must be implemented by subclass")
1079 @abstractmethod
1080 def validateKey(self, lookupKey: LookupKey, entity: DatasetRef | DatasetType | StorageClass) -> None:
1081 """Validate a specific look up key with supplied entity.
1083 Parameters
1084 ----------
1085 lookupKey : `LookupKey`
1086 Key to use to retrieve information from the datastore
1087 configuration.
1088 entity : `DatasetRef`, `DatasetType`, or `StorageClass`
1089 Entity to compare with configuration retrieved using the
1090 specified lookup key.
1092 Raises
1093 ------
1094 DatastoreValidationError
1095 Raised if there is a problem with the combination of entity
1096 and lookup key.
1098 Notes
1099 -----
1100 Bypasses the normal selection priorities by allowing a key that
1101 would normally not be selected to be validated.
1102 """
1103 raise NotImplementedError("Must be implemented by subclass")
1105 @abstractmethod
1106 def getLookupKeys(self) -> set[LookupKey]:
1107 """Return all the lookup keys relevant to this datastore.
1109 Returns
1110 -------
1111 keys : `set` of `LookupKey`
1112 The keys stored internally for looking up information based
1113 on `DatasetType` name or `StorageClass`.
1114 """
1115 raise NotImplementedError("Must be implemented by subclass")
1117 def needs_expanded_data_ids(
1118 self,
1119 transfer: str | None,
1120 entity: DatasetRef | DatasetType | StorageClass | None = None,
1121 ) -> bool:
1122 """Test whether this datastore needs expanded data IDs to ingest.
1124 Parameters
1125 ----------
1126 transfer : `str` or `None`
1127 Transfer mode for ingest.
1128 entity, optional
1129 Object representing what will be ingested. If not provided (or not
1130 specific enough), `True` may be returned even if expanded data
1131 IDs aren't necessary.
1133 Returns
1134 -------
1135 needed : `bool`
1136 If `True`, expanded data IDs may be needed. `False` only if
1137 expansion definitely isn't necessary.
1138 """
1139 return True
1141 @abstractmethod
1142 def import_records(
1143 self,
1144 data: Mapping[str, DatastoreRecordData],
1145 ) -> None:
1146 """Import datastore location and record data from an in-memory data
1147 structure.
1149 Parameters
1150 ----------
1151 data : `~collections.abc.Mapping` [ `str`, `DatastoreRecordData` ]
1152 Datastore records indexed by datastore name. May contain data for
1153 other `Datastore` instances (generally because they are chained to
1154 this one), which should be ignored.
1156 Notes
1157 -----
1158 Implementations should generally not check that any external resources
1159 (e.g. files) referred to by these records actually exist, for
1160 performance reasons; we expect higher-level code to guarantee that they
1161 do.
1163 Implementations are responsible for calling
1164 `DatastoreRegistryBridge.insert` on all datasets in ``data.locations``
1165 where the key is in `names`, as well as loading any opaque table data.
1166 """
1167 raise NotImplementedError()
1169 @abstractmethod
1170 def export_records(
1171 self,
1172 refs: Iterable[DatasetIdRef],
1173 ) -> Mapping[str, DatastoreRecordData]:
1174 """Export datastore records and locations to an in-memory data
1175 structure.
1177 Parameters
1178 ----------
1179 refs : `~collections.abc.Iterable` [ `DatasetIdRef` ]
1180 Datasets to save. This may include datasets not known to this
1181 datastore, which should be ignored.
1183 Returns
1184 -------
1185 data : `~collections.abc.Mapping` [ `str`, `DatastoreRecordData` ]
1186 Exported datastore records indexed by datastore name.
1187 """
1188 raise NotImplementedError()
1190 def set_retrieve_dataset_type_method(self, method: Callable[[str], DatasetType | None] | None) -> None:
1191 """Specify a method that can be used by datastore to retrieve
1192 registry-defined dataset type.
1194 Parameters
1195 ----------
1196 method : `~collections.abc.Callable` | `None`
1197 Method that takes a name of the dataset type and returns a
1198 corresponding `DatasetType` instance as defined in Registry. If
1199 dataset type name is not known to registry `None` is returned.
1201 Notes
1202 -----
1203 This method is only needed for a Datastore supporting a "trusted" mode
1204 when it does not have an access to datastore records and needs to
1205 guess dataset location based on its stored dataset type.
1206 """
1207 pass