Coverage for python/lsst/daf/butler/core/datastore.py: 42%
244 statements
« prev ^ index » next coverage.py v7.2.5, created at 2023-05-02 18:18 -0700
« prev ^ index » next coverage.py v7.2.5, created at 2023-05-02 18:18 -0700
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22"""Support for generic data stores."""
24from __future__ import annotations
26__all__ = ("DatastoreConfig", "Datastore", "DatastoreValidationError", "DatasetRefURIs")
28import contextlib
29import dataclasses
30import logging
31from abc import ABCMeta, abstractmethod
32from collections import abc, defaultdict
33from typing import (
34 TYPE_CHECKING,
35 Any,
36 Callable,
37 ClassVar,
38 Dict,
39 Iterable,
40 Iterator,
41 List,
42 Mapping,
43 Optional,
44 Set,
45 Tuple,
46 Type,
47 Union,
48)
50from lsst.utils import doImportType
52from .config import Config, ConfigSubset
53from .constraints import Constraints
54from .exceptions import DatasetTypeNotSupportedError, ValidationError
55from .fileDataset import FileDataset
56from .storageClass import StorageClassFactory
58if TYPE_CHECKING: 58 ↛ 59line 58 didn't jump to line 59, because the condition on line 58 was never true
59 from lsst.resources import ResourcePath, ResourcePathExpression
61 from ..registry.interfaces import DatasetIdRef, DatastoreRegistryBridgeManager
62 from .configSupport import LookupKey
63 from .datasets import DatasetRef, DatasetType
64 from .datastoreRecordData import DatastoreRecordData
65 from .storageClass import StorageClass
68class DatastoreConfig(ConfigSubset):
69 """Configuration for Datastores."""
71 component = "datastore"
72 requiredKeys = ("cls",)
73 defaultConfigFile = "datastore.yaml"
76class DatastoreValidationError(ValidationError):
77 """There is a problem with the Datastore configuration."""
79 pass
82@dataclasses.dataclass(frozen=True)
83class Event:
84 __slots__ = {"name", "undoFunc", "args", "kwargs"}
85 name: str
86 undoFunc: Callable
87 args: tuple
88 kwargs: dict
91class IngestPrepData:
92 """A helper base class for `Datastore` ingest implementations.
94 Datastore implementations will generally need a custom implementation of
95 this class.
97 Should be accessed as ``Datastore.IngestPrepData`` instead of via direct
98 import.
100 Parameters
101 ----------
102 refs : iterable of `DatasetRef`
103 References for the datasets that can be ingested by this datastore.
104 """
106 def __init__(self, refs: Iterable[DatasetRef]):
107 self.refs = {ref.id: ref for ref in refs}
110class DatastoreTransaction:
111 """Keeps a log of `Datastore` activity and allow rollback.
113 Parameters
114 ----------
115 parent : `DatastoreTransaction`, optional
116 The parent transaction (if any)
117 """
119 Event: ClassVar[Type] = Event
121 parent: Optional[DatastoreTransaction]
122 """The parent transaction. (`DatastoreTransaction`, optional)"""
124 def __init__(self, parent: Optional[DatastoreTransaction] = None):
125 self.parent = parent
126 self._log: List[Event] = []
128 def registerUndo(self, name: str, undoFunc: Callable, *args: Any, **kwargs: Any) -> None:
129 """Register event with undo function.
131 Parameters
132 ----------
133 name : `str`
134 Name of the event.
135 undoFunc : func
136 Function to undo this event.
137 args : `tuple`
138 Positional arguments to `undoFunc`.
139 **kwargs
140 Keyword arguments to `undoFunc`.
141 """
142 self._log.append(self.Event(name, undoFunc, args, kwargs))
144 @contextlib.contextmanager
145 def undoWith(self, name: str, undoFunc: Callable, *args: Any, **kwargs: Any) -> Iterator[None]:
146 """Register undo function if nested operation succeeds.
148 Calls `registerUndo`.
150 This can be used to wrap individual undo-able statements within a
151 DatastoreTransaction block. Multiple statements that can fail
152 separately should not be part of the same `undoWith` block.
154 All arguments are forwarded directly to `registerUndo`.
155 """
156 try:
157 yield None
158 except BaseException:
159 raise
160 else:
161 self.registerUndo(name, undoFunc, *args, **kwargs)
163 def rollback(self) -> None:
164 """Roll back all events in this transaction."""
165 log = logging.getLogger(__name__)
166 while self._log:
167 ev = self._log.pop()
168 try:
169 log.debug(
170 "Rolling back transaction: %s: %s(%s,%s)",
171 ev.name,
172 ev.undoFunc,
173 ",".join(str(a) for a in ev.args),
174 ",".join(f"{k}={v}" for k, v in ev.kwargs.items()),
175 )
176 except Exception:
177 # In case we had a problem in stringification of arguments
178 log.warning("Rolling back transaction: %s", ev.name)
179 try:
180 ev.undoFunc(*ev.args, **ev.kwargs)
181 except BaseException as e:
182 # Deliberately swallow error that may occur in unrolling
183 log.warning("Exception: %s caught while unrolling: %s", e, ev.name)
184 pass
186 def commit(self) -> None:
187 """Commit this transaction."""
188 if self.parent is None:
189 # Just forget about the events, they have already happened.
190 return
191 else:
192 # We may still want to events from this transaction as part of
193 # the parent.
194 self.parent._log.extend(self._log)
197@dataclasses.dataclass
198class DatasetRefURIs(abc.Sequence):
199 """Represents the primary and component ResourcePath(s) associated with a
200 DatasetRef.
202 This is used in places where its members used to be represented as a tuple
203 `(primaryURI, componentURIs)`. To maintain backward compatibility this
204 inherits from Sequence and so instances can be treated as a two-item
205 tuple.
206 """
208 def __init__(
209 self,
210 primaryURI: Optional[ResourcePath] = None,
211 componentURIs: Optional[Dict[str, ResourcePath]] = None,
212 ):
213 self.primaryURI = primaryURI
214 """The URI to the primary artifact associated with this dataset. If the
215 dataset was disassembled within the datastore this may be `None`.
216 """
218 self.componentURIs = componentURIs or {}
219 """The URIs to any components associated with the dataset artifact
220 indexed by component name. This can be empty if there are no
221 components.
222 """
224 def __getitem__(self, index: Any) -> Any:
225 """Get primaryURI and componentURIs by index.
227 Provides support for tuple-like access.
228 """
229 if index == 0:
230 return self.primaryURI
231 elif index == 1:
232 return self.componentURIs
233 raise IndexError("list index out of range")
235 def __len__(self) -> int:
236 """Get the number of data members.
238 Provides support for tuple-like access.
239 """
240 return 2
242 def __repr__(self) -> str:
243 return f"DatasetRefURIs({repr(self.primaryURI)}, {repr(self.componentURIs)})"
246class Datastore(metaclass=ABCMeta):
247 """Datastore interface.
249 Parameters
250 ----------
251 config : `DatastoreConfig` or `str`
252 Load configuration either from an existing config instance or by
253 referring to a configuration file.
254 bridgeManager : `DatastoreRegistryBridgeManager`
255 Object that manages the interface between `Registry` and datastores.
256 butlerRoot : `str`, optional
257 New datastore root to use to override the configuration value.
258 """
260 defaultConfigFile: ClassVar[Optional[str]] = None
261 """Path to configuration defaults. Accessed within the ``config`` resource
262 or relative to a search path. Can be None if no defaults specified.
263 """
265 containerKey: ClassVar[Optional[str]] = None
266 """Name of the key containing a list of subconfigurations that also
267 need to be merged with defaults and will likely use different Python
268 datastore classes (but all using DatastoreConfig). Assumed to be a
269 list of configurations that can be represented in a DatastoreConfig
270 and containing a "cls" definition. None indicates that no containers
271 are expected in this Datastore."""
273 isEphemeral: bool = False
274 """Indicate whether this Datastore is ephemeral or not. An ephemeral
275 datastore is one where the contents of the datastore will not exist
276 across process restarts. This value can change per-instance."""
278 config: DatastoreConfig
279 """Configuration used to create Datastore."""
281 name: str
282 """Label associated with this Datastore."""
284 storageClassFactory: StorageClassFactory
285 """Factory for creating storage class instances from name."""
287 constraints: Constraints
288 """Constraints to apply when putting datasets into the datastore."""
290 # MyPy does not like for this to be annotated as any kind of type, because
291 # it can't do static checking on type variables that can change at runtime.
292 IngestPrepData: ClassVar[Any] = IngestPrepData
293 """Helper base class for ingest implementations.
294 """
296 @classmethod
297 @abstractmethod
298 def setConfigRoot(cls, root: str, config: Config, full: Config, overwrite: bool = True) -> None:
299 """Set filesystem-dependent config options for this datastore.
301 The options will be appropriate for a new empty repository with the
302 given root.
304 Parameters
305 ----------
306 root : `str`
307 Filesystem path to the root of the data repository.
308 config : `Config`
309 A `Config` to update. Only the subset understood by
310 this component will be updated. Will not expand
311 defaults.
312 full : `Config`
313 A complete config with all defaults expanded that can be
314 converted to a `DatastoreConfig`. Read-only and will not be
315 modified by this method.
316 Repository-specific options that should not be obtained
317 from defaults when Butler instances are constructed
318 should be copied from ``full`` to ``config``.
319 overwrite : `bool`, optional
320 If `False`, do not modify a value in ``config`` if the value
321 already exists. Default is always to overwrite with the provided
322 ``root``.
324 Notes
325 -----
326 If a keyword is explicitly defined in the supplied ``config`` it
327 will not be overridden by this method if ``overwrite`` is `False`.
328 This allows explicit values set in external configs to be retained.
329 """
330 raise NotImplementedError()
332 @staticmethod
333 def fromConfig(
334 config: Config,
335 bridgeManager: DatastoreRegistryBridgeManager,
336 butlerRoot: Optional[ResourcePathExpression] = None,
337 ) -> "Datastore":
338 """Create datastore from type specified in config file.
340 Parameters
341 ----------
342 config : `Config`
343 Configuration instance.
344 bridgeManager : `DatastoreRegistryBridgeManager`
345 Object that manages the interface between `Registry` and
346 datastores.
347 butlerRoot : `str`, optional
348 Butler root directory.
349 """
350 cls = doImportType(config["datastore", "cls"])
351 if not issubclass(cls, Datastore):
352 raise TypeError(f"Imported child class {config['datastore', 'cls']} is not a Datastore")
353 return cls(config=config, bridgeManager=bridgeManager, butlerRoot=butlerRoot)
355 def __init__(
356 self,
357 config: Union[Config, str],
358 bridgeManager: DatastoreRegistryBridgeManager,
359 butlerRoot: Optional[ResourcePathExpression] = None,
360 ):
361 self.config = DatastoreConfig(config)
362 self.name = "ABCDataStore"
363 self._transaction: Optional[DatastoreTransaction] = None
365 # All Datastores need storage classes and constraints
366 self.storageClassFactory = StorageClassFactory()
368 # And read the constraints list
369 constraintsConfig = self.config.get("constraints")
370 self.constraints = Constraints(constraintsConfig, universe=bridgeManager.universe)
372 def __str__(self) -> str:
373 return self.name
375 def __repr__(self) -> str:
376 return self.name
378 @property
379 def names(self) -> Tuple[str, ...]:
380 """Names associated with this datastore returned as a list.
382 Can be different to ``name`` for a chaining datastore.
383 """
384 # Default implementation returns solely the name itself
385 return (self.name,)
387 @contextlib.contextmanager
388 def transaction(self) -> Iterator[DatastoreTransaction]:
389 """Context manager supporting `Datastore` transactions.
391 Transactions can be nested, and are to be used in combination with
392 `Registry.transaction`.
393 """
394 self._transaction = DatastoreTransaction(self._transaction)
395 try:
396 yield self._transaction
397 except BaseException:
398 self._transaction.rollback()
399 raise
400 else:
401 self._transaction.commit()
402 self._transaction = self._transaction.parent
404 @abstractmethod
405 def knows(self, ref: DatasetRef) -> bool:
406 """Check if the dataset is known to the datastore.
408 Does not check for existence of any artifact.
410 Parameters
411 ----------
412 ref : `DatasetRef`
413 Reference to the required dataset.
415 Returns
416 -------
417 exists : `bool`
418 `True` if the dataset is known to the datastore.
419 """
420 raise NotImplementedError()
422 def mexists(
423 self, refs: Iterable[DatasetRef], artifact_existence: Optional[Dict[ResourcePath, bool]] = None
424 ) -> Dict[DatasetRef, bool]:
425 """Check the existence of multiple datasets at once.
427 Parameters
428 ----------
429 refs : iterable of `DatasetRef`
430 The datasets to be checked.
431 artifact_existence : `dict` [`lsst.resources.ResourcePath`, `bool`]
432 Optional mapping of datastore artifact to existence. Updated by
433 this method with details of all artifacts tested. Can be `None`
434 if the caller is not interested.
436 Returns
437 -------
438 existence : `dict` of [`DatasetRef`, `bool`]
439 Mapping from dataset to boolean indicating existence.
440 """
441 existence: Dict[DatasetRef, bool] = {}
442 # Non-optimized default.
443 for ref in refs:
444 existence[ref] = self.exists(ref)
445 return existence
447 @abstractmethod
448 def exists(self, datasetRef: DatasetRef) -> bool:
449 """Check if the dataset exists in the datastore.
451 Parameters
452 ----------
453 datasetRef : `DatasetRef`
454 Reference to the required dataset.
456 Returns
457 -------
458 exists : `bool`
459 `True` if the entity exists in the `Datastore`.
460 """
461 raise NotImplementedError("Must be implemented by subclass")
463 @abstractmethod
464 def get(self, datasetRef: DatasetRef, parameters: Mapping[str, Any] = None) -> Any:
465 """Load an `InMemoryDataset` from the store.
467 Parameters
468 ----------
469 datasetRef : `DatasetRef`
470 Reference to the required Dataset.
471 parameters : `dict`
472 `StorageClass`-specific parameters that specify a slice of the
473 Dataset to be loaded.
475 Returns
476 -------
477 inMemoryDataset : `object`
478 Requested Dataset or slice thereof as an InMemoryDataset.
479 """
480 raise NotImplementedError("Must be implemented by subclass")
482 @abstractmethod
483 def put(self, inMemoryDataset: Any, datasetRef: DatasetRef) -> None:
484 """Write a `InMemoryDataset` with a given `DatasetRef` to the store.
486 Parameters
487 ----------
488 inMemoryDataset : `object`
489 The Dataset to store.
490 datasetRef : `DatasetRef`
491 Reference to the associated Dataset.
492 """
493 raise NotImplementedError("Must be implemented by subclass")
495 def _overrideTransferMode(self, *datasets: FileDataset, transfer: Optional[str] = None) -> Optional[str]:
496 """Allow ingest transfer mode to be defaulted based on datasets.
498 Parameters
499 ----------
500 datasets : `FileDataset`
501 Each positional argument is a struct containing information about
502 a file to be ingested, including its path (either absolute or
503 relative to the datastore root, if applicable), a complete
504 `DatasetRef` (with ``dataset_id not None``), and optionally a
505 formatter class or its fully-qualified string name. If a formatter
506 is not provided, this method should populate that attribute with
507 the formatter the datastore would use for `put`. Subclasses are
508 also permitted to modify the path attribute (typically to put it
509 in what the datastore considers its standard form).
510 transfer : `str`, optional
511 How (and whether) the dataset should be added to the datastore.
512 See `ingest` for details of transfer modes.
514 Returns
515 -------
516 newTransfer : `str`
517 Transfer mode to use. Will be identical to the supplied transfer
518 mode unless "auto" is used.
519 """
520 if transfer != "auto":
521 return transfer
522 raise RuntimeError(f"{transfer} is not allowed without specialization.")
524 def _prepIngest(self, *datasets: FileDataset, transfer: Optional[str] = None) -> IngestPrepData:
525 """Process datasets to identify which ones can be ingested.
527 Parameters
528 ----------
529 datasets : `FileDataset`
530 Each positional argument is a struct containing information about
531 a file to be ingested, including its path (either absolute or
532 relative to the datastore root, if applicable), a complete
533 `DatasetRef` (with ``dataset_id not None``), and optionally a
534 formatter class or its fully-qualified string name. If a formatter
535 is not provided, this method should populate that attribute with
536 the formatter the datastore would use for `put`. Subclasses are
537 also permitted to modify the path attribute (typically to put it
538 in what the datastore considers its standard form).
539 transfer : `str`, optional
540 How (and whether) the dataset should be added to the datastore.
541 See `ingest` for details of transfer modes.
543 Returns
544 -------
545 data : `IngestPrepData`
546 An instance of a subclass of `IngestPrepData`, used to pass
547 arbitrary data from `_prepIngest` to `_finishIngest`. This should
548 include only the datasets this datastore can actually ingest;
549 others should be silently ignored (`Datastore.ingest` will inspect
550 `IngestPrepData.refs` and raise `DatasetTypeNotSupportedError` if
551 necessary).
553 Raises
554 ------
555 NotImplementedError
556 Raised if the datastore does not support the given transfer mode
557 (including the case where ingest is not supported at all).
558 FileNotFoundError
559 Raised if one of the given files does not exist.
560 FileExistsError
561 Raised if transfer is not `None` but the (internal) location the
562 file would be moved to is already occupied.
564 Notes
565 -----
566 This method (along with `_finishIngest`) should be implemented by
567 subclasses to provide ingest support instead of implementing `ingest`
568 directly.
570 `_prepIngest` should not modify the data repository or given files in
571 any way; all changes should be deferred to `_finishIngest`.
573 When possible, exceptions should be raised in `_prepIngest` instead of
574 `_finishIngest`. `NotImplementedError` exceptions that indicate that
575 the transfer mode is not supported must be raised by `_prepIngest`
576 instead of `_finishIngest`.
577 """
578 raise NotImplementedError(f"Datastore {self} does not support direct file-based ingest.")
580 def _finishIngest(
581 self, prepData: IngestPrepData, *, transfer: Optional[str] = None, record_validation_info: bool = True
582 ) -> None:
583 """Complete an ingest operation.
585 Parameters
586 ----------
587 data : `IngestPrepData`
588 An instance of a subclass of `IngestPrepData`. Guaranteed to be
589 the direct result of a call to `_prepIngest` on this datastore.
590 transfer : `str`, optional
591 How (and whether) the dataset should be added to the datastore.
592 See `ingest` for details of transfer modes.
593 record_validation_info : `bool`, optional
594 If `True`, the default, the datastore can record validation
595 information associated with the file. If `False` the datastore
596 will not attempt to track any information such as checksums
597 or file sizes. This can be useful if such information is tracked
598 in an external system or if the file is to be compressed in place.
599 It is up to the datastore whether this parameter is relevant.
601 Raises
602 ------
603 FileNotFoundError
604 Raised if one of the given files does not exist.
605 FileExistsError
606 Raised if transfer is not `None` but the (internal) location the
607 file would be moved to is already occupied.
609 Notes
610 -----
611 This method (along with `_prepIngest`) should be implemented by
612 subclasses to provide ingest support instead of implementing `ingest`
613 directly.
614 """
615 raise NotImplementedError(f"Datastore {self} does not support direct file-based ingest.")
617 def ingest(
618 self, *datasets: FileDataset, transfer: Optional[str] = None, record_validation_info: bool = True
619 ) -> None:
620 """Ingest one or more files into the datastore.
622 Parameters
623 ----------
624 datasets : `FileDataset`
625 Each positional argument is a struct containing information about
626 a file to be ingested, including its path (either absolute or
627 relative to the datastore root, if applicable), a complete
628 `DatasetRef` (with ``dataset_id not None``), and optionally a
629 formatter class or its fully-qualified string name. If a formatter
630 is not provided, the one the datastore would use for ``put`` on
631 that dataset is assumed.
632 transfer : `str`, optional
633 How (and whether) the dataset should be added to the datastore.
634 If `None` (default), the file must already be in a location
635 appropriate for the datastore (e.g. within its root directory),
636 and will not be modified. Other choices include "move", "copy",
637 "link", "symlink", "relsymlink", and "hardlink". "link" is a
638 special transfer mode that will first try to make a hardlink and
639 if that fails a symlink will be used instead. "relsymlink" creates
640 a relative symlink rather than use an absolute path.
641 Most datastores do not support all transfer modes.
642 "auto" is a special option that will let the
643 data store choose the most natural option for itself.
644 record_validation_info : `bool`, optional
645 If `True`, the default, the datastore can record validation
646 information associated with the file. If `False` the datastore
647 will not attempt to track any information such as checksums
648 or file sizes. This can be useful if such information is tracked
649 in an external system or if the file is to be compressed in place.
650 It is up to the datastore whether this parameter is relevant.
652 Raises
653 ------
654 NotImplementedError
655 Raised if the datastore does not support the given transfer mode
656 (including the case where ingest is not supported at all).
657 DatasetTypeNotSupportedError
658 Raised if one or more files to be ingested have a dataset type that
659 is not supported by the datastore.
660 FileNotFoundError
661 Raised if one of the given files does not exist.
662 FileExistsError
663 Raised if transfer is not `None` but the (internal) location the
664 file would be moved to is already occupied.
666 Notes
667 -----
668 Subclasses should implement `_prepIngest` and `_finishIngest` instead
669 of implementing `ingest` directly. Datastores that hold and
670 delegate to child datastores may want to call those methods as well.
672 Subclasses are encouraged to document their supported transfer modes
673 in their class documentation.
674 """
675 # Allow a datastore to select a default transfer mode
676 transfer = self._overrideTransferMode(*datasets, transfer=transfer)
677 prepData = self._prepIngest(*datasets, transfer=transfer)
678 refs = {ref.id: ref for dataset in datasets for ref in dataset.refs}
679 if None in refs:
680 # Find the file for the error message. There may be multiple
681 # bad refs so look for all of them.
682 unresolved_paths = {}
683 for dataset in datasets:
684 unresolved = []
685 for ref in dataset.refs:
686 if ref.id is None:
687 unresolved.append(ref)
688 if unresolved:
689 unresolved_paths[dataset.path] = unresolved
690 raise RuntimeError(
691 "Attempt to ingest unresolved DatasetRef from: "
692 + ",".join(f"{p}: ({[str(r) for r in ref]})" for p, ref in unresolved_paths.items())
693 )
694 if refs.keys() != prepData.refs.keys():
695 unsupported = refs.keys() - prepData.refs.keys()
696 # Group unsupported refs by DatasetType for an informative
697 # but still concise error message.
698 byDatasetType = defaultdict(list)
699 for datasetId in unsupported:
700 ref = refs[datasetId]
701 byDatasetType[ref.datasetType].append(ref)
702 raise DatasetTypeNotSupportedError(
703 "DatasetType(s) not supported in ingest: "
704 + ", ".join(f"{k.name} ({len(v)} dataset(s))" for k, v in byDatasetType.items())
705 )
706 self._finishIngest(prepData, transfer=transfer, record_validation_info=record_validation_info)
708 def transfer_from(
709 self,
710 source_datastore: Datastore,
711 refs: Iterable[DatasetRef],
712 local_refs: Optional[Iterable[DatasetRef]] = None,
713 transfer: str = "auto",
714 artifact_existence: Optional[Dict[ResourcePath, bool]] = None,
715 ) -> None:
716 """Transfer dataset artifacts from another datastore to this one.
718 Parameters
719 ----------
720 source_datastore : `Datastore`
721 The datastore from which to transfer artifacts. That datastore
722 must be compatible with this datastore receiving the artifacts.
723 refs : iterable of `DatasetRef`
724 The datasets to transfer from the source datastore.
725 local_refs : iterable of `DatasetRef`, optional
726 The dataset refs associated with the registry associated with
727 this datastore. Can be `None` if the source and target datastore
728 are using UUIDs.
729 transfer : `str`, optional
730 How (and whether) the dataset should be added to the datastore.
731 Choices include "move", "copy",
732 "link", "symlink", "relsymlink", and "hardlink". "link" is a
733 special transfer mode that will first try to make a hardlink and
734 if that fails a symlink will be used instead. "relsymlink" creates
735 a relative symlink rather than use an absolute path.
736 Most datastores do not support all transfer modes.
737 "auto" (the default) is a special option that will let the
738 data store choose the most natural option for itself.
739 If the source location and transfer location are identical the
740 transfer mode will be ignored.
741 artifact_existence : `dict` [`lsst.resources.ResourcePath`, `bool`]
742 Optional mapping of datastore artifact to existence. Updated by
743 this method with details of all artifacts tested. Can be `None`
744 if the caller is not interested.
746 Raises
747 ------
748 TypeError
749 Raised if the two datastores are not compatible.
750 """
751 if type(self) is not type(source_datastore):
752 raise TypeError(
753 f"Datastore mismatch between this datastore ({type(self)}) and the "
754 f"source datastore ({type(source_datastore)})."
755 )
757 raise NotImplementedError(f"Datastore {type(self)} must implement a transfer_from method.")
759 def getManyURIs(
760 self,
761 refs: Iterable[DatasetRef],
762 predict: bool = False,
763 allow_missing: bool = False,
764 ) -> Dict[DatasetRef, DatasetRefURIs]:
765 """Return URIs associated with many datasets.
767 Parameters
768 ----------
769 refs : iterable of `DatasetIdRef`
770 References to the required datasets.
771 predict : `bool`, optional
772 If the datastore does not know about a dataset, should it
773 return a predicted URI or not?
774 allow_missing : `bool`
775 If `False`, and `predict` is `False`, will raise if a `DatasetRef`
776 does not exist.
778 Returns
779 -------
780 URIs : `dict` of [`DatasetRef`, `DatasetRefUris`]
781 A dict of primary and component URIs, indexed by the passed-in
782 refs.
784 Raises
785 ------
786 FileNotFoundError
787 A URI has been requested for a dataset that does not exist and
788 guessing is not allowed.
790 Notes
791 -----
792 In file-based datastores, getManuURIs does not check that the file is
793 really there, it's assuming it is if datastore is aware of the file
794 then it actually exists.
795 """
796 uris: Dict[DatasetRef, DatasetRefURIs] = {}
797 missing_refs = []
798 for ref in refs:
799 try:
800 uris[ref] = self.getURIs(ref, predict=predict)
801 except FileNotFoundError:
802 missing_refs.append(ref)
803 if missing_refs and not allow_missing:
804 raise FileNotFoundError(
805 "Missing {} refs from datastore out of {} and predict=False.".format(
806 num_missing := len(missing_refs), num_missing + len(uris)
807 )
808 )
809 return uris
811 @abstractmethod
812 def getURIs(self, datasetRef: DatasetRef, predict: bool = False) -> DatasetRefURIs:
813 """Return URIs associated with dataset.
815 Parameters
816 ----------
817 ref : `DatasetRef`
818 Reference to the required dataset.
819 predict : `bool`, optional
820 If the datastore does not know about the dataset, should it
821 return a predicted URI or not?
823 Returns
824 -------
825 uris : `DatasetRefURIs`
826 The URI to the primary artifact associated with this dataset (if
827 the dataset was disassembled within the datastore this may be
828 `None`), and the URIs to any components associated with the dataset
829 artifact. (can be empty if there are no components).
830 """
831 raise NotImplementedError()
833 @abstractmethod
834 def getURI(self, datasetRef: DatasetRef, predict: bool = False) -> ResourcePath:
835 """URI to the Dataset.
837 Parameters
838 ----------
839 datasetRef : `DatasetRef`
840 Reference to the required Dataset.
841 predict : `bool`
842 If `True` attempt to predict the URI for a dataset if it does
843 not exist in datastore.
845 Returns
846 -------
847 uri : `str`
848 URI string pointing to the Dataset within the datastore. If the
849 Dataset does not exist in the datastore, the URI may be a guess.
850 If the datastore does not have entities that relate well
851 to the concept of a URI the returned URI string will be
852 descriptive. The returned URI is not guaranteed to be obtainable.
854 Raises
855 ------
856 FileNotFoundError
857 A URI has been requested for a dataset that does not exist and
858 guessing is not allowed.
859 """
860 raise NotImplementedError("Must be implemented by subclass")
862 @abstractmethod
863 def retrieveArtifacts(
864 self,
865 refs: Iterable[DatasetRef],
866 destination: ResourcePath,
867 transfer: str = "auto",
868 preserve_path: bool = True,
869 overwrite: bool = False,
870 ) -> List[ResourcePath]:
871 """Retrieve the artifacts associated with the supplied refs.
873 Parameters
874 ----------
875 refs : iterable of `DatasetRef`
876 The datasets for which artifacts are to be retrieved.
877 A single ref can result in multiple artifacts. The refs must
878 be resolved.
879 destination : `lsst.resources.ResourcePath`
880 Location to write the artifacts.
881 transfer : `str`, optional
882 Method to use to transfer the artifacts. Must be one of the options
883 supported by `lsst.resources.ResourcePath.transfer_from()`.
884 "move" is not allowed.
885 preserve_path : `bool`, optional
886 If `True` the full path of the artifact within the datastore
887 is preserved. If `False` the final file component of the path
888 is used.
889 overwrite : `bool`, optional
890 If `True` allow transfers to overwrite existing files at the
891 destination.
893 Returns
894 -------
895 targets : `list` of `lsst.resources.ResourcePath`
896 URIs of file artifacts in destination location. Order is not
897 preserved.
899 Notes
900 -----
901 For non-file datastores the artifacts written to the destination
902 may not match the representation inside the datastore. For example
903 a hierarchichal data structure in a NoSQL database may well be stored
904 as a JSON file.
905 """
906 raise NotImplementedError()
908 @abstractmethod
909 def remove(self, datasetRef: DatasetRef) -> None:
910 """Indicate to the Datastore that a Dataset can be removed.
912 Parameters
913 ----------
914 datasetRef : `DatasetRef`
915 Reference to the required Dataset.
917 Raises
918 ------
919 FileNotFoundError
920 When Dataset does not exist.
922 Notes
923 -----
924 Some Datastores may implement this method as a silent no-op to
925 disable Dataset deletion through standard interfaces.
926 """
927 raise NotImplementedError("Must be implemented by subclass")
929 @abstractmethod
930 def forget(self, refs: Iterable[DatasetRef]) -> None:
931 """Indicate to the Datastore that it should remove all records of the
932 given datasets, without actually deleting them.
934 Parameters
935 ----------
936 refs : `Iterable` [ `DatasetRef` ]
937 References to the datasets being forgotten.
939 Notes
940 -----
941 Asking a datastore to forget a `DatasetRef` it does not hold should be
942 a silent no-op, not an error.
943 """
944 raise NotImplementedError("Must be implemented by subclass")
946 @abstractmethod
947 def trash(self, ref: Union[DatasetRef, Iterable[DatasetRef]], ignore_errors: bool = True) -> None:
948 """Indicate to the Datastore that a Dataset can be moved to the trash.
950 Parameters
951 ----------
952 ref : `DatasetRef` or iterable thereof
953 Reference(s) to the required Dataset.
954 ignore_errors : `bool`, optional
955 Determine whether errors should be ignored. When multiple
956 refs are being trashed there will be no per-ref check.
958 Raises
959 ------
960 FileNotFoundError
961 When Dataset does not exist and errors are not ignored. Only
962 checked if a single ref is supplied (and not in a list).
964 Notes
965 -----
966 Some Datastores may implement this method as a silent no-op to
967 disable Dataset deletion through standard interfaces.
968 """
969 raise NotImplementedError("Must be implemented by subclass")
971 @abstractmethod
972 def emptyTrash(self, ignore_errors: bool = True) -> None:
973 """Remove all datasets from the trash.
975 Parameters
976 ----------
977 ignore_errors : `bool`, optional
978 Determine whether errors should be ignored.
980 Notes
981 -----
982 Some Datastores may implement this method as a silent no-op to
983 disable Dataset deletion through standard interfaces.
984 """
985 raise NotImplementedError("Must be implemented by subclass")
987 @abstractmethod
988 def transfer(self, inputDatastore: Datastore, datasetRef: DatasetRef) -> None:
989 """Transfer a dataset from another datastore to this datastore.
991 Parameters
992 ----------
993 inputDatastore : `Datastore`
994 The external `Datastore` from which to retrieve the Dataset.
995 datasetRef : `DatasetRef`
996 Reference to the required Dataset.
997 """
998 raise NotImplementedError("Must be implemented by subclass")
1000 def export(
1001 self, refs: Iterable[DatasetRef], *, directory: Optional[str] = None, transfer: Optional[str] = None
1002 ) -> Iterable[FileDataset]:
1003 """Export datasets for transfer to another data repository.
1005 Parameters
1006 ----------
1007 refs : iterable of `DatasetRef`
1008 Dataset references to be exported.
1009 directory : `str`, optional
1010 Path to a directory that should contain files corresponding to
1011 output datasets. Ignored if ``transfer`` is `None`.
1012 transfer : `str`, optional
1013 Mode that should be used to move datasets out of the repository.
1014 Valid options are the same as those of the ``transfer`` argument
1015 to ``ingest``, and datastores may similarly signal that a transfer
1016 mode is not supported by raising `NotImplementedError`.
1018 Returns
1019 -------
1020 dataset : iterable of `DatasetTransfer`
1021 Structs containing information about the exported datasets, in the
1022 same order as ``refs``.
1024 Raises
1025 ------
1026 NotImplementedError
1027 Raised if the given transfer mode is not supported.
1028 """
1029 raise NotImplementedError(f"Transfer mode {transfer} not supported.")
1031 @abstractmethod
1032 def validateConfiguration(
1033 self, entities: Iterable[Union[DatasetRef, DatasetType, StorageClass]], logFailures: bool = False
1034 ) -> None:
1035 """Validate some of the configuration for this datastore.
1037 Parameters
1038 ----------
1039 entities : iterable of `DatasetRef`, `DatasetType`, or `StorageClass`
1040 Entities to test against this configuration. Can be differing
1041 types.
1042 logFailures : `bool`, optional
1043 If `True`, output a log message for every validation error
1044 detected.
1046 Raises
1047 ------
1048 DatastoreValidationError
1049 Raised if there is a validation problem with a configuration.
1051 Notes
1052 -----
1053 Which parts of the configuration are validated is at the discretion
1054 of each Datastore implementation.
1055 """
1056 raise NotImplementedError("Must be implemented by subclass")
1058 @abstractmethod
1059 def validateKey(self, lookupKey: LookupKey, entity: Union[DatasetRef, DatasetType, StorageClass]) -> None:
1060 """Validate a specific look up key with supplied entity.
1062 Parameters
1063 ----------
1064 lookupKey : `LookupKey`
1065 Key to use to retrieve information from the datastore
1066 configuration.
1067 entity : `DatasetRef`, `DatasetType`, or `StorageClass`
1068 Entity to compare with configuration retrieved using the
1069 specified lookup key.
1071 Raises
1072 ------
1073 DatastoreValidationError
1074 Raised if there is a problem with the combination of entity
1075 and lookup key.
1077 Notes
1078 -----
1079 Bypasses the normal selection priorities by allowing a key that
1080 would normally not be selected to be validated.
1081 """
1082 raise NotImplementedError("Must be implemented by subclass")
1084 @abstractmethod
1085 def getLookupKeys(self) -> Set[LookupKey]:
1086 """Return all the lookup keys relevant to this datastore.
1088 Returns
1089 -------
1090 keys : `set` of `LookupKey`
1091 The keys stored internally for looking up information based
1092 on `DatasetType` name or `StorageClass`.
1093 """
1094 raise NotImplementedError("Must be implemented by subclass")
1096 def needs_expanded_data_ids(
1097 self,
1098 transfer: Optional[str],
1099 entity: Optional[Union[DatasetRef, DatasetType, StorageClass]] = None,
1100 ) -> bool:
1101 """Test whether this datastore needs expanded data IDs to ingest.
1103 Parameters
1104 ----------
1105 transfer : `str` or `None`
1106 Transfer mode for ingest.
1107 entity, optional
1108 Object representing what will be ingested. If not provided (or not
1109 specific enough), `True` may be returned even if expanded data
1110 IDs aren't necessary.
1112 Returns
1113 -------
1114 needed : `bool`
1115 If `True`, expanded data IDs may be needed. `False` only if
1116 expansion definitely isn't necessary.
1117 """
1118 return True
1120 @abstractmethod
1121 def import_records(
1122 self,
1123 data: Mapping[str, DatastoreRecordData],
1124 ) -> None:
1125 """Import datastore location and record data from an in-memory data
1126 structure.
1128 Parameters
1129 ----------
1130 data : `Mapping` [ `str`, `DatastoreRecordData` ]
1131 Datastore records indexed by datastore name. May contain data for
1132 other `Datastore` instances (generally because they are chained to
1133 this one), which should be ignored.
1135 Notes
1136 -----
1137 Implementations should generally not check that any external resources
1138 (e.g. files) referred to by these records actually exist, for
1139 performance reasons; we expect higher-level code to guarantee that they
1140 do.
1142 Implementations are responsible for calling
1143 `DatastoreRegistryBridge.insert` on all datasets in ``data.locations``
1144 where the key is in `names`, as well as loading any opaque table data.
1145 """
1146 raise NotImplementedError()
1148 @abstractmethod
1149 def export_records(
1150 self,
1151 refs: Iterable[DatasetIdRef],
1152 ) -> Mapping[str, DatastoreRecordData]:
1153 """Export datastore records and locations to an in-memory data
1154 structure.
1156 Parameters
1157 ----------
1158 refs : `Iterable` [ `DatasetIdRef` ]
1159 Datasets to save. This may include datasets not known to this
1160 datastore, which should be ignored.
1162 Returns
1163 -------
1164 data : `Mapping` [ `str`, `DatastoreRecordData` ]
1165 Exported datastore records indexed by datastore name.
1166 """
1167 raise NotImplementedError()