Coverage for python/lsst/daf/butler/core/datastore.py: 42%
248 statements
« prev ^ index » next coverage.py v6.5.0, created at 2023-03-04 02:04 -0800
« prev ^ index » next coverage.py v6.5.0, created at 2023-03-04 02:04 -0800
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22"""Support for generic data stores."""
24from __future__ import annotations
26__all__ = ("DatastoreConfig", "Datastore", "DatastoreValidationError", "DatasetRefURIs")
28import contextlib
29import dataclasses
30import logging
31from abc import ABCMeta, abstractmethod
32from collections import abc, defaultdict
33from typing import (
34 TYPE_CHECKING,
35 Any,
36 Callable,
37 ClassVar,
38 Dict,
39 Iterable,
40 Iterator,
41 List,
42 Mapping,
43 Optional,
44 Set,
45 Tuple,
46 Type,
47 Union,
48)
50from lsst.utils import doImportType
52from .config import Config, ConfigSubset
53from .constraints import Constraints
54from .exceptions import DatasetTypeNotSupportedError, ValidationError
55from .fileDataset import FileDataset
56from .storageClass import StorageClassFactory
58if TYPE_CHECKING: 58 ↛ 59line 58 didn't jump to line 59, because the condition on line 58 was never true
59 from lsst.resources import ResourcePath, ResourcePathExpression
61 from ..registry.interfaces import DatasetIdRef, DatastoreRegistryBridgeManager
62 from .configSupport import LookupKey
63 from .datasets import DatasetRef, DatasetType
64 from .datastoreRecordData import DatastoreRecordData
65 from .storageClass import StorageClass
68class DatastoreConfig(ConfigSubset):
69 """Configuration for Datastores."""
71 component = "datastore"
72 requiredKeys = ("cls",)
73 defaultConfigFile = "datastore.yaml"
76class DatastoreValidationError(ValidationError):
77 """There is a problem with the Datastore configuration."""
79 pass
82@dataclasses.dataclass(frozen=True)
83class Event:
84 __slots__ = {"name", "undoFunc", "args", "kwargs"}
85 name: str
86 undoFunc: Callable
87 args: tuple
88 kwargs: dict
91class IngestPrepData:
92 """A helper base class for `Datastore` ingest implementations.
94 Datastore implementations will generally need a custom implementation of
95 this class.
97 Should be accessed as ``Datastore.IngestPrepData`` instead of via direct
98 import.
100 Parameters
101 ----------
102 refs : iterable of `DatasetRef`
103 References for the datasets that can be ingested by this datastore.
104 """
106 def __init__(self, refs: Iterable[DatasetRef]):
107 self.refs = {ref.id: ref for ref in refs}
110class DatastoreTransaction:
111 """Keeps a log of `Datastore` activity and allow rollback.
113 Parameters
114 ----------
115 parent : `DatastoreTransaction`, optional
116 The parent transaction (if any)
117 """
119 Event: ClassVar[Type] = Event
121 parent: Optional[DatastoreTransaction]
122 """The parent transaction. (`DatastoreTransaction`, optional)"""
124 def __init__(self, parent: Optional[DatastoreTransaction] = None):
125 self.parent = parent
126 self._log: List[Event] = []
128 def registerUndo(self, name: str, undoFunc: Callable, *args: Any, **kwargs: Any) -> None:
129 """Register event with undo function.
131 Parameters
132 ----------
133 name : `str`
134 Name of the event.
135 undoFunc : func
136 Function to undo this event.
137 args : `tuple`
138 Positional arguments to `undoFunc`.
139 **kwargs
140 Keyword arguments to `undoFunc`.
141 """
142 self._log.append(self.Event(name, undoFunc, args, kwargs))
144 @contextlib.contextmanager
145 def undoWith(self, name: str, undoFunc: Callable, *args: Any, **kwargs: Any) -> Iterator[None]:
146 """Register undo function if nested operation succeeds.
148 Calls `registerUndo`.
150 This can be used to wrap individual undo-able statements within a
151 DatastoreTransaction block. Multiple statements that can fail
152 separately should not be part of the same `undoWith` block.
154 All arguments are forwarded directly to `registerUndo`.
155 """
156 try:
157 yield None
158 except BaseException:
159 raise
160 else:
161 self.registerUndo(name, undoFunc, *args, **kwargs)
163 def rollback(self) -> None:
164 """Roll back all events in this transaction."""
165 log = logging.getLogger(__name__)
166 while self._log:
167 ev = self._log.pop()
168 try:
169 log.debug(
170 "Rolling back transaction: %s: %s(%s,%s)",
171 ev.name,
172 ev.undoFunc,
173 ",".join(str(a) for a in ev.args),
174 ",".join(f"{k}={v}" for k, v in ev.kwargs.items()),
175 )
176 except Exception:
177 # In case we had a problem in stringification of arguments
178 log.warning("Rolling back transaction: %s", ev.name)
179 try:
180 ev.undoFunc(*ev.args, **ev.kwargs)
181 except BaseException as e:
182 # Deliberately swallow error that may occur in unrolling
183 log.warning("Exception: %s caught while unrolling: %s", e, ev.name)
184 pass
186 def commit(self) -> None:
187 """Commit this transaction."""
188 if self.parent is None:
189 # Just forget about the events, they have already happened.
190 return
191 else:
192 # We may still want to events from this transaction as part of
193 # the parent.
194 self.parent._log.extend(self._log)
197@dataclasses.dataclass
198class DatasetRefURIs(abc.Sequence):
199 """Represents the primary and component ResourcePath(s) associated with a
200 DatasetRef.
202 This is used in places where its members used to be represented as a tuple
203 `(primaryURI, componentURIs)`. To maintain backward compatibility this
204 inherits from Sequence and so instances can be treated as a two-item
205 tuple.
206 """
208 def __init__(
209 self,
210 primaryURI: Optional[ResourcePath] = None,
211 componentURIs: Optional[Dict[str, ResourcePath]] = None,
212 ):
213 self.primaryURI = primaryURI
214 """The URI to the primary artifact associated with this dataset. If the
215 dataset was disassembled within the datastore this may be `None`.
216 """
218 self.componentURIs = componentURIs or {}
219 """The URIs to any components associated with the dataset artifact
220 indexed by component name. This can be empty if there are no
221 components.
222 """
224 def __getitem__(self, index: Any) -> Any:
225 """Get primaryURI and componentURIs by index.
227 Provides support for tuple-like access.
228 """
229 if index == 0:
230 return self.primaryURI
231 elif index == 1:
232 return self.componentURIs
233 raise IndexError("list index out of range")
235 def __len__(self) -> int:
236 """Get the number of data members.
238 Provides support for tuple-like access.
239 """
240 return 2
242 def __repr__(self) -> str:
243 return f"DatasetRefURIs({repr(self.primaryURI)}, {repr(self.componentURIs)})"
246class Datastore(metaclass=ABCMeta):
247 """Datastore interface.
249 Parameters
250 ----------
251 config : `DatastoreConfig` or `str`
252 Load configuration either from an existing config instance or by
253 referring to a configuration file.
254 bridgeManager : `DatastoreRegistryBridgeManager`
255 Object that manages the interface between `Registry` and datastores.
256 butlerRoot : `str`, optional
257 New datastore root to use to override the configuration value.
258 """
260 defaultConfigFile: ClassVar[Optional[str]] = None
261 """Path to configuration defaults. Accessed within the ``config`` resource
262 or relative to a search path. Can be None if no defaults specified.
263 """
265 containerKey: ClassVar[Optional[str]] = None
266 """Name of the key containing a list of subconfigurations that also
267 need to be merged with defaults and will likely use different Python
268 datastore classes (but all using DatastoreConfig). Assumed to be a
269 list of configurations that can be represented in a DatastoreConfig
270 and containing a "cls" definition. None indicates that no containers
271 are expected in this Datastore."""
273 isEphemeral: bool = False
274 """Indicate whether this Datastore is ephemeral or not. An ephemeral
275 datastore is one where the contents of the datastore will not exist
276 across process restarts. This value can change per-instance."""
278 config: DatastoreConfig
279 """Configuration used to create Datastore."""
281 name: str
282 """Label associated with this Datastore."""
284 storageClassFactory: StorageClassFactory
285 """Factory for creating storage class instances from name."""
287 constraints: Constraints
288 """Constraints to apply when putting datasets into the datastore."""
290 # MyPy does not like for this to be annotated as any kind of type, because
291 # it can't do static checking on type variables that can change at runtime.
292 IngestPrepData: ClassVar[Any] = IngestPrepData
293 """Helper base class for ingest implementations.
294 """
296 @classmethod
297 @abstractmethod
298 def setConfigRoot(cls, root: str, config: Config, full: Config, overwrite: bool = True) -> None:
299 """Set filesystem-dependent config options for this datastore.
301 The options will be appropriate for a new empty repository with the
302 given root.
304 Parameters
305 ----------
306 root : `str`
307 Filesystem path to the root of the data repository.
308 config : `Config`
309 A `Config` to update. Only the subset understood by
310 this component will be updated. Will not expand
311 defaults.
312 full : `Config`
313 A complete config with all defaults expanded that can be
314 converted to a `DatastoreConfig`. Read-only and will not be
315 modified by this method.
316 Repository-specific options that should not be obtained
317 from defaults when Butler instances are constructed
318 should be copied from ``full`` to ``config``.
319 overwrite : `bool`, optional
320 If `False`, do not modify a value in ``config`` if the value
321 already exists. Default is always to overwrite with the provided
322 ``root``.
324 Notes
325 -----
326 If a keyword is explicitly defined in the supplied ``config`` it
327 will not be overridden by this method if ``overwrite`` is `False`.
328 This allows explicit values set in external configs to be retained.
329 """
330 raise NotImplementedError()
332 @staticmethod
333 def fromConfig(
334 config: Config,
335 bridgeManager: DatastoreRegistryBridgeManager,
336 butlerRoot: Optional[ResourcePathExpression] = None,
337 ) -> "Datastore":
338 """Create datastore from type specified in config file.
340 Parameters
341 ----------
342 config : `Config`
343 Configuration instance.
344 bridgeManager : `DatastoreRegistryBridgeManager`
345 Object that manages the interface between `Registry` and
346 datastores.
347 butlerRoot : `str`, optional
348 Butler root directory.
349 """
350 cls = doImportType(config["datastore", "cls"])
351 if not issubclass(cls, Datastore):
352 raise TypeError(f"Imported child class {config['datastore', 'cls']} is not a Datastore")
353 return cls(config=config, bridgeManager=bridgeManager, butlerRoot=butlerRoot)
355 def __init__(
356 self,
357 config: Union[Config, str],
358 bridgeManager: DatastoreRegistryBridgeManager,
359 butlerRoot: Optional[ResourcePathExpression] = None,
360 ):
361 self.config = DatastoreConfig(config)
362 self.name = "ABCDataStore"
363 self._transaction: Optional[DatastoreTransaction] = None
365 # All Datastores need storage classes and constraints
366 self.storageClassFactory = StorageClassFactory()
368 # And read the constraints list
369 constraintsConfig = self.config.get("constraints")
370 self.constraints = Constraints(constraintsConfig, universe=bridgeManager.universe)
372 def __str__(self) -> str:
373 return self.name
375 def __repr__(self) -> str:
376 return self.name
378 @property
379 def names(self) -> Tuple[str, ...]:
380 """Names associated with this datastore returned as a list.
382 Can be different to ``name`` for a chaining datastore.
383 """
384 # Default implementation returns solely the name itself
385 return (self.name,)
387 @contextlib.contextmanager
388 def transaction(self) -> Iterator[DatastoreTransaction]:
389 """Context manager supporting `Datastore` transactions.
391 Transactions can be nested, and are to be used in combination with
392 `Registry.transaction`.
393 """
394 self._transaction = DatastoreTransaction(self._transaction)
395 try:
396 yield self._transaction
397 except BaseException:
398 self._transaction.rollback()
399 raise
400 else:
401 self._transaction.commit()
402 self._transaction = self._transaction.parent
404 @abstractmethod
405 def knows(self, ref: DatasetRef) -> bool:
406 """Check if the dataset is known to the datastore.
408 Does not check for existence of any artifact.
410 Parameters
411 ----------
412 ref : `DatasetRef`
413 Reference to the required dataset.
415 Returns
416 -------
417 exists : `bool`
418 `True` if the dataset is known to the datastore.
419 """
420 raise NotImplementedError()
422 def knows_these(self, refs: Iterable[DatasetRef]) -> dict[DatasetRef, bool]:
423 """Check which of the given datasets are known to this datastore.
425 This is like ``mexist()`` but does not check that the file exists.
427 Parameters
428 ----------
429 refs : iterable `DatasetRef`
430 The datasets to check.
432 Returns
433 -------
434 exists : `dict`[`DatasetRef`, `bool`]
435 Mapping of dataset to boolean indicating whether the dataset
436 is known to the datastore.
437 """
438 # Non-optimized default calls knows() repeatedly.
439 return {ref: self.knows(ref) for ref in refs}
441 def mexists(
442 self, refs: Iterable[DatasetRef], artifact_existence: Optional[Dict[ResourcePath, bool]] = None
443 ) -> Dict[DatasetRef, bool]:
444 """Check the existence of multiple datasets at once.
446 Parameters
447 ----------
448 refs : iterable of `DatasetRef`
449 The datasets to be checked.
450 artifact_existence : `dict` [`lsst.resources.ResourcePath`, `bool`]
451 Optional mapping of datastore artifact to existence. Updated by
452 this method with details of all artifacts tested. Can be `None`
453 if the caller is not interested.
455 Returns
456 -------
457 existence : `dict` of [`DatasetRef`, `bool`]
458 Mapping from dataset to boolean indicating existence.
459 """
460 existence: Dict[DatasetRef, bool] = {}
461 # Non-optimized default.
462 for ref in refs:
463 existence[ref] = self.exists(ref)
464 return existence
466 @abstractmethod
467 def exists(self, datasetRef: DatasetRef) -> bool:
468 """Check if the dataset exists in the datastore.
470 Parameters
471 ----------
472 datasetRef : `DatasetRef`
473 Reference to the required dataset.
475 Returns
476 -------
477 exists : `bool`
478 `True` if the entity exists in the `Datastore`.
479 """
480 raise NotImplementedError("Must be implemented by subclass")
482 @abstractmethod
483 def get(
484 self,
485 datasetRef: DatasetRef,
486 parameters: Mapping[str, Any] | None = None,
487 storageClass: Optional[Union[StorageClass, str]] = None,
488 ) -> Any:
489 """Load an `InMemoryDataset` from the store.
491 Parameters
492 ----------
493 datasetRef : `DatasetRef`
494 Reference to the required Dataset.
495 parameters : `dict`
496 `StorageClass`-specific parameters that specify a slice of the
497 Dataset to be loaded.
498 storageClass : `StorageClass` or `str`, optional
499 The storage class to be used to override the Python type
500 returned by this method. By default the returned type matches
501 the dataset type definition for this dataset. Specifying a
502 read `StorageClass` can force a different type to be returned.
503 This type must be compatible with the original type.
505 Returns
506 -------
507 inMemoryDataset : `object`
508 Requested Dataset or slice thereof as an InMemoryDataset.
509 """
510 raise NotImplementedError("Must be implemented by subclass")
512 @abstractmethod
513 def put(self, inMemoryDataset: Any, datasetRef: DatasetRef) -> None:
514 """Write a `InMemoryDataset` with a given `DatasetRef` to the store.
516 Parameters
517 ----------
518 inMemoryDataset : `object`
519 The Dataset to store.
520 datasetRef : `DatasetRef`
521 Reference to the associated Dataset.
522 """
523 raise NotImplementedError("Must be implemented by subclass")
525 def _overrideTransferMode(self, *datasets: FileDataset, transfer: Optional[str] = None) -> Optional[str]:
526 """Allow ingest transfer mode to be defaulted based on datasets.
528 Parameters
529 ----------
530 datasets : `FileDataset`
531 Each positional argument is a struct containing information about
532 a file to be ingested, including its path (either absolute or
533 relative to the datastore root, if applicable), a complete
534 `DatasetRef` (with ``dataset_id not None``), and optionally a
535 formatter class or its fully-qualified string name. If a formatter
536 is not provided, this method should populate that attribute with
537 the formatter the datastore would use for `put`. Subclasses are
538 also permitted to modify the path attribute (typically to put it
539 in what the datastore considers its standard form).
540 transfer : `str`, optional
541 How (and whether) the dataset should be added to the datastore.
542 See `ingest` for details of transfer modes.
544 Returns
545 -------
546 newTransfer : `str`
547 Transfer mode to use. Will be identical to the supplied transfer
548 mode unless "auto" is used.
549 """
550 if transfer != "auto":
551 return transfer
552 raise RuntimeError(f"{transfer} is not allowed without specialization.")
554 def _prepIngest(self, *datasets: FileDataset, transfer: Optional[str] = None) -> IngestPrepData:
555 """Process datasets to identify which ones can be ingested.
557 Parameters
558 ----------
559 datasets : `FileDataset`
560 Each positional argument is a struct containing information about
561 a file to be ingested, including its path (either absolute or
562 relative to the datastore root, if applicable), a complete
563 `DatasetRef` (with ``dataset_id not None``), and optionally a
564 formatter class or its fully-qualified string name. If a formatter
565 is not provided, this method should populate that attribute with
566 the formatter the datastore would use for `put`. Subclasses are
567 also permitted to modify the path attribute (typically to put it
568 in what the datastore considers its standard form).
569 transfer : `str`, optional
570 How (and whether) the dataset should be added to the datastore.
571 See `ingest` for details of transfer modes.
573 Returns
574 -------
575 data : `IngestPrepData`
576 An instance of a subclass of `IngestPrepData`, used to pass
577 arbitrary data from `_prepIngest` to `_finishIngest`. This should
578 include only the datasets this datastore can actually ingest;
579 others should be silently ignored (`Datastore.ingest` will inspect
580 `IngestPrepData.refs` and raise `DatasetTypeNotSupportedError` if
581 necessary).
583 Raises
584 ------
585 NotImplementedError
586 Raised if the datastore does not support the given transfer mode
587 (including the case where ingest is not supported at all).
588 FileNotFoundError
589 Raised if one of the given files does not exist.
590 FileExistsError
591 Raised if transfer is not `None` but the (internal) location the
592 file would be moved to is already occupied.
594 Notes
595 -----
596 This method (along with `_finishIngest`) should be implemented by
597 subclasses to provide ingest support instead of implementing `ingest`
598 directly.
600 `_prepIngest` should not modify the data repository or given files in
601 any way; all changes should be deferred to `_finishIngest`.
603 When possible, exceptions should be raised in `_prepIngest` instead of
604 `_finishIngest`. `NotImplementedError` exceptions that indicate that
605 the transfer mode is not supported must be raised by `_prepIngest`
606 instead of `_finishIngest`.
607 """
608 raise NotImplementedError(f"Datastore {self} does not support direct file-based ingest.")
610 def _finishIngest(
611 self, prepData: IngestPrepData, *, transfer: Optional[str] = None, record_validation_info: bool = True
612 ) -> None:
613 """Complete an ingest operation.
615 Parameters
616 ----------
617 data : `IngestPrepData`
618 An instance of a subclass of `IngestPrepData`. Guaranteed to be
619 the direct result of a call to `_prepIngest` on this datastore.
620 transfer : `str`, optional
621 How (and whether) the dataset should be added to the datastore.
622 See `ingest` for details of transfer modes.
623 record_validation_info : `bool`, optional
624 If `True`, the default, the datastore can record validation
625 information associated with the file. If `False` the datastore
626 will not attempt to track any information such as checksums
627 or file sizes. This can be useful if such information is tracked
628 in an external system or if the file is to be compressed in place.
629 It is up to the datastore whether this parameter is relevant.
631 Raises
632 ------
633 FileNotFoundError
634 Raised if one of the given files does not exist.
635 FileExistsError
636 Raised if transfer is not `None` but the (internal) location the
637 file would be moved to is already occupied.
639 Notes
640 -----
641 This method (along with `_prepIngest`) should be implemented by
642 subclasses to provide ingest support instead of implementing `ingest`
643 directly.
644 """
645 raise NotImplementedError(f"Datastore {self} does not support direct file-based ingest.")
647 def ingest(
648 self, *datasets: FileDataset, transfer: Optional[str] = None, record_validation_info: bool = True
649 ) -> None:
650 """Ingest one or more files into the datastore.
652 Parameters
653 ----------
654 datasets : `FileDataset`
655 Each positional argument is a struct containing information about
656 a file to be ingested, including its path (either absolute or
657 relative to the datastore root, if applicable), a complete
658 `DatasetRef` (with ``dataset_id not None``), and optionally a
659 formatter class or its fully-qualified string name. If a formatter
660 is not provided, the one the datastore would use for ``put`` on
661 that dataset is assumed.
662 transfer : `str`, optional
663 How (and whether) the dataset should be added to the datastore.
664 If `None` (default), the file must already be in a location
665 appropriate for the datastore (e.g. within its root directory),
666 and will not be modified. Other choices include "move", "copy",
667 "link", "symlink", "relsymlink", and "hardlink". "link" is a
668 special transfer mode that will first try to make a hardlink and
669 if that fails a symlink will be used instead. "relsymlink" creates
670 a relative symlink rather than use an absolute path.
671 Most datastores do not support all transfer modes.
672 "auto" is a special option that will let the
673 data store choose the most natural option for itself.
674 record_validation_info : `bool`, optional
675 If `True`, the default, the datastore can record validation
676 information associated with the file. If `False` the datastore
677 will not attempt to track any information such as checksums
678 or file sizes. This can be useful if such information is tracked
679 in an external system or if the file is to be compressed in place.
680 It is up to the datastore whether this parameter is relevant.
682 Raises
683 ------
684 NotImplementedError
685 Raised if the datastore does not support the given transfer mode
686 (including the case where ingest is not supported at all).
687 DatasetTypeNotSupportedError
688 Raised if one or more files to be ingested have a dataset type that
689 is not supported by the datastore.
690 FileNotFoundError
691 Raised if one of the given files does not exist.
692 FileExistsError
693 Raised if transfer is not `None` but the (internal) location the
694 file would be moved to is already occupied.
696 Notes
697 -----
698 Subclasses should implement `_prepIngest` and `_finishIngest` instead
699 of implementing `ingest` directly. Datastores that hold and
700 delegate to child datastores may want to call those methods as well.
702 Subclasses are encouraged to document their supported transfer modes
703 in their class documentation.
704 """
705 # Allow a datastore to select a default transfer mode
706 transfer = self._overrideTransferMode(*datasets, transfer=transfer)
707 prepData = self._prepIngest(*datasets, transfer=transfer)
708 refs = {ref.id: ref for dataset in datasets for ref in dataset.refs}
709 if None in refs:
710 # Find the file for the error message. There may be multiple
711 # bad refs so look for all of them.
712 unresolved_paths = {}
713 for dataset in datasets:
714 unresolved = []
715 for ref in dataset.refs:
716 if ref.id is None:
717 unresolved.append(ref)
718 if unresolved:
719 unresolved_paths[dataset.path] = unresolved
720 raise RuntimeError(
721 "Attempt to ingest unresolved DatasetRef from: "
722 + ",".join(f"{p}: ({[str(r) for r in ref]})" for p, ref in unresolved_paths.items())
723 )
724 if refs.keys() != prepData.refs.keys():
725 unsupported = refs.keys() - prepData.refs.keys()
726 # Group unsupported refs by DatasetType for an informative
727 # but still concise error message.
728 byDatasetType = defaultdict(list)
729 for datasetId in unsupported:
730 ref = refs[datasetId]
731 byDatasetType[ref.datasetType].append(ref)
732 raise DatasetTypeNotSupportedError(
733 "DatasetType(s) not supported in ingest: "
734 + ", ".join(f"{k.name} ({len(v)} dataset(s))" for k, v in byDatasetType.items())
735 )
736 self._finishIngest(prepData, transfer=transfer, record_validation_info=record_validation_info)
738 def transfer_from(
739 self,
740 source_datastore: Datastore,
741 refs: Iterable[DatasetRef],
742 local_refs: Optional[Iterable[DatasetRef]] = None,
743 transfer: str = "auto",
744 artifact_existence: Optional[Dict[ResourcePath, bool]] = None,
745 ) -> None:
746 """Transfer dataset artifacts from another datastore to this one.
748 Parameters
749 ----------
750 source_datastore : `Datastore`
751 The datastore from which to transfer artifacts. That datastore
752 must be compatible with this datastore receiving the artifacts.
753 refs : iterable of `DatasetRef`
754 The datasets to transfer from the source datastore.
755 local_refs : iterable of `DatasetRef`, optional
756 The dataset refs associated with the registry associated with
757 this datastore. Can be `None` if the source and target datastore
758 are using UUIDs.
759 transfer : `str`, optional
760 How (and whether) the dataset should be added to the datastore.
761 Choices include "move", "copy",
762 "link", "symlink", "relsymlink", and "hardlink". "link" is a
763 special transfer mode that will first try to make a hardlink and
764 if that fails a symlink will be used instead. "relsymlink" creates
765 a relative symlink rather than use an absolute path.
766 Most datastores do not support all transfer modes.
767 "auto" (the default) is a special option that will let the
768 data store choose the most natural option for itself.
769 If the source location and transfer location are identical the
770 transfer mode will be ignored.
771 artifact_existence : `dict` [`lsst.resources.ResourcePath`, `bool`]
772 Optional mapping of datastore artifact to existence. Updated by
773 this method with details of all artifacts tested. Can be `None`
774 if the caller is not interested.
776 Raises
777 ------
778 TypeError
779 Raised if the two datastores are not compatible.
780 """
781 if type(self) is not type(source_datastore):
782 raise TypeError(
783 f"Datastore mismatch between this datastore ({type(self)}) and the "
784 f"source datastore ({type(source_datastore)})."
785 )
787 raise NotImplementedError(f"Datastore {type(self)} must implement a transfer_from method.")
789 def getManyURIs(
790 self,
791 refs: Iterable[DatasetRef],
792 predict: bool = False,
793 allow_missing: bool = False,
794 ) -> Dict[DatasetRef, DatasetRefURIs]:
795 """Return URIs associated with many datasets.
797 Parameters
798 ----------
799 refs : iterable of `DatasetIdRef`
800 References to the required datasets.
801 predict : `bool`, optional
802 If the datastore does not know about a dataset, should it
803 return a predicted URI or not?
804 allow_missing : `bool`
805 If `False`, and `predict` is `False`, will raise if a `DatasetRef`
806 does not exist.
808 Returns
809 -------
810 URIs : `dict` of [`DatasetRef`, `DatasetRefUris`]
811 A dict of primary and component URIs, indexed by the passed-in
812 refs.
814 Raises
815 ------
816 FileNotFoundError
817 A URI has been requested for a dataset that does not exist and
818 guessing is not allowed.
820 Notes
821 -----
822 In file-based datastores, getManuURIs does not check that the file is
823 really there, it's assuming it is if datastore is aware of the file
824 then it actually exists.
825 """
826 uris: Dict[DatasetRef, DatasetRefURIs] = {}
827 missing_refs = []
828 for ref in refs:
829 try:
830 uris[ref] = self.getURIs(ref, predict=predict)
831 except FileNotFoundError:
832 missing_refs.append(ref)
833 if missing_refs and not allow_missing:
834 raise FileNotFoundError(
835 "Missing {} refs from datastore out of {} and predict=False.".format(
836 num_missing := len(missing_refs), num_missing + len(uris)
837 )
838 )
839 return uris
841 @abstractmethod
842 def getURIs(self, datasetRef: DatasetRef, predict: bool = False) -> DatasetRefURIs:
843 """Return URIs associated with dataset.
845 Parameters
846 ----------
847 ref : `DatasetRef`
848 Reference to the required dataset.
849 predict : `bool`, optional
850 If the datastore does not know about the dataset, should it
851 return a predicted URI or not?
853 Returns
854 -------
855 uris : `DatasetRefURIs`
856 The URI to the primary artifact associated with this dataset (if
857 the dataset was disassembled within the datastore this may be
858 `None`), and the URIs to any components associated with the dataset
859 artifact. (can be empty if there are no components).
860 """
861 raise NotImplementedError()
863 @abstractmethod
864 def getURI(self, datasetRef: DatasetRef, predict: bool = False) -> ResourcePath:
865 """URI to the Dataset.
867 Parameters
868 ----------
869 datasetRef : `DatasetRef`
870 Reference to the required Dataset.
871 predict : `bool`
872 If `True` attempt to predict the URI for a dataset if it does
873 not exist in datastore.
875 Returns
876 -------
877 uri : `str`
878 URI string pointing to the Dataset within the datastore. If the
879 Dataset does not exist in the datastore, the URI may be a guess.
880 If the datastore does not have entities that relate well
881 to the concept of a URI the returned URI string will be
882 descriptive. The returned URI is not guaranteed to be obtainable.
884 Raises
885 ------
886 FileNotFoundError
887 A URI has been requested for a dataset that does not exist and
888 guessing is not allowed.
889 """
890 raise NotImplementedError("Must be implemented by subclass")
892 @abstractmethod
893 def retrieveArtifacts(
894 self,
895 refs: Iterable[DatasetRef],
896 destination: ResourcePath,
897 transfer: str = "auto",
898 preserve_path: bool = True,
899 overwrite: bool = False,
900 ) -> List[ResourcePath]:
901 """Retrieve the artifacts associated with the supplied refs.
903 Parameters
904 ----------
905 refs : iterable of `DatasetRef`
906 The datasets for which artifacts are to be retrieved.
907 A single ref can result in multiple artifacts. The refs must
908 be resolved.
909 destination : `lsst.resources.ResourcePath`
910 Location to write the artifacts.
911 transfer : `str`, optional
912 Method to use to transfer the artifacts. Must be one of the options
913 supported by `lsst.resources.ResourcePath.transfer_from()`.
914 "move" is not allowed.
915 preserve_path : `bool`, optional
916 If `True` the full path of the artifact within the datastore
917 is preserved. If `False` the final file component of the path
918 is used.
919 overwrite : `bool`, optional
920 If `True` allow transfers to overwrite existing files at the
921 destination.
923 Returns
924 -------
925 targets : `list` of `lsst.resources.ResourcePath`
926 URIs of file artifacts in destination location. Order is not
927 preserved.
929 Notes
930 -----
931 For non-file datastores the artifacts written to the destination
932 may not match the representation inside the datastore. For example
933 a hierarchichal data structure in a NoSQL database may well be stored
934 as a JSON file.
935 """
936 raise NotImplementedError()
938 @abstractmethod
939 def remove(self, datasetRef: DatasetRef) -> None:
940 """Indicate to the Datastore that a Dataset can be removed.
942 Parameters
943 ----------
944 datasetRef : `DatasetRef`
945 Reference to the required Dataset.
947 Raises
948 ------
949 FileNotFoundError
950 When Dataset does not exist.
952 Notes
953 -----
954 Some Datastores may implement this method as a silent no-op to
955 disable Dataset deletion through standard interfaces.
956 """
957 raise NotImplementedError("Must be implemented by subclass")
959 @abstractmethod
960 def forget(self, refs: Iterable[DatasetRef]) -> None:
961 """Indicate to the Datastore that it should remove all records of the
962 given datasets, without actually deleting them.
964 Parameters
965 ----------
966 refs : `Iterable` [ `DatasetRef` ]
967 References to the datasets being forgotten.
969 Notes
970 -----
971 Asking a datastore to forget a `DatasetRef` it does not hold should be
972 a silent no-op, not an error.
973 """
974 raise NotImplementedError("Must be implemented by subclass")
976 @abstractmethod
977 def trash(self, ref: Union[DatasetRef, Iterable[DatasetRef]], ignore_errors: bool = True) -> None:
978 """Indicate to the Datastore that a Dataset can be moved to the trash.
980 Parameters
981 ----------
982 ref : `DatasetRef` or iterable thereof
983 Reference(s) to the required Dataset.
984 ignore_errors : `bool`, optional
985 Determine whether errors should be ignored. When multiple
986 refs are being trashed there will be no per-ref check.
988 Raises
989 ------
990 FileNotFoundError
991 When Dataset does not exist and errors are not ignored. Only
992 checked if a single ref is supplied (and not in a list).
994 Notes
995 -----
996 Some Datastores may implement this method as a silent no-op to
997 disable Dataset deletion through standard interfaces.
998 """
999 raise NotImplementedError("Must be implemented by subclass")
1001 @abstractmethod
1002 def emptyTrash(self, ignore_errors: bool = True) -> None:
1003 """Remove all datasets from the trash.
1005 Parameters
1006 ----------
1007 ignore_errors : `bool`, optional
1008 Determine whether errors should be ignored.
1010 Notes
1011 -----
1012 Some Datastores may implement this method as a silent no-op to
1013 disable Dataset deletion through standard interfaces.
1014 """
1015 raise NotImplementedError("Must be implemented by subclass")
1017 @abstractmethod
1018 def transfer(self, inputDatastore: Datastore, datasetRef: DatasetRef) -> None:
1019 """Transfer a dataset from another datastore to this datastore.
1021 Parameters
1022 ----------
1023 inputDatastore : `Datastore`
1024 The external `Datastore` from which to retrieve the Dataset.
1025 datasetRef : `DatasetRef`
1026 Reference to the required Dataset.
1027 """
1028 raise NotImplementedError("Must be implemented by subclass")
1030 def export(
1031 self,
1032 refs: Iterable[DatasetRef],
1033 *,
1034 directory: Optional[ResourcePathExpression] = None,
1035 transfer: Optional[str] = "auto",
1036 ) -> Iterable[FileDataset]:
1037 """Export datasets for transfer to another data repository.
1039 Parameters
1040 ----------
1041 refs : iterable of `DatasetRef`
1042 Dataset references to be exported.
1043 directory : `str`, optional
1044 Path to a directory that should contain files corresponding to
1045 output datasets. Ignored if ``transfer`` is explicitly `None`.
1046 transfer : `str`, optional
1047 Mode that should be used to move datasets out of the repository.
1048 Valid options are the same as those of the ``transfer`` argument
1049 to ``ingest``, and datastores may similarly signal that a transfer
1050 mode is not supported by raising `NotImplementedError`. If "auto"
1051 is given and no ``directory`` is specified, `None` will be
1052 implied.
1054 Returns
1055 -------
1056 dataset : iterable of `DatasetTransfer`
1057 Structs containing information about the exported datasets, in the
1058 same order as ``refs``.
1060 Raises
1061 ------
1062 NotImplementedError
1063 Raised if the given transfer mode is not supported.
1064 """
1065 raise NotImplementedError(f"Transfer mode {transfer} not supported.")
1067 @abstractmethod
1068 def validateConfiguration(
1069 self, entities: Iterable[Union[DatasetRef, DatasetType, StorageClass]], logFailures: bool = False
1070 ) -> None:
1071 """Validate some of the configuration for this datastore.
1073 Parameters
1074 ----------
1075 entities : iterable of `DatasetRef`, `DatasetType`, or `StorageClass`
1076 Entities to test against this configuration. Can be differing
1077 types.
1078 logFailures : `bool`, optional
1079 If `True`, output a log message for every validation error
1080 detected.
1082 Raises
1083 ------
1084 DatastoreValidationError
1085 Raised if there is a validation problem with a configuration.
1087 Notes
1088 -----
1089 Which parts of the configuration are validated is at the discretion
1090 of each Datastore implementation.
1091 """
1092 raise NotImplementedError("Must be implemented by subclass")
1094 @abstractmethod
1095 def validateKey(self, lookupKey: LookupKey, entity: Union[DatasetRef, DatasetType, StorageClass]) -> None:
1096 """Validate a specific look up key with supplied entity.
1098 Parameters
1099 ----------
1100 lookupKey : `LookupKey`
1101 Key to use to retrieve information from the datastore
1102 configuration.
1103 entity : `DatasetRef`, `DatasetType`, or `StorageClass`
1104 Entity to compare with configuration retrieved using the
1105 specified lookup key.
1107 Raises
1108 ------
1109 DatastoreValidationError
1110 Raised if there is a problem with the combination of entity
1111 and lookup key.
1113 Notes
1114 -----
1115 Bypasses the normal selection priorities by allowing a key that
1116 would normally not be selected to be validated.
1117 """
1118 raise NotImplementedError("Must be implemented by subclass")
1120 @abstractmethod
1121 def getLookupKeys(self) -> Set[LookupKey]:
1122 """Return all the lookup keys relevant to this datastore.
1124 Returns
1125 -------
1126 keys : `set` of `LookupKey`
1127 The keys stored internally for looking up information based
1128 on `DatasetType` name or `StorageClass`.
1129 """
1130 raise NotImplementedError("Must be implemented by subclass")
1132 def needs_expanded_data_ids(
1133 self,
1134 transfer: Optional[str],
1135 entity: Optional[Union[DatasetRef, DatasetType, StorageClass]] = None,
1136 ) -> bool:
1137 """Test whether this datastore needs expanded data IDs to ingest.
1139 Parameters
1140 ----------
1141 transfer : `str` or `None`
1142 Transfer mode for ingest.
1143 entity, optional
1144 Object representing what will be ingested. If not provided (or not
1145 specific enough), `True` may be returned even if expanded data
1146 IDs aren't necessary.
1148 Returns
1149 -------
1150 needed : `bool`
1151 If `True`, expanded data IDs may be needed. `False` only if
1152 expansion definitely isn't necessary.
1153 """
1154 return True
1156 @abstractmethod
1157 def import_records(
1158 self,
1159 data: Mapping[str, DatastoreRecordData],
1160 ) -> None:
1161 """Import datastore location and record data from an in-memory data
1162 structure.
1164 Parameters
1165 ----------
1166 data : `Mapping` [ `str`, `DatastoreRecordData` ]
1167 Datastore records indexed by datastore name. May contain data for
1168 other `Datastore` instances (generally because they are chained to
1169 this one), which should be ignored.
1171 Notes
1172 -----
1173 Implementations should generally not check that any external resources
1174 (e.g. files) referred to by these records actually exist, for
1175 performance reasons; we expect higher-level code to guarantee that they
1176 do.
1178 Implementations are responsible for calling
1179 `DatastoreRegistryBridge.insert` on all datasets in ``data.locations``
1180 where the key is in `names`, as well as loading any opaque table data.
1181 """
1182 raise NotImplementedError()
1184 @abstractmethod
1185 def export_records(
1186 self,
1187 refs: Iterable[DatasetIdRef],
1188 ) -> Mapping[str, DatastoreRecordData]:
1189 """Export datastore records and locations to an in-memory data
1190 structure.
1192 Parameters
1193 ----------
1194 refs : `Iterable` [ `DatasetIdRef` ]
1195 Datasets to save. This may include datasets not known to this
1196 datastore, which should be ignored.
1198 Returns
1199 -------
1200 data : `Mapping` [ `str`, `DatastoreRecordData` ]
1201 Exported datastore records indexed by datastore name.
1202 """
1203 raise NotImplementedError()
1205 def set_retrieve_dataset_type_method(self, method: Callable[[str], DatasetType | None] | None) -> None:
1206 """Specify a method that can be used by datastore to retrieve
1207 registry-defined dataset type.
1209 Parameters
1210 ----------
1211 method : `~collections.abc.Callable` | `None`
1212 Method that takes a name of the dataset type and returns a
1213 corresponding `DatasetType` instance as defined in Registry. If
1214 dataset type name is not known to registry `None` is returned.
1216 Notes
1217 -----
1218 This method is only needed for a Datastore supporting a "trusted" mode
1219 when it does not have an access to datastore records and needs to
1220 guess dataset location based on its stored dataset type.
1221 """
1222 pass