Coverage for python/lsst/daf/butler/core/datastore.py : 47%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22"""
23Support for generic data stores.
24"""
26from __future__ import annotations
28__all__ = ("DatastoreConfig", "Datastore", "DatastoreValidationError")
30import contextlib
31import logging
32from collections import defaultdict
33from typing import TYPE_CHECKING, Optional, Type, Callable, ClassVar, Any, Generator, Iterable
34from dataclasses import dataclass
35from abc import ABCMeta, abstractmethod
37from lsst.utils import doImport
38from .config import ConfigSubset, Config
39from .exceptions import ValidationError, DatasetTypeNotSupportedError
40from .constraints import Constraints
41from .storageClass import StorageClassFactory
43if TYPE_CHECKING: 43 ↛ 44line 43 didn't jump to line 44, because the condition on line 43 was never true
44 from ..registry import Registry
45 from .datasets import DatasetRef
46 from .repoTransfer import FileDataset
49class DatastoreConfig(ConfigSubset):
50 component = "datastore"
51 requiredKeys = ("cls",)
52 defaultConfigFile = "datastore.yaml"
55class DatastoreValidationError(ValidationError):
56 """There is a problem with the Datastore configuration.
57 """
58 pass
61@dataclass(frozen=True)
62class Event:
63 __slots__ = {"name", "undoFunc", "args", "kwargs"}
64 name: str
65 undoFunc: Callable
66 args: tuple
67 kwargs: dict
70class IngestPrepData:
71 """A helper base class for `Datastore` ingest implementations.
73 Datastore implementations will generally need a custom implementation of
74 this class.
76 Should be accessed as ``Datastore.IngestPrepData`` instead of via direct
77 import.
79 Parameters
80 ----------
81 refs : iterable of `DatasetRef`
82 References for the datasets that can be ingested by this datastore.
83 """
84 def __init__(self, refs: Iterable[DatasetRef]):
85 self.refs = {ref.id: ref for ref in refs}
88class DatastoreTransaction:
89 """Keeps a log of `Datastore` activity and allow rollback.
91 Parameters
92 ----------
93 parent : `DatastoreTransaction`, optional
94 The parent transaction (if any)
95 """
96 Event: ClassVar[Type] = Event
98 parent: Optional['DatastoreTransaction']
99 """The parent transaction. (`DatastoreTransaction`, optional)"""
101 def __init__(self, parent=None):
102 self.parent = parent
103 self._log = []
105 def registerUndo(self, name: str, undoFunc: Callable, *args: Any, **kwargs: Any) -> None:
106 """Register event with undo function.
108 Parameters
109 ----------
110 name : `str`
111 Name of the event.
112 undoFunc : func
113 Function to undo this event.
114 args : `tuple`
115 Positional arguments to `undoFunc`.
116 kwargs : `dict`
117 Keyword arguments to `undoFunc`.
118 """
119 self._log.append(self.Event(name, undoFunc, args, kwargs))
121 @contextlib.contextmanager
122 def undoWith(self, name: str, undoFunc: Callable, *args: Any, **kwargs: Any) -> Generator:
123 """A context manager that calls `registerUndo` if the nested operation
124 does not raise an exception.
126 This can be used to wrap individual undo-able statements within a
127 DatastoreTransaction block. Multiple statements that can fail
128 separately should not be part of the same `undoWith` block.
130 All arguments are forwarded directly to `registerUndo`.
131 """
132 try:
133 yield None
134 except BaseException:
135 raise
136 else:
137 self.registerUndo(name, undoFunc, *args, **kwargs)
139 def rollback(self) -> None:
140 """Roll back all events in this transaction.
141 """
142 while self._log:
143 ev = self._log.pop()
144 try:
145 ev.undoFunc(*ev.args, **ev.kwargs)
146 except BaseException as e:
147 # Deliberately swallow error that may occur in unrolling
148 log = logging.getLogger(__name__)
149 log.warning("Exception: %s caught while unrolling: %s", e, ev.name)
150 pass
152 def commit(self) -> None:
153 """Commit this transaction.
154 """
155 if self.parent is None:
156 # Just forget about the events, they have already happened.
157 return
158 else:
159 # We may still want to events from this transaction as part of
160 # the parent.
161 self.parent._log.extend(self._log)
164class Datastore(metaclass=ABCMeta):
165 """Datastore interface.
167 Parameters
168 ----------
169 config : `DatastoreConfig` or `str`
170 Load configuration either from an existing config instance or by
171 referring to a configuration file.
172 registry : `Registry`
173 Registry to use for storing internal information about the datasets.
174 butlerRoot : `str`, optional
175 New datastore root to use to override the configuration value.
176 """
178 defaultConfigFile: ClassVar[Optional[str]] = None
179 """Path to configuration defaults. Relative to $DAF_BUTLER_DIR/config or
180 absolute path. Can be None if no defaults specified.
181 """
183 containerKey: ClassVar[Optional[str]] = None
184 """Name of the key containing a list of subconfigurations that also
185 need to be merged with defaults and will likely use different Python
186 datastore classes (but all using DatastoreConfig). Assumed to be a
187 list of configurations that can be represented in a DatastoreConfig
188 and containing a "cls" definition. None indicates that no containers
189 are expected in this Datastore."""
191 isEphemeral: ClassVar[bool] = False
192 """Indicate whether this Datastore is ephemeral or not. An ephemeral
193 datastore is one where the contents of the datastore will not exist
194 across process restarts."""
196 config: DatastoreConfig
197 """Configuration used to create Datastore."""
199 registry: Registry
200 """`Registry` to use when recording the writing of Datasets."""
202 name: str
203 """Label associated with this Datastore."""
205 names: list
206 """List of names associated with this Datastore. Can be different to
207 ``name`` for a chaining datastore."""
209 storageClassFactory: StorageClassFactory
210 """Factory for creating storage class instances from name."""
212 constraints: Constraints
213 """Constraints to apply when putting datasets into the datastore."""
215 IngestPrepData: ClassVar[Type] = IngestPrepData
216 """Helper base class for ingest implementations.
217 """
219 @classmethod
220 @abstractmethod
221 def setConfigRoot(cls, root: str, config: Config, full: Config, overwrite: bool = True):
222 """Set any filesystem-dependent config options for this Datastore to
223 be appropriate for a new empty repository with the given root.
225 Parameters
226 ----------
227 root : `str`
228 Filesystem path to the root of the data repository.
229 config : `Config`
230 A `Config` to update. Only the subset understood by
231 this component will be updated. Will not expand
232 defaults.
233 full : `Config`
234 A complete config with all defaults expanded that can be
235 converted to a `DatastoreConfig`. Read-only and will not be
236 modified by this method.
237 Repository-specific options that should not be obtained
238 from defaults when Butler instances are constructed
239 should be copied from ``full`` to ``config``.
240 overwrite : `bool`, optional
241 If `False`, do not modify a value in ``config`` if the value
242 already exists. Default is always to overwrite with the provided
243 ``root``.
245 Notes
246 -----
247 If a keyword is explicitly defined in the supplied ``config`` it
248 will not be overridden by this method if ``overwrite`` is `False`.
249 This allows explicit values set in external configs to be retained.
250 """
251 raise NotImplementedError()
253 @staticmethod
254 def fromConfig(config: Config, registry: Registry, butlerRoot: Optional[str] = None) -> 'Datastore':
255 """Create datastore from type specified in config file.
257 Parameters
258 ----------
259 config : `Config`
260 Configuration instance.
261 registry : `Registry`
262 Registry to be used by the Datastore for internal data.
263 butlerRoot : `str`, optional
264 Butler root directory.
265 """
266 cls = doImport(config["datastore", "cls"])
267 return cls(config=config, registry=registry, butlerRoot=butlerRoot)
269 def __init__(self, config, registry, butlerRoot=None):
270 self.config = DatastoreConfig(config)
271 self.registry = registry
272 self.name = "ABCDataStore"
273 self._transaction = None
275 # All Datastores need storage classes and constraints
276 self.storageClassFactory = StorageClassFactory()
278 # And read the constraints list
279 constraintsConfig = self.config.get("constraints")
280 self.constraints = Constraints(constraintsConfig, universe=self.registry.dimensions)
282 def __str__(self):
283 return self.name
285 def __repr__(self):
286 return self.name
288 @property
289 def names(self):
290 """Names associated with this datastore returned as a list.
292 Some datastores can have child datastores.
293 """
294 # Default implementation returns solely the name itself
295 return [self.name]
297 @contextlib.contextmanager
298 def transaction(self):
299 """Context manager supporting `Datastore` transactions.
301 Transactions can be nested, and are to be used in combination with
302 `Registry.transaction`.
303 """
304 self._transaction = DatastoreTransaction(self._transaction)
305 try:
306 yield self._transaction
307 except BaseException:
308 self._transaction.rollback()
309 raise
310 else:
311 self._transaction.commit()
312 self._transaction = self._transaction.parent
314 @abstractmethod
315 def exists(self, datasetRef):
316 """Check if the dataset exists in the datastore.
318 Parameters
319 ----------
320 datasetRef : `DatasetRef`
321 Reference to the required dataset.
323 Returns
324 -------
325 exists : `bool`
326 `True` if the entity exists in the `Datastore`.
327 """
328 raise NotImplementedError("Must be implemented by subclass")
330 @abstractmethod
331 def get(self, datasetRef, parameters=None):
332 """Load an `InMemoryDataset` from the store.
334 Parameters
335 ----------
336 datasetRef : `DatasetRef`
337 Reference to the required Dataset.
338 parameters : `dict`
339 `StorageClass`-specific parameters that specify a slice of the
340 Dataset to be loaded.
342 Returns
343 -------
344 inMemoryDataset : `object`
345 Requested Dataset or slice thereof as an InMemoryDataset.
346 """
347 raise NotImplementedError("Must be implemented by subclass")
349 @abstractmethod
350 def put(self, inMemoryDataset, datasetRef):
351 """Write a `InMemoryDataset` with a given `DatasetRef` to the store.
353 Parameters
354 ----------
355 inMemoryDataset : `InMemoryDataset`
356 The Dataset to store.
357 datasetRef : `DatasetRef`
358 Reference to the associated Dataset.
359 """
360 raise NotImplementedError("Must be implemented by subclass")
362 def _overrideTransferMode(self, *datasets: FileDataset, transfer: Optional[str] = None) -> str:
363 """Allow ingest transfer mode to be defaulted based on datasets.
365 Parameters
366 ----------
367 datasets : `FileDataset`
368 Each positional argument is a struct containing information about
369 a file to be ingested, including its path (either absolute or
370 relative to the datastore root, if applicable), a complete
371 `DatasetRef` (with ``dataset_id not None``), and optionally a
372 formatter class or its fully-qualified string name. If a formatter
373 is not provided, this method should populate that attribute with
374 the formatter the datastore would use for `put`. Subclasses are
375 also permitted to modify the path attribute (typically to put it
376 in what the datastore considers its standard form).
377 transfer : `str`, optional
378 How (and whether) the dataset should be added to the datastore.
379 See `ingest` for details of transfer modes.
381 Returns
382 -------
383 newTransfer : `str`
384 Transfer mode to use. Will be identical to the supplied transfer
385 mode unless "auto" is used.
386 """
387 if transfer != "auto":
388 return transfer
389 raise RuntimeError(f"{transfer} is not allowed without specialization.")
391 def _prepIngest(self, *datasets: FileDataset, transfer: Optional[str] = None) -> IngestPrepData:
392 """Process datasets to identify which ones can be ingested into this
393 Datastore.
395 Parameters
396 ----------
397 datasets : `FileDataset`
398 Each positional argument is a struct containing information about
399 a file to be ingested, including its path (either absolute or
400 relative to the datastore root, if applicable), a complete
401 `DatasetRef` (with ``dataset_id not None``), and optionally a
402 formatter class or its fully-qualified string name. If a formatter
403 is not provided, this method should populate that attribute with
404 the formatter the datastore would use for `put`. Subclasses are
405 also permitted to modify the path attribute (typically to put it
406 in what the datastore considers its standard form).
407 transfer : `str`, optional
408 How (and whether) the dataset should be added to the datastore.
409 See `ingest` for details of transfer modes.
411 Returns
412 -------
413 data : `IngestPrepData`
414 An instance of a subclass of `IngestPrepData`, used to pass
415 arbitrary data from `_prepIngest` to `_finishIngest`. This should
416 include only the datasets this datastore can actually ingest;
417 others should be silently ignored (`Datastore.ingest` will inspect
418 `IngestPrepData.refs` and raise `DatasetTypeNotSupportedError` if
419 necessary).
421 Raises
422 ------
423 NotImplementedError
424 Raised if the datastore does not support the given transfer mode
425 (including the case where ingest is not supported at all).
426 FileNotFoundError
427 Raised if one of the given files does not exist.
428 FileExistsError
429 Raised if transfer is not `None` but the (internal) location the
430 file would be moved to is already occupied.
432 Notes
433 -----
434 This method (along with `_finishIngest`) should be implemented by
435 subclasses to provide ingest support instead of implementing `ingest`
436 directly.
438 `_prepIngest` should not modify the data repository or given files in
439 any way; all changes should be deferred to `_finishIngest`.
441 When possible, exceptions should be raised in `_prepIngest` instead of
442 `_finishIngest`. `NotImplementedError` exceptions that indicate that
443 the transfer mode is not supported must be raised by `_prepIngest`
444 instead of `_finishIngest`.
445 """
446 raise NotImplementedError(
447 "Datastore does not support direct file-based ingest."
448 )
450 def _finishIngest(self, prepData: IngestPrepData, *, transfer: Optional[str] = None):
451 """Complete an ingest operation.
453 Parameters
454 ----------
455 data : `IngestPrepData`
456 An instance of a subclass of `IngestPrepData`. Guaranteed to be
457 the direct result of a call to `_prepIngest` on this datastore.
458 transfer : `str`, optional
459 How (and whether) the dataset should be added to the datastore.
460 See `ingest` for details of transfer modes.
462 Raises
463 ------
464 FileNotFoundError
465 Raised if one of the given files does not exist.
466 FileExistsError
467 Raised if transfer is not `None` but the (internal) location the
468 file would be moved to is already occupied.
470 Notes
471 -----
472 This method (along with `_prepIngest`) should be implemented by
473 subclasses to provide ingest support instead of implementing `ingest`
474 directly.
475 """
476 raise NotImplementedError(
477 "Datastore does not support direct file-based ingest."
478 )
480 def ingest(self, *datasets: FileDataset, transfer: Optional[str] = None):
481 """Ingest one or more files into the datastore.
483 Parameters
484 ----------
485 datasets : `FileDataset`
486 Each positional argument is a struct containing information about
487 a file to be ingested, including its path (either absolute or
488 relative to the datastore root, if applicable), a complete
489 `DatasetRef` (with ``dataset_id not None``), and optionally a
490 formatter class or its fully-qualified string name. If a formatter
491 is not provided, the one the datastore would use for ``put`` on
492 that dataset is assumed.
493 transfer : `str`, optional
494 How (and whether) the dataset should be added to the datastore.
495 If `None` (default), the file must already be in a location
496 appropriate for the datastore (e.g. within its root directory),
497 and will not be modified. Other choices include "move", "copy",
498 "link", "symlink", and "hardlink". "link" is a special transfer
499 mode that will first try to make a hardlink and if that fails
500 a symlink will be used instead. Most datastores do not support all
501 transfer modes. "auto" is a special option that will let the
502 data store choose the most natural option for itself.
504 Raises
505 ------
506 NotImplementedError
507 Raised if the datastore does not support the given transfer mode
508 (including the case where ingest is not supported at all).
509 DatasetTypeNotSupportedError
510 Raised if one or more files to be ingested have a dataset type that
511 is not supported by the datastore.
512 FileNotFoundError
513 Raised if one of the given files does not exist.
514 FileExistsError
515 Raised if transfer is not `None` but the (internal) location the
516 file would be moved to is already occupied.
518 Notes
519 -----
520 Subclasses should implement `_prepIngest` and `_finishIngest` instead
521 of implementing `ingest` directly. Datastores that hold and
522 delegate to child datastores may want to call those methods as well.
524 Subclasses are encouraged to document their supported transfer modes
525 in their class documentation.
526 """
527 # Allow a datastore to select a default transfer mode
528 transfer = self._overrideTransferMode(*datasets, transfer=transfer)
529 prepData = self._prepIngest(*datasets, transfer=transfer)
530 refs = {ref.id: ref for dataset in datasets for ref in dataset.refs}
531 if refs.keys() != prepData.refs.keys():
532 unsupported = refs.keys() - prepData.refs.keys()
533 # Group unsupported refs by DatasetType for an informative
534 # but still concise error message.
535 byDatasetType = defaultdict(list)
536 for datasetId in unsupported:
537 ref = refs[datasetId]
538 byDatasetType[ref.datasetType].append(ref)
539 raise DatasetTypeNotSupportedError(
540 "DatasetType(s) not supported in ingest: "
541 + ", ".join(f"{k.name} ({len(v)} dataset(s))" for k, v in byDatasetType.items())
542 )
543 self._finishIngest(prepData, transfer=transfer)
545 @abstractmethod
546 def getUri(self, datasetRef):
547 """URI to the Dataset.
549 Parameters
550 ----------
551 datasetRef : `DatasetRef`
552 Reference to the required Dataset.
554 Returns
555 -------
556 uri : `str`
557 URI string pointing to the Dataset within the datastore. If the
558 Dataset does not exist in the datastore, the URI may be a guess.
559 If the datastore does not have entities that relate well
560 to the concept of a URI the returned URI string will be
561 descriptive. The returned URI is not guaranteed to be obtainable.
562 """
563 raise NotImplementedError("Must be implemented by subclass")
565 @abstractmethod
566 def remove(self, datasetRef):
567 """Indicate to the Datastore that a Dataset can be removed.
569 Parameters
570 ----------
571 datasetRef : `DatasetRef`
572 Reference to the required Dataset.
574 Raises
575 ------
576 FileNotFoundError
577 When Dataset does not exist.
579 Notes
580 -----
581 Some Datastores may implement this method as a silent no-op to
582 disable Dataset deletion through standard interfaces.
583 """
584 raise NotImplementedError("Must be implemented by subclass")
586 @abstractmethod
587 def transfer(self, inputDatastore, datasetRef):
588 """Retrieve a Dataset from an input `Datastore`, and store the result
589 in this `Datastore`.
591 Parameters
592 ----------
593 inputDatastore : `Datastore`
594 The external `Datastore` from which to retreive the Dataset.
595 datasetRef : `DatasetRef`
596 Reference to the required Dataset.
597 """
598 raise NotImplementedError("Must be implemented by subclass")
600 def export(self, refs: Iterable[DatasetRef], *,
601 directory: Optional[str] = None, transfer: Optional[str] = None) -> Iterable[FileDataset]:
602 """Export datasets for transfer to another data repository.
604 Parameters
605 ----------
606 refs : iterable of `DatasetRef`
607 Dataset references to be exported.
608 directory : `str`, optional
609 Path to a directory that should contain files corresponding to
610 output datasets. Ignored if ``transfer`` is `None`.
611 transfer : `str`, optional
612 Mode that should be used to move datasets out of the repository.
613 Valid options are the same as those of the ``transfer`` argument
614 to ``ingest``, and datastores may similarly signal that a transfer
615 mode is not supported by raising `NotImplementedError`.
617 Returns
618 -------
619 dataset : iterable of `DatasetTransfer`
620 Structs containing information about the exported datasets, in the
621 same order as ``refs``.
623 Raises
624 ------
625 NotImplementedError
626 Raised if the given transfer mode is not supported.
627 """
628 raise NotImplementedError(f"Transfer mode {transfer} not supported.")
630 @abstractmethod
631 def validateConfiguration(self, entities, logFailures=False):
632 """Validate some of the configuration for this datastore.
634 Parameters
635 ----------
636 entities : `DatasetRef`, `DatasetType`, or `StorageClass`
637 Entities to test against this configuration. Can be differing
638 types.
639 logFailures : `bool`, optional
640 If `True`, output a log message for every validation error
641 detected.
643 Raises
644 ------
645 DatastoreValidationError
646 Raised if there is a validation problem with a configuration.
648 Notes
649 -----
650 Which parts of the configuration are validated is at the discretion
651 of each Datastore implementation.
652 """
653 raise NotImplementedError("Must be implemented by subclass")
655 @abstractmethod
656 def validateKey(self, lookupKey, entity, logFailures=False):
657 """Validate a specific look up key with supplied entity.
659 Parameters
660 ----------
661 lookupKey : `LookupKey`
662 Key to use to retrieve information from the datastore
663 configuration.
664 entity : `DatasetRef`, `DatasetType`, or `StorageClass`
665 Entity to compare with configuration retrieved using the
666 specified lookup key.
668 Raises
669 ------
670 DatastoreValidationError
671 Raised if there is a problem with the combination of entity
672 and lookup key.
674 Notes
675 -----
676 Bypasses the normal selection priorities by allowing a key that
677 would normally not be selected to be validated.
678 """
679 raise NotImplementedError("Must be implemented by subclass")
681 @abstractmethod
682 def getLookupKeys(self):
683 """Return all the lookup keys relevant to this datastore.
685 Returns
686 -------
687 keys : `set` of `LookupKey`
688 The keys stored internally for looking up information based
689 on `DatasetType` name or `StorageClass`.
690 """
691 raise NotImplementedError("Must be implemented by subclass")