Coverage for python/lsst/daf/butler/core/datastore.py : 49%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22"""
23Support for generic data stores.
24"""
26from __future__ import annotations
28__all__ = ("DatastoreConfig", "Datastore", "DatastoreValidationError")
30import contextlib
31import logging
32from collections import defaultdict
33from typing import TYPE_CHECKING, Optional, Type, Callable, ClassVar, Any, Generator, Iterable
34from dataclasses import dataclass
35from abc import ABCMeta, abstractmethod
37from lsst.utils import doImport
38from .config import ConfigSubset, Config
39from .exceptions import ValidationError, DatasetTypeNotSupportedError
40from .constraints import Constraints
41from .storageClass import StorageClassFactory
43if TYPE_CHECKING: 43 ↛ 44line 43 didn't jump to line 44, because the condition on line 43 was never true
44 from ..registry import Registry
45 from .datasets import DatasetRef
46 from .repoTransfer import FileDataset
49class DatastoreConfig(ConfigSubset):
50 component = "datastore"
51 requiredKeys = ("cls",)
52 defaultConfigFile = "datastore.yaml"
55class DatastoreValidationError(ValidationError):
56 """There is a problem with the Datastore configuration.
57 """
58 pass
61@dataclass(frozen=True)
62class Event:
63 __slots__ = {"name", "undoFunc", "args", "kwargs"}
64 name: str
65 undoFunc: Callable
66 args: tuple
67 kwargs: dict
70class IngestPrepData:
71 """A helper base class for `Datastore` ingest implementations.
73 Datastore implementations will generally need a custom implementation of
74 this class.
76 Should be accessed as ``Datastore.IngestPrepData`` instead of via direct
77 import.
79 Parameters
80 ----------
81 refs : iterable of `DatasetRef`
82 References for the datasets that can be ingested by this datastore.
83 """
84 def __init__(self, refs: Iterable[DatasetRef]):
85 self.refs = {ref.id: ref for ref in refs}
88class DatastoreTransaction:
89 """Keeps a log of `Datastore` activity and allow rollback.
91 Parameters
92 ----------
93 parent : `DatastoreTransaction`, optional
94 The parent transaction (if any)
95 """
96 Event: ClassVar[Type] = Event
98 parent: Optional['DatastoreTransaction']
99 """The parent transaction. (`DatastoreTransaction`, optional)"""
101 def __init__(self, parent=None):
102 self.parent = parent
103 self._log = []
105 def registerUndo(self, name: str, undoFunc: Callable, *args: Any, **kwargs: Any) -> None:
106 """Register event with undo function.
108 Parameters
109 ----------
110 name : `str`
111 Name of the event.
112 undoFunc : func
113 Function to undo this event.
114 args : `tuple`
115 Positional arguments to `undoFunc`.
116 kwargs : `dict`
117 Keyword arguments to `undoFunc`.
118 """
119 self._log.append(self.Event(name, undoFunc, args, kwargs))
121 @contextlib.contextmanager
122 def undoWith(self, name: str, undoFunc: Callable, *args: Any, **kwargs: Any) -> Generator:
123 """A context manager that calls `registerUndo` if the nested operation
124 does not raise an exception.
126 This can be used to wrap individual undo-able statements within a
127 DatastoreTransaction block. Multiple statements that can fail
128 separately should not be part of the same `undoWith` block.
130 All arguments are forwarded directly to `registerUndo`.
131 """
132 try:
133 yield None
134 except BaseException:
135 raise
136 else:
137 self.registerUndo(name, undoFunc, *args, **kwargs)
139 def rollback(self) -> None:
140 """Roll back all events in this transaction.
141 """
142 while self._log:
143 ev = self._log.pop()
144 try:
145 ev.undoFunc(*ev.args, **ev.kwargs)
146 except BaseException as e:
147 # Deliberately swallow error that may occur in unrolling
148 log = logging.getLogger(__name__)
149 log.warning("Exception: %s caught while unrolling: %s", e, ev.name)
150 pass
152 def commit(self) -> None:
153 """Commit this transaction.
154 """
155 if self.parent is None:
156 # Just forget about the events, they have already happened.
157 return
158 else:
159 # We may still want to events from this transaction as part of
160 # the parent.
161 self.parent._log.extend(self._log)
164class Datastore(metaclass=ABCMeta):
165 """Datastore interface.
167 Parameters
168 ----------
169 config : `DatastoreConfig` or `str`
170 Load configuration either from an existing config instance or by
171 referring to a configuration file.
172 registry : `Registry`
173 Registry to use for storing internal information about the datasets.
174 butlerRoot : `str`, optional
175 New datastore root to use to override the configuration value.
176 """
178 defaultConfigFile: ClassVar[Optional[str]] = None
179 """Path to configuration defaults. Relative to $DAF_BUTLER_DIR/config or
180 absolute path. Can be None if no defaults specified.
181 """
183 containerKey: ClassVar[Optional[str]] = None
184 """Name of the key containing a list of subconfigurations that also
185 need to be merged with defaults and will likely use different Python
186 datastore classes (but all using DatastoreConfig). Assumed to be a
187 list of configurations that can be represented in a DatastoreConfig
188 and containing a "cls" definition. None indicates that no containers
189 are expected in this Datastore."""
191 isEphemeral: ClassVar[bool] = False
192 """Indicate whether this Datastore is ephemeral or not. An ephemeral
193 datastore is one where the contents of the datastore will not exist
194 across process restarts."""
196 config: DatastoreConfig
197 """Configuration used to create Datastore."""
199 registry: Registry
200 """`Registry` to use when recording the writing of Datasets."""
202 name: str
203 """Label associated with this Datastore."""
205 names: list
206 """List of names associated with this Datastore. Can be different to
207 ``name`` for a chaining datastore."""
209 storageClassFactory: StorageClassFactory
210 """Factory for creating storage class instances from name."""
212 constraints: Constraints
213 """Constraints to apply when putting datasets into the datastore."""
215 IngestPrepData: ClassVar[Type] = IngestPrepData
216 """Helper base class for ingest implementations.
217 """
219 @classmethod
220 @abstractmethod
221 def setConfigRoot(cls, root: str, config: Config, full: Config, overwrite: bool = True):
222 """Set any filesystem-dependent config options for this Datastore to
223 be appropriate for a new empty repository with the given root.
225 Parameters
226 ----------
227 root : `str`
228 Filesystem path to the root of the data repository.
229 config : `Config`
230 A `Config` to update. Only the subset understood by
231 this component will be updated. Will not expand
232 defaults.
233 full : `Config`
234 A complete config with all defaults expanded that can be
235 converted to a `DatastoreConfig`. Read-only and will not be
236 modified by this method.
237 Repository-specific options that should not be obtained
238 from defaults when Butler instances are constructed
239 should be copied from ``full`` to ``config``.
240 overwrite : `bool`, optional
241 If `False`, do not modify a value in ``config`` if the value
242 already exists. Default is always to overwrite with the provided
243 ``root``.
245 Notes
246 -----
247 If a keyword is explicitly defined in the supplied ``config`` it
248 will not be overridden by this method if ``overwrite`` is `False`.
249 This allows explicit values set in external configs to be retained.
250 """
251 raise NotImplementedError()
253 @staticmethod
254 def fromConfig(config: Config, registry: Registry, butlerRoot: Optional[str] = None) -> 'Datastore':
255 """Create datastore from type specified in config file.
257 Parameters
258 ----------
259 config : `Config`
260 Configuration instance.
261 registry : `Registry`
262 Registry to be used by the Datastore for internal data.
263 butlerRoot : `str`, optional
264 Butler root directory.
265 """
266 cls = doImport(config["datastore", "cls"])
267 return cls(config=config, registry=registry, butlerRoot=butlerRoot)
269 def __init__(self, config, registry, butlerRoot=None):
270 self.config = DatastoreConfig(config)
271 self.registry = registry
272 self.name = "ABCDataStore"
273 self._transaction = None
275 # All Datastores need storage classes and constraints
276 self.storageClassFactory = StorageClassFactory()
278 # And read the constraints list
279 constraintsConfig = self.config.get("constraints")
280 self.constraints = Constraints(constraintsConfig, universe=self.registry.dimensions)
282 def __str__(self):
283 return self.name
285 def __repr__(self):
286 return self.name
288 @property
289 def names(self):
290 """Names associated with this datastore returned as a list.
292 Some datastores can have child datastores.
293 """
294 # Default implementation returns solely the name itself
295 return [self.name]
297 @contextlib.contextmanager
298 def transaction(self):
299 """Context manager supporting `Datastore` transactions.
301 Transactions can be nested, and are to be used in combination with
302 `Registry.transaction`.
303 """
304 self._transaction = DatastoreTransaction(self._transaction)
305 try:
306 yield self._transaction
307 except BaseException:
308 self._transaction.rollback()
309 raise
310 else:
311 self._transaction.commit()
312 self._transaction = self._transaction.parent
314 @abstractmethod
315 def exists(self, datasetRef):
316 """Check if the dataset exists in the datastore.
318 Parameters
319 ----------
320 datasetRef : `DatasetRef`
321 Reference to the required dataset.
323 Returns
324 -------
325 exists : `bool`
326 `True` if the entity exists in the `Datastore`.
327 """
328 raise NotImplementedError("Must be implemented by subclass")
330 @abstractmethod
331 def get(self, datasetRef, parameters=None):
332 """Load an `InMemoryDataset` from the store.
334 Parameters
335 ----------
336 datasetRef : `DatasetRef`
337 Reference to the required Dataset.
338 parameters : `dict`
339 `StorageClass`-specific parameters that specify a slice of the
340 Dataset to be loaded.
342 Returns
343 -------
344 inMemoryDataset : `object`
345 Requested Dataset or slice thereof as an InMemoryDataset.
346 """
347 raise NotImplementedError("Must be implemented by subclass")
349 @abstractmethod
350 def put(self, inMemoryDataset, datasetRef):
351 """Write a `InMemoryDataset` with a given `DatasetRef` to the store.
353 Parameters
354 ----------
355 inMemoryDataset : `InMemoryDataset`
356 The Dataset to store.
357 datasetRef : `DatasetRef`
358 Reference to the associated Dataset.
359 """
360 raise NotImplementedError("Must be implemented by subclass")
362 def _prepIngest(self, *datasets: FileDataset, transfer: Optional[str] = None) -> IngestPrepData:
363 """Process datasets to identify which ones can be ingested into this
364 Datastore.
366 Parameters
367 ----------
368 datasets : `FileDataset`
369 Each positional argument is a struct containing information about
370 a file to be ingested, including its path (either absolute or
371 relative to the datastore root, if applicable), a complete
372 `DatasetRef` (with ``dataset_id not None``), and optionally a
373 formatter class or its fully-qualified string name. If a formatter
374 is not provided, this method should populate that attribute with
375 the formatter the datastore would use for `put`. Subclasses are
376 also permitted to modify the path attribute (typically to put it
377 in what the datastore considers its standard form).
378 transfer : `str`, optional
379 How (and whether) the dataset should be added to the datastore.
380 If `None` (default), the file must already be in a location
381 appropriate for the datastore (e.g. within its root directory),
382 and will not be modified. Other choices include "move", "copy",
383 "symlink", and "hardlink". Most datastores do not support all
384 transfer modes.
386 Returns
387 -------
388 data : `IngestPrepData`
389 An instance of a subclass of `IngestPrepData`, used to pass
390 arbitrary data from `_prepIngest` to `_finishIngest`. This should
391 include only the datasets this datastore can actually ingest;
392 others should be silently ignored (`Datastore.ingest` will inspect
393 `IngestPrepData.refs` and raise `DatasetTypeNotSupportedError` if
394 necessary).
396 Raises
397 ------
398 NotImplementedError
399 Raised if the datastore does not support the given transfer mode
400 (including the case where ingest is not supported at all).
401 FileNotFoundError
402 Raised if one of the given files does not exist.
403 FileExistsError
404 Raised if transfer is not `None` but the (internal) location the
405 file would be moved to is already occupied.
407 Notes
408 -----
409 This method (along with `_finishIngest`) should be implemented by
410 subclasses to provide ingest support instead of implementing `ingest`
411 directly.
413 `_prepIngest` should not modify the data repository or given files in
414 any way; all changes should be deferred to `_finishIngest`.
416 When possible, exceptions should be raised in `_prepIngest` instead of
417 `_finishIngest`. `NotImplementedError` exceptions that indicate that
418 the transfer mode is not supported must be raised by `_prepIngest`
419 instead of `_finishIngest`.
420 """
421 raise NotImplementedError(
422 "Datastore does not support direct file-based ingest."
423 )
425 def _finishIngest(self, prepData: IngestPrepData, *, transfer: Optional[str] = None):
426 """Complete an ingest operation.
428 Parameters
429 ----------
430 data : `IngestPrepData`
431 An instance of a subclass of `IngestPrepData`. Guaranteed to be
432 the direct result of a call to `_prepIngest` on this datastore.
433 transfer : `str`, optional
434 How (and whether) the dataset should be added to the datastore.
435 If `None` (default), the file must already be in a location
436 appropriate for the datastore (e.g. within its root directory),
437 and will not be modified. Other choices include "move", "copy",
438 "symlink", and "hardlink". Most datastores do not support all
439 transfer modes.
441 Raises
442 ------
443 FileNotFoundError
444 Raised if one of the given files does not exist.
445 FileExistsError
446 Raised if transfer is not `None` but the (internal) location the
447 file would be moved to is already occupied.
449 Notes
450 -----
451 This method (along with `_prepIngest`) should be implemented by
452 subclasses to provide ingest support instead of implementing `ingest`
453 directly.
454 """
455 raise NotImplementedError(
456 "Datastore does not support direct file-based ingest."
457 )
459 def ingest(self, *datasets: FileDataset, transfer: Optional[str] = None):
460 """Ingest one or more files into the datastore.
462 Parameters
463 ----------
464 datasets : `FileDataset`
465 Each positional argument is a struct containing information about
466 a file to be ingested, including its path (either absolute or
467 relative to the datastore root, if applicable), a complete
468 `DatasetRef` (with ``dataset_id not None``), and optionally a
469 formatter class or its fully-qualified string name. If a formatter
470 is not provided, the one the datastore would use for ``put`` on
471 that dataset is assumed.
472 transfer : `str`, optional
473 How (and whether) the dataset should be added to the datastore.
474 If `None` (default), the file must already be in a location
475 appropriate for the datastore (e.g. within its root directory),
476 and will not be modified. Other choices include "move", "copy",
477 "symlink", and "hardlink". Most datastores do not support all
478 transfer modes.
480 Raises
481 ------
482 NotImplementedError
483 Raised if the datastore does not support the given transfer mode
484 (including the case where ingest is not supported at all).
485 DatasetTypeNotSupportedError
486 Raised if one or more files to be ingested have a dataset type that
487 is not supported by the datastore.
488 FileNotFoundError
489 Raised if one of the given files does not exist.
490 FileExistsError
491 Raised if transfer is not `None` but the (internal) location the
492 file would be moved to is already occupied.
494 Notes
495 -----
496 Subclasses should implement `_prepIngest` and `_finishIngest` instead
497 of implementing `ingest` directly. Datastores that hold and
498 delegate to child datastores may want to call those methods as well.
500 Subclasses are encouraged to document their supported transfer modes
501 in their class documentation.
502 """
503 prepData = self._prepIngest(*datasets, transfer=transfer)
504 refs = {ref.id: ref for dataset in datasets for ref in dataset.refs}
505 if refs.keys() != prepData.refs.keys():
506 unsupported = refs.keys() - prepData.refs.keys()
507 # Group unsupported refs by DatasetType for an informative
508 # but still concise error message.
509 byDatasetType = defaultdict(list)
510 for datasetId in unsupported:
511 ref = refs[datasetId]
512 byDatasetType[ref.datasetType].append(ref)
513 raise DatasetTypeNotSupportedError(
514 "DatasetType(s) not supported in ingest: "
515 + ", ".join(f"{k.name} ({len(v)} dataset(s))" for k, v in byDatasetType.items())
516 )
517 self._finishIngest(prepData, transfer=transfer)
519 @abstractmethod
520 def getUri(self, datasetRef):
521 """URI to the Dataset.
523 Parameters
524 ----------
525 datasetRef : `DatasetRef`
526 Reference to the required Dataset.
528 Returns
529 -------
530 uri : `str`
531 URI string pointing to the Dataset within the datastore. If the
532 Dataset does not exist in the datastore, the URI may be a guess.
533 If the datastore does not have entities that relate well
534 to the concept of a URI the returned URI string will be
535 descriptive. The returned URI is not guaranteed to be obtainable.
536 """
537 raise NotImplementedError("Must be implemented by subclass")
539 @abstractmethod
540 def remove(self, datasetRef):
541 """Indicate to the Datastore that a Dataset can be removed.
543 Parameters
544 ----------
545 datasetRef : `DatasetRef`
546 Reference to the required Dataset.
548 Raises
549 ------
550 FileNotFoundError
551 When Dataset does not exist.
553 Notes
554 -----
555 Some Datastores may implement this method as a silent no-op to
556 disable Dataset deletion through standard interfaces.
557 """
558 raise NotImplementedError("Must be implemented by subclass")
560 @abstractmethod
561 def transfer(self, inputDatastore, datasetRef):
562 """Retrieve a Dataset from an input `Datastore`, and store the result
563 in this `Datastore`.
565 Parameters
566 ----------
567 inputDatastore : `Datastore`
568 The external `Datastore` from which to retreive the Dataset.
569 datasetRef : `DatasetRef`
570 Reference to the required Dataset.
571 """
572 raise NotImplementedError("Must be implemented by subclass")
574 def export(self, refs: Iterable[DatasetRef], *,
575 directory: Optional[str] = None, transfer: Optional[str] = None) -> Iterable[FileDataset]:
576 """Export datasets for transfer to another data repository.
578 Parameters
579 ----------
580 refs : iterable of `DatasetRef`
581 Dataset references to be exported.
582 directory : `str`, optional
583 Path to a directory that should contain files corresponding to
584 output datasets. Ignored if ``transfer`` is `None`.
585 transfer : `str`, optional
586 Mode that should be used to move datasets out of the repository.
587 Valid options are the same as those of the ``transfer`` argument
588 to ``ingest``, and datastores may similarly signal that a transfer
589 mode is not supported by raising `NotImplementedError`.
591 Returns
592 -------
593 dataset : iterable of `DatasetTransfer`
594 Structs containing information about the exported datasets, in the
595 same order as ``refs``.
597 Raises
598 ------
599 NotImplementedError
600 Raised if the given transfer mode is not supported.
601 """
602 raise NotImplementedError(f"Transfer mode {transfer} not supported.")
604 @abstractmethod
605 def validateConfiguration(self, entities, logFailures=False):
606 """Validate some of the configuration for this datastore.
608 Parameters
609 ----------
610 entities : `DatasetRef`, `DatasetType`, or `StorageClass`
611 Entities to test against this configuration. Can be differing
612 types.
613 logFailures : `bool`, optional
614 If `True`, output a log message for every validation error
615 detected.
617 Raises
618 ------
619 DatastoreValidationError
620 Raised if there is a validation problem with a configuration.
622 Notes
623 -----
624 Which parts of the configuration are validated is at the discretion
625 of each Datastore implementation.
626 """
627 raise NotImplementedError("Must be implemented by subclass")
629 @abstractmethod
630 def validateKey(self, lookupKey, entity, logFailures=False):
631 """Validate a specific look up key with supplied entity.
633 Parameters
634 ----------
635 lookupKey : `LookupKey`
636 Key to use to retrieve information from the datastore
637 configuration.
638 entity : `DatasetRef`, `DatasetType`, or `StorageClass`
639 Entity to compare with configuration retrieved using the
640 specified lookup key.
642 Raises
643 ------
644 DatastoreValidationError
645 Raised if there is a problem with the combination of entity
646 and lookup key.
648 Notes
649 -----
650 Bypasses the normal selection priorities by allowing a key that
651 would normally not be selected to be validated.
652 """
653 raise NotImplementedError("Must be implemented by subclass")
655 @abstractmethod
656 def getLookupKeys(self):
657 """Return all the lookup keys relevant to this datastore.
659 Returns
660 -------
661 keys : `set` of `LookupKey`
662 The keys stored internally for looking up information based
663 on `DatasetType` name or `StorageClass`.
664 """
665 raise NotImplementedError("Must be implemented by subclass")