Coverage for python/lsst/daf/butler/datastores/fileLikeDatastore.py : 82%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
21from __future__ import annotations
23"""Generic file-based datastore code."""
25__all__ = ("FileLikeDatastore", )
27import logging
28from abc import abstractmethod
30from sqlalchemy import BigInteger, String
32from dataclasses import dataclass
33from typing import (
34 TYPE_CHECKING,
35 Any,
36 ClassVar,
37 Dict,
38 Iterable,
39 List,
40 Mapping,
41 Optional,
42 Set,
43 Tuple,
44 Type,
45 Union,
46)
48from lsst.daf.butler import (
49 ButlerURI,
50 CompositesMap,
51 Config,
52 FileDataset,
53 DatasetRef,
54 DatasetType,
55 DatasetTypeNotSupportedError,
56 Datastore,
57 DatastoreConfig,
58 DatastoreValidationError,
59 FileDescriptor,
60 FileTemplates,
61 FileTemplateValidationError,
62 Formatter,
63 FormatterFactory,
64 Location,
65 LocationFactory,
66 StorageClass,
67 StoredFileInfo,
68)
70from lsst.daf.butler import ddl
71from lsst.daf.butler.registry.interfaces import (
72 ReadOnlyDatabaseError,
73 DatastoreRegistryBridge,
74)
76from lsst.daf.butler.core.repoRelocation import replaceRoot
77from lsst.daf.butler.core.utils import getInstanceOf, getClassOf, transactional
78from .genericDatastore import GenericBaseDatastore
80if TYPE_CHECKING: 80 ↛ 81line 80 didn't jump to line 81, because the condition on line 80 was never true
81 from lsst.daf.butler import LookupKey
82 from lsst.daf.butler.registry.interfaces import DatasetIdRef, DatastoreRegistryBridgeManager
84log = logging.getLogger(__name__)
86# String to use when a Python None is encountered
87NULLSTR = "__NULL_STRING__"
90class _IngestPrepData(Datastore.IngestPrepData):
91 """Helper class for FileLikeDatastore ingest implementation.
93 Parameters
94 ----------
95 datasets : `list` of `FileDataset`
96 Files to be ingested by this datastore.
97 """
98 def __init__(self, datasets: List[FileDataset]):
99 super().__init__(ref for dataset in datasets for ref in dataset.refs)
100 self.datasets = datasets
103@dataclass(frozen=True)
104class DatastoreFileGetInformation:
105 """Collection of useful parameters needed to retrieve a file from
106 a Datastore.
107 """
109 location: Location
110 """The location from which to read the dataset."""
112 formatter: Formatter
113 """The `Formatter` to use to deserialize the dataset."""
115 info: StoredFileInfo
116 """Stored information about this file and its formatter."""
118 assemblerParams: Dict[str, Any]
119 """Parameters to use for post-processing the retrieved dataset."""
121 formatterParams: Dict[str, Any]
122 """Parameters that were understood by the associated formatter."""
124 component: Optional[str]
125 """The component to be retrieved (can be `None`)."""
127 readStorageClass: StorageClass
128 """The `StorageClass` of the dataset being read."""
131class FileLikeDatastore(GenericBaseDatastore):
132 """Generic Datastore for file-based implementations.
134 Should always be sub-classed since key abstract methods are missing.
136 Parameters
137 ----------
138 config : `DatastoreConfig` or `str`
139 Configuration as either a `Config` object or URI to file.
140 bridgeManager : `DatastoreRegistryBridgeManager`
141 Object that manages the interface between `Registry` and datastores.
142 butlerRoot : `str`, optional
143 New datastore root to use to override the configuration value.
145 Raises
146 ------
147 ValueError
148 If root location does not exist and ``create`` is `False` in the
149 configuration.
150 """
152 defaultConfigFile: ClassVar[Optional[str]] = None
153 """Path to configuration defaults. Accessed within the ``config`` resource
154 or relative to a search path. Can be None if no defaults specified.
155 """
157 root: str
158 """Root directory or URI of this `Datastore`."""
160 locationFactory: LocationFactory
161 """Factory for creating locations relative to the datastore root."""
163 formatterFactory: FormatterFactory
164 """Factory for creating instances of formatters."""
166 templates: FileTemplates
167 """File templates that can be used by this `Datastore`."""
169 composites: CompositesMap
170 """Determines whether a dataset should be disassembled on put."""
172 @classmethod
173 def setConfigRoot(cls, root: str, config: Config, full: Config, overwrite: bool = True) -> None:
174 """Set any filesystem-dependent config options for this Datastore to
175 be appropriate for a new empty repository with the given root.
177 Parameters
178 ----------
179 root : `str`
180 URI to the root of the data repository.
181 config : `Config`
182 A `Config` to update. Only the subset understood by
183 this component will be updated. Will not expand
184 defaults.
185 full : `Config`
186 A complete config with all defaults expanded that can be
187 converted to a `DatastoreConfig`. Read-only and will not be
188 modified by this method.
189 Repository-specific options that should not be obtained
190 from defaults when Butler instances are constructed
191 should be copied from ``full`` to ``config``.
192 overwrite : `bool`, optional
193 If `False`, do not modify a value in ``config`` if the value
194 already exists. Default is always to overwrite with the provided
195 ``root``.
197 Notes
198 -----
199 If a keyword is explicitly defined in the supplied ``config`` it
200 will not be overridden by this method if ``overwrite`` is `False`.
201 This allows explicit values set in external configs to be retained.
202 """
203 Config.updateParameters(DatastoreConfig, config, full,
204 toUpdate={"root": root},
205 toCopy=("cls", ("records", "table")), overwrite=overwrite)
207 @classmethod
208 def makeTableSpec(cls) -> ddl.TableSpec:
209 return ddl.TableSpec(
210 fields=[
211 ddl.FieldSpec(name="dataset_id", dtype=BigInteger, primaryKey=True),
212 ddl.FieldSpec(name="path", dtype=String, length=256, nullable=False),
213 ddl.FieldSpec(name="formatter", dtype=String, length=128, nullable=False),
214 ddl.FieldSpec(name="storage_class", dtype=String, length=64, nullable=False),
215 # Use empty string to indicate no component
216 ddl.FieldSpec(name="component", dtype=String, length=32, primaryKey=True),
217 # TODO: should checksum be Base64Bytes instead?
218 ddl.FieldSpec(name="checksum", dtype=String, length=128, nullable=True),
219 ddl.FieldSpec(name="file_size", dtype=BigInteger, nullable=True),
220 ],
221 unique=frozenset(),
222 )
224 def __init__(self, config: Union[DatastoreConfig, str],
225 bridgeManager: DatastoreRegistryBridgeManager, butlerRoot: str = None):
226 super().__init__(config, bridgeManager)
227 if "root" not in self.config: 227 ↛ 228line 227 didn't jump to line 228, because the condition on line 227 was never true
228 raise ValueError("No root directory specified in configuration")
230 # Name ourselves either using an explicit name or a name
231 # derived from the (unexpanded) root
232 if "name" in self.config:
233 self.name = self.config["name"]
234 else:
235 # We use the unexpanded root in the name to indicate that this
236 # datastore can be moved without having to update registry.
237 self.name = "{}@{}".format(type(self).__name__,
238 self.config["root"])
240 # Support repository relocation in config
241 # Existence of self.root is checked in subclass
242 self.root = replaceRoot(self.config["root"], butlerRoot)
244 self.locationFactory = LocationFactory(self.root)
245 self.formatterFactory = FormatterFactory()
247 # Now associate formatters with storage classes
248 self.formatterFactory.registerFormatters(self.config["formatters"],
249 universe=bridgeManager.universe)
251 # Read the file naming templates
252 self.templates = FileTemplates(self.config["templates"],
253 universe=bridgeManager.universe)
255 # See if composites should be disassembled
256 self.composites = CompositesMap(self.config["composites"],
257 universe=bridgeManager.universe)
259 tableName = self.config["records", "table"]
260 try:
261 # Storage of paths and formatters, keyed by dataset_id
262 self._table = bridgeManager.opaque.register(tableName, self.makeTableSpec())
263 # Interface to Registry.
264 self._bridge = bridgeManager.register(self.name)
265 except ReadOnlyDatabaseError:
266 # If the database is read only and we just tried and failed to
267 # create a table, it means someone is trying to create a read-only
268 # butler client for an empty repo. That should be okay, as long
269 # as they then try to get any datasets before some other client
270 # creates the table. Chances are they'rejust validating
271 # configuration.
272 pass
274 # Determine whether checksums should be used
275 self.useChecksum = self.config.get("checksum", True)
277 def __str__(self) -> str:
278 return self.root
280 @property
281 def bridge(self) -> DatastoreRegistryBridge:
282 return self._bridge
284 @abstractmethod
285 def _artifact_exists(self, location: Location) -> bool:
286 """Check that an artifact exists in this datastore at the specified
287 location.
289 Parameters
290 ----------
291 location : `Location`
292 Expected location of the artifact associated with this datastore.
294 Returns
295 -------
296 exists : `bool`
297 True if the location can be found, false otherwise.
298 """
299 raise NotImplementedError()
301 @abstractmethod
302 def _delete_artifact(self, location: Location) -> None:
303 """Delete the artifact from the datastore.
305 Parameters
306 ----------
307 location : `Location`
308 Location of the artifact associated with this datastore.
309 """
310 raise NotImplementedError()
312 def addStoredItemInfo(self, refs: Iterable[DatasetRef], infos: Iterable[StoredFileInfo]) -> None:
313 # Docstring inherited from GenericBaseDatastore
314 records = []
315 for ref, info in zip(refs, infos):
316 # Component should come from ref and fall back on info
317 component = ref.datasetType.component()
318 if component is None and info.component is not None: 318 ↛ 319line 318 didn't jump to line 319, because the condition on line 318 was never true
319 component = info.component
320 if component is None:
321 # Use empty string since we want this to be part of the
322 # primary key.
323 component = NULLSTR
324 records.append(
325 dict(dataset_id=ref.id, formatter=info.formatter, path=info.path,
326 storage_class=info.storageClass.name, component=component,
327 checksum=info.checksum, file_size=info.file_size)
328 )
329 self._table.insert(*records)
331 def getStoredItemsInfo(self, ref: DatasetIdRef) -> List[StoredFileInfo]:
332 # Docstring inherited from GenericBaseDatastore
334 # Look for the dataset_id -- there might be multiple matches
335 # if we have disassembled the dataset.
336 records = list(self._table.fetch(dataset_id=ref.id))
338 results = []
339 for record in records:
340 # Convert name of StorageClass to instance
341 storageClass = self.storageClassFactory.getStorageClass(record["storage_class"])
342 component = record["component"] if (record["component"]
343 and record["component"] != NULLSTR) else None
345 info = StoredFileInfo(formatter=record["formatter"],
346 path=record["path"],
347 storageClass=storageClass,
348 component=component,
349 checksum=record["checksum"],
350 file_size=record["file_size"])
351 results.append(info)
353 return results
355 def _registered_refs_per_artifact(self, pathInStore: str) -> Set[int]:
356 """Return all dataset refs associated with the supplied path.
358 Parameters
359 ----------
360 pathInStore : `str`
361 Path of interest in the data store.
363 Returns
364 -------
365 ids : `set` of `int`
366 All `DatasetRef` IDs associated with this path.
367 """
368 records = list(self._table.fetch(path=pathInStore))
369 ids = {r["dataset_id"] for r in records}
370 return ids
372 def removeStoredItemInfo(self, ref: DatasetIdRef) -> None:
373 # Docstring inherited from GenericBaseDatastore
374 self._table.delete(dataset_id=ref.id)
376 def _get_dataset_locations_info(self, ref: DatasetIdRef) -> List[Tuple[Location, StoredFileInfo]]:
377 r"""Find all the `Location`\ s of the requested dataset in the
378 `Datastore` and the associated stored file information.
380 Parameters
381 ----------
382 ref : `DatasetRef`
383 Reference to the required `Dataset`.
385 Returns
386 -------
387 results : `list` [`tuple` [`Location`, `StoredFileInfo` ]]
388 Location of the dataset within the datastore and
389 stored information about each file and its formatter.
390 """
391 # Get the file information (this will fail if no file)
392 records = self.getStoredItemsInfo(ref)
394 # Use the path to determine the location
395 return [(self.locationFactory.fromPath(r.path), r) for r in records]
397 def _can_remove_dataset_artifact(self, ref: DatasetIdRef, location: Location) -> bool:
398 """Check that there is only one dataset associated with the
399 specified artifact.
401 Parameters
402 ----------
403 ref : `DatasetRef` or `FakeDatasetRef`
404 Dataset to be removed.
405 location : `Location`
406 The location of the artifact to be removed.
408 Returns
409 -------
410 can_remove : `Bool`
411 True if the artifact can be safely removed.
412 """
414 # Get all entries associated with this path
415 allRefs = self._registered_refs_per_artifact(location.pathInStore)
416 if not allRefs: 416 ↛ 417line 416 didn't jump to line 417, because the condition on line 416 was never true
417 raise RuntimeError(f"Datastore inconsistency error. {location.pathInStore} not in registry")
419 # Remove these refs from all the refs and if there is nothing left
420 # then we can delete
421 remainingRefs = allRefs - {ref.id}
423 if remainingRefs:
424 return False
425 return True
427 def _prepare_for_get(self, ref: DatasetRef,
428 parameters: Optional[Mapping[str, Any]] = None) -> List[DatastoreFileGetInformation]:
429 """Check parameters for ``get`` and obtain formatter and
430 location.
432 Parameters
433 ----------
434 ref : `DatasetRef`
435 Reference to the required Dataset.
436 parameters : `dict`
437 `StorageClass`-specific parameters that specify, for example,
438 a slice of the dataset to be loaded.
440 Returns
441 -------
442 getInfo : `list` [`DatastoreFileGetInformation`]
443 Parameters needed to retrieve each file.
444 """
445 log.debug("Retrieve %s from %s with parameters %s", ref, self.name, parameters)
447 # Get file metadata and internal metadata
448 fileLocations = self._get_dataset_locations_info(ref)
449 if not fileLocations:
450 raise FileNotFoundError(f"Could not retrieve dataset {ref}.")
452 # The storage class we want to use eventually
453 refStorageClass = ref.datasetType.storageClass
455 if len(fileLocations) > 1:
456 disassembled = True
457 else:
458 disassembled = False
460 # Is this a component request?
461 refComponent = ref.datasetType.component()
463 fileGetInfo = []
464 for location, storedFileInfo in fileLocations:
466 # The storage class used to write the file
467 writeStorageClass = storedFileInfo.storageClass
469 # If this has been disassembled we need read to match the write
470 if disassembled:
471 readStorageClass = writeStorageClass
472 else:
473 readStorageClass = refStorageClass
475 formatter = getInstanceOf(storedFileInfo.formatter,
476 FileDescriptor(location, readStorageClass=readStorageClass,
477 storageClass=writeStorageClass, parameters=parameters),
478 ref.dataId)
480 formatterParams, notFormatterParams = formatter.segregateParameters()
482 # Of the remaining parameters, extract the ones supported by
483 # this StorageClass (for components not all will be handled)
484 assemblerParams = readStorageClass.filterParameters(notFormatterParams)
486 # The ref itself could be a component if the dataset was
487 # disassembled by butler, or we disassembled in datastore and
488 # components came from the datastore records
489 component = storedFileInfo.component if storedFileInfo.component else refComponent
491 fileGetInfo.append(DatastoreFileGetInformation(location, formatter, storedFileInfo,
492 assemblerParams, formatterParams,
493 component, readStorageClass))
495 return fileGetInfo
497 def _prepare_for_put(self, inMemoryDataset: Any, ref: DatasetRef) -> Tuple[Location, Formatter]:
498 """Check the arguments for ``put`` and obtain formatter and
499 location.
501 Parameters
502 ----------
503 inMemoryDataset : `object`
504 The dataset to store.
505 ref : `DatasetRef`
506 Reference to the associated Dataset.
508 Returns
509 -------
510 location : `Location`
511 The location to write the dataset.
512 formatter : `Formatter`
513 The `Formatter` to use to write the dataset.
515 Raises
516 ------
517 TypeError
518 Supplied object and storage class are inconsistent.
519 DatasetTypeNotSupportedError
520 The associated `DatasetType` is not handled by this datastore.
521 """
522 self._validate_put_parameters(inMemoryDataset, ref)
524 # Work out output file name
525 try:
526 template = self.templates.getTemplate(ref)
527 except KeyError as e:
528 raise DatasetTypeNotSupportedError(f"Unable to find template for {ref}") from e
530 # Validate the template to protect against filenames from different
531 # dataIds returning the same and causing overwrite confusion.
532 template.validateTemplate(ref)
534 location = self.locationFactory.fromPath(template.format(ref))
536 # Get the formatter based on the storage class
537 storageClass = ref.datasetType.storageClass
538 try:
539 formatter = self.formatterFactory.getFormatter(ref,
540 FileDescriptor(location,
541 storageClass=storageClass),
542 ref.dataId)
543 except KeyError as e:
544 raise DatasetTypeNotSupportedError(f"Unable to find formatter for {ref} in datastore "
545 f"{self.name}") from e
547 # Now that we know the formatter, update the location
548 location = formatter.makeUpdatedLocation(location)
550 return location, formatter
552 @abstractmethod
553 def _standardizeIngestPath(self, path: str, *, transfer: Optional[str] = None) -> str:
554 """Standardize the path of a to-be-ingested file.
556 Parameters
557 ----------
558 path : `str`
559 Path of a file to be ingested.
560 transfer : `str`, optional
561 How (and whether) the dataset should be added to the datastore.
562 See `ingest` for details of transfer modes.
563 This implementation is provided only so
564 `NotImplementedError` can be raised if the mode is not supported;
565 actual transfers are deferred to `_extractIngestInfo`.
567 Returns
568 -------
569 path : `str`
570 New path in what the datastore considers standard form.
572 Notes
573 -----
574 Subclasses of `FileLikeDatastore` should implement this method instead
575 of `_prepIngest`. It should not modify the data repository or given
576 file in any way.
578 Raises
579 ------
580 NotImplementedError
581 Raised if the datastore does not support the given transfer mode
582 (including the case where ingest is not supported at all).
583 FileNotFoundError
584 Raised if one of the given files does not exist.
585 """
586 raise NotImplementedError("Must be implemented by subclasses.")
588 @abstractmethod
589 def _extractIngestInfo(self, path: Union[str, ButlerURI], ref: DatasetRef, *,
590 formatter: Union[Formatter, Type[Formatter]],
591 transfer: Optional[str] = None) -> StoredFileInfo:
592 """Relocate (if necessary) and extract `StoredFileInfo` from a
593 to-be-ingested file.
595 Parameters
596 ----------
597 path : `str` or `ButlerURI`
598 URI or path of a file to be ingested.
599 ref : `DatasetRef`
600 Reference for the dataset being ingested. Guaranteed to have
601 ``dataset_id not None`.
602 formatter : `type` or `Formatter`
603 `Formatter` subclass to use for this dataset or an instance.
604 transfer : `str`, optional
605 How (and whether) the dataset should be added to the datastore.
606 See `ingest` for details of transfer modes.
608 Returns
609 -------
610 info : `StoredFileInfo`
611 Internal datastore record for this file. This will be inserted by
612 the caller; the `_extractIngestInfo` is only resposible for
613 creating and populating the struct.
615 Raises
616 ------
617 FileNotFoundError
618 Raised if one of the given files does not exist.
619 FileExistsError
620 Raised if transfer is not `None` but the (internal) location the
621 file would be moved to is already occupied.
622 """
623 raise NotImplementedError("Must be implemented by subclasses.")
625 def _prepIngest(self, *datasets: FileDataset, transfer: Optional[str] = None) -> _IngestPrepData:
626 # Docstring inherited from Datastore._prepIngest.
627 filtered = []
628 for dataset in datasets:
629 acceptable = [ref for ref in dataset.refs if self.constraints.isAcceptable(ref)]
630 if not acceptable:
631 continue
632 else:
633 dataset.refs = acceptable
634 if dataset.formatter is None:
635 dataset.formatter = self.formatterFactory.getFormatterClass(dataset.refs[0])
636 else:
637 assert isinstance(dataset.formatter, (type, str))
638 dataset.formatter = getClassOf(dataset.formatter)
639 dataset.path = self._standardizeIngestPath(dataset.path, transfer=transfer)
640 filtered.append(dataset)
641 return _IngestPrepData(filtered)
643 @transactional
644 def _finishIngest(self, prepData: Datastore.IngestPrepData, *, transfer: Optional[str] = None) -> None:
645 # Docstring inherited from Datastore._finishIngest.
646 refsAndInfos = []
647 for dataset in prepData.datasets:
648 # Do ingest as if the first dataset ref is associated with the file
649 info = self._extractIngestInfo(dataset.path, dataset.refs[0], formatter=dataset.formatter,
650 transfer=transfer)
651 refsAndInfos.extend([(ref, info) for ref in dataset.refs])
652 self._register_datasets(refsAndInfos)
654 def _calculate_ingested_datastore_name(self, srcUri: ButlerURI, ref: DatasetRef,
655 formatter: Union[Formatter, Type[Formatter]]) -> Location:
656 """Given a source URI and a DatasetRef, determine the name the
657 dataset will have inside datastore.
659 Parameters
660 ----------
661 srcUri : `ButlerURI`
662 URI to the source dataset file.
663 ref : `DatasetRef`
664 Ref associated with the newly-ingested dataset artifact. This
665 is used to determine the name within the datastore.
666 formatter : `Formatter` or Formatter class.
667 Formatter to use for validation. Can be a class or an instance.
669 Returns
670 -------
671 location : `Location`
672 Target location for the newly-ingested dataset.
673 """
674 # Ingesting a file from outside the datastore.
675 # This involves a new name.
676 template = self.templates.getTemplate(ref)
677 location = self.locationFactory.fromPath(template.format(ref))
679 # Get the extension
680 ext = srcUri.getExtension()
682 # Update the destination to include that extension
683 location.updateExtension(ext)
685 # Ask the formatter to validate this extension
686 formatter.validateExtension(location)
688 return location
690 @abstractmethod
691 def _write_in_memory_to_artifact(self, inMemoryDataset: Any, ref: DatasetRef) -> StoredFileInfo:
692 """Write out in memory dataset to datastore.
694 Parameters
695 ----------
696 inMemoryDataset : `object`
697 Dataset to write to datastore.
698 ref : `DatasetRef`
699 Registry information associated with this dataset.
701 Returns
702 -------
703 info : `StoredFileInfo`
704 Information describin the artifact written to the datastore.
705 """
706 raise NotImplementedError()
708 @abstractmethod
709 def _read_artifact_into_memory(self, getInfo: DatastoreFileGetInformation,
710 ref: DatasetRef, isComponent: bool = False) -> Any:
711 """Read the artifact from datastore into in memory object.
713 Parameters
714 ----------
715 getInfo : `DatastoreFileGetInformation`
716 Information about the artifact within the datastore.
717 ref : `DatasetRef`
718 The registry information associated with this artifact.
719 isComponent : `bool`
720 Flag to indicate if a component is being read from this artifact.
722 Returns
723 -------
724 inMemoryDataset : `object`
725 The artifact as a python object.
726 """
727 raise NotImplementedError()
729 def exists(self, ref: DatasetRef) -> bool:
730 """Check if the dataset exists in the datastore.
732 Parameters
733 ----------
734 ref : `DatasetRef`
735 Reference to the required dataset.
737 Returns
738 -------
739 exists : `bool`
740 `True` if the entity exists in the `Datastore`.
741 """
742 fileLocations = self._get_dataset_locations_info(ref)
743 if not fileLocations:
744 return False
745 for location, _ in fileLocations:
746 if not self._artifact_exists(location):
747 return False
749 return True
751 def getURIs(self, ref: DatasetRef,
752 predict: bool = False) -> Tuple[Optional[ButlerURI], Dict[str, ButlerURI]]:
753 """Return URIs associated with dataset.
755 Parameters
756 ----------
757 ref : `DatasetRef`
758 Reference to the required dataset.
759 predict : `bool`, optional
760 If the datastore does not know about the dataset, should it
761 return a predicted URI or not?
763 Returns
764 -------
765 primary : `ButlerURI`
766 The URI to the primary artifact associated with this dataset.
767 If the dataset was disassembled within the datastore this
768 may be `None`.
769 components : `dict`
770 URIs to any components associated with the dataset artifact.
771 Can be empty if there are no components.
772 """
774 primary: Optional[ButlerURI] = None
775 components: Dict[str, ButlerURI] = {}
777 # if this has never been written then we have to guess
778 if not self.exists(ref):
779 if not predict:
780 raise FileNotFoundError("Dataset {} not in this datastore".format(ref))
782 def predictLocation(thisRef: DatasetRef) -> Location:
783 template = self.templates.getTemplate(thisRef)
784 location = self.locationFactory.fromPath(template.format(thisRef))
785 storageClass = ref.datasetType.storageClass
786 formatter = self.formatterFactory.getFormatter(thisRef,
787 FileDescriptor(location,
788 storageClass=storageClass))
789 # Try to use the extension attribute but ignore problems if the
790 # formatter does not define one.
791 try:
792 location = formatter.makeUpdatedLocation(location)
793 except Exception:
794 # Use the default extension
795 pass
796 return location
798 doDisassembly = self.composites.shouldBeDisassembled(ref)
800 if doDisassembly:
802 for component, componentStorage in ref.datasetType.storageClass.components.items():
803 compRef = ref.makeComponentRef(component)
804 compLocation = predictLocation(compRef)
806 # Add a URI fragment to indicate this is a guess
807 components[component] = ButlerURI(compLocation.uri.geturl() + "#predicted")
809 else:
811 location = predictLocation(ref)
813 # Add a URI fragment to indicate this is a guess
814 primary = ButlerURI(location.uri.geturl() + "#predicted")
816 return primary, components
818 # If this is a ref that we have written we can get the path.
819 # Get file metadata and internal metadata
820 fileLocations = self._get_dataset_locations_info(ref)
822 if not fileLocations: 822 ↛ 823line 822 didn't jump to line 823, because the condition on line 822 was never true
823 raise RuntimeError(f"Unexpectedly got no artifacts for dataset {ref}")
825 if len(fileLocations) == 1:
826 # No disassembly so this is the primary URI
827 primary = ButlerURI(fileLocations[0][0].uri)
829 else:
830 for location, storedFileInfo in fileLocations:
831 if storedFileInfo.component is None: 831 ↛ 832line 831 didn't jump to line 832, because the condition on line 831 was never true
832 raise RuntimeError(f"Unexpectedly got no component name for a component at {location}")
833 components[storedFileInfo.component] = ButlerURI(location.uri)
835 return primary, components
837 def getURI(self, ref: DatasetRef, predict: bool = False) -> ButlerURI:
838 """URI to the Dataset.
840 Parameters
841 ----------
842 ref : `DatasetRef`
843 Reference to the required Dataset.
844 predict : `bool`
845 If `True`, allow URIs to be returned of datasets that have not
846 been written.
848 Returns
849 -------
850 uri : `str`
851 URI pointing to the dataset within the datastore. If the
852 dataset does not exist in the datastore, and if ``predict`` is
853 `True`, the URI will be a prediction and will include a URI
854 fragment "#predicted".
855 If the datastore does not have entities that relate well
856 to the concept of a URI the returned URI will be
857 descriptive. The returned URI is not guaranteed to be obtainable.
859 Raises
860 ------
861 FileNotFoundError
862 Raised if a URI has been requested for a dataset that does not
863 exist and guessing is not allowed.
864 RuntimeError
865 Raised if a request is made for a single URI but multiple URIs
866 are associated with this dataset.
868 Notes
869 -----
870 When a predicted URI is requested an attempt will be made to form
871 a reasonable URI based on file templates and the expected formatter.
872 """
873 primary, components = self.getURIs(ref, predict)
874 if primary is None or components: 874 ↛ 875line 874 didn't jump to line 875, because the condition on line 874 was never true
875 raise RuntimeError(f"Dataset ({ref}) includes distinct URIs for components. "
876 "Use Dataastore.getURIs() instead.")
877 return primary
879 def get(self, ref: DatasetRef, parameters: Optional[Mapping[str, Any]] = None) -> Any:
880 """Load an InMemoryDataset from the store.
882 Parameters
883 ----------
884 ref : `DatasetRef`
885 Reference to the required Dataset.
886 parameters : `dict`
887 `StorageClass`-specific parameters that specify, for example,
888 a slice of the dataset to be loaded.
890 Returns
891 -------
892 inMemoryDataset : `object`
893 Requested dataset or slice thereof as an InMemoryDataset.
895 Raises
896 ------
897 FileNotFoundError
898 Requested dataset can not be retrieved.
899 TypeError
900 Return value from formatter has unexpected type.
901 ValueError
902 Formatter failed to process the dataset.
903 """
904 allGetInfo = self._prepare_for_get(ref, parameters)
905 refComponent = ref.datasetType.component()
907 # Supplied storage class for the component being read
908 refStorageClass = ref.datasetType.storageClass
910 # Create mapping from component name to related info
911 allComponents = {i.component: i for i in allGetInfo}
913 # By definition the dataset is disassembled if we have more
914 # than one record for it.
915 isDisassembled = len(allGetInfo) > 1
917 # Look for the special case where we are disassembled but the
918 # component is a read-only component that was not written during
919 # disassembly. For this scenario we need to check that the
920 # component requested is listed as a read-only component for the
921 # composite storage class
922 isDisassembledReadOnlyComponent = False
923 if isDisassembled and refComponent:
924 # The composite storage class should be accessible through
925 # the component dataset type
926 compositeStorageClass = ref.datasetType.parentStorageClass
928 # In the unlikely scenario where the composite storage
929 # class is not known, we can only assume that this is a
930 # normal component. If that assumption is wrong then the
931 # branch below that reads a persisted component will fail
932 # so there is no need to complain here.
933 if compositeStorageClass is not None: 933 ↛ 936line 933 didn't jump to line 936, because the condition on line 933 was never false
934 isDisassembledReadOnlyComponent = refComponent in compositeStorageClass.readComponents
936 if isDisassembled and not refComponent:
937 # This was a disassembled dataset spread over multiple files
938 # and we need to put them all back together again.
939 # Read into memory and then assemble
941 # Check that the supplied parameters are suitable for the type read
942 refStorageClass.validateParameters(parameters)
944 # We want to keep track of all the parameters that were not used
945 # by formatters. We assume that if any of the component formatters
946 # use a parameter that we do not need to apply it again in the
947 # assembler.
948 usedParams = set()
950 components: Dict[str, Any] = {}
951 for getInfo in allGetInfo:
952 # assemblerParams are parameters not understood by the
953 # associated formatter.
954 usedParams.update(set(getInfo.formatterParams))
956 component = getInfo.component
958 if component is None: 958 ↛ 959line 958 didn't jump to line 959, because the condition on line 958 was never true
959 raise RuntimeError(f"Internal error in datastore assembly of {ref}")
961 # We do not want the formatter to think it's reading
962 # a component though because it is really reading a
963 # standalone dataset -- always tell reader it is not a
964 # component.
965 components[component] = self._read_artifact_into_memory(getInfo, ref, isComponent=False)
967 inMemoryDataset = ref.datasetType.storageClass.assembler().assemble(components)
969 # Any unused parameters will have to be passed to the assembler
970 if parameters:
971 unusedParams = {k: v for k, v in parameters.items() if k not in usedParams}
972 else:
973 unusedParams = {}
975 # Process parameters
976 return ref.datasetType.storageClass.assembler().handleParameters(inMemoryDataset,
977 parameters=unusedParams)
979 elif isDisassembledReadOnlyComponent:
981 compositeStorageClass = ref.datasetType.parentStorageClass
982 if compositeStorageClass is None: 982 ↛ 983line 982 didn't jump to line 983, because the condition on line 982 was never true
983 raise RuntimeError(f"Unable to retrieve read-only component '{refComponent}' since"
984 "no composite storage class is available.")
986 if refComponent is None: 986 ↛ 988line 986 didn't jump to line 988, because the condition on line 986 was never true
987 # Mainly for mypy
988 raise RuntimeError(f"Internal error in datastore {self.name}: component can not be None here")
990 # Assume that every read-only component can be calculated by
991 # forwarding the request to a single read/write component.
992 # Rather than guessing which rw component is the right one by
993 # scanning each for a read-only component of the same name,
994 # we ask the composite assembler directly which one is best to
995 # use.
996 compositeAssembler = compositeStorageClass.assembler()
997 forwardedComponent = compositeAssembler.selectResponsibleComponent(refComponent,
998 set(allComponents))
1000 # Select the relevant component
1001 rwInfo = allComponents[forwardedComponent]
1003 # For now assume that read parameters are validated against
1004 # the real component and not the requested component
1005 forwardedStorageClass = rwInfo.formatter.fileDescriptor.readStorageClass
1006 forwardedStorageClass.validateParameters(parameters)
1008 # Unfortunately the FileDescriptor inside the formatter will have
1009 # the wrong write storage class so we need to create a new one
1010 # given the immutability constraint.
1011 writeStorageClass = rwInfo.info.storageClass
1013 # We may need to put some thought into parameters for read
1014 # components but for now forward them on as is
1015 readFormatter = type(rwInfo.formatter)(FileDescriptor(rwInfo.location,
1016 readStorageClass=refStorageClass,
1017 storageClass=writeStorageClass,
1018 parameters=parameters),
1019 ref.dataId)
1021 # The assembler can not receive any parameter requests for a
1022 # read-only component at this time since the assembler will
1023 # see the storage class of the read-only component and those
1024 # parameters will have to be handled by the formatter on the
1025 # forwarded storage class.
1026 assemblerParams: Dict[str, Any] = {}
1028 # Need to created a new info that specifies the read-only
1029 # component and associated storage class
1030 readInfo = DatastoreFileGetInformation(rwInfo.location, readFormatter,
1031 rwInfo.info, assemblerParams, {},
1032 refComponent, refStorageClass)
1034 return self._read_artifact_into_memory(readInfo, ref, isComponent=True)
1036 else:
1037 # Single file request or component from that composite file
1038 for lookup in (refComponent, None): 1038 ↛ 1043line 1038 didn't jump to line 1043, because the loop on line 1038 didn't complete
1039 if lookup in allComponents: 1039 ↛ 1038line 1039 didn't jump to line 1038, because the condition on line 1039 was never false
1040 getInfo = allComponents[lookup]
1041 break
1042 else:
1043 raise FileNotFoundError(f"Component {refComponent} not found "
1044 f"for ref {ref} in datastore {self.name}")
1046 # Do not need the component itself if already disassembled
1047 if isDisassembled:
1048 isComponent = False
1049 else:
1050 isComponent = getInfo.component is not None
1052 # For a disassembled component we can validate parametersagainst
1053 # the component storage class directly
1054 if isDisassembled:
1055 refStorageClass.validateParameters(parameters)
1056 else:
1057 # For an assembled composite this could be a read-only
1058 # component derived from a real component. The validity
1059 # of the parameters is not clear. For now validate against
1060 # the composite storage class
1061 getInfo.formatter.fileDescriptor.storageClass.validateParameters(parameters)
1063 return self._read_artifact_into_memory(getInfo, ref, isComponent=isComponent)
1065 @transactional
1066 def put(self, inMemoryDataset: Any, ref: DatasetRef) -> None:
1067 """Write a InMemoryDataset with a given `DatasetRef` to the store.
1069 Parameters
1070 ----------
1071 inMemoryDataset : `object`
1072 The dataset to store.
1073 ref : `DatasetRef`
1074 Reference to the associated Dataset.
1076 Raises
1077 ------
1078 TypeError
1079 Supplied object and storage class are inconsistent.
1080 DatasetTypeNotSupportedError
1081 The associated `DatasetType` is not handled by this datastore.
1083 Notes
1084 -----
1085 If the datastore is configured to reject certain dataset types it
1086 is possible that the put will fail and raise a
1087 `DatasetTypeNotSupportedError`. The main use case for this is to
1088 allow `ChainedDatastore` to put to multiple datastores without
1089 requiring that every datastore accepts the dataset.
1090 """
1092 doDisassembly = self.composites.shouldBeDisassembled(ref)
1093 # doDisassembly = True
1095 artifacts = []
1096 if doDisassembly:
1097 components = ref.datasetType.storageClass.assembler().disassemble(inMemoryDataset)
1098 for component, componentInfo in components.items():
1099 # Don't recurse because we want to take advantage of
1100 # bulk insert -- need a new DatasetRef that refers to the
1101 # same dataset_id but has the component DatasetType
1102 # DatasetType does not refer to the types of components
1103 # So we construct one ourselves.
1104 compRef = ref.makeComponentRef(component)
1105 storedInfo = self._write_in_memory_to_artifact(componentInfo.component, compRef)
1106 artifacts.append((compRef, storedInfo))
1107 else:
1108 # Write the entire thing out
1109 storedInfo = self._write_in_memory_to_artifact(inMemoryDataset, ref)
1110 artifacts.append((ref, storedInfo))
1112 self._register_datasets(artifacts)
1114 @transactional
1115 def trash(self, ref: DatasetRef, ignore_errors: bool = True) -> None:
1116 """Indicate to the datastore that a dataset can be removed.
1118 Parameters
1119 ----------
1120 ref : `DatasetRef`
1121 Reference to the required Dataset.
1122 ignore_errors : `bool`
1123 If `True` return without error even if something went wrong.
1124 Problems could occur if another process is simultaneously trying
1125 to delete.
1127 Raises
1128 ------
1129 FileNotFoundError
1130 Attempt to remove a dataset that does not exist.
1131 """
1132 # Get file metadata and internal metadata
1133 log.debug("Trashing %s in datastore %s", ref, self.name)
1135 fileLocations = self._get_dataset_locations_info(ref)
1137 if not fileLocations:
1138 err_msg = f"Requested dataset to trash ({ref}) is not known to datastore {self.name}"
1139 if ignore_errors:
1140 log.warning(err_msg)
1141 return
1142 else:
1143 raise FileNotFoundError(err_msg)
1145 for location, storedFileInfo in fileLocations:
1146 if not self._artifact_exists(location): 1146 ↛ 1147line 1146 didn't jump to line 1147, because the condition on line 1146 was never true
1147 err_msg = f"Dataset is known to datastore {self.name} but " \
1148 f"associated artifact ({location.uri}) is missing"
1149 if ignore_errors:
1150 log.warning(err_msg)
1151 return
1152 else:
1153 raise FileNotFoundError(err_msg)
1155 # Mark dataset as trashed
1156 try:
1157 self._move_to_trash_in_registry(ref)
1158 except Exception as e:
1159 if ignore_errors:
1160 log.warning(f"Attempted to mark dataset ({ref}) to be trashed in datastore {self.name} "
1161 f"but encountered an error: {e}")
1162 pass
1163 else:
1164 raise
1166 @transactional
1167 def emptyTrash(self, ignore_errors: bool = True) -> None:
1168 """Remove all datasets from the trash.
1170 Parameters
1171 ----------
1172 ignore_errors : `bool`
1173 If `True` return without error even if something went wrong.
1174 Problems could occur if another process is simultaneously trying
1175 to delete.
1176 """
1177 log.debug("Emptying trash in datastore %s", self.name)
1178 # Context manager will empty trash iff we finish it without raising.
1179 with self.bridge.emptyTrash() as trashed:
1180 for ref in trashed:
1181 fileLocations = self._get_dataset_locations_info(ref)
1183 if not fileLocations: 1183 ↛ 1184line 1183 didn't jump to line 1184, because the condition on line 1183 was never true
1184 err_msg = f"Requested dataset ({ref}) does not exist in datastore {self.name}"
1185 if ignore_errors:
1186 log.warning(err_msg)
1187 continue
1188 else:
1189 raise FileNotFoundError(err_msg)
1191 for location, _ in fileLocations:
1193 if not self._artifact_exists(location): 1193 ↛ 1194line 1193 didn't jump to line 1194, because the condition on line 1193 was never true
1194 err_msg = f"Dataset {location.uri} no longer present in datastore {self.name}"
1195 if ignore_errors:
1196 log.warning(err_msg)
1197 continue
1198 else:
1199 raise FileNotFoundError(err_msg)
1201 # Can only delete the artifact if there are no references
1202 # to the file from untrashed dataset refs.
1203 if self._can_remove_dataset_artifact(ref, location):
1204 # Point of no return for this artifact
1205 log.debug("Removing artifact %s from datastore %s", location.uri, self.name)
1206 try:
1207 self._delete_artifact(location)
1208 except Exception as e:
1209 if ignore_errors:
1210 log.critical("Encountered error removing artifact %s from datastore %s: %s",
1211 location.uri, self.name, e)
1212 else:
1213 raise
1215 # Now must remove the entry from the internal registry even if
1216 # the artifact removal failed and was ignored,
1217 # otherwise the removal check above will never be true
1218 try:
1219 # There may be multiple rows associated with this ref
1220 # depending on disassembly
1221 self.removeStoredItemInfo(ref)
1222 except Exception as e:
1223 if ignore_errors:
1224 log.warning("Error removing dataset %s (%s) from internal registry of %s: %s",
1225 ref.id, location.uri, self.name, e)
1226 continue
1227 else:
1228 raise FileNotFoundError(err_msg)
1230 def validateConfiguration(self, entities: Iterable[Union[DatasetRef, DatasetType, StorageClass]],
1231 logFailures: bool = False) -> None:
1232 """Validate some of the configuration for this datastore.
1234 Parameters
1235 ----------
1236 entities : iterable of `DatasetRef`, `DatasetType`, or `StorageClass`
1237 Entities to test against this configuration. Can be differing
1238 types.
1239 logFailures : `bool`, optional
1240 If `True`, output a log message for every validation error
1241 detected.
1243 Raises
1244 ------
1245 DatastoreValidationError
1246 Raised if there is a validation problem with a configuration.
1247 All the problems are reported in a single exception.
1249 Notes
1250 -----
1251 This method checks that all the supplied entities have valid file
1252 templates and also have formatters defined.
1253 """
1255 templateFailed = None
1256 try:
1257 self.templates.validateTemplates(entities, logFailures=logFailures)
1258 except FileTemplateValidationError as e:
1259 templateFailed = str(e)
1261 formatterFailed = []
1262 for entity in entities:
1263 try:
1264 self.formatterFactory.getFormatterClass(entity)
1265 except KeyError as e:
1266 formatterFailed.append(str(e))
1267 if logFailures: 1267 ↛ 1262line 1267 didn't jump to line 1262, because the condition on line 1267 was never false
1268 log.fatal("Formatter failure: %s", e)
1270 if templateFailed or formatterFailed:
1271 messages = []
1272 if templateFailed: 1272 ↛ 1273line 1272 didn't jump to line 1273, because the condition on line 1272 was never true
1273 messages.append(templateFailed)
1274 if formatterFailed: 1274 ↛ 1276line 1274 didn't jump to line 1276, because the condition on line 1274 was never false
1275 messages.append(",".join(formatterFailed))
1276 msg = ";\n".join(messages)
1277 raise DatastoreValidationError(msg)
1279 def getLookupKeys(self) -> Set[LookupKey]:
1280 # Docstring is inherited from base class
1281 return self.templates.getLookupKeys() | self.formatterFactory.getLookupKeys() | \
1282 self.constraints.getLookupKeys()
1284 def validateKey(self, lookupKey: LookupKey,
1285 entity: Union[DatasetRef, DatasetType, StorageClass]) -> None:
1286 # Docstring is inherited from base class
1287 # The key can be valid in either formatters or templates so we can
1288 # only check the template if it exists
1289 if lookupKey in self.templates:
1290 try:
1291 self.templates[lookupKey].validateTemplate(entity)
1292 except FileTemplateValidationError as e:
1293 raise DatastoreValidationError(e) from e
1295 def export(self, refs: Iterable[DatasetRef], *,
1296 directory: Optional[Union[ButlerURI, str]] = None,
1297 transfer: Optional[str] = "auto") -> Iterable[FileDataset]:
1298 # Docstring inherited from Datastore.export.
1299 if transfer is not None and directory is None: 1299 ↛ 1300line 1299 didn't jump to line 1300, because the condition on line 1299 was never true
1300 raise RuntimeError(f"Cannot export using transfer mode {transfer} with no "
1301 "export directory given")
1303 # Force the directory to be a URI object
1304 directoryUri: Optional[ButlerURI] = None
1305 if directory is not None: 1305 ↛ 1308line 1305 didn't jump to line 1308, because the condition on line 1305 was never false
1306 directoryUri = ButlerURI(directory, forceDirectory=True)
1308 if transfer is not None and directoryUri is not None: 1308 ↛ 1313line 1308 didn't jump to line 1313, because the condition on line 1308 was never false
1309 # mypy needs the second test
1310 if not directoryUri.exists(): 1310 ↛ 1311line 1310 didn't jump to line 1311, because the condition on line 1310 was never true
1311 raise FileNotFoundError(f"Export location {directory} does not exist")
1313 for ref in refs:
1314 fileLocations = self._get_dataset_locations_info(ref)
1315 if not fileLocations: 1315 ↛ 1316line 1315 didn't jump to line 1316, because the condition on line 1315 was never true
1316 raise FileNotFoundError(f"Could not retrieve dataset {ref}.")
1317 # For now we can not export disassembled datasets
1318 if len(fileLocations) > 1:
1319 raise NotImplementedError(f"Can not export disassembled datasets such as {ref}")
1320 location, storedFileInfo = fileLocations[0]
1321 if transfer is None: 1321 ↛ 1324line 1321 didn't jump to line 1324, because the condition on line 1321 was never true
1322 # TODO: do we also need to return the readStorageClass somehow?
1323 # We will use the path in store directly
1324 pass
1325 else:
1326 # mypy needs help
1327 assert directoryUri is not None, "directoryUri must be defined to get here"
1328 storeUri = ButlerURI(location.uri)
1329 exportUri = directoryUri.join(location.pathInStore)
1330 exportUri.transfer_from(storeUri, transfer=transfer)
1332 yield FileDataset(refs=[ref], path=location.pathInStore, formatter=storedFileInfo.formatter)