Coverage for python/lsst/daf/butler/datastores/fileLikeDatastore.py : 83%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
21from __future__ import annotations
23"""Generic file-based datastore code."""
25__all__ = ("FileLikeDatastore", )
27import logging
28from abc import abstractmethod
30from sqlalchemy import BigInteger, String
32from dataclasses import dataclass
33from typing import (
34 TYPE_CHECKING,
35 Any,
36 ClassVar,
37 Dict,
38 Iterable,
39 List,
40 Mapping,
41 Optional,
42 Set,
43 Tuple,
44 Type,
45 Union,
46)
48from lsst.daf.butler import (
49 ButlerURI,
50 CompositesMap,
51 Config,
52 FileDataset,
53 DatasetRef,
54 DatasetType,
55 DatasetTypeNotSupportedError,
56 Datastore,
57 DatastoreConfig,
58 DatastoreValidationError,
59 FileDescriptor,
60 FileTemplates,
61 FileTemplateValidationError,
62 Formatter,
63 FormatterFactory,
64 Location,
65 LocationFactory,
66 StorageClass,
67 StoredFileInfo,
68)
70from lsst.daf.butler import ddl
71from lsst.daf.butler.registry.interfaces import (
72 ReadOnlyDatabaseError,
73 DatastoreRegistryBridge,
74)
76from lsst.daf.butler.core.repoRelocation import replaceRoot
77from lsst.daf.butler.core.utils import getInstanceOf, getClassOf, transactional
78from .genericDatastore import GenericBaseDatastore
80if TYPE_CHECKING: 80 ↛ 81line 80 didn't jump to line 81, because the condition on line 80 was never true
81 from lsst.daf.butler import LookupKey
82 from lsst.daf.butler.registry.interfaces import DatasetIdRef, DatastoreRegistryBridgeManager
84log = logging.getLogger(__name__)
86# String to use when a Python None is encountered
87NULLSTR = "__NULL_STRING__"
90class _IngestPrepData(Datastore.IngestPrepData):
91 """Helper class for FileLikeDatastore ingest implementation.
93 Parameters
94 ----------
95 datasets : `list` of `FileDataset`
96 Files to be ingested by this datastore.
97 """
98 def __init__(self, datasets: List[FileDataset]):
99 super().__init__(ref for dataset in datasets for ref in dataset.refs)
100 self.datasets = datasets
103@dataclass(frozen=True)
104class DatastoreFileGetInformation:
105 """Collection of useful parameters needed to retrieve a file from
106 a Datastore.
107 """
109 location: Location
110 """The location from which to read the dataset."""
112 formatter: Formatter
113 """The `Formatter` to use to deserialize the dataset."""
115 info: StoredFileInfo
116 """Stored information about this file and its formatter."""
118 assemblerParams: dict
119 """Parameters to use for post-processing the retrieved dataset."""
121 component: Optional[str]
122 """The component to be retrieved (can be `None`)."""
124 readStorageClass: StorageClass
125 """The `StorageClass` of the dataset being read."""
128class FileLikeDatastore(GenericBaseDatastore):
129 """Generic Datastore for file-based implementations.
131 Should always be sub-classed since key abstract methods are missing.
133 Parameters
134 ----------
135 config : `DatastoreConfig` or `str`
136 Configuration as either a `Config` object or URI to file.
137 bridgeManager : `DatastoreRegistryBridgeManager`
138 Object that manages the interface between `Registry` and datastores.
139 butlerRoot : `str`, optional
140 New datastore root to use to override the configuration value.
142 Raises
143 ------
144 ValueError
145 If root location does not exist and ``create`` is `False` in the
146 configuration.
147 """
149 defaultConfigFile: ClassVar[Optional[str]] = None
150 """Path to configuration defaults. Accessed within the ``config`` resource
151 or relative to a search path. Can be None if no defaults specified.
152 """
154 root: str
155 """Root directory or URI of this `Datastore`."""
157 locationFactory: LocationFactory
158 """Factory for creating locations relative to the datastore root."""
160 formatterFactory: FormatterFactory
161 """Factory for creating instances of formatters."""
163 templates: FileTemplates
164 """File templates that can be used by this `Datastore`."""
166 composites: CompositesMap
167 """Determines whether a dataset should be disassembled on put."""
169 @classmethod
170 def setConfigRoot(cls, root: str, config: Config, full: Config, overwrite: bool = True) -> None:
171 """Set any filesystem-dependent config options for this Datastore to
172 be appropriate for a new empty repository with the given root.
174 Parameters
175 ----------
176 root : `str`
177 URI to the root of the data repository.
178 config : `Config`
179 A `Config` to update. Only the subset understood by
180 this component will be updated. Will not expand
181 defaults.
182 full : `Config`
183 A complete config with all defaults expanded that can be
184 converted to a `DatastoreConfig`. Read-only and will not be
185 modified by this method.
186 Repository-specific options that should not be obtained
187 from defaults when Butler instances are constructed
188 should be copied from ``full`` to ``config``.
189 overwrite : `bool`, optional
190 If `False`, do not modify a value in ``config`` if the value
191 already exists. Default is always to overwrite with the provided
192 ``root``.
194 Notes
195 -----
196 If a keyword is explicitly defined in the supplied ``config`` it
197 will not be overridden by this method if ``overwrite`` is `False`.
198 This allows explicit values set in external configs to be retained.
199 """
200 Config.updateParameters(DatastoreConfig, config, full,
201 toUpdate={"root": root},
202 toCopy=("cls", ("records", "table")), overwrite=overwrite)
204 @classmethod
205 def makeTableSpec(cls) -> ddl.TableSpec:
206 return ddl.TableSpec(
207 fields=[
208 ddl.FieldSpec(name="dataset_id", dtype=BigInteger, primaryKey=True),
209 ddl.FieldSpec(name="path", dtype=String, length=256, nullable=False),
210 ddl.FieldSpec(name="formatter", dtype=String, length=128, nullable=False),
211 ddl.FieldSpec(name="storage_class", dtype=String, length=64, nullable=False),
212 # Use empty string to indicate no component
213 ddl.FieldSpec(name="component", dtype=String, length=32, primaryKey=True),
214 # TODO: should checksum be Base64Bytes instead?
215 ddl.FieldSpec(name="checksum", dtype=String, length=128, nullable=True),
216 ddl.FieldSpec(name="file_size", dtype=BigInteger, nullable=True),
217 ],
218 unique=frozenset(),
219 )
221 def __init__(self, config: Union[DatastoreConfig, str],
222 bridgeManager: DatastoreRegistryBridgeManager, butlerRoot: str = None):
223 super().__init__(config, bridgeManager)
224 if "root" not in self.config: 224 ↛ 225line 224 didn't jump to line 225, because the condition on line 224 was never true
225 raise ValueError("No root directory specified in configuration")
227 # Name ourselves either using an explicit name or a name
228 # derived from the (unexpanded) root
229 if "name" in self.config:
230 self.name = self.config["name"]
231 else:
232 # We use the unexpanded root in the name to indicate that this
233 # datastore can be moved without having to update registry.
234 self.name = "{}@{}".format(type(self).__name__,
235 self.config["root"])
237 # Support repository relocation in config
238 # Existence of self.root is checked in subclass
239 self.root = replaceRoot(self.config["root"], butlerRoot)
241 self.locationFactory = LocationFactory(self.root)
242 self.formatterFactory = FormatterFactory()
244 # Now associate formatters with storage classes
245 self.formatterFactory.registerFormatters(self.config["formatters"],
246 universe=bridgeManager.universe)
248 # Read the file naming templates
249 self.templates = FileTemplates(self.config["templates"],
250 universe=bridgeManager.universe)
252 # See if composites should be disassembled
253 self.composites = CompositesMap(self.config["composites"],
254 universe=bridgeManager.universe)
256 tableName = self.config["records", "table"]
257 try:
258 # Storage of paths and formatters, keyed by dataset_id
259 self._table = bridgeManager.opaque.register(tableName, self.makeTableSpec())
260 # Interface to Registry.
261 self._bridge = bridgeManager.register(self.name)
262 except ReadOnlyDatabaseError:
263 # If the database is read only and we just tried and failed to
264 # create a table, it means someone is trying to create a read-only
265 # butler client for an empty repo. That should be okay, as long
266 # as they then try to get any datasets before some other client
267 # creates the table. Chances are they'rejust validating
268 # configuration.
269 pass
271 # Determine whether checksums should be used
272 self.useChecksum = self.config.get("checksum", True)
274 def __str__(self) -> str:
275 return self.root
277 @property
278 def bridge(self) -> DatastoreRegistryBridge:
279 return self._bridge
281 @abstractmethod
282 def _artifact_exists(self, location: Location) -> bool:
283 """Check that an artifact exists in this datastore at the specified
284 location.
286 Parameters
287 ----------
288 location : `Location`
289 Expected location of the artifact associated with this datastore.
291 Returns
292 -------
293 exists : `bool`
294 True if the location can be found, false otherwise.
295 """
296 raise NotImplementedError()
298 @abstractmethod
299 def _delete_artifact(self, location: Location) -> None:
300 """Delete the artifact from the datastore.
302 Parameters
303 ----------
304 location : `Location`
305 Location of the artifact associated with this datastore.
306 """
307 raise NotImplementedError()
309 def addStoredItemInfo(self, refs: Iterable[DatasetRef], infos: Iterable[StoredFileInfo]) -> None:
310 # Docstring inherited from GenericBaseDatastore
311 records = []
312 for ref, info in zip(refs, infos):
313 # Component should come from ref and fall back on info
314 component = ref.datasetType.component()
315 if component is None and info.component is not None: 315 ↛ 316line 315 didn't jump to line 316, because the condition on line 315 was never true
316 component = info.component
317 if component is None:
318 # Use empty string since we want this to be part of the
319 # primary key.
320 component = NULLSTR
321 records.append(
322 dict(dataset_id=ref.id, formatter=info.formatter, path=info.path,
323 storage_class=info.storageClass.name, component=component,
324 checksum=info.checksum, file_size=info.file_size)
325 )
326 self._table.insert(*records)
328 def getStoredItemsInfo(self, ref: DatasetIdRef) -> List[StoredFileInfo]:
329 # Docstring inherited from GenericBaseDatastore
331 # Look for the dataset_id -- there might be multiple matches
332 # if we have disassembled the dataset.
333 records = list(self._table.fetch(dataset_id=ref.id))
335 results = []
336 for record in records:
337 # Convert name of StorageClass to instance
338 storageClass = self.storageClassFactory.getStorageClass(record["storage_class"])
339 component = record["component"] if (record["component"]
340 and record["component"] != NULLSTR) else None
342 info = StoredFileInfo(formatter=record["formatter"],
343 path=record["path"],
344 storageClass=storageClass,
345 component=component,
346 checksum=record["checksum"],
347 file_size=record["file_size"])
348 results.append(info)
350 return results
352 def _registered_refs_per_artifact(self, pathInStore: str) -> Set[int]:
353 """Return all dataset refs associated with the supplied path.
355 Parameters
356 ----------
357 pathInStore : `str`
358 Path of interest in the data store.
360 Returns
361 -------
362 ids : `set` of `int`
363 All `DatasetRef` IDs associated with this path.
364 """
365 records = list(self._table.fetch(path=pathInStore))
366 ids = {r["dataset_id"] for r in records}
367 return ids
369 def removeStoredItemInfo(self, ref: DatasetIdRef) -> None:
370 # Docstring inherited from GenericBaseDatastore
371 self._table.delete(dataset_id=ref.id)
373 def _get_dataset_locations_info(self, ref: DatasetIdRef) -> List[Tuple[Location, StoredFileInfo]]:
374 r"""Find all the `Location`\ s of the requested dataset in the
375 `Datastore` and the associated stored file information.
377 Parameters
378 ----------
379 ref : `DatasetRef`
380 Reference to the required `Dataset`.
382 Returns
383 -------
384 results : `list` [`tuple` [`Location`, `StoredFileInfo` ]]
385 Location of the dataset within the datastore and
386 stored information about each file and its formatter.
387 """
388 # Get the file information (this will fail if no file)
389 records = self.getStoredItemsInfo(ref)
391 # Use the path to determine the location
392 return [(self.locationFactory.fromPath(r.path), r) for r in records]
394 def _can_remove_dataset_artifact(self, ref: DatasetIdRef, location: Location) -> bool:
395 """Check that there is only one dataset associated with the
396 specified artifact.
398 Parameters
399 ----------
400 ref : `DatasetRef` or `FakeDatasetRef`
401 Dataset to be removed.
402 location : `Location`
403 The location of the artifact to be removed.
405 Returns
406 -------
407 can_remove : `Bool`
408 True if the artifact can be safely removed.
409 """
411 # Get all entries associated with this path
412 allRefs = self._registered_refs_per_artifact(location.pathInStore)
413 if not allRefs: 413 ↛ 414line 413 didn't jump to line 414, because the condition on line 413 was never true
414 raise RuntimeError(f"Datastore inconsistency error. {location.pathInStore} not in registry")
416 # Remove these refs from all the refs and if there is nothing left
417 # then we can delete
418 remainingRefs = allRefs - {ref.id}
420 if remainingRefs:
421 return False
422 return True
424 def _prepare_for_get(self, ref: DatasetRef,
425 parameters: Optional[Mapping[str, Any]] = None) -> List[DatastoreFileGetInformation]:
426 """Check parameters for ``get`` and obtain formatter and
427 location.
429 Parameters
430 ----------
431 ref : `DatasetRef`
432 Reference to the required Dataset.
433 parameters : `dict`
434 `StorageClass`-specific parameters that specify, for example,
435 a slice of the dataset to be loaded.
437 Returns
438 -------
439 getInfo : `list` [`DatastoreFileGetInformation`]
440 Parameters needed to retrieve each file.
441 """
442 log.debug("Retrieve %s from %s with parameters %s", ref, self.name, parameters)
444 # Get file metadata and internal metadata
445 fileLocations = self._get_dataset_locations_info(ref)
446 if not fileLocations:
447 raise FileNotFoundError(f"Could not retrieve dataset {ref}.")
449 # The storage class we want to use eventually
450 refStorageClass = ref.datasetType.storageClass
452 if len(fileLocations) > 1:
453 disassembled = True
454 else:
455 disassembled = False
457 # Is this a component request?
458 refComponent = ref.datasetType.component()
460 fileGetInfo = []
461 for location, storedFileInfo in fileLocations:
463 # The storage class used to write the file
464 writeStorageClass = storedFileInfo.storageClass
466 # If this has been disassembled we need read to match the write
467 if disassembled:
468 readStorageClass = writeStorageClass
469 else:
470 readStorageClass = refStorageClass
472 formatter = getInstanceOf(storedFileInfo.formatter,
473 FileDescriptor(location, readStorageClass=readStorageClass,
474 storageClass=writeStorageClass, parameters=parameters),
475 ref.dataId)
477 _, notFormatterParams = formatter.segregateParameters()
479 # Of the remaining parameters, extract the ones supported by
480 # this StorageClass (for components not all will be handled)
481 assemblerParams = readStorageClass.filterParameters(notFormatterParams)
483 # The ref itself could be a component if the dataset was
484 # disassembled by butler, or we disassembled in datastore and
485 # components came from the datastore records
486 component = storedFileInfo.component if storedFileInfo.component else refComponent
488 fileGetInfo.append(DatastoreFileGetInformation(location, formatter, storedFileInfo,
489 assemblerParams, component, readStorageClass))
491 return fileGetInfo
493 def _prepare_for_put(self, inMemoryDataset: Any, ref: DatasetRef) -> Tuple[Location, Formatter]:
494 """Check the arguments for ``put`` and obtain formatter and
495 location.
497 Parameters
498 ----------
499 inMemoryDataset : `object`
500 The dataset to store.
501 ref : `DatasetRef`
502 Reference to the associated Dataset.
504 Returns
505 -------
506 location : `Location`
507 The location to write the dataset.
508 formatter : `Formatter`
509 The `Formatter` to use to write the dataset.
511 Raises
512 ------
513 TypeError
514 Supplied object and storage class are inconsistent.
515 DatasetTypeNotSupportedError
516 The associated `DatasetType` is not handled by this datastore.
517 """
518 self._validate_put_parameters(inMemoryDataset, ref)
520 # Work out output file name
521 try:
522 template = self.templates.getTemplate(ref)
523 except KeyError as e:
524 raise DatasetTypeNotSupportedError(f"Unable to find template for {ref}") from e
526 location = self.locationFactory.fromPath(template.format(ref))
528 # Get the formatter based on the storage class
529 storageClass = ref.datasetType.storageClass
530 try:
531 formatter = self.formatterFactory.getFormatter(ref,
532 FileDescriptor(location,
533 storageClass=storageClass),
534 ref.dataId)
535 except KeyError as e:
536 raise DatasetTypeNotSupportedError(f"Unable to find formatter for {ref} in datastore "
537 f"{self.name}") from e
539 # Now that we know the formatter, update the location
540 location = formatter.makeUpdatedLocation(location)
542 return location, formatter
544 @abstractmethod
545 def _standardizeIngestPath(self, path: str, *, transfer: Optional[str] = None) -> str:
546 """Standardize the path of a to-be-ingested file.
548 Parameters
549 ----------
550 path : `str`
551 Path of a file to be ingested.
552 transfer : `str`, optional
553 How (and whether) the dataset should be added to the datastore.
554 See `ingest` for details of transfer modes.
555 This implementation is provided only so
556 `NotImplementedError` can be raised if the mode is not supported;
557 actual transfers are deferred to `_extractIngestInfo`.
559 Returns
560 -------
561 path : `str`
562 New path in what the datastore considers standard form.
564 Notes
565 -----
566 Subclasses of `FileLikeDatastore` should implement this method instead
567 of `_prepIngest`. It should not modify the data repository or given
568 file in any way.
570 Raises
571 ------
572 NotImplementedError
573 Raised if the datastore does not support the given transfer mode
574 (including the case where ingest is not supported at all).
575 FileNotFoundError
576 Raised if one of the given files does not exist.
577 """
578 raise NotImplementedError("Must be implemented by subclasses.")
580 @abstractmethod
581 def _extractIngestInfo(self, path: str, ref: DatasetRef, *,
582 formatter: Union[Formatter, Type[Formatter]],
583 transfer: Optional[str] = None) -> StoredFileInfo:
584 """Relocate (if necessary) and extract `StoredFileInfo` from a
585 to-be-ingested file.
587 Parameters
588 ----------
589 path : `str`
590 Path of a file to be ingested.
591 ref : `DatasetRef`
592 Reference for the dataset being ingested. Guaranteed to have
593 ``dataset_id not None`.
594 formatter : `type` or `Formatter`
595 `Formatter` subclass to use for this dataset or an instance.
596 transfer : `str`, optional
597 How (and whether) the dataset should be added to the datastore.
598 See `ingest` for details of transfer modes.
600 Returns
601 -------
602 info : `StoredFileInfo`
603 Internal datastore record for this file. This will be inserted by
604 the caller; the `_extractIngestInfo` is only resposible for
605 creating and populating the struct.
607 Raises
608 ------
609 FileNotFoundError
610 Raised if one of the given files does not exist.
611 FileExistsError
612 Raised if transfer is not `None` but the (internal) location the
613 file would be moved to is already occupied.
614 """
615 raise NotImplementedError("Must be implemented by subclasses.")
617 def _prepIngest(self, *datasets: FileDataset, transfer: Optional[str] = None) -> _IngestPrepData:
618 # Docstring inherited from Datastore._prepIngest.
619 filtered = []
620 for dataset in datasets:
621 acceptable = [ref for ref in dataset.refs if self.constraints.isAcceptable(ref)]
622 if not acceptable:
623 continue
624 else:
625 dataset.refs = acceptable
626 if dataset.formatter is None:
627 dataset.formatter = self.formatterFactory.getFormatterClass(dataset.refs[0])
628 else:
629 assert isinstance(dataset.formatter, (type, str))
630 dataset.formatter = getClassOf(dataset.formatter)
631 dataset.path = self._standardizeIngestPath(dataset.path, transfer=transfer)
632 filtered.append(dataset)
633 return _IngestPrepData(filtered)
635 @transactional
636 def _finishIngest(self, prepData: Datastore.IngestPrepData, *, transfer: Optional[str] = None) -> None:
637 # Docstring inherited from Datastore._finishIngest.
638 refsAndInfos = []
639 for dataset in prepData.datasets:
640 # Do ingest as if the first dataset ref is associated with the file
641 info = self._extractIngestInfo(dataset.path, dataset.refs[0], formatter=dataset.formatter,
642 transfer=transfer)
643 refsAndInfos.extend([(ref, info) for ref in dataset.refs])
644 self._register_datasets(refsAndInfos)
646 def _calculate_ingested_datastore_name(self, srcUri: ButlerURI, ref: DatasetRef,
647 formatter: Union[Formatter, Type[Formatter]]) -> Location:
648 """Given a source URI and a DatasetRef, determine the name the
649 dataset will have inside datastore.
651 Parameters
652 ----------
653 srcUri : `ButlerURI`
654 URI to the source dataset file.
655 ref : `DatasetRef`
656 Ref associated with the newly-ingested dataset artifact. This
657 is used to determine the name within the datastore.
658 formatter : `Formatter` or Formatter class.
659 Formatter to use for validation. Can be a class or an instance.
661 Returns
662 -------
663 location : `Location`
664 Target location for the newly-ingested dataset.
665 """
666 # Ingesting a file from outside the datastore.
667 # This involves a new name.
668 template = self.templates.getTemplate(ref)
669 location = self.locationFactory.fromPath(template.format(ref))
671 # Get the extension
672 ext = srcUri.getExtension()
674 # Update the destination to include that extension
675 location.updateExtension(ext)
677 # Ask the formatter to validate this extension
678 formatter.validateExtension(location)
680 return location
682 @abstractmethod
683 def _write_in_memory_to_artifact(self, inMemoryDataset: Any, ref: DatasetRef) -> StoredFileInfo:
684 """Write out in memory dataset to datastore.
686 Parameters
687 ----------
688 inMemoryDataset : `object`
689 Dataset to write to datastore.
690 ref : `DatasetRef`
691 Registry information associated with this dataset.
693 Returns
694 -------
695 info : `StoredFileInfo`
696 Information describin the artifact written to the datastore.
697 """
698 raise NotImplementedError()
700 @abstractmethod
701 def _read_artifact_into_memory(self, getInfo: DatastoreFileGetInformation,
702 ref: DatasetRef, isComponent: bool = False) -> Any:
703 """Read the artifact from datastore into in memory object.
705 Parameters
706 ----------
707 getInfo : `DatastoreFileGetInformation`
708 Information about the artifact within the datastore.
709 ref : `DatasetRef`
710 The registry information associated with this artifact.
711 isComponent : `bool`
712 Flag to indicate if a component is being read from this artifact.
714 Returns
715 -------
716 inMemoryDataset : `object`
717 The artifact as a python object.
718 """
719 raise NotImplementedError()
721 def exists(self, ref: DatasetRef) -> bool:
722 """Check if the dataset exists in the datastore.
724 Parameters
725 ----------
726 ref : `DatasetRef`
727 Reference to the required dataset.
729 Returns
730 -------
731 exists : `bool`
732 `True` if the entity exists in the `Datastore`.
733 """
734 fileLocations = self._get_dataset_locations_info(ref)
735 if not fileLocations:
736 return False
737 for location, _ in fileLocations:
738 if not self._artifact_exists(location):
739 return False
741 return True
743 def getURIs(self, ref: DatasetRef,
744 predict: bool = False) -> Tuple[Optional[ButlerURI], Dict[str, ButlerURI]]:
745 """Return URIs associated with dataset.
747 Parameters
748 ----------
749 ref : `DatasetRef`
750 Reference to the required dataset.
751 predict : `bool`, optional
752 If the datastore does not know about the dataset, should it
753 return a predicted URI or not?
755 Returns
756 -------
757 primary : `ButlerURI`
758 The URI to the primary artifact associated with this dataset.
759 If the dataset was disassembled within the datastore this
760 may be `None`.
761 components : `dict`
762 URIs to any components associated with the dataset artifact.
763 Can be empty if there are no components.
764 """
766 primary: Optional[ButlerURI] = None
767 components: Dict[str, ButlerURI] = {}
769 # if this has never been written then we have to guess
770 if not self.exists(ref):
771 if not predict:
772 raise FileNotFoundError("Dataset {} not in this datastore".format(ref))
774 def predictLocation(thisRef: DatasetRef) -> Location:
775 template = self.templates.getTemplate(thisRef)
776 location = self.locationFactory.fromPath(template.format(thisRef))
777 storageClass = ref.datasetType.storageClass
778 formatter = self.formatterFactory.getFormatter(thisRef,
779 FileDescriptor(location,
780 storageClass=storageClass))
781 # Try to use the extension attribute but ignore problems if the
782 # formatter does not define one.
783 try:
784 location = formatter.makeUpdatedLocation(location)
785 except Exception:
786 # Use the default extension
787 pass
788 return location
790 doDisassembly = self.composites.shouldBeDisassembled(ref)
792 if doDisassembly:
794 for component, componentStorage in ref.datasetType.storageClass.components.items():
795 compRef = ref.makeComponentRef(component)
796 compLocation = predictLocation(compRef)
798 # Add a URI fragment to indicate this is a guess
799 components[component] = ButlerURI(compLocation.uri + "#predicted")
801 else:
803 location = predictLocation(ref)
805 # Add a URI fragment to indicate this is a guess
806 primary = ButlerURI(location.uri + "#predicted")
808 return primary, components
810 # If this is a ref that we have written we can get the path.
811 # Get file metadata and internal metadata
812 fileLocations = self._get_dataset_locations_info(ref)
814 if not fileLocations: 814 ↛ 815line 814 didn't jump to line 815, because the condition on line 814 was never true
815 raise RuntimeError(f"Unexpectedly got no artifacts for dataset {ref}")
817 if len(fileLocations) == 1:
818 # No disassembly so this is the primary URI
819 primary = ButlerURI(fileLocations[0][0].uri)
821 else:
822 for location, storedFileInfo in fileLocations:
823 if storedFileInfo.component is None: 823 ↛ 824line 823 didn't jump to line 824, because the condition on line 823 was never true
824 raise RuntimeError(f"Unexpectedly got no component name for a component at {location}")
825 components[storedFileInfo.component] = ButlerURI(location.uri)
827 return primary, components
829 def getURI(self, ref: DatasetRef, predict: bool = False) -> ButlerURI:
830 """URI to the Dataset.
832 Parameters
833 ----------
834 ref : `DatasetRef`
835 Reference to the required Dataset.
836 predict : `bool`
837 If `True`, allow URIs to be returned of datasets that have not
838 been written.
840 Returns
841 -------
842 uri : `str`
843 URI pointing to the dataset within the datastore. If the
844 dataset does not exist in the datastore, and if ``predict`` is
845 `True`, the URI will be a prediction and will include a URI
846 fragment "#predicted".
847 If the datastore does not have entities that relate well
848 to the concept of a URI the returned URI will be
849 descriptive. The returned URI is not guaranteed to be obtainable.
851 Raises
852 ------
853 FileNotFoundError
854 Raised if a URI has been requested for a dataset that does not
855 exist and guessing is not allowed.
856 RuntimeError
857 Raised if a request is made for a single URI but multiple URIs
858 are associated with this dataset.
860 Notes
861 -----
862 When a predicted URI is requested an attempt will be made to form
863 a reasonable URI based on file templates and the expected formatter.
864 """
865 primary, components = self.getURIs(ref, predict)
866 if primary is None or components: 866 ↛ 867line 866 didn't jump to line 867, because the condition on line 866 was never true
867 raise RuntimeError(f"Dataset ({ref}) includes distinct URIs for components. "
868 "Use Dataastore.getURIs() instead.")
869 return primary
871 def get(self, ref: DatasetRef, parameters: Optional[Mapping[str, Any]] = None) -> Any:
872 """Load an InMemoryDataset from the store.
874 Parameters
875 ----------
876 ref : `DatasetRef`
877 Reference to the required Dataset.
878 parameters : `dict`
879 `StorageClass`-specific parameters that specify, for example,
880 a slice of the dataset to be loaded.
882 Returns
883 -------
884 inMemoryDataset : `object`
885 Requested dataset or slice thereof as an InMemoryDataset.
887 Raises
888 ------
889 FileNotFoundError
890 Requested dataset can not be retrieved.
891 TypeError
892 Return value from formatter has unexpected type.
893 ValueError
894 Formatter failed to process the dataset.
895 """
896 allGetInfo = self._prepare_for_get(ref, parameters)
897 refComponent = ref.datasetType.component()
899 # Supplied storage class for the component being read
900 refStorageClass = ref.datasetType.storageClass
902 # Create mapping from component name to related info
903 allComponents = {i.component: i for i in allGetInfo}
905 # By definition the dataset is disassembled if we have more
906 # than one record for it.
907 isDisassembled = len(allGetInfo) > 1
909 # Look for the special case where we are disassembled but the
910 # component is a read-only component that was not written during
911 # disassembly. For this scenario we need to check that the
912 # component requested is listed as a read-only component for the
913 # composite storage class
914 isDisassembledReadOnlyComponent = False
915 if isDisassembled and refComponent:
916 # The composite storage class should be accessible through
917 # the component dataset type
918 compositeStorageClass = ref.datasetType.parentStorageClass
920 # In the unlikely scenario where the composite storage
921 # class is not known, we can only assume that this is a
922 # normal component. If that assumption is wrong then the
923 # branch below that reads a persisted component will fail
924 # so there is no need to complain here.
925 if compositeStorageClass is not None: 925 ↛ 928line 925 didn't jump to line 928, because the condition on line 925 was never false
926 isDisassembledReadOnlyComponent = refComponent in compositeStorageClass.readComponents
928 if isDisassembled and not refComponent:
929 # This was a disassembled dataset spread over multiple files
930 # and we need to put them all back together again.
931 # Read into memory and then assemble
933 # Check that the supplied parameters are suitable for the type read
934 refStorageClass.validateParameters(parameters)
936 usedParams = set()
937 components: Dict[str, Any] = {}
938 for getInfo in allGetInfo:
939 # assemblerParams are parameters not understood by the
940 # associated formatter.
941 usedParams.update(set(getInfo.assemblerParams))
943 component = getInfo.component
945 if component is None: 945 ↛ 946line 945 didn't jump to line 946, because the condition on line 945 was never true
946 raise RuntimeError(f"Internal error in datastore assembly of {ref}")
948 # We do not want the formatter to think it's reading
949 # a component though because it is really reading a
950 # standalone dataset -- always tell reader it is not a
951 # component.
952 components[component] = self._read_artifact_into_memory(getInfo, ref, isComponent=False)
954 inMemoryDataset = ref.datasetType.storageClass.assembler().assemble(components)
956 # Any unused parameters will have to be passed to the assembler
957 if parameters:
958 unusedParams = {k: v for k, v in parameters.items() if k not in usedParams}
959 else:
960 unusedParams = {}
962 # Process parameters
963 return ref.datasetType.storageClass.assembler().handleParameters(inMemoryDataset,
964 parameters=unusedParams)
966 elif isDisassembledReadOnlyComponent:
968 compositeStorageClass = ref.datasetType.parentStorageClass
969 if compositeStorageClass is None: 969 ↛ 970line 969 didn't jump to line 970, because the condition on line 969 was never true
970 raise RuntimeError(f"Unable to retrieve read-only component '{refComponent}' since"
971 "no composite storage class is available.")
973 if refComponent is None: 973 ↛ 975line 973 didn't jump to line 975, because the condition on line 973 was never true
974 # Mainly for mypy
975 raise RuntimeError(f"Internal error in datastore {self.name}: component can not be None here")
977 # Assume that every read-only component can be calculated by
978 # forwarding the request to a single read/write component.
979 # Rather than guessing which rw component is the right one by
980 # scanning each for a read-only component of the same name,
981 # we ask the composite assembler directly which one is best to
982 # use.
983 compositeAssembler = compositeStorageClass.assembler()
984 forwardedComponent = compositeAssembler.selectResponsibleComponent(refComponent,
985 set(allComponents))
987 # Select the relevant component
988 rwInfo = allComponents[forwardedComponent]
990 # For now assume that read parameters are validated against
991 # the real component and not the requested component
992 forwardedStorageClass = rwInfo.formatter.fileDescriptor.readStorageClass
993 forwardedStorageClass.validateParameters(parameters)
995 # Unfortunately the FileDescriptor inside the formatter will have
996 # the wrong write storage class so we need to create a new one
997 # given the immutability constraint.
998 writeStorageClass = rwInfo.info.storageClass
1000 # We may need to put some thought into parameters for read
1001 # components but for now forward them on as is
1002 readFormatter = type(rwInfo.formatter)(FileDescriptor(rwInfo.location,
1003 readStorageClass=refStorageClass,
1004 storageClass=writeStorageClass,
1005 parameters=parameters),
1006 ref.dataId)
1008 # The assembler can not receive any parameter requests for a
1009 # read-only component at this time since the assembler will
1010 # see the storage class of the read-only component and those
1011 # parameters will have to be handled by the formatter on the
1012 # forwarded storage class.
1013 assemblerParams: Dict[str, Any] = {}
1015 # Need to created a new info that specifies the read-only
1016 # component and associated storage class
1017 readInfo = DatastoreFileGetInformation(rwInfo.location, readFormatter,
1018 rwInfo.info, assemblerParams,
1019 refComponent, refStorageClass)
1021 return self._read_artifact_into_memory(readInfo, ref, isComponent=True)
1023 else:
1024 # Single file request or component from that composite file
1025 for lookup in (refComponent, None): 1025 ↛ 1030line 1025 didn't jump to line 1030, because the loop on line 1025 didn't complete
1026 if lookup in allComponents: 1026 ↛ 1025line 1026 didn't jump to line 1025, because the condition on line 1026 was never false
1027 getInfo = allComponents[lookup]
1028 break
1029 else:
1030 raise FileNotFoundError(f"Component {refComponent} not found "
1031 f"for ref {ref} in datastore {self.name}")
1033 # Do not need the component itself if already disassembled
1034 if isDisassembled:
1035 isComponent = False
1036 else:
1037 isComponent = getInfo.component is not None
1039 # For a disassembled component we can validate parametersagainst
1040 # the component storage class directly
1041 if isDisassembled:
1042 refStorageClass.validateParameters(parameters)
1043 else:
1044 # For an assembled composite this could be a read-only
1045 # component derived from a real component. The validity
1046 # of the parameters is not clear. For now validate against
1047 # the composite storage class
1048 getInfo.formatter.fileDescriptor.storageClass.validateParameters(parameters)
1050 return self._read_artifact_into_memory(getInfo, ref, isComponent=isComponent)
1052 @transactional
1053 def put(self, inMemoryDataset: Any, ref: DatasetRef) -> None:
1054 """Write a InMemoryDataset with a given `DatasetRef` to the store.
1056 Parameters
1057 ----------
1058 inMemoryDataset : `object`
1059 The dataset to store.
1060 ref : `DatasetRef`
1061 Reference to the associated Dataset.
1063 Raises
1064 ------
1065 TypeError
1066 Supplied object and storage class are inconsistent.
1067 DatasetTypeNotSupportedError
1068 The associated `DatasetType` is not handled by this datastore.
1070 Notes
1071 -----
1072 If the datastore is configured to reject certain dataset types it
1073 is possible that the put will fail and raise a
1074 `DatasetTypeNotSupportedError`. The main use case for this is to
1075 allow `ChainedDatastore` to put to multiple datastores without
1076 requiring that every datastore accepts the dataset.
1077 """
1079 doDisassembly = self.composites.shouldBeDisassembled(ref)
1080 # doDisassembly = True
1082 artifacts = []
1083 if doDisassembly:
1084 components = ref.datasetType.storageClass.assembler().disassemble(inMemoryDataset)
1085 for component, componentInfo in components.items():
1086 # Don't recurse because we want to take advantage of
1087 # bulk insert -- need a new DatasetRef that refers to the
1088 # same dataset_id but has the component DatasetType
1089 # DatasetType does not refer to the types of components
1090 # So we construct one ourselves.
1091 compRef = ref.makeComponentRef(component)
1092 storedInfo = self._write_in_memory_to_artifact(componentInfo.component, compRef)
1093 artifacts.append((compRef, storedInfo))
1094 else:
1095 # Write the entire thing out
1096 storedInfo = self._write_in_memory_to_artifact(inMemoryDataset, ref)
1097 artifacts.append((ref, storedInfo))
1099 self._register_datasets(artifacts)
1101 @transactional
1102 def trash(self, ref: DatasetRef, ignore_errors: bool = True) -> None:
1103 """Indicate to the datastore that a dataset can be removed.
1105 Parameters
1106 ----------
1107 ref : `DatasetRef`
1108 Reference to the required Dataset.
1109 ignore_errors : `bool`
1110 If `True` return without error even if something went wrong.
1111 Problems could occur if another process is simultaneously trying
1112 to delete.
1114 Raises
1115 ------
1116 FileNotFoundError
1117 Attempt to remove a dataset that does not exist.
1118 """
1119 # Get file metadata and internal metadata
1120 log.debug("Trashing %s in datastore %s", ref, self.name)
1122 fileLocations = self._get_dataset_locations_info(ref)
1124 if not fileLocations:
1125 err_msg = f"Requested dataset to trash ({ref}) is not known to datastore {self.name}"
1126 if ignore_errors:
1127 log.warning(err_msg)
1128 return
1129 else:
1130 raise FileNotFoundError(err_msg)
1132 for location, storedFileInfo in fileLocations:
1133 if not self._artifact_exists(location): 1133 ↛ 1134line 1133 didn't jump to line 1134, because the condition on line 1133 was never true
1134 err_msg = f"Dataset is known to datastore {self.name} but " \
1135 f"associated artifact ({location.uri}) is missing"
1136 if ignore_errors:
1137 log.warning(err_msg)
1138 return
1139 else:
1140 raise FileNotFoundError(err_msg)
1142 # Mark dataset as trashed
1143 try:
1144 self._move_to_trash_in_registry(ref)
1145 except Exception as e:
1146 if ignore_errors:
1147 log.warning(f"Attempted to mark dataset ({ref}) to be trashed in datastore {self.name} "
1148 f"but encountered an error: {e}")
1149 pass
1150 else:
1151 raise
1153 @transactional
1154 def emptyTrash(self, ignore_errors: bool = True) -> None:
1155 """Remove all datasets from the trash.
1157 Parameters
1158 ----------
1159 ignore_errors : `bool`
1160 If `True` return without error even if something went wrong.
1161 Problems could occur if another process is simultaneously trying
1162 to delete.
1163 """
1164 log.debug("Emptying trash in datastore %s", self.name)
1165 # Context manager will empty trash iff we finish it without raising.
1166 with self._bridge.emptyTrash() as trashed:
1167 for ref in trashed:
1168 fileLocations = self._get_dataset_locations_info(ref)
1170 if not fileLocations: 1170 ↛ 1171line 1170 didn't jump to line 1171, because the condition on line 1170 was never true
1171 err_msg = f"Requested dataset ({ref}) does not exist in datastore {self.name}"
1172 if ignore_errors:
1173 log.warning(err_msg)
1174 continue
1175 else:
1176 raise FileNotFoundError(err_msg)
1178 for location, _ in fileLocations:
1180 if not self._artifact_exists(location): 1180 ↛ 1181line 1180 didn't jump to line 1181, because the condition on line 1180 was never true
1181 err_msg = f"Dataset {location.uri} no longer present in datastore {self.name}"
1182 if ignore_errors:
1183 log.warning(err_msg)
1184 continue
1185 else:
1186 raise FileNotFoundError(err_msg)
1188 # Can only delete the artifact if there are no references
1189 # to the file from untrashed dataset refs.
1190 if self._can_remove_dataset_artifact(ref, location):
1191 # Point of no return for this artifact
1192 log.debug("Removing artifact %s from datastore %s", location.uri, self.name)
1193 try:
1194 self._delete_artifact(location)
1195 except Exception as e:
1196 if ignore_errors:
1197 log.critical("Encountered error removing artifact %s from datastore %s: %s",
1198 location.uri, self.name, e)
1199 else:
1200 raise
1202 # Now must remove the entry from the internal registry even if
1203 # the artifact removal failed and was ignored,
1204 # otherwise the removal check above will never be true
1205 try:
1206 # There may be multiple rows associated with this ref
1207 # depending on disassembly
1208 self.removeStoredItemInfo(ref)
1209 except Exception as e:
1210 if ignore_errors:
1211 log.warning("Error removing dataset %s (%s) from internal registry of %s: %s",
1212 ref.id, location.uri, self.name, e)
1213 continue
1214 else:
1215 raise FileNotFoundError(err_msg)
1217 def validateConfiguration(self, entities: Iterable[Union[DatasetRef, DatasetType, StorageClass]],
1218 logFailures: bool = False) -> None:
1219 """Validate some of the configuration for this datastore.
1221 Parameters
1222 ----------
1223 entities : iterable of `DatasetRef`, `DatasetType`, or `StorageClass`
1224 Entities to test against this configuration. Can be differing
1225 types.
1226 logFailures : `bool`, optional
1227 If `True`, output a log message for every validation error
1228 detected.
1230 Raises
1231 ------
1232 DatastoreValidationError
1233 Raised if there is a validation problem with a configuration.
1234 All the problems are reported in a single exception.
1236 Notes
1237 -----
1238 This method checks that all the supplied entities have valid file
1239 templates and also have formatters defined.
1240 """
1242 templateFailed = None
1243 try:
1244 self.templates.validateTemplates(entities, logFailures=logFailures)
1245 except FileTemplateValidationError as e:
1246 templateFailed = str(e)
1248 formatterFailed = []
1249 for entity in entities:
1250 try:
1251 self.formatterFactory.getFormatterClass(entity)
1252 except KeyError as e:
1253 formatterFailed.append(str(e))
1254 if logFailures: 1254 ↛ 1249line 1254 didn't jump to line 1249, because the condition on line 1254 was never false
1255 log.fatal("Formatter failure: %s", e)
1257 if templateFailed or formatterFailed:
1258 messages = []
1259 if templateFailed: 1259 ↛ 1260line 1259 didn't jump to line 1260, because the condition on line 1259 was never true
1260 messages.append(templateFailed)
1261 if formatterFailed: 1261 ↛ 1263line 1261 didn't jump to line 1263, because the condition on line 1261 was never false
1262 messages.append(",".join(formatterFailed))
1263 msg = ";\n".join(messages)
1264 raise DatastoreValidationError(msg)
1266 def getLookupKeys(self) -> Set[LookupKey]:
1267 # Docstring is inherited from base class
1268 return self.templates.getLookupKeys() | self.formatterFactory.getLookupKeys() | \
1269 self.constraints.getLookupKeys()
1271 def validateKey(self, lookupKey: LookupKey,
1272 entity: Union[DatasetRef, DatasetType, StorageClass]) -> None:
1273 # Docstring is inherited from base class
1274 # The key can be valid in either formatters or templates so we can
1275 # only check the template if it exists
1276 if lookupKey in self.templates:
1277 try:
1278 self.templates[lookupKey].validateTemplate(entity)
1279 except FileTemplateValidationError as e:
1280 raise DatastoreValidationError(e) from e