Coverage for python/lsst/daf/butler/datastores/fileLikeDatastore.py : 82%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
21from __future__ import annotations
23"""Generic file-based datastore code."""
25__all__ = ("FileLikeDatastore", )
27import logging
28from abc import abstractmethod
30from sqlalchemy import BigInteger, String
32from dataclasses import dataclass
33from typing import (
34 TYPE_CHECKING,
35 Any,
36 ClassVar,
37 Dict,
38 Iterable,
39 List,
40 Mapping,
41 Optional,
42 Set,
43 Tuple,
44 Type,
45 Union,
46)
48from lsst.daf.butler import (
49 ButlerURI,
50 CompositesMap,
51 Config,
52 FileDataset,
53 DatasetRef,
54 DatasetType,
55 DatasetTypeNotSupportedError,
56 Datastore,
57 DatastoreConfig,
58 DatastoreValidationError,
59 FileDescriptor,
60 FileTemplates,
61 FileTemplateValidationError,
62 Formatter,
63 FormatterFactory,
64 Location,
65 LocationFactory,
66 StorageClass,
67 StoredFileInfo,
68)
70from lsst.daf.butler import ddl
71from lsst.daf.butler.registry.interfaces import (
72 ReadOnlyDatabaseError,
73 DatastoreRegistryBridge,
74)
76from lsst.daf.butler.core.repoRelocation import replaceRoot
77from lsst.daf.butler.core.utils import getInstanceOf, getClassOf, transactional
78from .genericDatastore import GenericBaseDatastore
80if TYPE_CHECKING: 80 ↛ 81line 80 didn't jump to line 81, because the condition on line 80 was never true
81 from lsst.daf.butler import LookupKey
82 from lsst.daf.butler.registry.interfaces import DatasetIdRef, DatastoreRegistryBridgeManager
84log = logging.getLogger(__name__)
86# String to use when a Python None is encountered
87NULLSTR = "__NULL_STRING__"
90class _IngestPrepData(Datastore.IngestPrepData):
91 """Helper class for FileLikeDatastore ingest implementation.
93 Parameters
94 ----------
95 datasets : `list` of `FileDataset`
96 Files to be ingested by this datastore.
97 """
98 def __init__(self, datasets: List[FileDataset]):
99 super().__init__(ref for dataset in datasets for ref in dataset.refs)
100 self.datasets = datasets
103@dataclass(frozen=True)
104class DatastoreFileGetInformation:
105 """Collection of useful parameters needed to retrieve a file from
106 a Datastore.
107 """
109 location: Location
110 """The location from which to read the dataset."""
112 formatter: Formatter
113 """The `Formatter` to use to deserialize the dataset."""
115 info: StoredFileInfo
116 """Stored information about this file and its formatter."""
118 assemblerParams: Dict[str, Any]
119 """Parameters to use for post-processing the retrieved dataset."""
121 formatterParams: Dict[str, Any]
122 """Parameters that were understood by the associated formatter."""
124 component: Optional[str]
125 """The component to be retrieved (can be `None`)."""
127 readStorageClass: StorageClass
128 """The `StorageClass` of the dataset being read."""
131class FileLikeDatastore(GenericBaseDatastore):
132 """Generic Datastore for file-based implementations.
134 Should always be sub-classed since key abstract methods are missing.
136 Parameters
137 ----------
138 config : `DatastoreConfig` or `str`
139 Configuration as either a `Config` object or URI to file.
140 bridgeManager : `DatastoreRegistryBridgeManager`
141 Object that manages the interface between `Registry` and datastores.
142 butlerRoot : `str`, optional
143 New datastore root to use to override the configuration value.
145 Raises
146 ------
147 ValueError
148 If root location does not exist and ``create`` is `False` in the
149 configuration.
150 """
152 defaultConfigFile: ClassVar[Optional[str]] = None
153 """Path to configuration defaults. Accessed within the ``config`` resource
154 or relative to a search path. Can be None if no defaults specified.
155 """
157 root: str
158 """Root directory or URI of this `Datastore`."""
160 locationFactory: LocationFactory
161 """Factory for creating locations relative to the datastore root."""
163 formatterFactory: FormatterFactory
164 """Factory for creating instances of formatters."""
166 templates: FileTemplates
167 """File templates that can be used by this `Datastore`."""
169 composites: CompositesMap
170 """Determines whether a dataset should be disassembled on put."""
172 @classmethod
173 def setConfigRoot(cls, root: str, config: Config, full: Config, overwrite: bool = True) -> None:
174 """Set any filesystem-dependent config options for this Datastore to
175 be appropriate for a new empty repository with the given root.
177 Parameters
178 ----------
179 root : `str`
180 URI to the root of the data repository.
181 config : `Config`
182 A `Config` to update. Only the subset understood by
183 this component will be updated. Will not expand
184 defaults.
185 full : `Config`
186 A complete config with all defaults expanded that can be
187 converted to a `DatastoreConfig`. Read-only and will not be
188 modified by this method.
189 Repository-specific options that should not be obtained
190 from defaults when Butler instances are constructed
191 should be copied from ``full`` to ``config``.
192 overwrite : `bool`, optional
193 If `False`, do not modify a value in ``config`` if the value
194 already exists. Default is always to overwrite with the provided
195 ``root``.
197 Notes
198 -----
199 If a keyword is explicitly defined in the supplied ``config`` it
200 will not be overridden by this method if ``overwrite`` is `False`.
201 This allows explicit values set in external configs to be retained.
202 """
203 Config.updateParameters(DatastoreConfig, config, full,
204 toUpdate={"root": root},
205 toCopy=("cls", ("records", "table")), overwrite=overwrite)
207 @classmethod
208 def makeTableSpec(cls) -> ddl.TableSpec:
209 return ddl.TableSpec(
210 fields=[
211 ddl.FieldSpec(name="dataset_id", dtype=BigInteger, primaryKey=True),
212 ddl.FieldSpec(name="path", dtype=String, length=256, nullable=False),
213 ddl.FieldSpec(name="formatter", dtype=String, length=128, nullable=False),
214 ddl.FieldSpec(name="storage_class", dtype=String, length=64, nullable=False),
215 # Use empty string to indicate no component
216 ddl.FieldSpec(name="component", dtype=String, length=32, primaryKey=True),
217 # TODO: should checksum be Base64Bytes instead?
218 ddl.FieldSpec(name="checksum", dtype=String, length=128, nullable=True),
219 ddl.FieldSpec(name="file_size", dtype=BigInteger, nullable=True),
220 ],
221 unique=frozenset(),
222 )
224 def __init__(self, config: Union[DatastoreConfig, str],
225 bridgeManager: DatastoreRegistryBridgeManager, butlerRoot: str = None):
226 super().__init__(config, bridgeManager)
227 if "root" not in self.config: 227 ↛ 228line 227 didn't jump to line 228, because the condition on line 227 was never true
228 raise ValueError("No root directory specified in configuration")
230 # Name ourselves either using an explicit name or a name
231 # derived from the (unexpanded) root
232 if "name" in self.config:
233 self.name = self.config["name"]
234 else:
235 # We use the unexpanded root in the name to indicate that this
236 # datastore can be moved without having to update registry.
237 self.name = "{}@{}".format(type(self).__name__,
238 self.config["root"])
240 # Support repository relocation in config
241 # Existence of self.root is checked in subclass
242 self.root = replaceRoot(self.config["root"], butlerRoot)
244 self.locationFactory = LocationFactory(self.root)
245 self.formatterFactory = FormatterFactory()
247 # Now associate formatters with storage classes
248 self.formatterFactory.registerFormatters(self.config["formatters"],
249 universe=bridgeManager.universe)
251 # Read the file naming templates
252 self.templates = FileTemplates(self.config["templates"],
253 universe=bridgeManager.universe)
255 # See if composites should be disassembled
256 self.composites = CompositesMap(self.config["composites"],
257 universe=bridgeManager.universe)
259 tableName = self.config["records", "table"]
260 try:
261 # Storage of paths and formatters, keyed by dataset_id
262 self._table = bridgeManager.opaque.register(tableName, self.makeTableSpec())
263 # Interface to Registry.
264 self._bridge = bridgeManager.register(self.name)
265 except ReadOnlyDatabaseError:
266 # If the database is read only and we just tried and failed to
267 # create a table, it means someone is trying to create a read-only
268 # butler client for an empty repo. That should be okay, as long
269 # as they then try to get any datasets before some other client
270 # creates the table. Chances are they'rejust validating
271 # configuration.
272 pass
274 # Determine whether checksums should be used
275 self.useChecksum = self.config.get("checksum", True)
277 def __str__(self) -> str:
278 return self.root
280 @property
281 def bridge(self) -> DatastoreRegistryBridge:
282 return self._bridge
284 @abstractmethod
285 def _artifact_exists(self, location: Location) -> bool:
286 """Check that an artifact exists in this datastore at the specified
287 location.
289 Parameters
290 ----------
291 location : `Location`
292 Expected location of the artifact associated with this datastore.
294 Returns
295 -------
296 exists : `bool`
297 True if the location can be found, false otherwise.
298 """
299 raise NotImplementedError()
301 @abstractmethod
302 def _delete_artifact(self, location: Location) -> None:
303 """Delete the artifact from the datastore.
305 Parameters
306 ----------
307 location : `Location`
308 Location of the artifact associated with this datastore.
309 """
310 raise NotImplementedError()
312 def addStoredItemInfo(self, refs: Iterable[DatasetRef], infos: Iterable[StoredFileInfo]) -> None:
313 # Docstring inherited from GenericBaseDatastore
314 records = []
315 for ref, info in zip(refs, infos):
316 # Component should come from ref and fall back on info
317 component = ref.datasetType.component()
318 if component is None and info.component is not None: 318 ↛ 319line 318 didn't jump to line 319, because the condition on line 318 was never true
319 component = info.component
320 if component is None:
321 # Use empty string since we want this to be part of the
322 # primary key.
323 component = NULLSTR
324 records.append(
325 dict(dataset_id=ref.id, formatter=info.formatter, path=info.path,
326 storage_class=info.storageClass.name, component=component,
327 checksum=info.checksum, file_size=info.file_size)
328 )
329 self._table.insert(*records)
331 def getStoredItemsInfo(self, ref: DatasetIdRef) -> List[StoredFileInfo]:
332 # Docstring inherited from GenericBaseDatastore
334 # Look for the dataset_id -- there might be multiple matches
335 # if we have disassembled the dataset.
336 records = list(self._table.fetch(dataset_id=ref.id))
338 results = []
339 for record in records:
340 # Convert name of StorageClass to instance
341 storageClass = self.storageClassFactory.getStorageClass(record["storage_class"])
342 component = record["component"] if (record["component"]
343 and record["component"] != NULLSTR) else None
345 info = StoredFileInfo(formatter=record["formatter"],
346 path=record["path"],
347 storageClass=storageClass,
348 component=component,
349 checksum=record["checksum"],
350 file_size=record["file_size"])
351 results.append(info)
353 return results
355 def _registered_refs_per_artifact(self, pathInStore: str) -> Set[int]:
356 """Return all dataset refs associated with the supplied path.
358 Parameters
359 ----------
360 pathInStore : `str`
361 Path of interest in the data store.
363 Returns
364 -------
365 ids : `set` of `int`
366 All `DatasetRef` IDs associated with this path.
367 """
368 records = list(self._table.fetch(path=pathInStore))
369 ids = {r["dataset_id"] for r in records}
370 return ids
372 def removeStoredItemInfo(self, ref: DatasetIdRef) -> None:
373 # Docstring inherited from GenericBaseDatastore
374 self._table.delete(dataset_id=ref.id)
376 def _get_dataset_locations_info(self, ref: DatasetIdRef) -> List[Tuple[Location, StoredFileInfo]]:
377 r"""Find all the `Location`\ s of the requested dataset in the
378 `Datastore` and the associated stored file information.
380 Parameters
381 ----------
382 ref : `DatasetRef`
383 Reference to the required `Dataset`.
385 Returns
386 -------
387 results : `list` [`tuple` [`Location`, `StoredFileInfo` ]]
388 Location of the dataset within the datastore and
389 stored information about each file and its formatter.
390 """
391 # Get the file information (this will fail if no file)
392 records = self.getStoredItemsInfo(ref)
394 # Use the path to determine the location
395 return [(self.locationFactory.fromPath(r.path), r) for r in records]
397 def _can_remove_dataset_artifact(self, ref: DatasetIdRef, location: Location) -> bool:
398 """Check that there is only one dataset associated with the
399 specified artifact.
401 Parameters
402 ----------
403 ref : `DatasetRef` or `FakeDatasetRef`
404 Dataset to be removed.
405 location : `Location`
406 The location of the artifact to be removed.
408 Returns
409 -------
410 can_remove : `Bool`
411 True if the artifact can be safely removed.
412 """
414 # Get all entries associated with this path
415 allRefs = self._registered_refs_per_artifact(location.pathInStore)
416 if not allRefs: 416 ↛ 417line 416 didn't jump to line 417, because the condition on line 416 was never true
417 raise RuntimeError(f"Datastore inconsistency error. {location.pathInStore} not in registry")
419 # Remove these refs from all the refs and if there is nothing left
420 # then we can delete
421 remainingRefs = allRefs - {ref.id}
423 if remainingRefs:
424 return False
425 return True
427 def _prepare_for_get(self, ref: DatasetRef,
428 parameters: Optional[Mapping[str, Any]] = None) -> List[DatastoreFileGetInformation]:
429 """Check parameters for ``get`` and obtain formatter and
430 location.
432 Parameters
433 ----------
434 ref : `DatasetRef`
435 Reference to the required Dataset.
436 parameters : `dict`
437 `StorageClass`-specific parameters that specify, for example,
438 a slice of the dataset to be loaded.
440 Returns
441 -------
442 getInfo : `list` [`DatastoreFileGetInformation`]
443 Parameters needed to retrieve each file.
444 """
445 log.debug("Retrieve %s from %s with parameters %s", ref, self.name, parameters)
447 # Get file metadata and internal metadata
448 fileLocations = self._get_dataset_locations_info(ref)
449 if not fileLocations:
450 raise FileNotFoundError(f"Could not retrieve dataset {ref}.")
452 # The storage class we want to use eventually
453 refStorageClass = ref.datasetType.storageClass
455 if len(fileLocations) > 1:
456 disassembled = True
457 else:
458 disassembled = False
460 # Is this a component request?
461 refComponent = ref.datasetType.component()
463 fileGetInfo = []
464 for location, storedFileInfo in fileLocations:
466 # The storage class used to write the file
467 writeStorageClass = storedFileInfo.storageClass
469 # If this has been disassembled we need read to match the write
470 if disassembled:
471 readStorageClass = writeStorageClass
472 else:
473 readStorageClass = refStorageClass
475 formatter = getInstanceOf(storedFileInfo.formatter,
476 FileDescriptor(location, readStorageClass=readStorageClass,
477 storageClass=writeStorageClass, parameters=parameters),
478 ref.dataId)
480 formatterParams, notFormatterParams = formatter.segregateParameters()
482 # Of the remaining parameters, extract the ones supported by
483 # this StorageClass (for components not all will be handled)
484 assemblerParams = readStorageClass.filterParameters(notFormatterParams)
486 # The ref itself could be a component if the dataset was
487 # disassembled by butler, or we disassembled in datastore and
488 # components came from the datastore records
489 component = storedFileInfo.component if storedFileInfo.component else refComponent
491 fileGetInfo.append(DatastoreFileGetInformation(location, formatter, storedFileInfo,
492 assemblerParams, formatterParams,
493 component, readStorageClass))
495 return fileGetInfo
497 def _prepare_for_put(self, inMemoryDataset: Any, ref: DatasetRef) -> Tuple[Location, Formatter]:
498 """Check the arguments for ``put`` and obtain formatter and
499 location.
501 Parameters
502 ----------
503 inMemoryDataset : `object`
504 The dataset to store.
505 ref : `DatasetRef`
506 Reference to the associated Dataset.
508 Returns
509 -------
510 location : `Location`
511 The location to write the dataset.
512 formatter : `Formatter`
513 The `Formatter` to use to write the dataset.
515 Raises
516 ------
517 TypeError
518 Supplied object and storage class are inconsistent.
519 DatasetTypeNotSupportedError
520 The associated `DatasetType` is not handled by this datastore.
521 """
522 self._validate_put_parameters(inMemoryDataset, ref)
524 # Work out output file name
525 try:
526 template = self.templates.getTemplate(ref)
527 except KeyError as e:
528 raise DatasetTypeNotSupportedError(f"Unable to find template for {ref}") from e
530 location = self.locationFactory.fromPath(template.format(ref))
532 # Get the formatter based on the storage class
533 storageClass = ref.datasetType.storageClass
534 try:
535 formatter = self.formatterFactory.getFormatter(ref,
536 FileDescriptor(location,
537 storageClass=storageClass),
538 ref.dataId)
539 except KeyError as e:
540 raise DatasetTypeNotSupportedError(f"Unable to find formatter for {ref} in datastore "
541 f"{self.name}") from e
543 # Now that we know the formatter, update the location
544 location = formatter.makeUpdatedLocation(location)
546 return location, formatter
548 @abstractmethod
549 def _standardizeIngestPath(self, path: str, *, transfer: Optional[str] = None) -> str:
550 """Standardize the path of a to-be-ingested file.
552 Parameters
553 ----------
554 path : `str`
555 Path of a file to be ingested.
556 transfer : `str`, optional
557 How (and whether) the dataset should be added to the datastore.
558 See `ingest` for details of transfer modes.
559 This implementation is provided only so
560 `NotImplementedError` can be raised if the mode is not supported;
561 actual transfers are deferred to `_extractIngestInfo`.
563 Returns
564 -------
565 path : `str`
566 New path in what the datastore considers standard form.
568 Notes
569 -----
570 Subclasses of `FileLikeDatastore` should implement this method instead
571 of `_prepIngest`. It should not modify the data repository or given
572 file in any way.
574 Raises
575 ------
576 NotImplementedError
577 Raised if the datastore does not support the given transfer mode
578 (including the case where ingest is not supported at all).
579 FileNotFoundError
580 Raised if one of the given files does not exist.
581 """
582 raise NotImplementedError("Must be implemented by subclasses.")
584 @abstractmethod
585 def _extractIngestInfo(self, path: Union[str, ButlerURI], ref: DatasetRef, *,
586 formatter: Union[Formatter, Type[Formatter]],
587 transfer: Optional[str] = None) -> StoredFileInfo:
588 """Relocate (if necessary) and extract `StoredFileInfo` from a
589 to-be-ingested file.
591 Parameters
592 ----------
593 path : `str` or `ButlerURI`
594 URI or path of a file to be ingested.
595 ref : `DatasetRef`
596 Reference for the dataset being ingested. Guaranteed to have
597 ``dataset_id not None`.
598 formatter : `type` or `Formatter`
599 `Formatter` subclass to use for this dataset or an instance.
600 transfer : `str`, optional
601 How (and whether) the dataset should be added to the datastore.
602 See `ingest` for details of transfer modes.
604 Returns
605 -------
606 info : `StoredFileInfo`
607 Internal datastore record for this file. This will be inserted by
608 the caller; the `_extractIngestInfo` is only resposible for
609 creating and populating the struct.
611 Raises
612 ------
613 FileNotFoundError
614 Raised if one of the given files does not exist.
615 FileExistsError
616 Raised if transfer is not `None` but the (internal) location the
617 file would be moved to is already occupied.
618 """
619 raise NotImplementedError("Must be implemented by subclasses.")
621 def _prepIngest(self, *datasets: FileDataset, transfer: Optional[str] = None) -> _IngestPrepData:
622 # Docstring inherited from Datastore._prepIngest.
623 filtered = []
624 for dataset in datasets:
625 acceptable = [ref for ref in dataset.refs if self.constraints.isAcceptable(ref)]
626 if not acceptable:
627 continue
628 else:
629 dataset.refs = acceptable
630 if dataset.formatter is None:
631 dataset.formatter = self.formatterFactory.getFormatterClass(dataset.refs[0])
632 else:
633 assert isinstance(dataset.formatter, (type, str))
634 dataset.formatter = getClassOf(dataset.formatter)
635 dataset.path = self._standardizeIngestPath(dataset.path, transfer=transfer)
636 filtered.append(dataset)
637 return _IngestPrepData(filtered)
639 @transactional
640 def _finishIngest(self, prepData: Datastore.IngestPrepData, *, transfer: Optional[str] = None) -> None:
641 # Docstring inherited from Datastore._finishIngest.
642 refsAndInfos = []
643 for dataset in prepData.datasets:
644 # Do ingest as if the first dataset ref is associated with the file
645 info = self._extractIngestInfo(dataset.path, dataset.refs[0], formatter=dataset.formatter,
646 transfer=transfer)
647 refsAndInfos.extend([(ref, info) for ref in dataset.refs])
648 self._register_datasets(refsAndInfos)
650 def _calculate_ingested_datastore_name(self, srcUri: ButlerURI, ref: DatasetRef,
651 formatter: Union[Formatter, Type[Formatter]]) -> Location:
652 """Given a source URI and a DatasetRef, determine the name the
653 dataset will have inside datastore.
655 Parameters
656 ----------
657 srcUri : `ButlerURI`
658 URI to the source dataset file.
659 ref : `DatasetRef`
660 Ref associated with the newly-ingested dataset artifact. This
661 is used to determine the name within the datastore.
662 formatter : `Formatter` or Formatter class.
663 Formatter to use for validation. Can be a class or an instance.
665 Returns
666 -------
667 location : `Location`
668 Target location for the newly-ingested dataset.
669 """
670 # Ingesting a file from outside the datastore.
671 # This involves a new name.
672 template = self.templates.getTemplate(ref)
673 location = self.locationFactory.fromPath(template.format(ref))
675 # Get the extension
676 ext = srcUri.getExtension()
678 # Update the destination to include that extension
679 location.updateExtension(ext)
681 # Ask the formatter to validate this extension
682 formatter.validateExtension(location)
684 return location
686 @abstractmethod
687 def _write_in_memory_to_artifact(self, inMemoryDataset: Any, ref: DatasetRef) -> StoredFileInfo:
688 """Write out in memory dataset to datastore.
690 Parameters
691 ----------
692 inMemoryDataset : `object`
693 Dataset to write to datastore.
694 ref : `DatasetRef`
695 Registry information associated with this dataset.
697 Returns
698 -------
699 info : `StoredFileInfo`
700 Information describin the artifact written to the datastore.
701 """
702 raise NotImplementedError()
704 @abstractmethod
705 def _read_artifact_into_memory(self, getInfo: DatastoreFileGetInformation,
706 ref: DatasetRef, isComponent: bool = False) -> Any:
707 """Read the artifact from datastore into in memory object.
709 Parameters
710 ----------
711 getInfo : `DatastoreFileGetInformation`
712 Information about the artifact within the datastore.
713 ref : `DatasetRef`
714 The registry information associated with this artifact.
715 isComponent : `bool`
716 Flag to indicate if a component is being read from this artifact.
718 Returns
719 -------
720 inMemoryDataset : `object`
721 The artifact as a python object.
722 """
723 raise NotImplementedError()
725 def exists(self, ref: DatasetRef) -> bool:
726 """Check if the dataset exists in the datastore.
728 Parameters
729 ----------
730 ref : `DatasetRef`
731 Reference to the required dataset.
733 Returns
734 -------
735 exists : `bool`
736 `True` if the entity exists in the `Datastore`.
737 """
738 fileLocations = self._get_dataset_locations_info(ref)
739 if not fileLocations:
740 return False
741 for location, _ in fileLocations:
742 if not self._artifact_exists(location):
743 return False
745 return True
747 def getURIs(self, ref: DatasetRef,
748 predict: bool = False) -> Tuple[Optional[ButlerURI], Dict[str, ButlerURI]]:
749 """Return URIs associated with dataset.
751 Parameters
752 ----------
753 ref : `DatasetRef`
754 Reference to the required dataset.
755 predict : `bool`, optional
756 If the datastore does not know about the dataset, should it
757 return a predicted URI or not?
759 Returns
760 -------
761 primary : `ButlerURI`
762 The URI to the primary artifact associated with this dataset.
763 If the dataset was disassembled within the datastore this
764 may be `None`.
765 components : `dict`
766 URIs to any components associated with the dataset artifact.
767 Can be empty if there are no components.
768 """
770 primary: Optional[ButlerURI] = None
771 components: Dict[str, ButlerURI] = {}
773 # if this has never been written then we have to guess
774 if not self.exists(ref):
775 if not predict:
776 raise FileNotFoundError("Dataset {} not in this datastore".format(ref))
778 def predictLocation(thisRef: DatasetRef) -> Location:
779 template = self.templates.getTemplate(thisRef)
780 location = self.locationFactory.fromPath(template.format(thisRef))
781 storageClass = ref.datasetType.storageClass
782 formatter = self.formatterFactory.getFormatter(thisRef,
783 FileDescriptor(location,
784 storageClass=storageClass))
785 # Try to use the extension attribute but ignore problems if the
786 # formatter does not define one.
787 try:
788 location = formatter.makeUpdatedLocation(location)
789 except Exception:
790 # Use the default extension
791 pass
792 return location
794 doDisassembly = self.composites.shouldBeDisassembled(ref)
796 if doDisassembly:
798 for component, componentStorage in ref.datasetType.storageClass.components.items():
799 compRef = ref.makeComponentRef(component)
800 compLocation = predictLocation(compRef)
802 # Add a URI fragment to indicate this is a guess
803 components[component] = ButlerURI(compLocation.uri.geturl() + "#predicted")
805 else:
807 location = predictLocation(ref)
809 # Add a URI fragment to indicate this is a guess
810 primary = ButlerURI(location.uri.geturl() + "#predicted")
812 return primary, components
814 # If this is a ref that we have written we can get the path.
815 # Get file metadata and internal metadata
816 fileLocations = self._get_dataset_locations_info(ref)
818 if not fileLocations: 818 ↛ 819line 818 didn't jump to line 819, because the condition on line 818 was never true
819 raise RuntimeError(f"Unexpectedly got no artifacts for dataset {ref}")
821 if len(fileLocations) == 1:
822 # No disassembly so this is the primary URI
823 primary = ButlerURI(fileLocations[0][0].uri)
825 else:
826 for location, storedFileInfo in fileLocations:
827 if storedFileInfo.component is None: 827 ↛ 828line 827 didn't jump to line 828, because the condition on line 827 was never true
828 raise RuntimeError(f"Unexpectedly got no component name for a component at {location}")
829 components[storedFileInfo.component] = ButlerURI(location.uri)
831 return primary, components
833 def getURI(self, ref: DatasetRef, predict: bool = False) -> ButlerURI:
834 """URI to the Dataset.
836 Parameters
837 ----------
838 ref : `DatasetRef`
839 Reference to the required Dataset.
840 predict : `bool`
841 If `True`, allow URIs to be returned of datasets that have not
842 been written.
844 Returns
845 -------
846 uri : `str`
847 URI pointing to the dataset within the datastore. If the
848 dataset does not exist in the datastore, and if ``predict`` is
849 `True`, the URI will be a prediction and will include a URI
850 fragment "#predicted".
851 If the datastore does not have entities that relate well
852 to the concept of a URI the returned URI will be
853 descriptive. The returned URI is not guaranteed to be obtainable.
855 Raises
856 ------
857 FileNotFoundError
858 Raised if a URI has been requested for a dataset that does not
859 exist and guessing is not allowed.
860 RuntimeError
861 Raised if a request is made for a single URI but multiple URIs
862 are associated with this dataset.
864 Notes
865 -----
866 When a predicted URI is requested an attempt will be made to form
867 a reasonable URI based on file templates and the expected formatter.
868 """
869 primary, components = self.getURIs(ref, predict)
870 if primary is None or components: 870 ↛ 871line 870 didn't jump to line 871, because the condition on line 870 was never true
871 raise RuntimeError(f"Dataset ({ref}) includes distinct URIs for components. "
872 "Use Dataastore.getURIs() instead.")
873 return primary
875 def get(self, ref: DatasetRef, parameters: Optional[Mapping[str, Any]] = None) -> Any:
876 """Load an InMemoryDataset from the store.
878 Parameters
879 ----------
880 ref : `DatasetRef`
881 Reference to the required Dataset.
882 parameters : `dict`
883 `StorageClass`-specific parameters that specify, for example,
884 a slice of the dataset to be loaded.
886 Returns
887 -------
888 inMemoryDataset : `object`
889 Requested dataset or slice thereof as an InMemoryDataset.
891 Raises
892 ------
893 FileNotFoundError
894 Requested dataset can not be retrieved.
895 TypeError
896 Return value from formatter has unexpected type.
897 ValueError
898 Formatter failed to process the dataset.
899 """
900 allGetInfo = self._prepare_for_get(ref, parameters)
901 refComponent = ref.datasetType.component()
903 # Supplied storage class for the component being read
904 refStorageClass = ref.datasetType.storageClass
906 # Create mapping from component name to related info
907 allComponents = {i.component: i for i in allGetInfo}
909 # By definition the dataset is disassembled if we have more
910 # than one record for it.
911 isDisassembled = len(allGetInfo) > 1
913 # Look for the special case where we are disassembled but the
914 # component is a read-only component that was not written during
915 # disassembly. For this scenario we need to check that the
916 # component requested is listed as a read-only component for the
917 # composite storage class
918 isDisassembledReadOnlyComponent = False
919 if isDisassembled and refComponent:
920 # The composite storage class should be accessible through
921 # the component dataset type
922 compositeStorageClass = ref.datasetType.parentStorageClass
924 # In the unlikely scenario where the composite storage
925 # class is not known, we can only assume that this is a
926 # normal component. If that assumption is wrong then the
927 # branch below that reads a persisted component will fail
928 # so there is no need to complain here.
929 if compositeStorageClass is not None: 929 ↛ 932line 929 didn't jump to line 932, because the condition on line 929 was never false
930 isDisassembledReadOnlyComponent = refComponent in compositeStorageClass.readComponents
932 if isDisassembled and not refComponent:
933 # This was a disassembled dataset spread over multiple files
934 # and we need to put them all back together again.
935 # Read into memory and then assemble
937 # Check that the supplied parameters are suitable for the type read
938 refStorageClass.validateParameters(parameters)
940 # We want to keep track of all the parameters that were not used
941 # by formatters. We assume that if any of the component formatters
942 # use a parameter that we do not need to apply it again in the
943 # assembler.
944 usedParams = set()
946 components: Dict[str, Any] = {}
947 for getInfo in allGetInfo:
948 # assemblerParams are parameters not understood by the
949 # associated formatter.
950 usedParams.update(set(getInfo.formatterParams))
952 component = getInfo.component
954 if component is None: 954 ↛ 955line 954 didn't jump to line 955, because the condition on line 954 was never true
955 raise RuntimeError(f"Internal error in datastore assembly of {ref}")
957 # We do not want the formatter to think it's reading
958 # a component though because it is really reading a
959 # standalone dataset -- always tell reader it is not a
960 # component.
961 components[component] = self._read_artifact_into_memory(getInfo, ref, isComponent=False)
963 inMemoryDataset = ref.datasetType.storageClass.assembler().assemble(components)
965 # Any unused parameters will have to be passed to the assembler
966 if parameters:
967 unusedParams = {k: v for k, v in parameters.items() if k not in usedParams}
968 else:
969 unusedParams = {}
971 # Process parameters
972 return ref.datasetType.storageClass.assembler().handleParameters(inMemoryDataset,
973 parameters=unusedParams)
975 elif isDisassembledReadOnlyComponent:
977 compositeStorageClass = ref.datasetType.parentStorageClass
978 if compositeStorageClass is None: 978 ↛ 979line 978 didn't jump to line 979, because the condition on line 978 was never true
979 raise RuntimeError(f"Unable to retrieve read-only component '{refComponent}' since"
980 "no composite storage class is available.")
982 if refComponent is None: 982 ↛ 984line 982 didn't jump to line 984, because the condition on line 982 was never true
983 # Mainly for mypy
984 raise RuntimeError(f"Internal error in datastore {self.name}: component can not be None here")
986 # Assume that every read-only component can be calculated by
987 # forwarding the request to a single read/write component.
988 # Rather than guessing which rw component is the right one by
989 # scanning each for a read-only component of the same name,
990 # we ask the composite assembler directly which one is best to
991 # use.
992 compositeAssembler = compositeStorageClass.assembler()
993 forwardedComponent = compositeAssembler.selectResponsibleComponent(refComponent,
994 set(allComponents))
996 # Select the relevant component
997 rwInfo = allComponents[forwardedComponent]
999 # For now assume that read parameters are validated against
1000 # the real component and not the requested component
1001 forwardedStorageClass = rwInfo.formatter.fileDescriptor.readStorageClass
1002 forwardedStorageClass.validateParameters(parameters)
1004 # Unfortunately the FileDescriptor inside the formatter will have
1005 # the wrong write storage class so we need to create a new one
1006 # given the immutability constraint.
1007 writeStorageClass = rwInfo.info.storageClass
1009 # We may need to put some thought into parameters for read
1010 # components but for now forward them on as is
1011 readFormatter = type(rwInfo.formatter)(FileDescriptor(rwInfo.location,
1012 readStorageClass=refStorageClass,
1013 storageClass=writeStorageClass,
1014 parameters=parameters),
1015 ref.dataId)
1017 # The assembler can not receive any parameter requests for a
1018 # read-only component at this time since the assembler will
1019 # see the storage class of the read-only component and those
1020 # parameters will have to be handled by the formatter on the
1021 # forwarded storage class.
1022 assemblerParams: Dict[str, Any] = {}
1024 # Need to created a new info that specifies the read-only
1025 # component and associated storage class
1026 readInfo = DatastoreFileGetInformation(rwInfo.location, readFormatter,
1027 rwInfo.info, assemblerParams, {},
1028 refComponent, refStorageClass)
1030 return self._read_artifact_into_memory(readInfo, ref, isComponent=True)
1032 else:
1033 # Single file request or component from that composite file
1034 for lookup in (refComponent, None): 1034 ↛ 1039line 1034 didn't jump to line 1039, because the loop on line 1034 didn't complete
1035 if lookup in allComponents: 1035 ↛ 1034line 1035 didn't jump to line 1034, because the condition on line 1035 was never false
1036 getInfo = allComponents[lookup]
1037 break
1038 else:
1039 raise FileNotFoundError(f"Component {refComponent} not found "
1040 f"for ref {ref} in datastore {self.name}")
1042 # Do not need the component itself if already disassembled
1043 if isDisassembled:
1044 isComponent = False
1045 else:
1046 isComponent = getInfo.component is not None
1048 # For a disassembled component we can validate parametersagainst
1049 # the component storage class directly
1050 if isDisassembled:
1051 refStorageClass.validateParameters(parameters)
1052 else:
1053 # For an assembled composite this could be a read-only
1054 # component derived from a real component. The validity
1055 # of the parameters is not clear. For now validate against
1056 # the composite storage class
1057 getInfo.formatter.fileDescriptor.storageClass.validateParameters(parameters)
1059 return self._read_artifact_into_memory(getInfo, ref, isComponent=isComponent)
1061 @transactional
1062 def put(self, inMemoryDataset: Any, ref: DatasetRef) -> None:
1063 """Write a InMemoryDataset with a given `DatasetRef` to the store.
1065 Parameters
1066 ----------
1067 inMemoryDataset : `object`
1068 The dataset to store.
1069 ref : `DatasetRef`
1070 Reference to the associated Dataset.
1072 Raises
1073 ------
1074 TypeError
1075 Supplied object and storage class are inconsistent.
1076 DatasetTypeNotSupportedError
1077 The associated `DatasetType` is not handled by this datastore.
1079 Notes
1080 -----
1081 If the datastore is configured to reject certain dataset types it
1082 is possible that the put will fail and raise a
1083 `DatasetTypeNotSupportedError`. The main use case for this is to
1084 allow `ChainedDatastore` to put to multiple datastores without
1085 requiring that every datastore accepts the dataset.
1086 """
1088 doDisassembly = self.composites.shouldBeDisassembled(ref)
1089 # doDisassembly = True
1091 artifacts = []
1092 if doDisassembly:
1093 components = ref.datasetType.storageClass.assembler().disassemble(inMemoryDataset)
1094 for component, componentInfo in components.items():
1095 # Don't recurse because we want to take advantage of
1096 # bulk insert -- need a new DatasetRef that refers to the
1097 # same dataset_id but has the component DatasetType
1098 # DatasetType does not refer to the types of components
1099 # So we construct one ourselves.
1100 compRef = ref.makeComponentRef(component)
1101 storedInfo = self._write_in_memory_to_artifact(componentInfo.component, compRef)
1102 artifacts.append((compRef, storedInfo))
1103 else:
1104 # Write the entire thing out
1105 storedInfo = self._write_in_memory_to_artifact(inMemoryDataset, ref)
1106 artifacts.append((ref, storedInfo))
1108 self._register_datasets(artifacts)
1110 @transactional
1111 def trash(self, ref: DatasetRef, ignore_errors: bool = True) -> None:
1112 """Indicate to the datastore that a dataset can be removed.
1114 Parameters
1115 ----------
1116 ref : `DatasetRef`
1117 Reference to the required Dataset.
1118 ignore_errors : `bool`
1119 If `True` return without error even if something went wrong.
1120 Problems could occur if another process is simultaneously trying
1121 to delete.
1123 Raises
1124 ------
1125 FileNotFoundError
1126 Attempt to remove a dataset that does not exist.
1127 """
1128 # Get file metadata and internal metadata
1129 log.debug("Trashing %s in datastore %s", ref, self.name)
1131 fileLocations = self._get_dataset_locations_info(ref)
1133 if not fileLocations:
1134 err_msg = f"Requested dataset to trash ({ref}) is not known to datastore {self.name}"
1135 if ignore_errors:
1136 log.warning(err_msg)
1137 return
1138 else:
1139 raise FileNotFoundError(err_msg)
1141 for location, storedFileInfo in fileLocations:
1142 if not self._artifact_exists(location): 1142 ↛ 1143line 1142 didn't jump to line 1143, because the condition on line 1142 was never true
1143 err_msg = f"Dataset is known to datastore {self.name} but " \
1144 f"associated artifact ({location.uri}) is missing"
1145 if ignore_errors:
1146 log.warning(err_msg)
1147 return
1148 else:
1149 raise FileNotFoundError(err_msg)
1151 # Mark dataset as trashed
1152 try:
1153 self._move_to_trash_in_registry(ref)
1154 except Exception as e:
1155 if ignore_errors:
1156 log.warning(f"Attempted to mark dataset ({ref}) to be trashed in datastore {self.name} "
1157 f"but encountered an error: {e}")
1158 pass
1159 else:
1160 raise
1162 @transactional
1163 def emptyTrash(self, ignore_errors: bool = True) -> None:
1164 """Remove all datasets from the trash.
1166 Parameters
1167 ----------
1168 ignore_errors : `bool`
1169 If `True` return without error even if something went wrong.
1170 Problems could occur if another process is simultaneously trying
1171 to delete.
1172 """
1173 log.debug("Emptying trash in datastore %s", self.name)
1174 # Context manager will empty trash iff we finish it without raising.
1175 with self._bridge.emptyTrash() as trashed:
1176 for ref in trashed:
1177 fileLocations = self._get_dataset_locations_info(ref)
1179 if not fileLocations: 1179 ↛ 1180line 1179 didn't jump to line 1180, because the condition on line 1179 was never true
1180 err_msg = f"Requested dataset ({ref}) does not exist in datastore {self.name}"
1181 if ignore_errors:
1182 log.warning(err_msg)
1183 continue
1184 else:
1185 raise FileNotFoundError(err_msg)
1187 for location, _ in fileLocations:
1189 if not self._artifact_exists(location): 1189 ↛ 1190line 1189 didn't jump to line 1190, because the condition on line 1189 was never true
1190 err_msg = f"Dataset {location.uri} no longer present in datastore {self.name}"
1191 if ignore_errors:
1192 log.warning(err_msg)
1193 continue
1194 else:
1195 raise FileNotFoundError(err_msg)
1197 # Can only delete the artifact if there are no references
1198 # to the file from untrashed dataset refs.
1199 if self._can_remove_dataset_artifact(ref, location):
1200 # Point of no return for this artifact
1201 log.debug("Removing artifact %s from datastore %s", location.uri, self.name)
1202 try:
1203 self._delete_artifact(location)
1204 except Exception as e:
1205 if ignore_errors:
1206 log.critical("Encountered error removing artifact %s from datastore %s: %s",
1207 location.uri, self.name, e)
1208 else:
1209 raise
1211 # Now must remove the entry from the internal registry even if
1212 # the artifact removal failed and was ignored,
1213 # otherwise the removal check above will never be true
1214 try:
1215 # There may be multiple rows associated with this ref
1216 # depending on disassembly
1217 self.removeStoredItemInfo(ref)
1218 except Exception as e:
1219 if ignore_errors:
1220 log.warning("Error removing dataset %s (%s) from internal registry of %s: %s",
1221 ref.id, location.uri, self.name, e)
1222 continue
1223 else:
1224 raise FileNotFoundError(err_msg)
1226 def validateConfiguration(self, entities: Iterable[Union[DatasetRef, DatasetType, StorageClass]],
1227 logFailures: bool = False) -> None:
1228 """Validate some of the configuration for this datastore.
1230 Parameters
1231 ----------
1232 entities : iterable of `DatasetRef`, `DatasetType`, or `StorageClass`
1233 Entities to test against this configuration. Can be differing
1234 types.
1235 logFailures : `bool`, optional
1236 If `True`, output a log message for every validation error
1237 detected.
1239 Raises
1240 ------
1241 DatastoreValidationError
1242 Raised if there is a validation problem with a configuration.
1243 All the problems are reported in a single exception.
1245 Notes
1246 -----
1247 This method checks that all the supplied entities have valid file
1248 templates and also have formatters defined.
1249 """
1251 templateFailed = None
1252 try:
1253 self.templates.validateTemplates(entities, logFailures=logFailures)
1254 except FileTemplateValidationError as e:
1255 templateFailed = str(e)
1257 formatterFailed = []
1258 for entity in entities:
1259 try:
1260 self.formatterFactory.getFormatterClass(entity)
1261 except KeyError as e:
1262 formatterFailed.append(str(e))
1263 if logFailures: 1263 ↛ 1258line 1263 didn't jump to line 1258, because the condition on line 1263 was never false
1264 log.fatal("Formatter failure: %s", e)
1266 if templateFailed or formatterFailed:
1267 messages = []
1268 if templateFailed: 1268 ↛ 1269line 1268 didn't jump to line 1269, because the condition on line 1268 was never true
1269 messages.append(templateFailed)
1270 if formatterFailed: 1270 ↛ 1272line 1270 didn't jump to line 1272, because the condition on line 1270 was never false
1271 messages.append(",".join(formatterFailed))
1272 msg = ";\n".join(messages)
1273 raise DatastoreValidationError(msg)
1275 def getLookupKeys(self) -> Set[LookupKey]:
1276 # Docstring is inherited from base class
1277 return self.templates.getLookupKeys() | self.formatterFactory.getLookupKeys() | \
1278 self.constraints.getLookupKeys()
1280 def validateKey(self, lookupKey: LookupKey,
1281 entity: Union[DatasetRef, DatasetType, StorageClass]) -> None:
1282 # Docstring is inherited from base class
1283 # The key can be valid in either formatters or templates so we can
1284 # only check the template if it exists
1285 if lookupKey in self.templates:
1286 try:
1287 self.templates[lookupKey].validateTemplate(entity)
1288 except FileTemplateValidationError as e:
1289 raise DatastoreValidationError(e) from e
1291 def export(self, refs: Iterable[DatasetRef], *,
1292 directory: Optional[Union[ButlerURI, str]] = None,
1293 transfer: Optional[str] = "auto") -> Iterable[FileDataset]:
1294 # Docstring inherited from Datastore.export.
1295 if transfer is not None and directory is None: 1295 ↛ 1296line 1295 didn't jump to line 1296, because the condition on line 1295 was never true
1296 raise RuntimeError(f"Cannot export using transfer mode {transfer} with no "
1297 "export directory given")
1299 # Force the directory to be a URI object
1300 directoryUri: Optional[ButlerURI] = None
1301 if directory is not None: 1301 ↛ 1304line 1301 didn't jump to line 1304, because the condition on line 1301 was never false
1302 directoryUri = ButlerURI(directory, forceDirectory=True)
1304 if transfer is not None and directoryUri is not None: 1304 ↛ 1309line 1304 didn't jump to line 1309, because the condition on line 1304 was never false
1305 # mypy needs the second test
1306 if not directoryUri.exists(): 1306 ↛ 1307line 1306 didn't jump to line 1307, because the condition on line 1306 was never true
1307 raise FileNotFoundError(f"Export location {directory} does not exist")
1309 for ref in refs:
1310 fileLocations = self._get_dataset_locations_info(ref)
1311 if not fileLocations: 1311 ↛ 1312line 1311 didn't jump to line 1312, because the condition on line 1311 was never true
1312 raise FileNotFoundError(f"Could not retrieve dataset {ref}.")
1313 # For now we can not export disassembled datasets
1314 if len(fileLocations) > 1:
1315 raise NotImplementedError(f"Can not export disassembled datasets such as {ref}")
1316 location, storedFileInfo = fileLocations[0]
1317 if transfer is None: 1317 ↛ 1320line 1317 didn't jump to line 1320, because the condition on line 1317 was never true
1318 # TODO: do we also need to return the readStorageClass somehow?
1319 # We will use the path in store directly
1320 pass
1321 else:
1322 # mypy needs help
1323 assert directoryUri is not None, "directoryUri must be defined to get here"
1324 storeUri = ButlerURI(location.uri)
1325 exportUri = directoryUri.join(location.pathInStore)
1326 exportUri.transfer_from(storeUri, transfer=transfer)
1328 yield FileDataset(refs=[ref], path=location.pathInStore, formatter=storedFileInfo.formatter)