Coverage for python/lsst/daf/butler/datastores/fileLikeDatastore.py : 80%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
21from __future__ import annotations
23"""Generic file-based datastore code."""
25__all__ = ("FileLikeDatastore", )
27import logging
28from abc import abstractmethod
30from sqlalchemy import Integer, String
32from dataclasses import dataclass
33from typing import (
34 TYPE_CHECKING,
35 Any,
36 ClassVar,
37 Dict,
38 Iterable,
39 List,
40 Mapping,
41 Optional,
42 Set,
43 Tuple,
44 Type,
45 Union,
46)
48from lsst.daf.butler import (
49 ButlerURI,
50 CompositesMap,
51 Config,
52 FileDataset,
53 DatasetRef,
54 DatasetType,
55 DatasetTypeNotSupportedError,
56 Datastore,
57 DatastoreConfig,
58 DatastoreValidationError,
59 FileDescriptor,
60 FileTemplates,
61 FileTemplateValidationError,
62 Formatter,
63 FormatterFactory,
64 Location,
65 LocationFactory,
66 StorageClass,
67 StoredFileInfo,
68)
70from lsst.daf.butler import ddl
71from lsst.daf.butler.registry.interfaces import (
72 ReadOnlyDatabaseError,
73 DatastoreRegistryBridge,
74 FakeDatasetRef,
75)
77from lsst.daf.butler.core.repoRelocation import replaceRoot
78from lsst.daf.butler.core.utils import getInstanceOf, getClassOf, transactional
79from .genericDatastore import GenericBaseDatastore
81if TYPE_CHECKING: 81 ↛ 82line 81 didn't jump to line 82, because the condition on line 81 was never true
82 from lsst.daf.butler import LookupKey
83 from lsst.daf.butler.registry.interfaces import DatasetIdRef, DatastoreRegistryBridgeManager
85log = logging.getLogger(__name__)
87# String to use when a Python None is encountered
88NULLSTR = "__NULL_STRING__"
91class _IngestPrepData(Datastore.IngestPrepData):
92 """Helper class for FileLikeDatastore ingest implementation.
94 Parameters
95 ----------
96 datasets : `list` of `FileDataset`
97 Files to be ingested by this datastore.
98 """
99 def __init__(self, datasets: List[FileDataset]):
100 super().__init__(ref for dataset in datasets for ref in dataset.refs)
101 self.datasets = datasets
104@dataclass(frozen=True)
105class DatastoreFileGetInformation:
106 """Collection of useful parameters needed to retrieve a file from
107 a Datastore.
108 """
110 location: Location
111 """The location from which to read the dataset."""
113 formatter: Formatter
114 """The `Formatter` to use to deserialize the dataset."""
116 info: StoredFileInfo
117 """Stored information about this file and its formatter."""
119 assemblerParams: dict
120 """Parameters to use for post-processing the retrieved dataset."""
122 component: Optional[str]
123 """The component to be retrieved (can be `None`)."""
125 readStorageClass: StorageClass
126 """The `StorageClass` of the dataset being read."""
129class FileLikeDatastore(GenericBaseDatastore):
130 """Generic Datastore for file-based implementations.
132 Should always be sub-classed since key abstract methods are missing.
134 Parameters
135 ----------
136 config : `DatastoreConfig` or `str`
137 Configuration as either a `Config` object or URI to file.
138 bridgeManager : `DatastoreRegistryBridgeManager`
139 Object that manages the interface between `Registry` and datastores.
140 butlerRoot : `str`, optional
141 New datastore root to use to override the configuration value.
143 Raises
144 ------
145 ValueError
146 If root location does not exist and ``create`` is `False` in the
147 configuration.
148 """
150 defaultConfigFile: ClassVar[Optional[str]] = None
151 """Path to configuration defaults. Relative to $DAF_BUTLER_DIR/config or
152 absolute path. Can be None if no defaults specified.
153 """
155 root: str
156 """Root directory or URI of this `Datastore`."""
158 locationFactory: LocationFactory
159 """Factory for creating locations relative to the datastore root."""
161 formatterFactory: FormatterFactory
162 """Factory for creating instances of formatters."""
164 templates: FileTemplates
165 """File templates that can be used by this `Datastore`."""
167 composites: CompositesMap
168 """Determines whether a dataset should be disassembled on put."""
170 @classmethod
171 def setConfigRoot(cls, root: str, config: Config, full: Config, overwrite: bool = True) -> None:
172 """Set any filesystem-dependent config options for this Datastore to
173 be appropriate for a new empty repository with the given root.
175 Parameters
176 ----------
177 root : `str`
178 URI to the root of the data repository.
179 config : `Config`
180 A `Config` to update. Only the subset understood by
181 this component will be updated. Will not expand
182 defaults.
183 full : `Config`
184 A complete config with all defaults expanded that can be
185 converted to a `DatastoreConfig`. Read-only and will not be
186 modified by this method.
187 Repository-specific options that should not be obtained
188 from defaults when Butler instances are constructed
189 should be copied from ``full`` to ``config``.
190 overwrite : `bool`, optional
191 If `False`, do not modify a value in ``config`` if the value
192 already exists. Default is always to overwrite with the provided
193 ``root``.
195 Notes
196 -----
197 If a keyword is explicitly defined in the supplied ``config`` it
198 will not be overridden by this method if ``overwrite`` is `False`.
199 This allows explicit values set in external configs to be retained.
200 """
201 Config.updateParameters(DatastoreConfig, config, full,
202 toUpdate={"root": root},
203 toCopy=("cls", ("records", "table")), overwrite=overwrite)
205 @classmethod
206 def makeTableSpec(cls) -> ddl.TableSpec:
207 return ddl.TableSpec(
208 fields=[
209 ddl.FieldSpec(name="dataset_id", dtype=Integer, primaryKey=True),
210 ddl.FieldSpec(name="path", dtype=String, length=256, nullable=False),
211 ddl.FieldSpec(name="formatter", dtype=String, length=128, nullable=False),
212 ddl.FieldSpec(name="storage_class", dtype=String, length=64, nullable=False),
213 # Use empty string to indicate no component
214 ddl.FieldSpec(name="component", dtype=String, length=32, primaryKey=True),
215 # TODO: should checksum be Base64Bytes instead?
216 ddl.FieldSpec(name="checksum", dtype=String, length=128, nullable=True),
217 ddl.FieldSpec(name="file_size", dtype=Integer, nullable=True),
218 ],
219 unique=frozenset(),
220 )
222 def __init__(self, config: Union[DatastoreConfig, str],
223 bridgeManager: DatastoreRegistryBridgeManager, butlerRoot: str = None):
224 super().__init__(config, bridgeManager)
225 if "root" not in self.config: 225 ↛ 226line 225 didn't jump to line 226, because the condition on line 225 was never true
226 raise ValueError("No root directory specified in configuration")
228 # Name ourselves either using an explicit name or a name
229 # derived from the (unexpanded) root
230 if "name" in self.config:
231 self.name = self.config["name"]
232 else:
233 # We use the unexpanded root in the name to indicate that this
234 # datastore can be moved without having to update registry.
235 self.name = "{}@{}".format(type(self).__name__,
236 self.config["root"])
238 # Support repository relocation in config
239 # Existence of self.root is checked in subclass
240 self.root = replaceRoot(self.config["root"], butlerRoot)
242 self.locationFactory = LocationFactory(self.root)
243 self.formatterFactory = FormatterFactory()
245 # Now associate formatters with storage classes
246 self.formatterFactory.registerFormatters(self.config["formatters"],
247 universe=bridgeManager.universe)
249 # Read the file naming templates
250 self.templates = FileTemplates(self.config["templates"],
251 universe=bridgeManager.universe)
253 # See if composites should be disassembled
254 self.composites = CompositesMap(self.config["composites"],
255 universe=bridgeManager.universe)
257 tableName = self.config["records", "table"]
258 try:
259 # Storage of paths and formatters, keyed by dataset_id
260 self._table = bridgeManager.opaque.register(tableName, self.makeTableSpec())
261 # Interface to Registry.
262 self._bridge = bridgeManager.register(self.name)
263 except ReadOnlyDatabaseError:
264 # If the database is read only and we just tried and failed to
265 # create a table, it means someone is trying to create a read-only
266 # butler client for an empty repo. That should be okay, as long
267 # as they then try to get any datasets before some other client
268 # creates the table. Chances are they'rejust validating
269 # configuration.
270 pass
272 # Determine whether checksums should be used
273 self.useChecksum = self.config.get("checksum", True)
275 def __str__(self) -> str:
276 return self.root
278 @property
279 def bridge(self) -> DatastoreRegistryBridge:
280 return self._bridge
282 @abstractmethod
283 def _artifact_exists(self, location: Location) -> bool:
284 """Check that an artifact exists in this datastore at the specified
285 location.
287 Parameters
288 ----------
289 location : `Location`
290 Expected location of the artifact associated with this datastore.
292 Returns
293 -------
294 exists : `bool`
295 True if the location can be found, false otherwise.
296 """
297 raise NotImplementedError()
299 @abstractmethod
300 def _delete_artifact(self, location: Location) -> None:
301 """Delete the artifact from the datastore.
303 Parameters
304 ----------
305 location : `Location`
306 Location of the artifact associated with this datastore.
307 """
308 raise NotImplementedError()
310 def addStoredItemInfo(self, refs: Iterable[DatasetRef], infos: Iterable[StoredFileInfo]) -> None:
311 # Docstring inherited from GenericBaseDatastore
312 records = []
313 for ref, info in zip(refs, infos):
314 # Component should come from ref and fall back on info
315 component = ref.datasetType.component()
316 if component is None and info.component is not None: 316 ↛ 317line 316 didn't jump to line 317, because the condition on line 316 was never true
317 component = info.component
318 if component is None:
319 # Use empty string since we want this to be part of the
320 # primary key.
321 component = NULLSTR
322 records.append(
323 dict(dataset_id=ref.id, formatter=info.formatter, path=info.path,
324 storage_class=info.storageClass.name, component=component,
325 checksum=info.checksum, file_size=info.file_size)
326 )
327 self._table.insert(*records)
329 def getStoredItemInfo(self, ref: DatasetIdRef) -> StoredFileInfo:
330 # Docstring inherited from GenericBaseDatastore
332 if ref.id is None: 332 ↛ 333line 332 didn't jump to line 333, because the condition on line 332 was never true
333 raise RuntimeError("Unable to retrieve information for unresolved DatasetRef")
335 where: Dict[str, Union[int, str]] = {"dataset_id": ref.id}
337 # If we have no component we want the row from this table without
338 # a component. If we do have a component we either need the row
339 # with no component or the row with the component, depending on how
340 # this dataset was dissassembled.
342 # if we are emptying trash we won't have real refs so can't constrain
343 # by component. Will need to fix this to return multiple matches
344 # in future.
345 component = None
346 try:
347 component = ref.datasetType.component()
348 except AttributeError:
349 pass
350 else:
351 if component is None: 351 ↛ 356line 351 didn't jump to line 356, because the condition on line 351 was never false
352 where["component"] = NULLSTR
354 # Look for the dataset_id -- there might be multiple matches
355 # if we have disassembled the dataset.
356 records = list(self._table.fetch(**where))
357 if len(records) == 0: 357 ↛ 358line 357 didn't jump to line 358, because the condition on line 357 was never true
358 raise KeyError(f"Unable to retrieve location associated with dataset {ref}.")
360 # if we are not asking for a component
361 if not component and len(records) != 1: 361 ↛ 362line 361 didn't jump to line 362, because the condition on line 361 was never true
362 raise RuntimeError(f"Got {len(records)} from location query of dataset {ref}")
364 # if we had a FakeDatasetRef we pick the first record regardless
365 if isinstance(ref, FakeDatasetRef): 365 ↛ 366line 365 didn't jump to line 366, because the condition on line 365 was never true
366 record = records[0]
367 else:
368 records_by_component = {}
369 for r in records:
370 this_component = r["component"] if r["component"] and r["component"] != NULLSTR else None
371 records_by_component[this_component] = r
373 # Look for component by name else fall back to the parent
374 for lookup in (component, None): 374 ↛ 379line 374 didn't jump to line 379, because the loop on line 374 didn't complete
375 if lookup in records_by_component: 375 ↛ 374line 375 didn't jump to line 374, because the condition on line 375 was never false
376 record = records_by_component[lookup]
377 break
378 else:
379 raise KeyError(f"Unable to retrieve location for component {component} associated with "
380 f"dataset {ref}.")
382 # Convert name of StorageClass to instance
383 storageClass = self.storageClassFactory.getStorageClass(record["storage_class"])
385 return StoredFileInfo(formatter=record["formatter"],
386 path=record["path"],
387 storageClass=storageClass,
388 component=component,
389 checksum=record["checksum"],
390 file_size=record["file_size"])
392 def getStoredItemsInfo(self, ref: DatasetIdRef) -> List[StoredFileInfo]:
393 # Docstring inherited from GenericBaseDatastore
395 # Look for the dataset_id -- there might be multiple matches
396 # if we have disassembled the dataset.
397 records = list(self._table.fetch(dataset_id=ref.id))
399 results = []
400 for record in records:
401 # Convert name of StorageClass to instance
402 storageClass = self.storageClassFactory.getStorageClass(record["storage_class"])
403 component = record["component"] if (record["component"]
404 and record["component"] != NULLSTR) else None
406 info = StoredFileInfo(formatter=record["formatter"],
407 path=record["path"],
408 storageClass=storageClass,
409 component=component,
410 checksum=record["checksum"],
411 file_size=record["file_size"])
412 results.append(info)
414 return results
416 def _registered_refs_per_artifact(self, pathInStore: str) -> Set[int]:
417 """Return all dataset refs associated with the supplied path.
419 Parameters
420 ----------
421 pathInStore : `str`
422 Path of interest in the data store.
424 Returns
425 -------
426 ids : `set` of `int`
427 All `DatasetRef` IDs associated with this path.
428 """
429 records = list(self._table.fetch(path=pathInStore))
430 ids = {r["dataset_id"] for r in records}
431 return ids
433 def removeStoredItemInfo(self, ref: DatasetIdRef) -> None:
434 # Docstring inherited from GenericBaseDatastore
435 self._table.delete(dataset_id=ref.id)
437 def _get_dataset_location_info(self,
438 ref: DatasetRef) -> Tuple[Optional[Location], Optional[StoredFileInfo]]:
439 """Find the `Location` of the requested dataset in the
440 `Datastore` and the associated stored file information.
442 Parameters
443 ----------
444 ref : `DatasetRef`
445 Reference to the required `Dataset`.
447 Returns
448 -------
449 location : `Location`
450 Location of the dataset within the datastore.
451 Returns `None` if the dataset can not be located.
452 info : `StoredFileInfo`
453 Stored information about this file and its formatter.
454 """
455 # Get the file information (this will fail if no file)
456 try:
457 storedFileInfo = self.getStoredItemInfo(ref)
458 except KeyError:
459 return None, None
461 # Use the path to determine the location
462 location = self.locationFactory.fromPath(storedFileInfo.path)
464 return location, storedFileInfo
466 def _get_dataset_locations_info(self, ref: DatasetIdRef) -> List[Tuple[Location, StoredFileInfo]]:
467 r"""Find all the `Location`\ s of the requested dataset in the
468 `Datastore` and the associated stored file information.
470 Parameters
471 ----------
472 ref : `DatasetRef`
473 Reference to the required `Dataset`.
475 Returns
476 -------
477 results : `list` [`tuple` [`Location`, `StoredFileInfo` ]]
478 Location of the dataset within the datastore and
479 stored information about each file and its formatter.
480 """
481 # Get the file information (this will fail if no file)
482 records = self.getStoredItemsInfo(ref)
484 # Use the path to determine the location
485 return [(self.locationFactory.fromPath(r.path), r) for r in records]
487 def _can_remove_dataset_artifact(self, ref: DatasetIdRef, location: Location) -> bool:
488 """Check that there is only one dataset associated with the
489 specified artifact.
491 Parameters
492 ----------
493 ref : `DatasetRef` or `FakeDatasetRef`
494 Dataset to be removed.
495 location : `Location`
496 The location of the artifact to be removed.
498 Returns
499 -------
500 can_remove : `Bool`
501 True if the artifact can be safely removed.
502 """
504 # Get all entries associated with this path
505 allRefs = self._registered_refs_per_artifact(location.pathInStore)
506 if not allRefs: 506 ↛ 507line 506 didn't jump to line 507, because the condition on line 506 was never true
507 raise RuntimeError(f"Datastore inconsistency error. {location.pathInStore} not in registry")
509 # Remove these refs from all the refs and if there is nothing left
510 # then we can delete
511 remainingRefs = allRefs - {ref.id}
513 if remainingRefs:
514 return False
515 return True
517 def _prepare_for_get(self, ref: DatasetRef,
518 parameters: Optional[Mapping[str, Any]] = None) -> List[DatastoreFileGetInformation]:
519 """Check parameters for ``get`` and obtain formatter and
520 location.
522 Parameters
523 ----------
524 ref : `DatasetRef`
525 Reference to the required Dataset.
526 parameters : `dict`
527 `StorageClass`-specific parameters that specify, for example,
528 a slice of the dataset to be loaded.
530 Returns
531 -------
532 getInfo : `list` [`DatastoreFileGetInformation`]
533 Parameters needed to retrieve each file.
534 """
535 log.debug("Retrieve %s from %s with parameters %s", ref, self.name, parameters)
537 # Get file metadata and internal metadata
538 fileLocations = self._get_dataset_locations_info(ref)
539 if not fileLocations:
540 raise FileNotFoundError(f"Could not retrieve dataset {ref}.")
542 # The storage class we want to use eventually
543 refStorageClass = ref.datasetType.storageClass
545 # Check that the supplied parameters are suitable for the type read
546 refStorageClass.validateParameters(parameters)
548 if len(fileLocations) > 1:
549 disassembled = True
550 else:
551 disassembled = False
553 # Is this a component request?
554 refComponent = ref.datasetType.component()
556 fileGetInfo = []
557 for location, storedFileInfo in fileLocations:
559 # The storage class used to write the file
560 writeStorageClass = storedFileInfo.storageClass
562 # If this has been disassembled we need read to match the write
563 if disassembled:
564 readStorageClass = writeStorageClass
565 else:
566 readStorageClass = refStorageClass
568 formatter = getInstanceOf(storedFileInfo.formatter,
569 FileDescriptor(location, readStorageClass=readStorageClass,
570 storageClass=writeStorageClass, parameters=parameters),
571 ref.dataId)
573 _, notFormatterParams = formatter.segregateParameters()
575 # Of the remaining parameters, extract the ones supported by
576 # this StorageClass (for components not all will be handled)
577 assemblerParams = readStorageClass.filterParameters(notFormatterParams)
579 # The ref itself could be a component if the dataset was
580 # disassembled by butler, or we disassembled in datastore and
581 # components came from the datastore records
582 component = storedFileInfo.component if storedFileInfo.component else refComponent
584 fileGetInfo.append(DatastoreFileGetInformation(location, formatter, storedFileInfo,
585 assemblerParams, component, readStorageClass))
587 return fileGetInfo
589 def _prepare_for_put(self, inMemoryDataset: Any, ref: DatasetRef) -> Tuple[Location, Formatter]:
590 """Check the arguments for ``put`` and obtain formatter and
591 location.
593 Parameters
594 ----------
595 inMemoryDataset : `object`
596 The dataset to store.
597 ref : `DatasetRef`
598 Reference to the associated Dataset.
600 Returns
601 -------
602 location : `Location`
603 The location to write the dataset.
604 formatter : `Formatter`
605 The `Formatter` to use to write the dataset.
607 Raises
608 ------
609 TypeError
610 Supplied object and storage class are inconsistent.
611 DatasetTypeNotSupportedError
612 The associated `DatasetType` is not handled by this datastore.
613 """
614 self._validate_put_parameters(inMemoryDataset, ref)
616 # Work out output file name
617 try:
618 template = self.templates.getTemplate(ref)
619 except KeyError as e:
620 raise DatasetTypeNotSupportedError(f"Unable to find template for {ref}") from e
622 location = self.locationFactory.fromPath(template.format(ref))
624 # Get the formatter based on the storage class
625 storageClass = ref.datasetType.storageClass
626 try:
627 formatter = self.formatterFactory.getFormatter(ref,
628 FileDescriptor(location,
629 storageClass=storageClass),
630 ref.dataId)
631 except KeyError as e:
632 raise DatasetTypeNotSupportedError(f"Unable to find formatter for {ref}") from e
634 return location, formatter
636 @abstractmethod
637 def _standardizeIngestPath(self, path: str, *, transfer: Optional[str] = None) -> str:
638 """Standardize the path of a to-be-ingested file.
640 Parameters
641 ----------
642 path : `str`
643 Path of a file to be ingested.
644 transfer : `str`, optional
645 How (and whether) the dataset should be added to the datastore.
646 See `ingest` for details of transfer modes.
647 This implementation is provided only so
648 `NotImplementedError` can be raised if the mode is not supported;
649 actual transfers are deferred to `_extractIngestInfo`.
651 Returns
652 -------
653 path : `str`
654 New path in what the datastore considers standard form.
656 Notes
657 -----
658 Subclasses of `FileLikeDatastore` should implement this method instead
659 of `_prepIngest`. It should not modify the data repository or given
660 file in any way.
662 Raises
663 ------
664 NotImplementedError
665 Raised if the datastore does not support the given transfer mode
666 (including the case where ingest is not supported at all).
667 FileNotFoundError
668 Raised if one of the given files does not exist.
669 """
670 raise NotImplementedError("Must be implemented by subclasses.")
672 @abstractmethod
673 def _extractIngestInfo(self, path: str, ref: DatasetRef, *,
674 formatter: Union[Formatter, Type[Formatter]],
675 transfer: Optional[str] = None) -> StoredFileInfo:
676 """Relocate (if necessary) and extract `StoredFileInfo` from a
677 to-be-ingested file.
679 Parameters
680 ----------
681 path : `str`
682 Path of a file to be ingested.
683 ref : `DatasetRef`
684 Reference for the dataset being ingested. Guaranteed to have
685 ``dataset_id not None`.
686 formatter : `type` or `Formatter`
687 `Formatter` subclass to use for this dataset or an instance.
688 transfer : `str`, optional
689 How (and whether) the dataset should be added to the datastore.
690 See `ingest` for details of transfer modes.
692 Returns
693 -------
694 info : `StoredFileInfo`
695 Internal datastore record for this file. This will be inserted by
696 the caller; the `_extractIngestInfo` is only resposible for
697 creating and populating the struct.
699 Raises
700 ------
701 FileNotFoundError
702 Raised if one of the given files does not exist.
703 FileExistsError
704 Raised if transfer is not `None` but the (internal) location the
705 file would be moved to is already occupied.
706 """
707 raise NotImplementedError("Must be implemented by subclasses.")
709 def _prepIngest(self, *datasets: FileDataset, transfer: Optional[str] = None) -> _IngestPrepData:
710 # Docstring inherited from Datastore._prepIngest.
711 filtered = []
712 for dataset in datasets:
713 acceptable = [ref for ref in dataset.refs if self.constraints.isAcceptable(ref)]
714 if not acceptable:
715 continue
716 else:
717 dataset.refs = acceptable
718 if dataset.formatter is None:
719 dataset.formatter = self.formatterFactory.getFormatterClass(dataset.refs[0])
720 else:
721 assert isinstance(dataset.formatter, (type, str))
722 dataset.formatter = getClassOf(dataset.formatter)
723 dataset.path = self._standardizeIngestPath(dataset.path, transfer=transfer)
724 filtered.append(dataset)
725 return _IngestPrepData(filtered)
727 @transactional
728 def _finishIngest(self, prepData: Datastore.IngestPrepData, *, transfer: Optional[str] = None) -> None:
729 # Docstring inherited from Datastore._finishIngest.
730 refsAndInfos = []
731 for dataset in prepData.datasets:
732 # Do ingest as if the first dataset ref is associated with the file
733 info = self._extractIngestInfo(dataset.path, dataset.refs[0], formatter=dataset.formatter,
734 transfer=transfer)
735 refsAndInfos.extend([(ref, info) for ref in dataset.refs])
736 self._register_datasets(refsAndInfos)
738 @abstractmethod
739 def _write_in_memory_to_artifact(self, inMemoryDataset: Any, ref: DatasetRef) -> StoredFileInfo:
740 """Write out in memory dataset to datastore.
742 Parameters
743 ----------
744 inMemoryDataset : `object`
745 Dataset to write to datastore.
746 ref : `DatasetRef`
747 Registry information associated with this dataset.
749 Returns
750 -------
751 info : `StoredFileInfo`
752 Information describin the artifact written to the datastore.
753 """
754 raise NotImplementedError()
756 @abstractmethod
757 def _read_artifact_into_memory(self, getInfo: DatastoreFileGetInformation,
758 ref: DatasetRef, isComponent: bool = False) -> Any:
759 """Read the artifact from datastore into in memory object.
761 Parameters
762 ----------
763 getInfo : `DatastoreFileGetInformation`
764 Information about the artifact within the datastore.
765 ref : `DatasetRef`
766 The registry information associated with this artifact.
767 isComponent : `bool`
768 Flag to indicate if a component is being read from this artifact.
770 Returns
771 -------
772 inMemoryDataset : `object`
773 The artifact as a python object.
774 """
775 raise NotImplementedError()
777 def exists(self, ref: DatasetRef) -> bool:
778 """Check if the dataset exists in the datastore.
780 Parameters
781 ----------
782 ref : `DatasetRef`
783 Reference to the required dataset.
785 Returns
786 -------
787 exists : `bool`
788 `True` if the entity exists in the `Datastore`.
789 """
790 fileLocations = self._get_dataset_locations_info(ref)
791 if not fileLocations:
792 return False
793 for location, _ in fileLocations:
794 if not self._artifact_exists(location):
795 return False
797 return True
799 def getURIs(self, ref: DatasetRef,
800 predict: bool = False) -> Tuple[Optional[ButlerURI], Dict[str, ButlerURI]]:
801 """Return URIs associated with dataset.
803 Parameters
804 ----------
805 ref : `DatasetRef`
806 Reference to the required dataset.
807 predict : `bool`, optional
808 If the datastore does not know about the dataset, should it
809 return a predicted URI or not?
811 Returns
812 -------
813 primary : `ButlerURI`
814 The URI to the primary artifact associated with this dataset.
815 If the dataset was disassembled within the datastore this
816 may be `None`.
817 components : `dict`
818 URIs to any components associated with the dataset artifact.
819 Can be empty if there are no components.
820 """
822 primary: Optional[ButlerURI] = None
823 components: Dict[str, ButlerURI] = {}
825 # if this has never been written then we have to guess
826 if not self.exists(ref):
827 if not predict:
828 raise FileNotFoundError("Dataset {} not in this datastore".format(ref))
830 def predictLocation(thisRef: DatasetRef) -> Location:
831 template = self.templates.getTemplate(thisRef)
832 location = self.locationFactory.fromPath(template.format(thisRef))
833 storageClass = ref.datasetType.storageClass
834 formatter = self.formatterFactory.getFormatter(thisRef,
835 FileDescriptor(location,
836 storageClass=storageClass))
837 # Try to use the extension attribute but ignore problems if the
838 # formatter does not define one.
839 try:
840 location = formatter.makeUpdatedLocation(location)
841 except Exception:
842 # Use the default extension
843 pass
844 return location
846 doDisassembly = self.composites.shouldBeDisassembled(ref)
848 if doDisassembly:
850 for component, componentStorage in ref.datasetType.storageClass.components.items():
851 compTypeName = ref.datasetType.componentTypeName(component)
852 compType = DatasetType(compTypeName, dimensions=ref.datasetType.dimensions,
853 storageClass=componentStorage)
854 compRef = DatasetRef(compType, ref.dataId, id=ref.id, run=ref.run, conform=False)
856 compLocation = predictLocation(compRef)
858 # Add a URI fragment to indicate this is a guess
859 components[component] = ButlerURI(compLocation.uri + "#predicted")
861 else:
863 location = predictLocation(ref)
865 # Add a URI fragment to indicate this is a guess
866 primary = ButlerURI(location.uri + "#predicted")
868 return primary, components
870 # If this is a ref that we have written we can get the path.
871 # Get file metadata and internal metadata
872 fileLocations = self._get_dataset_locations_info(ref)
874 if not fileLocations: 874 ↛ 875line 874 didn't jump to line 875, because the condition on line 874 was never true
875 raise RuntimeError(f"Unexpectedly got no artifacts for dataset {ref}")
877 if len(fileLocations) == 1:
878 # No disassembly so this is the primary URI
879 primary = ButlerURI(fileLocations[0][0].uri)
881 else:
882 for location, storedFileInfo in fileLocations:
883 if storedFileInfo.component is None: 883 ↛ 884line 883 didn't jump to line 884, because the condition on line 883 was never true
884 raise RuntimeError(f"Unexpectedly got no component name for a component at {location}")
885 components[storedFileInfo.component] = ButlerURI(location.uri)
887 return primary, components
889 def getURI(self, ref: DatasetRef, predict: bool = False) -> ButlerURI:
890 """URI to the Dataset.
892 Parameters
893 ----------
894 ref : `DatasetRef`
895 Reference to the required Dataset.
896 predict : `bool`
897 If `True`, allow URIs to be returned of datasets that have not
898 been written.
900 Returns
901 -------
902 uri : `str`
903 URI pointing to the dataset within the datastore. If the
904 dataset does not exist in the datastore, and if ``predict`` is
905 `True`, the URI will be a prediction and will include a URI
906 fragment "#predicted".
907 If the datastore does not have entities that relate well
908 to the concept of a URI the returned URI will be
909 descriptive. The returned URI is not guaranteed to be obtainable.
911 Raises
912 ------
913 FileNotFoundError
914 Raised if a URI has been requested for a dataset that does not
915 exist and guessing is not allowed.
916 RuntimeError
917 Raised if a request is made for a single URI but multiple URIs
918 are associated with this dataset.
920 Notes
921 -----
922 When a predicted URI is requested an attempt will be made to form
923 a reasonable URI based on file templates and the expected formatter.
924 """
925 primary, components = self.getURIs(ref, predict)
926 if primary is None or components: 926 ↛ 927line 926 didn't jump to line 927, because the condition on line 926 was never true
927 raise RuntimeError(f"Dataset ({ref}) includes distinct URIs for components. "
928 "Use Dataastore.getURIs() instead.")
929 return primary
931 def get(self, ref: DatasetRef, parameters: Optional[Mapping[str, Any]] = None) -> Any:
932 """Load an InMemoryDataset from the store.
934 Parameters
935 ----------
936 ref : `DatasetRef`
937 Reference to the required Dataset.
938 parameters : `dict`
939 `StorageClass`-specific parameters that specify, for example,
940 a slice of the dataset to be loaded.
942 Returns
943 -------
944 inMemoryDataset : `object`
945 Requested dataset or slice thereof as an InMemoryDataset.
947 Raises
948 ------
949 FileNotFoundError
950 Requested dataset can not be retrieved.
951 TypeError
952 Return value from formatter has unexpected type.
953 ValueError
954 Formatter failed to process the dataset.
955 """
956 allGetInfo = self._prepare_for_get(ref, parameters)
957 refComponent = ref.datasetType.component()
959 if len(allGetInfo) > 1 and not refComponent:
960 # This was a disassembled dataset spread over multiple files
961 # and we need to put them all back together again.
962 # Read into memory and then assemble
963 usedParams = set()
964 components = {}
965 for getInfo in allGetInfo:
966 # assemblerParams are parameters not understood by the
967 # associated formatter.
968 usedParams.update(set(getInfo.assemblerParams))
970 component = getInfo.component
971 # We do not want the formatter to think it's reading
972 # a component though because it is really reading a
973 # standalone dataset -- always tell reader it is not a
974 # component.
975 components[component] = self._read_artifact_into_memory(getInfo, ref, isComponent=False)
977 inMemoryDataset = ref.datasetType.storageClass.assembler().assemble(components)
979 # Any unused parameters will have to be passed to the assembler
980 if parameters:
981 unusedParams = {k: v for k, v in parameters.items() if k not in usedParams}
982 else:
983 unusedParams = {}
985 # Process parameters
986 return ref.datasetType.storageClass.assembler().handleParameters(inMemoryDataset,
987 parameters=unusedParams)
989 else:
990 # Single file request or component from that composite file
991 allComponents = {i.component: i for i in allGetInfo}
992 for lookup in (refComponent, None): 992 ↛ 997line 992 didn't jump to line 997, because the loop on line 992 didn't complete
993 if lookup in allComponents: 993 ↛ 992line 993 didn't jump to line 992, because the condition on line 993 was never false
994 getInfo = allComponents[lookup]
995 break
996 else:
997 raise FileNotFoundError(f"Component {refComponent} not found "
998 f"for ref {ref} in datastore {self.name}")
1000 return self._read_artifact_into_memory(getInfo, ref, isComponent=getInfo.component is not None)
1002 @transactional
1003 def put(self, inMemoryDataset: Any, ref: DatasetRef) -> None:
1004 """Write a InMemoryDataset with a given `DatasetRef` to the store.
1006 Parameters
1007 ----------
1008 inMemoryDataset : `object`
1009 The dataset to store.
1010 ref : `DatasetRef`
1011 Reference to the associated Dataset.
1013 Raises
1014 ------
1015 TypeError
1016 Supplied object and storage class are inconsistent.
1017 DatasetTypeNotSupportedError
1018 The associated `DatasetType` is not handled by this datastore.
1020 Notes
1021 -----
1022 If the datastore is configured to reject certain dataset types it
1023 is possible that the put will fail and raise a
1024 `DatasetTypeNotSupportedError`. The main use case for this is to
1025 allow `ChainedDatastore` to put to multiple datastores without
1026 requiring that every datastore accepts the dataset.
1027 """
1029 doDisassembly = self.composites.shouldBeDisassembled(ref)
1030 # doDisassembly = True
1032 artifacts = []
1033 if doDisassembly:
1034 components = ref.datasetType.storageClass.assembler().disassemble(inMemoryDataset)
1035 for component, componentInfo in components.items():
1036 compTypeName = ref.datasetType.componentTypeName(component)
1037 # Don't recurse because we want to take advantage of
1038 # bulk insert -- need a new DatasetRef that refers to the
1039 # same dataset_id but has the component DatasetType
1040 # DatasetType does not refer to the types of components
1041 # So we construct one ourselves.
1042 compType = DatasetType(compTypeName, dimensions=ref.datasetType.dimensions,
1043 storageClass=componentInfo.storageClass)
1044 compRef = DatasetRef(compType, ref.dataId, id=ref.id, run=ref.run, conform=False)
1045 storedInfo = self._write_in_memory_to_artifact(componentInfo.component, compRef)
1046 artifacts.append((compRef, storedInfo))
1047 else:
1048 # Write the entire thing out
1049 storedInfo = self._write_in_memory_to_artifact(inMemoryDataset, ref)
1050 artifacts.append((ref, storedInfo))
1052 self._register_datasets(artifacts)
1054 @transactional
1055 def trash(self, ref: DatasetRef, ignore_errors: bool = True) -> None:
1056 """Indicate to the datastore that a dataset can be removed.
1058 Parameters
1059 ----------
1060 ref : `DatasetRef`
1061 Reference to the required Dataset.
1062 ignore_errors : `bool`
1063 If `True` return without error even if something went wrong.
1064 Problems could occur if another process is simultaneously trying
1065 to delete.
1067 Raises
1068 ------
1069 FileNotFoundError
1070 Attempt to remove a dataset that does not exist.
1071 """
1072 # Get file metadata and internal metadata
1073 log.debug("Trashing %s in datastore %s", ref, self.name)
1075 fileLocations = self._get_dataset_locations_info(ref)
1077 if not fileLocations:
1078 err_msg = f"Requested dataset to trash ({ref}) is not known to datastore {self.name}"
1079 if ignore_errors:
1080 log.warning(err_msg)
1081 return
1082 else:
1083 raise FileNotFoundError(err_msg)
1085 for location, storedFileInfo in fileLocations:
1086 if not self._artifact_exists(location): 1086 ↛ 1087line 1086 didn't jump to line 1087, because the condition on line 1086 was never true
1087 err_msg = f"Dataset is known to datastore {self.name} but " \
1088 f"associated artifact ({location.uri}) is missing"
1089 if ignore_errors:
1090 log.warning(err_msg)
1091 return
1092 else:
1093 raise FileNotFoundError(err_msg)
1095 # Mark dataset as trashed
1096 try:
1097 self._move_to_trash_in_registry(ref)
1098 except Exception as e:
1099 if ignore_errors:
1100 log.warning(f"Attempted to mark dataset ({ref}) to be trashed in datastore {self.name} "
1101 f"but encountered an error: {e}")
1102 pass
1103 else:
1104 raise
1106 @transactional
1107 def emptyTrash(self, ignore_errors: bool = True) -> None:
1108 """Remove all datasets from the trash.
1110 Parameters
1111 ----------
1112 ignore_errors : `bool`
1113 If `True` return without error even if something went wrong.
1114 Problems could occur if another process is simultaneously trying
1115 to delete.
1116 """
1117 log.debug("Emptying trash in datastore %s", self.name)
1118 # Context manager will empty trash iff we finish it without raising.
1119 with self._bridge.emptyTrash() as trashed:
1120 for ref in trashed:
1121 fileLocations = self._get_dataset_locations_info(ref)
1123 if not fileLocations: 1123 ↛ 1124line 1123 didn't jump to line 1124, because the condition on line 1123 was never true
1124 err_msg = f"Requested dataset ({ref}) does not exist in datastore {self.name}"
1125 if ignore_errors:
1126 log.warning(err_msg)
1127 continue
1128 else:
1129 raise FileNotFoundError(err_msg)
1131 for location, _ in fileLocations:
1133 if not self._artifact_exists(location): 1133 ↛ 1134line 1133 didn't jump to line 1134, because the condition on line 1133 was never true
1134 err_msg = f"Dataset {location.uri} no longer present in datastore {self.name}"
1135 if ignore_errors:
1136 log.warning(err_msg)
1137 continue
1138 else:
1139 raise FileNotFoundError(err_msg)
1141 # Can only delete the artifact if there are no references
1142 # to the file from untrashed dataset refs.
1143 if self._can_remove_dataset_artifact(ref, location):
1144 # Point of no return for this artifact
1145 log.debug("Removing artifact %s from datastore %s", location.uri, self.name)
1146 try:
1147 self._delete_artifact(location)
1148 except Exception as e:
1149 if ignore_errors:
1150 log.critical("Encountered error removing artifact %s from datastore %s: %s",
1151 location.uri, self.name, e)
1152 else:
1153 raise
1155 # Now must remove the entry from the internal registry even if
1156 # the artifact removal failed and was ignored,
1157 # otherwise the removal check above will never be true
1158 try:
1159 # There may be multiple rows associated with this ref
1160 # depending on disassembly
1161 self.removeStoredItemInfo(ref)
1162 except Exception as e:
1163 if ignore_errors:
1164 log.warning("Error removing dataset %s (%s) from internal registry of %s: %s",
1165 ref.id, location.uri, self.name, e)
1166 continue
1167 else:
1168 raise FileNotFoundError(err_msg)
1170 def validateConfiguration(self, entities: Iterable[Union[DatasetRef, DatasetType, StorageClass]],
1171 logFailures: bool = False) -> None:
1172 """Validate some of the configuration for this datastore.
1174 Parameters
1175 ----------
1176 entities : iterable of `DatasetRef`, `DatasetType`, or `StorageClass`
1177 Entities to test against this configuration. Can be differing
1178 types.
1179 logFailures : `bool`, optional
1180 If `True`, output a log message for every validation error
1181 detected.
1183 Raises
1184 ------
1185 DatastoreValidationError
1186 Raised if there is a validation problem with a configuration.
1187 All the problems are reported in a single exception.
1189 Notes
1190 -----
1191 This method checks that all the supplied entities have valid file
1192 templates and also have formatters defined.
1193 """
1195 templateFailed = None
1196 try:
1197 self.templates.validateTemplates(entities, logFailures=logFailures)
1198 except FileTemplateValidationError as e:
1199 templateFailed = str(e)
1201 formatterFailed = []
1202 for entity in entities:
1203 try:
1204 self.formatterFactory.getFormatterClass(entity)
1205 except KeyError as e:
1206 formatterFailed.append(str(e))
1207 if logFailures: 1207 ↛ 1202line 1207 didn't jump to line 1202, because the condition on line 1207 was never false
1208 log.fatal("Formatter failure: %s", e)
1210 if templateFailed or formatterFailed:
1211 messages = []
1212 if templateFailed: 1212 ↛ 1213line 1212 didn't jump to line 1213, because the condition on line 1212 was never true
1213 messages.append(templateFailed)
1214 if formatterFailed: 1214 ↛ 1216line 1214 didn't jump to line 1216, because the condition on line 1214 was never false
1215 messages.append(",".join(formatterFailed))
1216 msg = ";\n".join(messages)
1217 raise DatastoreValidationError(msg)
1219 def getLookupKeys(self) -> Set[LookupKey]:
1220 # Docstring is inherited from base class
1221 return self.templates.getLookupKeys() | self.formatterFactory.getLookupKeys() | \
1222 self.constraints.getLookupKeys()
1224 def validateKey(self, lookupKey: LookupKey,
1225 entity: Union[DatasetRef, DatasetType, StorageClass]) -> None:
1226 # Docstring is inherited from base class
1227 # The key can be valid in either formatters or templates so we can
1228 # only check the template if it exists
1229 if lookupKey in self.templates:
1230 try:
1231 self.templates[lookupKey].validateTemplate(entity)
1232 except FileTemplateValidationError as e:
1233 raise DatastoreValidationError(e) from e