Coverage for python/lsst/daf/butler/datastores/fileLikeDatastore.py : 80%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
21from __future__ import annotations
23"""Generic file-based datastore code."""
25__all__ = ("FileLikeDatastore", )
27import logging
28from abc import abstractmethod
30from sqlalchemy import Integer, String
32from dataclasses import dataclass
33from typing import (
34 TYPE_CHECKING,
35 Any,
36 ClassVar,
37 Dict,
38 Iterable,
39 List,
40 Mapping,
41 Optional,
42 Set,
43 Tuple,
44 Type,
45 Union,
46)
48from lsst.daf.butler import (
49 CompositesMap,
50 Config,
51 FileDataset,
52 DatasetRef,
53 DatasetType,
54 DatasetTypeNotSupportedError,
55 Datastore,
56 DatastoreConfig,
57 DatastoreValidationError,
58 FileDescriptor,
59 FileTemplates,
60 FileTemplateValidationError,
61 Formatter,
62 FormatterFactory,
63 Location,
64 LocationFactory,
65 StorageClass,
66 StoredFileInfo,
67)
69from lsst.daf.butler import ddl
70from lsst.daf.butler.registry.interfaces import (
71 ReadOnlyDatabaseError,
72 DatastoreRegistryBridge,
73 FakeDatasetRef,
74)
76from lsst.daf.butler.core.repoRelocation import replaceRoot
77from lsst.daf.butler.core.utils import getInstanceOf, NamedValueSet, getClassOf, transactional
78from .genericDatastore import GenericBaseDatastore
80if TYPE_CHECKING: 80 ↛ 81line 80 didn't jump to line 81, because the condition on line 80 was never true
81 from lsst.daf.butler import LookupKey
82 from lsst.daf.butler.registry.interfaces import DatasetIdRef, DatastoreRegistryBridgeManager
84log = logging.getLogger(__name__)
86# String to use when a Python None is encountered
87NULLSTR = "__NULL_STRING__"
90class _IngestPrepData(Datastore.IngestPrepData):
91 """Helper class for FileLikeDatastore ingest implementation.
93 Parameters
94 ----------
95 datasets : `list` of `FileDataset`
96 Files to be ingested by this datastore.
97 """
98 def __init__(self, datasets: List[FileDataset]):
99 super().__init__(ref for dataset in datasets for ref in dataset.refs)
100 self.datasets = datasets
103@dataclass(frozen=True)
104class DatastoreFileGetInformation:
105 """Collection of useful parameters needed to retrieve a file from
106 a Datastore.
107 """
109 location: Location
110 """The location from which to read the dataset."""
112 formatter: Formatter
113 """The `Formatter` to use to deserialize the dataset."""
115 info: StoredFileInfo
116 """Stored information about this file and its formatter."""
118 assemblerParams: dict
119 """Parameters to use for post-processing the retrieved dataset."""
121 component: Optional[str]
122 """The component to be retrieved (can be `None`)."""
124 readStorageClass: StorageClass
125 """The `StorageClass` of the dataset being read."""
128class FileLikeDatastore(GenericBaseDatastore):
129 """Generic Datastore for file-based implementations.
131 Should always be sub-classed since key abstract methods are missing.
133 Parameters
134 ----------
135 config : `DatastoreConfig` or `str`
136 Configuration as either a `Config` object or URI to file.
137 bridgeManager : `DatastoreRegistryBridgeManager`
138 Object that manages the interface between `Registry` and datastores.
139 butlerRoot : `str`, optional
140 New datastore root to use to override the configuration value.
142 Raises
143 ------
144 ValueError
145 If root location does not exist and ``create`` is `False` in the
146 configuration.
147 """
149 defaultConfigFile: ClassVar[Optional[str]] = None
150 """Path to configuration defaults. Relative to $DAF_BUTLER_DIR/config or
151 absolute path. Can be None if no defaults specified.
152 """
154 root: str
155 """Root directory or URI of this `Datastore`."""
157 locationFactory: LocationFactory
158 """Factory for creating locations relative to the datastore root."""
160 formatterFactory: FormatterFactory
161 """Factory for creating instances of formatters."""
163 templates: FileTemplates
164 """File templates that can be used by this `Datastore`."""
166 composites: CompositesMap
167 """Determines whether a dataset should be disassembled on put."""
169 @classmethod
170 def setConfigRoot(cls, root: str, config: Config, full: Config, overwrite: bool = True) -> None:
171 """Set any filesystem-dependent config options for this Datastore to
172 be appropriate for a new empty repository with the given root.
174 Parameters
175 ----------
176 root : `str`
177 URI to the root of the data repository.
178 config : `Config`
179 A `Config` to update. Only the subset understood by
180 this component will be updated. Will not expand
181 defaults.
182 full : `Config`
183 A complete config with all defaults expanded that can be
184 converted to a `DatastoreConfig`. Read-only and will not be
185 modified by this method.
186 Repository-specific options that should not be obtained
187 from defaults when Butler instances are constructed
188 should be copied from ``full`` to ``config``.
189 overwrite : `bool`, optional
190 If `False`, do not modify a value in ``config`` if the value
191 already exists. Default is always to overwrite with the provided
192 ``root``.
194 Notes
195 -----
196 If a keyword is explicitly defined in the supplied ``config`` it
197 will not be overridden by this method if ``overwrite`` is `False`.
198 This allows explicit values set in external configs to be retained.
199 """
200 Config.updateParameters(DatastoreConfig, config, full,
201 toUpdate={"root": root},
202 toCopy=("cls", ("records", "table")), overwrite=overwrite)
204 @classmethod
205 def makeTableSpec(cls) -> ddl.TableSpec:
206 return ddl.TableSpec(
207 fields=NamedValueSet([
208 ddl.FieldSpec(name="dataset_id", dtype=Integer, primaryKey=True),
209 ddl.FieldSpec(name="path", dtype=String, length=256, nullable=False),
210 ddl.FieldSpec(name="formatter", dtype=String, length=128, nullable=False),
211 ddl.FieldSpec(name="storage_class", dtype=String, length=64, nullable=False),
212 # Use empty string to indicate no component
213 ddl.FieldSpec(name="component", dtype=String, length=32, primaryKey=True),
214 # TODO: should checksum be Base64Bytes instead?
215 ddl.FieldSpec(name="checksum", dtype=String, length=128, nullable=True),
216 ddl.FieldSpec(name="file_size", dtype=Integer, nullable=True),
217 ]),
218 unique=frozenset(), # type: ignore
219 )
221 def __init__(self, config: Union[DatastoreConfig, str],
222 bridgeManager: DatastoreRegistryBridgeManager, butlerRoot: str = None):
223 super().__init__(config, bridgeManager)
224 if "root" not in self.config: 224 ↛ 225line 224 didn't jump to line 225, because the condition on line 224 was never true
225 raise ValueError("No root directory specified in configuration")
227 # Name ourselves either using an explicit name or a name
228 # derived from the (unexpanded) root
229 if "name" in self.config:
230 self.name = self.config["name"]
231 else:
232 # We use the unexpanded root in the name to indicate that this
233 # datastore can be moved without having to update registry.
234 self.name = "{}@{}".format(type(self).__name__,
235 self.config["root"])
237 # Support repository relocation in config
238 # Existence of self.root is checked in subclass
239 self.root = replaceRoot(self.config["root"], butlerRoot)
241 self.locationFactory = LocationFactory(self.root)
242 self.formatterFactory = FormatterFactory()
244 # Now associate formatters with storage classes
245 self.formatterFactory.registerFormatters(self.config["formatters"],
246 universe=bridgeManager.universe)
248 # Read the file naming templates
249 self.templates = FileTemplates(self.config["templates"],
250 universe=bridgeManager.universe)
252 # See if composites should be disassembled
253 self.composites = CompositesMap(self.config["composites"],
254 universe=bridgeManager.universe)
256 tableName = self.config["records", "table"]
257 try:
258 # Storage of paths and formatters, keyed by dataset_id
259 self._table = bridgeManager.opaque.register(tableName, self.makeTableSpec())
260 # Interface to Registry.
261 self._bridge = bridgeManager.register(self.name)
262 except ReadOnlyDatabaseError:
263 # If the database is read only and we just tried and failed to
264 # create a table, it means someone is trying to create a read-only
265 # butler client for an empty repo. That should be okay, as long
266 # as they then try to get any datasets before some other client
267 # creates the table. Chances are they'rejust validating
268 # configuration.
269 pass
271 # Determine whether checksums should be used
272 self.useChecksum = self.config.get("checksum", True)
274 def __str__(self) -> str:
275 return self.root
277 @property
278 def bridge(self) -> DatastoreRegistryBridge:
279 return self._bridge
281 @abstractmethod
282 def _artifact_exists(self, location: Location) -> bool:
283 """Check that an artifact exists in this datastore at the specified
284 location.
286 Parameters
287 ----------
288 location : `Location`
289 Expected location of the artifact associated with this datastore.
291 Returns
292 -------
293 exists : `bool`
294 True if the location can be found, false otherwise.
295 """
296 raise NotImplementedError()
298 @abstractmethod
299 def _delete_artifact(self, location: Location) -> None:
300 """Delete the artifact from the datastore.
302 Parameters
303 ----------
304 location : `Location`
305 Location of the artifact associated with this datastore.
306 """
307 raise NotImplementedError()
309 def addStoredItemInfo(self, refs: Iterable[DatasetRef], infos: Iterable[StoredFileInfo]) -> None:
310 # Docstring inherited from GenericBaseDatastore
311 records = []
312 for ref, info in zip(refs, infos):
313 # Component should come from ref and fall back on info
314 component = ref.datasetType.component()
315 if component is None and info.component is not None: 315 ↛ 316line 315 didn't jump to line 316, because the condition on line 315 was never true
316 component = info.component
317 if component is None:
318 # Use empty string since we want this to be part of the
319 # primary key.
320 component = NULLSTR
321 records.append(
322 dict(dataset_id=ref.id, formatter=info.formatter, path=info.path,
323 storage_class=info.storageClass.name, component=component,
324 checksum=info.checksum, file_size=info.file_size)
325 )
326 self._table.insert(*records)
328 def getStoredItemInfo(self, ref: DatasetIdRef) -> StoredFileInfo:
329 # Docstring inherited from GenericBaseDatastore
331 if ref.id is None: 331 ↛ 332line 331 didn't jump to line 332, because the condition on line 331 was never true
332 raise RuntimeError("Unable to retrieve information for unresolved DatasetRef")
334 where: Dict[str, Union[int, str]] = {"dataset_id": ref.id}
336 # If we have no component we want the row from this table without
337 # a component. If we do have a component we either need the row
338 # with no component or the row with the component, depending on how
339 # this dataset was dissassembled.
341 # if we are emptying trash we won't have real refs so can't constrain
342 # by component. Will need to fix this to return multiple matches
343 # in future.
344 component = None
345 try:
346 component = ref.datasetType.component()
347 except AttributeError:
348 pass
349 else:
350 if component is None:
351 where["component"] = NULLSTR
353 # Look for the dataset_id -- there might be multiple matches
354 # if we have disassembled the dataset.
355 records = list(self._table.fetch(**where))
356 if len(records) == 0: 356 ↛ 357line 356 didn't jump to line 357, because the condition on line 356 was never true
357 raise KeyError(f"Unable to retrieve location associated with dataset {ref}.")
359 # if we are not asking for a component
360 if not component and len(records) != 1: 360 ↛ 361line 360 didn't jump to line 361, because the condition on line 360 was never true
361 raise RuntimeError(f"Got {len(records)} from location query of dataset {ref}")
363 # if we had a FakeDatasetRef we pick the first record regardless
364 if isinstance(ref, FakeDatasetRef): 364 ↛ 365line 364 didn't jump to line 365, because the condition on line 364 was never true
365 record = records[0]
366 else:
367 records_by_component = {}
368 for r in records:
369 this_component = r["component"] if r["component"] and r["component"] != NULLSTR else None
370 records_by_component[this_component] = r
372 # Look for component by name else fall back to the parent
373 for lookup in (component, None): 373 ↛ 378line 373 didn't jump to line 378, because the loop on line 373 didn't complete
374 if lookup in records_by_component: 374 ↛ 373line 374 didn't jump to line 373, because the condition on line 374 was never false
375 record = records_by_component[lookup]
376 break
377 else:
378 raise KeyError(f"Unable to retrieve location for component {component} associated with "
379 f"dataset {ref}.")
381 # Convert name of StorageClass to instance
382 storageClass = self.storageClassFactory.getStorageClass(record["storage_class"])
384 return StoredFileInfo(formatter=record["formatter"],
385 path=record["path"],
386 storageClass=storageClass,
387 component=component,
388 checksum=record["checksum"],
389 file_size=record["file_size"])
391 def getStoredItemsInfo(self, ref: DatasetIdRef) -> List[StoredFileInfo]:
392 # Docstring inherited from GenericBaseDatastore
394 # Look for the dataset_id -- there might be multiple matches
395 # if we have disassembled the dataset.
396 records = list(self._table.fetch(dataset_id=ref.id))
398 results = []
399 for record in records:
400 # Convert name of StorageClass to instance
401 storageClass = self.storageClassFactory.getStorageClass(record["storage_class"])
402 component = record["component"] if (record["component"]
403 and record["component"] != NULLSTR) else None
405 info = StoredFileInfo(formatter=record["formatter"],
406 path=record["path"],
407 storageClass=storageClass,
408 component=component,
409 checksum=record["checksum"],
410 file_size=record["file_size"])
411 results.append(info)
413 return results
415 def _registered_refs_per_artifact(self, pathInStore: str) -> Set[int]:
416 """Return all dataset refs associated with the supplied path.
418 Parameters
419 ----------
420 pathInStore : `str`
421 Path of interest in the data store.
423 Returns
424 -------
425 ids : `set` of `int`
426 All `DatasetRef` IDs associated with this path.
427 """
428 records = list(self._table.fetch(path=pathInStore))
429 ids = {r["dataset_id"] for r in records}
430 return ids
432 def removeStoredItemInfo(self, ref: DatasetIdRef) -> None:
433 # Docstring inherited from GenericBaseDatastore
434 self._table.delete(dataset_id=ref.id)
436 def _get_dataset_location_info(self,
437 ref: DatasetRef) -> Tuple[Optional[Location], Optional[StoredFileInfo]]:
438 """Find the `Location` of the requested dataset in the
439 `Datastore` and the associated stored file information.
441 Parameters
442 ----------
443 ref : `DatasetRef`
444 Reference to the required `Dataset`.
446 Returns
447 -------
448 location : `Location`
449 Location of the dataset within the datastore.
450 Returns `None` if the dataset can not be located.
451 info : `StoredFileInfo`
452 Stored information about this file and its formatter.
453 """
454 # Get the file information (this will fail if no file)
455 try:
456 storedFileInfo = self.getStoredItemInfo(ref)
457 except KeyError:
458 return None, None
460 # Use the path to determine the location
461 location = self.locationFactory.fromPath(storedFileInfo.path)
463 return location, storedFileInfo
465 def _get_dataset_locations_info(self, ref: DatasetIdRef) -> List[Tuple[Location, StoredFileInfo]]:
466 r"""Find all the `Location`\ s of the requested dataset in the
467 `Datastore` and the associated stored file information.
469 Parameters
470 ----------
471 ref : `DatasetRef`
472 Reference to the required `Dataset`.
474 Returns
475 -------
476 results : `list` [`tuple` [`Location`, `StoredFileInfo` ]]
477 Location of the dataset within the datastore and
478 stored information about each file and its formatter.
479 """
480 # Get the file information (this will fail if no file)
481 records = self.getStoredItemsInfo(ref)
483 # Use the path to determine the location
484 return [(self.locationFactory.fromPath(r.path), r) for r in records]
486 def _can_remove_dataset_artifact(self, ref: DatasetIdRef, location: Location) -> bool:
487 """Check that there is only one dataset associated with the
488 specified artifact.
490 Parameters
491 ----------
492 ref : `DatasetRef` or `FakeDatasetRef`
493 Dataset to be removed.
494 location : `Location`
495 The location of the artifact to be removed.
497 Returns
498 -------
499 can_remove : `Bool`
500 True if the artifact can be safely removed.
501 """
503 # Get all entries associated with this path
504 allRefs = self._registered_refs_per_artifact(location.pathInStore)
505 if not allRefs: 505 ↛ 506line 505 didn't jump to line 506, because the condition on line 505 was never true
506 raise RuntimeError(f"Datastore inconsistency error. {location.pathInStore} not in registry")
508 # Get all the refs associated with this dataset if it is a composite
509 theseRefs = {r.id for r in ref.allRefs()}
511 # Remove these refs from all the refs and if there is nothing left
512 # then we can delete
513 remainingRefs = allRefs - theseRefs
515 if remainingRefs:
516 return False
517 return True
519 def _prepare_for_get(self, ref: DatasetRef,
520 parameters: Optional[Mapping[str, Any]] = None) -> List[DatastoreFileGetInformation]:
521 """Check parameters for ``get`` and obtain formatter and
522 location.
524 Parameters
525 ----------
526 ref : `DatasetRef`
527 Reference to the required Dataset.
528 parameters : `dict`
529 `StorageClass`-specific parameters that specify, for example,
530 a slice of the dataset to be loaded.
532 Returns
533 -------
534 getInfo : `list` [`DatastoreFileGetInformation`]
535 Parameters needed to retrieve each file.
536 """
537 log.debug("Retrieve %s from %s with parameters %s", ref, self.name, parameters)
539 # Get file metadata and internal metadata
540 fileLocations = self._get_dataset_locations_info(ref)
541 if not fileLocations:
542 raise FileNotFoundError(f"Could not retrieve dataset {ref}.")
544 # The storage class we want to use eventually
545 refStorageClass = ref.datasetType.storageClass
547 # Check that the supplied parameters are suitable for the type read
548 refStorageClass.validateParameters(parameters)
550 if len(fileLocations) > 1:
551 disassembled = True
552 else:
553 disassembled = False
555 # Is this a component request?
556 refComponent = ref.datasetType.component()
558 fileGetInfo = []
559 for location, storedFileInfo in fileLocations:
561 # The storage class used to write the file
562 writeStorageClass = storedFileInfo.storageClass
564 # If this has been disassembled we need read to match the write
565 if disassembled:
566 readStorageClass = writeStorageClass
567 else:
568 readStorageClass = refStorageClass
570 formatter = getInstanceOf(storedFileInfo.formatter,
571 FileDescriptor(location, readStorageClass=readStorageClass,
572 storageClass=writeStorageClass, parameters=parameters),
573 ref.dataId)
575 _, notFormatterParams = formatter.segregateParameters()
577 # Of the remaining parameters, extract the ones supported by
578 # this StorageClass (for components not all will be handled)
579 assemblerParams = readStorageClass.filterParameters(notFormatterParams)
581 # The ref itself could be a component if the dataset was
582 # disassembled by butler, or we disassembled in datastore and
583 # components came from the datastore records
584 component = storedFileInfo.component if storedFileInfo.component else refComponent
586 fileGetInfo.append(DatastoreFileGetInformation(location, formatter, storedFileInfo,
587 assemblerParams, component, readStorageClass))
589 return fileGetInfo
591 def _prepare_for_put(self, inMemoryDataset: Any, ref: DatasetRef) -> Tuple[Location, Formatter]:
592 """Check the arguments for ``put`` and obtain formatter and
593 location.
595 Parameters
596 ----------
597 inMemoryDataset : `object`
598 The dataset to store.
599 ref : `DatasetRef`
600 Reference to the associated Dataset.
602 Returns
603 -------
604 location : `Location`
605 The location to write the dataset.
606 formatter : `Formatter`
607 The `Formatter` to use to write the dataset.
609 Raises
610 ------
611 TypeError
612 Supplied object and storage class are inconsistent.
613 DatasetTypeNotSupportedError
614 The associated `DatasetType` is not handled by this datastore.
615 """
616 self._validate_put_parameters(inMemoryDataset, ref)
618 # Work out output file name
619 try:
620 template = self.templates.getTemplate(ref)
621 except KeyError as e:
622 raise DatasetTypeNotSupportedError(f"Unable to find template for {ref}") from e
624 location = self.locationFactory.fromPath(template.format(ref))
626 # Get the formatter based on the storage class
627 storageClass = ref.datasetType.storageClass
628 try:
629 formatter = self.formatterFactory.getFormatter(ref,
630 FileDescriptor(location,
631 storageClass=storageClass),
632 ref.dataId)
633 except KeyError as e:
634 raise DatasetTypeNotSupportedError(f"Unable to find formatter for {ref}") from e
636 return location, formatter
638 @abstractmethod
639 def _standardizeIngestPath(self, path: str, *, transfer: Optional[str] = None) -> str:
640 """Standardize the path of a to-be-ingested file.
642 Parameters
643 ----------
644 path : `str`
645 Path of a file to be ingested.
646 transfer : `str`, optional
647 How (and whether) the dataset should be added to the datastore.
648 See `ingest` for details of transfer modes.
649 This implementation is provided only so
650 `NotImplementedError` can be raised if the mode is not supported;
651 actual transfers are deferred to `_extractIngestInfo`.
653 Returns
654 -------
655 path : `str`
656 New path in what the datastore considers standard form.
658 Notes
659 -----
660 Subclasses of `FileLikeDatastore` should implement this method instead
661 of `_prepIngest`. It should not modify the data repository or given
662 file in any way.
664 Raises
665 ------
666 NotImplementedError
667 Raised if the datastore does not support the given transfer mode
668 (including the case where ingest is not supported at all).
669 FileNotFoundError
670 Raised if one of the given files does not exist.
671 """
672 raise NotImplementedError("Must be implemented by subclasses.")
674 @abstractmethod
675 def _extractIngestInfo(self, path: str, ref: DatasetRef, *,
676 formatter: Union[Formatter, Type[Formatter]],
677 transfer: Optional[str] = None) -> StoredFileInfo:
678 """Relocate (if necessary) and extract `StoredFileInfo` from a
679 to-be-ingested file.
681 Parameters
682 ----------
683 path : `str`
684 Path of a file to be ingested.
685 ref : `DatasetRef`
686 Reference for the dataset being ingested. Guaranteed to have
687 ``dataset_id not None`.
688 formatter : `type` or `Formatter`
689 `Formatter` subclass to use for this dataset or an instance.
690 transfer : `str`, optional
691 How (and whether) the dataset should be added to the datastore.
692 See `ingest` for details of transfer modes.
694 Returns
695 -------
696 info : `StoredFileInfo`
697 Internal datastore record for this file. This will be inserted by
698 the caller; the `_extractIngestInfo` is only resposible for
699 creating and populating the struct.
701 Raises
702 ------
703 FileNotFoundError
704 Raised if one of the given files does not exist.
705 FileExistsError
706 Raised if transfer is not `None` but the (internal) location the
707 file would be moved to is already occupied.
708 """
709 raise NotImplementedError("Must be implemented by subclasses.")
711 def _prepIngest(self, *datasets: FileDataset, transfer: Optional[str] = None) -> _IngestPrepData:
712 # Docstring inherited from Datastore._prepIngest.
713 filtered = []
714 for dataset in datasets:
715 acceptable = [ref for ref in dataset.refs if self.constraints.isAcceptable(ref)]
716 if not acceptable:
717 continue
718 else:
719 dataset.refs = acceptable
720 if dataset.formatter is None:
721 dataset.formatter = self.formatterFactory.getFormatterClass(dataset.refs[0])
722 else:
723 dataset.formatter = getClassOf(dataset.formatter)
724 dataset.path = self._standardizeIngestPath(dataset.path, transfer=transfer)
725 filtered.append(dataset)
726 return _IngestPrepData(filtered)
728 @transactional
729 def _finishIngest(self, prepData: Datastore.IngestPrepData, *, transfer: Optional[str] = None) -> None:
730 # Docstring inherited from Datastore._finishIngest.
731 refsAndInfos = []
732 for dataset in prepData.datasets:
733 # Do ingest as if the first dataset ref is associated with the file
734 info = self._extractIngestInfo(dataset.path, dataset.refs[0], formatter=dataset.formatter,
735 transfer=transfer)
736 refsAndInfos.extend([(ref, info) for ref in dataset.refs])
737 self._register_datasets(refsAndInfos)
739 @abstractmethod
740 def _write_in_memory_to_artifact(self, inMemoryDataset: Any, ref: DatasetRef) -> StoredFileInfo:
741 """Write out in memory dataset to datastore.
743 Parameters
744 ----------
745 inMemoryDataset : `object`
746 Dataset to write to datastore.
747 ref : `DatasetRef`
748 Registry information associated with this dataset.
750 Returns
751 -------
752 info : `StoredFileInfo`
753 Information describin the artifact written to the datastore.
754 """
755 raise NotImplementedError()
757 @abstractmethod
758 def _read_artifact_into_memory(self, getInfo: DatastoreFileGetInformation,
759 ref: DatasetRef, isComponent: bool = False) -> Any:
760 """Read the artifact from datastore into in memory object.
762 Parameters
763 ----------
764 getInfo : `DatastoreFileGetInformation`
765 Information about the artifact within the datastore.
766 ref : `DatasetRef`
767 The registry information associated with this artifact.
768 isComponent : `bool`
769 Flag to indicate if a component is being read from this artifact.
771 Returns
772 -------
773 inMemoryDataset : `object`
774 The artifact as a python object.
775 """
776 raise NotImplementedError()
778 def exists(self, ref: DatasetRef) -> bool:
779 """Check if the dataset exists in the datastore.
781 Parameters
782 ----------
783 ref : `DatasetRef`
784 Reference to the required dataset.
786 Returns
787 -------
788 exists : `bool`
789 `True` if the entity exists in the `Datastore`.
790 """
791 fileLocations = self._get_dataset_locations_info(ref)
792 if not fileLocations:
793 return False
794 for location, _ in fileLocations:
795 if not self._artifact_exists(location):
796 return False
798 return True
800 def getUri(self, ref: DatasetRef, predict: bool = False) -> str:
801 """URI to the Dataset.
803 Parameters
804 ----------
805 ref : `DatasetRef`
806 Reference to the required Dataset.
807 predict : `bool`
808 If `True`, allow URIs to be returned of datasets that have not
809 been written.
811 Returns
812 -------
813 uri : `str`
814 URI string pointing to the dataset within the datastore. If the
815 dataset does not exist in the datastore, and if ``predict`` is
816 `True`, the URI will be a prediction and will include a URI
817 fragment "#predicted".
818 If the datastore does not have entities that relate well
819 to the concept of a URI the returned URI string will be
820 descriptive. The returned URI is not guaranteed to be obtainable.
822 Raises
823 ------
824 FileNotFoundError
825 A URI has been requested for a dataset that does not exist and
826 guessing is not allowed.
828 Notes
829 -----
830 When a predicted URI is requested an attempt will be made to form
831 a reasonable URI based on file templates and the expected formatter.
832 """
833 # if this has never been written then we have to guess
834 if not self.exists(ref):
835 if not predict:
836 raise FileNotFoundError("Dataset {} not in this datastore".format(ref))
838 template = self.templates.getTemplate(ref)
839 location = self.locationFactory.fromPath(template.format(ref))
840 storageClass = ref.datasetType.storageClass
841 formatter = self.formatterFactory.getFormatter(ref, FileDescriptor(location,
842 storageClass=storageClass))
843 # Try to use the extension attribute but ignore problems if the
844 # formatter does not define one.
845 try:
846 location = formatter.makeUpdatedLocation(location)
847 except Exception:
848 # Use the default extension
849 pass
851 # Add a URI fragment to indicate this is a guess
852 return location.uri + "#predicted"
854 # If this is a ref that we have written we can get the path.
855 # Get file metadata and internal metadata
856 storedFileInfo = self.getStoredItemInfo(ref)
858 # Use the path to determine the location
859 location = self.locationFactory.fromPath(storedFileInfo.path)
861 return location.uri
863 def get(self, ref: DatasetRef, parameters: Optional[Mapping[str, Any]] = None) -> Any:
864 """Load an InMemoryDataset from the store.
866 Parameters
867 ----------
868 ref : `DatasetRef`
869 Reference to the required Dataset.
870 parameters : `dict`
871 `StorageClass`-specific parameters that specify, for example,
872 a slice of the dataset to be loaded.
874 Returns
875 -------
876 inMemoryDataset : `object`
877 Requested dataset or slice thereof as an InMemoryDataset.
879 Raises
880 ------
881 FileNotFoundError
882 Requested dataset can not be retrieved.
883 TypeError
884 Return value from formatter has unexpected type.
885 ValueError
886 Formatter failed to process the dataset.
887 """
888 allGetInfo = self._prepare_for_get(ref, parameters)
889 refComponent = ref.datasetType.component()
891 if len(allGetInfo) > 1 and not refComponent:
892 # This was a disassembled dataset spread over multiple files
893 # and we need to put them all back together again.
894 # Read into memory and then assemble
895 usedParams = set()
896 components = {}
897 for getInfo in allGetInfo:
898 # assemblerParams are parameters not understood by the
899 # associated formatter.
900 usedParams.update(set(getInfo.assemblerParams))
902 component = getInfo.component
903 # We do not want the formatter to think it's reading
904 # a component though because it is really reading a
905 # standalone dataset -- always tell reader it is not a
906 # component.
907 components[component] = self._read_artifact_into_memory(getInfo, ref, isComponent=False)
909 inMemoryDataset = ref.datasetType.storageClass.assembler().assemble(components)
911 # Any unused parameters will have to be passed to the assembler
912 if parameters:
913 unusedParams = {k: v for k, v in parameters.items() if k not in usedParams}
914 else:
915 unusedParams = {}
917 # Process parameters
918 return ref.datasetType.storageClass.assembler().handleParameters(inMemoryDataset,
919 parameters=unusedParams)
921 else:
922 # Single file request or component from that composite file
923 allComponents = {i.component: i for i in allGetInfo}
924 for lookup in (refComponent, None): 924 ↛ 929line 924 didn't jump to line 929, because the loop on line 924 didn't complete
925 if lookup in allComponents: 925 ↛ 924line 925 didn't jump to line 924, because the condition on line 925 was never false
926 getInfo = allComponents[lookup]
927 break
928 else:
929 raise FileNotFoundError(f"Component {refComponent} not found "
930 f"for ref {ref} in datastore {self.name}")
932 return self._read_artifact_into_memory(getInfo, ref, isComponent=getInfo.component is not None)
934 @transactional
935 def put(self, inMemoryDataset: Any, ref: DatasetRef) -> None:
936 """Write a InMemoryDataset with a given `DatasetRef` to the store.
938 Parameters
939 ----------
940 inMemoryDataset : `object`
941 The dataset to store.
942 ref : `DatasetRef`
943 Reference to the associated Dataset.
945 Raises
946 ------
947 TypeError
948 Supplied object and storage class are inconsistent.
949 DatasetTypeNotSupportedError
950 The associated `DatasetType` is not handled by this datastore.
952 Notes
953 -----
954 If the datastore is configured to reject certain dataset types it
955 is possible that the put will fail and raise a
956 `DatasetTypeNotSupportedError`. The main use case for this is to
957 allow `ChainedDatastore` to put to multiple datastores without
958 requiring that every datastore accepts the dataset.
959 """
961 doDisassembly = self.composites.shouldBeDisassembled(ref)
962 # doDisassembly = True
964 artifacts = []
965 if doDisassembly:
966 components = ref.datasetType.storageClass.assembler().disassemble(inMemoryDataset)
967 for component, componentInfo in components.items():
968 compTypeName = ref.datasetType.componentTypeName(component)
969 # Don't recurse because we want to take advantage of
970 # bulk insert -- need a new DatasetRef that refers to the
971 # same dataset_id but has the component DatasetType
972 # DatasetType does not refer to the types of components
973 # So we construct one ourselves.
974 compType = DatasetType(compTypeName, dimensions=ref.datasetType.dimensions,
975 storageClass=componentInfo.storageClass)
976 compRef = DatasetRef(compType, ref.dataId, id=ref.id, run=ref.run, conform=False)
977 storedInfo = self._write_in_memory_to_artifact(componentInfo.component, compRef)
978 artifacts.append((compRef, storedInfo))
979 else:
980 # Write the entire thing out
981 storedInfo = self._write_in_memory_to_artifact(inMemoryDataset, ref)
982 artifacts.append((ref, storedInfo))
984 self._register_datasets(artifacts)
986 @transactional
987 def trash(self, ref: DatasetRef, ignore_errors: bool = True) -> None:
988 """Indicate to the datastore that a dataset can be removed.
990 Parameters
991 ----------
992 ref : `DatasetRef`
993 Reference to the required Dataset.
994 ignore_errors : `bool`
995 If `True` return without error even if something went wrong.
996 Problems could occur if another process is simultaneously trying
997 to delete.
999 Raises
1000 ------
1001 FileNotFoundError
1002 Attempt to remove a dataset that does not exist.
1003 """
1004 # Get file metadata and internal metadata
1005 log.debug("Trashing %s in datastore %s", ref, self.name)
1007 fileLocations = self._get_dataset_locations_info(ref)
1009 if not fileLocations:
1010 err_msg = f"Requested dataset to trash ({ref}) is not known to datastore {self.name}"
1011 if ignore_errors:
1012 log.warning(err_msg)
1013 return
1014 else:
1015 raise FileNotFoundError(err_msg)
1017 for location, storedFileInfo in fileLocations:
1018 if not self._artifact_exists(location): 1018 ↛ 1019line 1018 didn't jump to line 1019, because the condition on line 1018 was never true
1019 err_msg = f"Dataset is known to datastore {self.name} but " \
1020 f"associated artifact ({location.uri}) is missing"
1021 if ignore_errors:
1022 log.warning(err_msg)
1023 return
1024 else:
1025 raise FileNotFoundError(err_msg)
1027 # Mark dataset as trashed
1028 try:
1029 self._move_to_trash_in_registry(ref)
1030 except Exception as e:
1031 if ignore_errors:
1032 log.warning(f"Attempted to mark dataset ({ref}) to be trashed in datastore {self.name} "
1033 f"but encountered an error: {e}")
1034 pass
1035 else:
1036 raise
1038 @transactional
1039 def emptyTrash(self, ignore_errors: bool = True) -> None:
1040 """Remove all datasets from the trash.
1042 Parameters
1043 ----------
1044 ignore_errors : `bool`
1045 If `True` return without error even if something went wrong.
1046 Problems could occur if another process is simultaneously trying
1047 to delete.
1048 """
1049 log.debug("Emptying trash in datastore %s", self.name)
1050 # Context manager will empty trash iff we finish it without raising.
1051 with self._bridge.emptyTrash() as trashed:
1052 for ref in trashed:
1053 fileLocations = self._get_dataset_locations_info(ref)
1055 if not fileLocations: 1055 ↛ 1056line 1055 didn't jump to line 1056, because the condition on line 1055 was never true
1056 err_msg = f"Requested dataset ({ref}) does not exist in datastore {self.name}"
1057 if ignore_errors:
1058 log.warning(err_msg)
1059 continue
1060 else:
1061 raise FileNotFoundError(err_msg)
1063 for location, _ in fileLocations:
1065 if not self._artifact_exists(location): 1065 ↛ 1066line 1065 didn't jump to line 1066, because the condition on line 1065 was never true
1066 err_msg = f"Dataset {location.uri} no longer present in datastore {self.name}"
1067 if ignore_errors:
1068 log.warning(err_msg)
1069 continue
1070 else:
1071 raise FileNotFoundError(err_msg)
1073 # Can only delete the artifact if there are no references
1074 # to the file from untrashed dataset refs.
1075 if self._can_remove_dataset_artifact(ref, location):
1076 # Point of no return for this artifact
1077 log.debug("Removing artifact %s from datastore %s", location.uri, self.name)
1078 try:
1079 self._delete_artifact(location)
1080 except Exception as e:
1081 if ignore_errors:
1082 log.critical("Encountered error removing artifact %s from datastore %s: %s",
1083 location.uri, self.name, e)
1084 else:
1085 raise
1087 # Now must remove the entry from the internal registry even if
1088 # the artifact removal failed and was ignored,
1089 # otherwise the removal check above will never be true
1090 try:
1091 # There may be multiple rows associated with this ref
1092 # depending on disassembly
1093 self.removeStoredItemInfo(ref)
1094 except Exception as e:
1095 if ignore_errors:
1096 log.warning("Error removing dataset %s (%s) from internal registry of %s: %s",
1097 ref.id, location.uri, self.name, e)
1098 continue
1099 else:
1100 raise FileNotFoundError(err_msg)
1102 def validateConfiguration(self, entities: Iterable[Union[DatasetRef, DatasetType, StorageClass]],
1103 logFailures: bool = False) -> None:
1104 """Validate some of the configuration for this datastore.
1106 Parameters
1107 ----------
1108 entities : iterable of `DatasetRef`, `DatasetType`, or `StorageClass`
1109 Entities to test against this configuration. Can be differing
1110 types.
1111 logFailures : `bool`, optional
1112 If `True`, output a log message for every validation error
1113 detected.
1115 Raises
1116 ------
1117 DatastoreValidationError
1118 Raised if there is a validation problem with a configuration.
1119 All the problems are reported in a single exception.
1121 Notes
1122 -----
1123 This method checks that all the supplied entities have valid file
1124 templates and also have formatters defined.
1125 """
1127 templateFailed = None
1128 try:
1129 self.templates.validateTemplates(entities, logFailures=logFailures)
1130 except FileTemplateValidationError as e:
1131 templateFailed = str(e)
1133 formatterFailed = []
1134 for entity in entities:
1135 try:
1136 self.formatterFactory.getFormatterClass(entity)
1137 except KeyError as e:
1138 formatterFailed.append(str(e))
1139 if logFailures: 1139 ↛ 1134line 1139 didn't jump to line 1134, because the condition on line 1139 was never false
1140 log.fatal("Formatter failure: %s", e)
1142 if templateFailed or formatterFailed:
1143 messages = []
1144 if templateFailed: 1144 ↛ 1145line 1144 didn't jump to line 1145, because the condition on line 1144 was never true
1145 messages.append(templateFailed)
1146 if formatterFailed: 1146 ↛ 1148line 1146 didn't jump to line 1148, because the condition on line 1146 was never false
1147 messages.append(",".join(formatterFailed))
1148 msg = ";\n".join(messages)
1149 raise DatastoreValidationError(msg)
1151 def getLookupKeys(self) -> Set[LookupKey]:
1152 # Docstring is inherited from base class
1153 return self.templates.getLookupKeys() | self.formatterFactory.getLookupKeys() | \
1154 self.constraints.getLookupKeys()
1156 def validateKey(self, lookupKey: LookupKey,
1157 entity: Union[DatasetRef, DatasetType, StorageClass]) -> None:
1158 # Docstring is inherited from base class
1159 # The key can be valid in either formatters or templates so we can
1160 # only check the template if it exists
1161 if lookupKey in self.templates:
1162 try:
1163 self.templates[lookupKey].validateTemplate(entity)
1164 except FileTemplateValidationError as e:
1165 raise DatastoreValidationError(e) from e