Coverage for python/lsst/daf/butler/datastores/fileLikeDatastore.py : 81%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
21from __future__ import annotations
23"""Generic file-based datastore code."""
25__all__ = ("FileLikeDatastore", )
27import logging
28from abc import abstractmethod
30from sqlalchemy import Integer, String
32from dataclasses import dataclass
33from typing import Optional, List, Type
35from lsst.daf.butler import (
36 CompositesMap,
37 Config,
38 FileDataset,
39 DatasetRef,
40 DatasetType,
41 DatasetTypeNotSupportedError,
42 Datastore,
43 DatastoreConfig,
44 DatastoreValidationError,
45 FileDescriptor,
46 FileTemplates,
47 FileTemplateValidationError,
48 Formatter,
49 FormatterFactory,
50 Location,
51 LocationFactory,
52 StorageClass,
53 StoredFileInfo,
54)
56from lsst.daf.butler import ddl
57from lsst.daf.butler.registry.interfaces import (
58 ReadOnlyDatabaseError,
59 DatastoreRegistryBridge,
60 FakeDatasetRef,
61)
63from lsst.daf.butler.core.repoRelocation import replaceRoot
64from lsst.daf.butler.core.utils import getInstanceOf, NamedValueSet, getClassOf, transactional
65from .genericDatastore import GenericBaseDatastore
67log = logging.getLogger(__name__)
69# String to use when a Python None is encountered
70NULLSTR = "__NULL_STRING__"
73class _IngestPrepData(Datastore.IngestPrepData):
74 """Helper class for FileLikeDatastore ingest implementation.
76 Parameters
77 ----------
78 datasets : `list` of `FileDataset`
79 Files to be ingested by this datastore.
80 """
81 def __init__(self, datasets: List[FileDataset]):
82 super().__init__(ref for dataset in datasets for ref in dataset.refs)
83 self.datasets = datasets
86@dataclass(frozen=True)
87class DatastoreFileGetInformation:
88 """Collection of useful parameters needed to retrieve a file from
89 a Datastore.
90 """
92 location: Location
93 """The location from which to read the dataset."""
95 formatter: Formatter
96 """The `Formatter` to use to deserialize the dataset."""
98 info: StoredFileInfo
99 """Stored information about this file and its formatter."""
101 assemblerParams: dict
102 """Parameters to use for post-processing the retrieved dataset."""
104 component: Optional[str]
105 """The component to be retrieved (can be `None`)."""
107 readStorageClass: StorageClass
108 """The `StorageClass` of the dataset being read."""
111class FileLikeDatastore(GenericBaseDatastore):
112 """Generic Datastore for file-based implementations.
114 Should always be sub-classed since key abstract methods are missing.
116 Parameters
117 ----------
118 config : `DatastoreConfig` or `str`
119 Configuration as either a `Config` object or URI to file.
120 bridgeManager : `DatastoreRegistryBridgeManager`
121 Object that manages the interface between `Registry` and datastores.
122 butlerRoot : `str`, optional
123 New datastore root to use to override the configuration value.
125 Raises
126 ------
127 ValueError
128 If root location does not exist and ``create`` is `False` in the
129 configuration.
130 """
132 defaultConfigFile = None
133 """Path to configuration defaults. Relative to $DAF_BUTLER_DIR/config or
134 absolute path. Can be None if no defaults specified.
135 """
137 root: str
138 """Root directory or URI of this `Datastore`."""
140 locationFactory: LocationFactory
141 """Factory for creating locations relative to the datastore root."""
143 formatterFactory: FormatterFactory
144 """Factory for creating instances of formatters."""
146 templates: FileTemplates
147 """File templates that can be used by this `Datastore`."""
149 composites: CompositesMap
150 """Determines whether a dataset should be disassembled on put."""
152 @classmethod
153 def setConfigRoot(cls, root, config, full, overwrite=True):
154 """Set any filesystem-dependent config options for this Datastore to
155 be appropriate for a new empty repository with the given root.
157 Parameters
158 ----------
159 root : `str`
160 URI to the root of the data repository.
161 config : `Config`
162 A `Config` to update. Only the subset understood by
163 this component will be updated. Will not expand
164 defaults.
165 full : `Config`
166 A complete config with all defaults expanded that can be
167 converted to a `DatastoreConfig`. Read-only and will not be
168 modified by this method.
169 Repository-specific options that should not be obtained
170 from defaults when Butler instances are constructed
171 should be copied from ``full`` to ``config``.
172 overwrite : `bool`, optional
173 If `False`, do not modify a value in ``config`` if the value
174 already exists. Default is always to overwrite with the provided
175 ``root``.
177 Notes
178 -----
179 If a keyword is explicitly defined in the supplied ``config`` it
180 will not be overridden by this method if ``overwrite`` is `False`.
181 This allows explicit values set in external configs to be retained.
182 """
183 Config.updateParameters(DatastoreConfig, config, full,
184 toUpdate={"root": root},
185 toCopy=("cls", ("records", "table")), overwrite=overwrite)
187 @classmethod
188 def makeTableSpec(cls):
189 return ddl.TableSpec(
190 fields=NamedValueSet([
191 ddl.FieldSpec(name="dataset_id", dtype=Integer, primaryKey=True),
192 ddl.FieldSpec(name="path", dtype=String, length=256, nullable=False),
193 ddl.FieldSpec(name="formatter", dtype=String, length=128, nullable=False),
194 ddl.FieldSpec(name="storage_class", dtype=String, length=64, nullable=False),
195 # Use empty string to indicate no component
196 ddl.FieldSpec(name="component", dtype=String, length=32, primaryKey=True),
197 # TODO: should checksum be Base64Bytes instead?
198 ddl.FieldSpec(name="checksum", dtype=String, length=128, nullable=True),
199 ddl.FieldSpec(name="file_size", dtype=Integer, nullable=True),
200 ]),
201 unique=frozenset(),
202 )
204 def __init__(self, config, bridgeManager, butlerRoot=None):
205 super().__init__(config, bridgeManager)
206 if "root" not in self.config: 206 ↛ 207line 206 didn't jump to line 207, because the condition on line 206 was never true
207 raise ValueError("No root directory specified in configuration")
209 # Name ourselves either using an explicit name or a name
210 # derived from the (unexpanded) root
211 if "name" in self.config:
212 self.name = self.config["name"]
213 else:
214 # We use the unexpanded root in the name to indicate that this
215 # datastore can be moved without having to update registry.
216 self.name = "{}@{}".format(type(self).__name__,
217 self.config["root"])
219 # Support repository relocation in config
220 # Existence of self.root is checked in subclass
221 self.root = replaceRoot(self.config["root"], butlerRoot)
223 self.locationFactory = LocationFactory(self.root)
224 self.formatterFactory = FormatterFactory()
226 # Now associate formatters with storage classes
227 self.formatterFactory.registerFormatters(self.config["formatters"],
228 universe=bridgeManager.universe)
230 # Read the file naming templates
231 self.templates = FileTemplates(self.config["templates"],
232 universe=bridgeManager.universe)
234 # See if composites should be disassembled
235 self.composites = CompositesMap(self.config["composites"],
236 universe=bridgeManager.universe)
238 tableName = self.config["records", "table"]
239 try:
240 # Storage of paths and formatters, keyed by dataset_id
241 self._table = bridgeManager.opaque.register(tableName, self.makeTableSpec())
242 # Interface to Registry.
243 self._bridge = bridgeManager.register(self.name)
244 except ReadOnlyDatabaseError:
245 # If the database is read only and we just tried and failed to
246 # create a table, it means someone is trying to create a read-only
247 # butler client for an empty repo. That should be okay, as long
248 # as they then try to get any datasets before some other client
249 # creates the table. Chances are they'rejust validating
250 # configuration.
251 pass
253 # Determine whether checksums should be used
254 self.useChecksum = self.config.get("checksum", True)
256 def __str__(self):
257 return self.root
259 @property
260 def bridge(self) -> DatastoreRegistryBridge:
261 return self._bridge
263 @abstractmethod
264 def _artifact_exists(self, location):
265 """Check that an artifact exists in this datastore at the specified
266 location.
268 Parameters
269 ----------
270 location : `Location`
271 Expected location of the artifact associated with this datastore.
273 Returns
274 -------
275 exists : `bool`
276 True if the location can be found, false otherwise.
277 """
278 raise NotImplementedError()
280 @abstractmethod
281 def _delete_artifact(self, location):
282 """Delete the artifact from the datastore.
284 Parameters
285 ----------
286 location : `Location`
287 Location of the artifact associated with this datastore.
288 """
289 raise NotImplementedError()
291 def addStoredItemInfo(self, refs, infos):
292 # Docstring inherited from GenericBaseDatastore
293 records = []
294 for ref, info in zip(refs, infos):
295 # Component should come from ref and fall back on info
296 component = ref.datasetType.component()
297 if component is None and info.component is not None: 297 ↛ 298line 297 didn't jump to line 298, because the condition on line 297 was never true
298 component = info.component
299 if component is None:
300 # Use empty string since we want this to be part of the
301 # primary key.
302 component = NULLSTR
303 records.append(
304 dict(dataset_id=ref.id, formatter=info.formatter, path=info.path,
305 storage_class=info.storageClass.name, component=component,
306 checksum=info.checksum, file_size=info.file_size)
307 )
308 self._table.insert(*records)
310 def getStoredItemInfo(self, ref):
311 # Docstring inherited from GenericBaseDatastore
313 where = {"dataset_id": ref.id}
315 # If we have no component we want the row from this table without
316 # a component. If we do have a component we either need the row
317 # with no component or the row with the component, depending on how
318 # this dataset was dissassembled.
320 # if we are emptying trash we won't have real refs so can't constrain
321 # by component. Will need to fix this to return multiple matches
322 # in future.
323 try:
324 component = ref.datasetType.component()
325 except AttributeError:
326 component = None
327 else:
328 if component is None:
329 where["component"] = NULLSTR
331 # Look for the dataset_id -- there might be multiple matches
332 # if we have disassembled the dataset.
333 records = list(self._table.fetch(**where))
334 if len(records) == 0: 334 ↛ 335line 334 didn't jump to line 335, because the condition on line 334 was never true
335 raise KeyError(f"Unable to retrieve location associated with dataset {ref}.")
337 # if we are not asking for a component
338 if not component and len(records) != 1: 338 ↛ 339line 338 didn't jump to line 339, because the condition on line 338 was never true
339 raise RuntimeError(f"Got {len(records)} from location query of dataset {ref}")
341 # if we had a FakeDatasetRef we pick the first record regardless
342 if isinstance(ref, FakeDatasetRef): 342 ↛ 343line 342 didn't jump to line 343, because the condition on line 342 was never true
343 record = records[0]
344 else:
345 records_by_component = {}
346 for r in records:
347 this_component = r["component"] if r["component"] and r["component"] != NULLSTR else None
348 records_by_component[this_component] = r
350 # Look for component by name else fall back to the parent
351 for lookup in (component, None): 351 ↛ 356line 351 didn't jump to line 356, because the loop on line 351 didn't complete
352 if lookup in records_by_component: 352 ↛ 351line 352 didn't jump to line 351, because the condition on line 352 was never false
353 record = records_by_component[lookup]
354 break
355 else:
356 raise KeyError(f"Unable to retrieve location for component {component} associated with "
357 f"dataset {ref}.")
359 # Convert name of StorageClass to instance
360 storageClass = self.storageClassFactory.getStorageClass(record["storage_class"])
362 return StoredFileInfo(formatter=record["formatter"],
363 path=record["path"],
364 storageClass=storageClass,
365 component=component,
366 checksum=record["checksum"],
367 file_size=record["file_size"])
369 def getStoredItemsInfo(self, ref):
370 # Docstring inherited from GenericBaseDatastore
372 # Look for the dataset_id -- there might be multiple matches
373 # if we have disassembled the dataset.
374 records = list(self._table.fetch(dataset_id=ref.id))
376 results = []
377 for record in records:
378 # Convert name of StorageClass to instance
379 storageClass = self.storageClassFactory.getStorageClass(record["storage_class"])
380 component = record["component"] if (record["component"]
381 and record["component"] != NULLSTR) else None
383 info = StoredFileInfo(formatter=record["formatter"],
384 path=record["path"],
385 storageClass=storageClass,
386 component=component,
387 checksum=record["checksum"],
388 file_size=record["file_size"])
389 results.append(info)
391 return results
393 def _registered_refs_per_artifact(self, pathInStore):
394 """Return all dataset refs associated with the supplied path.
396 Parameters
397 ----------
398 pathInStore : `str`
399 Path of interest in the data store.
401 Returns
402 -------
403 ids : `set` of `int`
404 All `DatasetRef` IDs associated with this path.
405 """
406 records = list(self._table.fetch(path=pathInStore))
407 ids = {r["dataset_id"] for r in records}
408 return ids
410 def removeStoredItemInfo(self, ref):
411 # Docstring inherited from GenericBaseDatastore
412 self._table.delete(dataset_id=ref.id)
414 def _get_dataset_location_info(self, ref):
415 """Find the `Location` of the requested dataset in the
416 `Datastore` and the associated stored file information.
418 Parameters
419 ----------
420 ref : `DatasetRef`
421 Reference to the required `Dataset`.
423 Returns
424 -------
425 location : `Location`
426 Location of the dataset within the datastore.
427 Returns `None` if the dataset can not be located.
428 info : `StoredFileInfo`
429 Stored information about this file and its formatter.
430 """
431 # Get the file information (this will fail if no file)
432 try:
433 storedFileInfo = self.getStoredItemInfo(ref)
434 except KeyError:
435 return None, None
437 # Use the path to determine the location
438 location = self.locationFactory.fromPath(storedFileInfo.path)
440 return location, storedFileInfo
442 def _get_dataset_locations_info(self, ref):
443 r"""Find all the `Location`\ s of the requested dataset in the
444 `Datastore` and the associated stored file information.
446 Parameters
447 ----------
448 ref : `DatasetRef`
449 Reference to the required `Dataset`.
451 Returns
452 -------
453 results : `list` [`tuple` [`Location`, `StoredFileInfo` ]]
454 Location of the dataset within the datastore and
455 stored information about each file and its formatter.
456 """
457 # Get the file information (this will fail if no file)
458 records = self.getStoredItemsInfo(ref)
460 # Use the path to determine the location
461 return [(self.locationFactory.fromPath(r.path), r) for r in records]
463 def _can_remove_dataset_artifact(self, ref, location):
464 """Check that there is only one dataset associated with the
465 specified artifact.
467 Parameters
468 ----------
469 ref : `DatasetRef`
470 Dataset to be removed.
471 location : `Location`
472 The location of the artifact to be removed.
474 Returns
475 -------
476 can_remove : `Bool`
477 True if the artifact can be safely removed.
478 """
480 # Get all entries associated with this path
481 allRefs = self._registered_refs_per_artifact(location.pathInStore)
482 if not allRefs: 482 ↛ 483line 482 didn't jump to line 483, because the condition on line 482 was never true
483 raise RuntimeError(f"Datastore inconsistency error. {location.pathInStore} not in registry")
485 # Get all the refs associated with this dataset if it is a composite
486 theseRefs = {r.id for r in ref.flatten([ref])}
488 # Remove these refs from all the refs and if there is nothing left
489 # then we can delete
490 remainingRefs = allRefs - theseRefs
492 if remainingRefs:
493 return False
494 return True
496 def _prepare_for_get(self, ref, parameters=None):
497 """Check parameters for ``get`` and obtain formatter and
498 location.
500 Parameters
501 ----------
502 ref : `DatasetRef`
503 Reference to the required Dataset.
504 parameters : `dict`
505 `StorageClass`-specific parameters that specify, for example,
506 a slice of the dataset to be loaded.
508 Returns
509 -------
510 getInfo : `list` [`DatastoreFileGetInformation`]
511 Parameters needed to retrieve each file.
512 """
513 log.debug("Retrieve %s from %s with parameters %s", ref, self.name, parameters)
515 # Get file metadata and internal metadata
516 fileLocations = self._get_dataset_locations_info(ref)
517 if not fileLocations:
518 raise FileNotFoundError(f"Could not retrieve dataset {ref}.")
520 # The storage class we want to use eventually
521 refStorageClass = ref.datasetType.storageClass
523 # Check that the supplied parameters are suitable for the type read
524 refStorageClass.validateParameters(parameters)
526 if len(fileLocations) > 1:
527 disassembled = True
528 else:
529 disassembled = False
531 # Is this a component request?
532 refComponent = ref.datasetType.component()
534 fileGetInfo = []
535 for location, storedFileInfo in fileLocations:
537 # The storage class used to write the file
538 writeStorageClass = storedFileInfo.storageClass
540 # If this has been disassembled we need read to match the write
541 if disassembled:
542 readStorageClass = writeStorageClass
543 else:
544 readStorageClass = refStorageClass
546 formatter = getInstanceOf(storedFileInfo.formatter,
547 FileDescriptor(location, readStorageClass=readStorageClass,
548 storageClass=writeStorageClass, parameters=parameters),
549 ref.dataId)
551 _, notFormatterParams = formatter.segregateParameters()
553 # Of the remaining parameters, extract the ones supported by
554 # this StorageClass (for components not all will be handled)
555 assemblerParams = readStorageClass.filterParameters(notFormatterParams)
557 # The ref itself could be a component if the dataset was
558 # disassembled by butler, or we disassembled in datastore and
559 # components came from the datastore records
560 if storedFileInfo.component:
561 component = storedFileInfo.component
562 else:
563 component = refComponent
565 fileGetInfo.append(DatastoreFileGetInformation(location, formatter, storedFileInfo,
566 assemblerParams, component, readStorageClass))
568 return fileGetInfo
570 def _prepare_for_put(self, inMemoryDataset, ref):
571 """Check the arguments for ``put`` and obtain formatter and
572 location.
574 Parameters
575 ----------
576 inMemoryDataset : `object`
577 The dataset to store.
578 ref : `DatasetRef`
579 Reference to the associated Dataset.
581 Returns
582 -------
583 location : `Location`
584 The location to write the dataset.
585 formatter : `Formatter`
586 The `Formatter` to use to write the dataset.
588 Raises
589 ------
590 TypeError
591 Supplied object and storage class are inconsistent.
592 DatasetTypeNotSupportedError
593 The associated `DatasetType` is not handled by this datastore.
594 """
595 self._validate_put_parameters(inMemoryDataset, ref)
597 # Work out output file name
598 try:
599 template = self.templates.getTemplate(ref)
600 except KeyError as e:
601 raise DatasetTypeNotSupportedError(f"Unable to find template for {ref}") from e
603 location = self.locationFactory.fromPath(template.format(ref))
605 # Get the formatter based on the storage class
606 storageClass = ref.datasetType.storageClass
607 try:
608 formatter = self.formatterFactory.getFormatter(ref,
609 FileDescriptor(location,
610 storageClass=storageClass),
611 ref.dataId)
612 except KeyError as e:
613 raise DatasetTypeNotSupportedError(f"Unable to find formatter for {ref}") from e
615 return location, formatter
617 @abstractmethod
618 def _standardizeIngestPath(self, path: str, *, transfer: Optional[str] = None) -> str:
619 """Standardize the path of a to-be-ingested file.
621 Parameters
622 ----------
623 path : `str`
624 Path of a file to be ingested.
625 transfer : `str`, optional
626 How (and whether) the dataset should be added to the datastore.
627 See `ingest` for details of transfer modes.
628 This implementation is provided only so
629 `NotImplementedError` can be raised if the mode is not supported;
630 actual transfers are deferred to `_extractIngestInfo`.
632 Returns
633 -------
634 path : `str`
635 New path in what the datastore considers standard form.
637 Notes
638 -----
639 Subclasses of `FileLikeDatastore` should implement this method instead
640 of `_prepIngest`. It should not modify the data repository or given
641 file in any way.
643 Raises
644 ------
645 NotImplementedError
646 Raised if the datastore does not support the given transfer mode
647 (including the case where ingest is not supported at all).
648 FileNotFoundError
649 Raised if one of the given files does not exist.
650 """
651 raise NotImplementedError("Must be implemented by subclasses.")
653 @abstractmethod
654 def _extractIngestInfo(self, path: str, ref: DatasetRef, *, formatter: Type[Formatter],
655 transfer: Optional[str] = None) -> StoredFileInfo:
656 """Relocate (if necessary) and extract `StoredFileInfo` from a
657 to-be-ingested file.
659 Parameters
660 ----------
661 path : `str`
662 Path of a file to be ingested.
663 ref : `DatasetRef`
664 Reference for the dataset being ingested. Guaranteed to have
665 ``dataset_id not None`.
666 formatter : `type`
667 `Formatter` subclass to use for this dataset.
668 transfer : `str`, optional
669 How (and whether) the dataset should be added to the datastore.
670 See `ingest` for details of transfer modes.
672 Returns
673 -------
674 info : `StoredFileInfo`
675 Internal datastore record for this file. This will be inserted by
676 the caller; the `_extractIngestInfo` is only resposible for
677 creating and populating the struct.
679 Raises
680 ------
681 FileNotFoundError
682 Raised if one of the given files does not exist.
683 FileExistsError
684 Raised if transfer is not `None` but the (internal) location the
685 file would be moved to is already occupied.
686 """
687 raise NotImplementedError("Must be implemented by subclasses.")
689 def _prepIngest(self, *datasets: FileDataset, transfer: Optional[str] = None) -> _IngestPrepData:
690 # Docstring inherited from Datastore._prepIngest.
691 filtered = []
692 for dataset in datasets:
693 acceptable = [ref for ref in dataset.refs if self.constraints.isAcceptable(ref)]
694 if not acceptable:
695 continue
696 else:
697 dataset.refs = acceptable
698 if dataset.formatter is None:
699 dataset.formatter = self.formatterFactory.getFormatterClass(dataset.refs[0])
700 else:
701 dataset.formatter = getClassOf(dataset.formatter)
702 dataset.path = self._standardizeIngestPath(dataset.path, transfer=transfer)
703 filtered.append(dataset)
704 return _IngestPrepData(filtered)
706 @transactional
707 def _finishIngest(self, prepData: Datastore.IngestPrepData, *, transfer: Optional[str] = None):
708 # Docstring inherited from Datastore._finishIngest.
709 refsAndInfos = []
710 for dataset in prepData.datasets:
711 # Do ingest as if the first dataset ref is associated with the file
712 info = self._extractIngestInfo(dataset.path, dataset.refs[0], formatter=dataset.formatter,
713 transfer=transfer)
714 refsAndInfos.extend([(ref, info) for ref in dataset.refs])
715 self._register_datasets(refsAndInfos)
717 def exists(self, ref):
718 """Check if the dataset exists in the datastore.
720 Parameters
721 ----------
722 ref : `DatasetRef`
723 Reference to the required dataset.
725 Returns
726 -------
727 exists : `bool`
728 `True` if the entity exists in the `Datastore`.
729 """
730 fileLocations = self._get_dataset_locations_info(ref)
731 if not fileLocations:
732 return False
733 for location, _ in fileLocations:
734 if not self._artifact_exists(location):
735 return False
737 return True
739 def getUri(self, ref, predict=False):
740 """URI to the Dataset.
742 Parameters
743 ----------
744 ref : `DatasetRef`
745 Reference to the required Dataset.
746 predict : `bool`
747 If `True`, allow URIs to be returned of datasets that have not
748 been written.
750 Returns
751 -------
752 uri : `str`
753 URI string pointing to the dataset within the datastore. If the
754 dataset does not exist in the datastore, and if ``predict`` is
755 `True`, the URI will be a prediction and will include a URI
756 fragment "#predicted".
757 If the datastore does not have entities that relate well
758 to the concept of a URI the returned URI string will be
759 descriptive. The returned URI is not guaranteed to be obtainable.
761 Raises
762 ------
763 FileNotFoundError
764 A URI has been requested for a dataset that does not exist and
765 guessing is not allowed.
767 Notes
768 -----
769 When a predicted URI is requested an attempt will be made to form
770 a reasonable URI based on file templates and the expected formatter.
771 """
772 # if this has never been written then we have to guess
773 if not self.exists(ref):
774 if not predict:
775 raise FileNotFoundError("Dataset {} not in this datastore".format(ref))
777 template = self.templates.getTemplate(ref)
778 location = self.locationFactory.fromPath(template.format(ref))
779 storageClass = ref.datasetType.storageClass
780 formatter = self.formatterFactory.getFormatter(ref, FileDescriptor(location,
781 storageClass=storageClass))
782 # Try to use the extension attribute but ignore problems if the
783 # formatter does not define one.
784 try:
785 location = formatter.makeUpdatedLocation(location)
786 except Exception:
787 # Use the default extension
788 pass
790 # Add a URI fragment to indicate this is a guess
791 return location.uri + "#predicted"
793 # If this is a ref that we have written we can get the path.
794 # Get file metadata and internal metadata
795 storedFileInfo = self.getStoredItemInfo(ref)
797 # Use the path to determine the location
798 location = self.locationFactory.fromPath(storedFileInfo.path)
800 return location.uri
802 def get(self, ref, parameters=None):
803 """Load an InMemoryDataset from the store.
805 Parameters
806 ----------
807 ref : `DatasetRef`
808 Reference to the required Dataset.
809 parameters : `dict`
810 `StorageClass`-specific parameters that specify, for example,
811 a slice of the dataset to be loaded.
813 Returns
814 -------
815 inMemoryDataset : `object`
816 Requested dataset or slice thereof as an InMemoryDataset.
818 Raises
819 ------
820 FileNotFoundError
821 Requested dataset can not be retrieved.
822 TypeError
823 Return value from formatter has unexpected type.
824 ValueError
825 Formatter failed to process the dataset.
826 """
827 allGetInfo = self._prepare_for_get(ref, parameters)
828 refComponent = ref.datasetType.component()
830 if len(allGetInfo) > 1 and not refComponent:
831 # This was a disassembled dataset spread over multiple files
832 # and we need to put them all back together again.
833 # Read into memory and then assemble
834 usedParams = set()
835 components = {}
836 for getInfo in allGetInfo:
837 # assemblerParams are parameters not understood by the
838 # associated formatter.
839 usedParams.update(set(getInfo.assemblerParams))
841 component = getInfo.component
842 # We do not want the formatter to think it's reading
843 # a component though because it is really reading a
844 # standalone dataset -- always tell reader it is not a
845 # component.
846 components[component] = self._read_artifact_into_memory(getInfo, ref, isComponent=False)
848 inMemoryDataset = ref.datasetType.storageClass.assembler().assemble(components)
850 # Any unused parameters will have to be passed to the assembler
851 if parameters:
852 unusedParams = {k: v for k, v in parameters.items() if k not in usedParams}
853 else:
854 unusedParams = {}
856 # Process parameters
857 return ref.datasetType.storageClass.assembler().handleParameters(inMemoryDataset,
858 parameters=unusedParams)
860 else:
861 # Single file request or component from that composite file
862 allComponents = {i.component: i for i in allGetInfo}
863 for lookup in (refComponent, None): 863 ↛ 868line 863 didn't jump to line 868, because the loop on line 863 didn't complete
864 if lookup in allComponents: 864 ↛ 863line 864 didn't jump to line 863, because the condition on line 864 was never false
865 getInfo = allComponents[lookup]
866 break
867 else:
868 raise FileNotFoundError(f"Component {refComponent} not found "
869 f"for ref {ref} in datastore {self.name}")
871 return self._read_artifact_into_memory(getInfo, ref, isComponent=getInfo.component is not None)
873 @transactional
874 def put(self, inMemoryDataset, ref):
875 """Write a InMemoryDataset with a given `DatasetRef` to the store.
877 Parameters
878 ----------
879 inMemoryDataset : `object`
880 The dataset to store.
881 ref : `DatasetRef`
882 Reference to the associated Dataset.
884 Raises
885 ------
886 TypeError
887 Supplied object and storage class are inconsistent.
888 DatasetTypeNotSupportedError
889 The associated `DatasetType` is not handled by this datastore.
891 Notes
892 -----
893 If the datastore is configured to reject certain dataset types it
894 is possible that the put will fail and raise a
895 `DatasetTypeNotSupportedError`. The main use case for this is to
896 allow `ChainedDatastore` to put to multiple datastores without
897 requiring that every datastore accepts the dataset.
898 """
900 doDisassembly = self.composites.shouldBeDisassembled(ref)
901 # doDisassembly = True
903 artifacts = []
904 if doDisassembly:
905 components = ref.datasetType.storageClass.assembler().disassemble(inMemoryDataset)
906 for component, componentInfo in components.items():
907 compTypeName = ref.datasetType.componentTypeName(component)
908 # Don't recurse because we want to take advantage of
909 # bulk insert -- need a new DatasetRef that refers to the
910 # same dataset_id but has the component DatasetType
911 # DatasetType does not refer to the types of components
912 # So we construct one ourselves.
913 compType = DatasetType(compTypeName, dimensions=ref.datasetType.dimensions,
914 storageClass=componentInfo.storageClass)
915 compRef = DatasetRef(compType, ref.dataId, id=ref.id, run=ref.run, conform=False)
916 storedInfo = self._write_in_memory_to_artifact(componentInfo.component, compRef)
917 artifacts.append((compRef, storedInfo))
918 else:
919 # Write the entire thing out
920 storedInfo = self._write_in_memory_to_artifact(inMemoryDataset, ref)
921 artifacts.append((ref, storedInfo))
923 self._register_datasets(artifacts)
925 @transactional
926 def trash(self, ref, ignore_errors=True):
927 """Indicate to the datastore that a dataset can be removed.
929 Parameters
930 ----------
931 ref : `DatasetRef`
932 Reference to the required Dataset.
933 ignore_errors : `bool`
934 If `True` return without error even if something went wrong.
935 Problems could occur if another process is simultaneously trying
936 to delete.
938 Raises
939 ------
940 FileNotFoundError
941 Attempt to remove a dataset that does not exist.
942 """
943 # Get file metadata and internal metadata
944 log.debug("Trashing %s in datastore %s", ref, self.name)
946 fileLocations = self._get_dataset_locations_info(ref)
948 if not fileLocations:
949 err_msg = f"Requested dataset to trash ({ref}) is not known to datastore {self.name}"
950 if ignore_errors:
951 log.warning(err_msg)
952 return
953 else:
954 raise FileNotFoundError(err_msg)
956 for location, storedFileInfo in fileLocations:
957 if not self._artifact_exists(location): 957 ↛ 958line 957 didn't jump to line 958, because the condition on line 957 was never true
958 err_msg = f"Dataset is known to datastore {self.name} but " \
959 f"associated artifact ({location.uri}) is missing"
960 if ignore_errors:
961 log.warning(err_msg)
962 return
963 else:
964 raise FileNotFoundError(err_msg)
966 # Mark dataset as trashed
967 try:
968 self._move_to_trash_in_registry(ref)
969 except Exception as e:
970 if ignore_errors:
971 log.warning(f"Attempted to mark dataset ({ref}) to be trashed in datastore {self.name} "
972 f"but encountered an error: {e}")
973 pass
974 else:
975 raise
977 @transactional
978 def emptyTrash(self, ignore_errors=True):
979 """Remove all datasets from the trash.
981 Parameters
982 ----------
983 ignore_errors : `bool`
984 If `True` return without error even if something went wrong.
985 Problems could occur if another process is simultaneously trying
986 to delete.
987 """
988 log.debug("Emptying trash in datastore %s", self.name)
989 # Context manager will empty trash iff we finish it without raising.
990 with self._bridge.emptyTrash() as trashed:
991 for ref in trashed:
992 fileLocations = self._get_dataset_locations_info(ref)
994 for location, _ in fileLocations:
996 if location is None: 996 ↛ 997line 996 didn't jump to line 997, because the condition on line 996 was never true
997 err_msg = f"Requested dataset ({ref}) does not exist in datastore {self.name}"
998 if ignore_errors:
999 log.warning(err_msg)
1000 continue
1001 else:
1002 raise FileNotFoundError(err_msg)
1004 if not self._artifact_exists(location): 1004 ↛ 1005line 1004 didn't jump to line 1005, because the condition on line 1004 was never true
1005 err_msg = f"Dataset {location.uri} no longer present in datastore {self.name}"
1006 if ignore_errors:
1007 log.warning(err_msg)
1008 continue
1009 else:
1010 raise FileNotFoundError(err_msg)
1012 # Can only delete the artifact if there are no references
1013 # to the file from untrashed dataset refs.
1014 if self._can_remove_dataset_artifact(ref, location):
1015 # Point of no return for this artifact
1016 log.debug("Removing artifact %s from datastore %s", location.uri, self.name)
1017 try:
1018 self._delete_artifact(location)
1019 except Exception as e:
1020 if ignore_errors:
1021 log.critical("Encountered error removing artifact %s from datastore %s: %s",
1022 location.uri, self.name, e)
1023 else:
1024 raise
1026 # Now must remove the entry from the internal registry even if
1027 # the artifact removal failed and was ignored,
1028 # otherwise the removal check above will never be true
1029 try:
1030 # There may be multiple rows associated with this ref
1031 # depending on disassembly
1032 self.removeStoredItemInfo(ref)
1033 except Exception as e:
1034 if ignore_errors:
1035 log.warning("Error removing dataset %s (%s) from internal registry of %s: %s",
1036 ref.id, location.uri, self.name, e)
1037 continue
1038 else:
1039 raise FileNotFoundError(err_msg)
1041 def validateConfiguration(self, entities, logFailures=False):
1042 """Validate some of the configuration for this datastore.
1044 Parameters
1045 ----------
1046 entities : iterable of `DatasetRef`, `DatasetType`, or `StorageClass`
1047 Entities to test against this configuration. Can be differing
1048 types.
1049 logFailures : `bool`, optional
1050 If `True`, output a log message for every validation error
1051 detected.
1053 Raises
1054 ------
1055 DatastoreValidationError
1056 Raised if there is a validation problem with a configuration.
1057 All the problems are reported in a single exception.
1059 Notes
1060 -----
1061 This method checks that all the supplied entities have valid file
1062 templates and also have formatters defined.
1063 """
1065 templateFailed = None
1066 try:
1067 self.templates.validateTemplates(entities, logFailures=logFailures)
1068 except FileTemplateValidationError as e:
1069 templateFailed = str(e)
1071 formatterFailed = []
1072 for entity in entities:
1073 try:
1074 self.formatterFactory.getFormatterClass(entity)
1075 except KeyError as e:
1076 formatterFailed.append(str(e))
1077 if logFailures: 1077 ↛ 1072line 1077 didn't jump to line 1072, because the condition on line 1077 was never false
1078 log.fatal("Formatter failure: %s", e)
1080 if templateFailed or formatterFailed:
1081 messages = []
1082 if templateFailed: 1082 ↛ 1083line 1082 didn't jump to line 1083, because the condition on line 1082 was never true
1083 messages.append(templateFailed)
1084 if formatterFailed: 1084 ↛ 1086line 1084 didn't jump to line 1086, because the condition on line 1084 was never false
1085 messages.append(",".join(formatterFailed))
1086 msg = ";\n".join(messages)
1087 raise DatastoreValidationError(msg)
1089 def getLookupKeys(self):
1090 # Docstring is inherited from base class
1091 return self.templates.getLookupKeys() | self.formatterFactory.getLookupKeys() | \
1092 self.constraints.getLookupKeys()
1094 def validateKey(self, lookupKey, entity):
1095 # Docstring is inherited from base class
1096 # The key can be valid in either formatters or templates so we can
1097 # only check the template if it exists
1098 if lookupKey in self.templates:
1099 try:
1100 self.templates[lookupKey].validateTemplate(entity)
1101 except FileTemplateValidationError as e:
1102 raise DatastoreValidationError(e) from e