Coverage for python/lsst/daf/butler/datastores/fileLikeDatastore.py : 81%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22"""Generic file-based datastore code."""
24__all__ = ("FileLikeDatastore", )
26import logging
27from abc import abstractmethod
29from sqlalchemy import Integer, String
31from dataclasses import dataclass
32from typing import Optional, List, Type
34from lsst.daf.butler import (
35 CompositesMap,
36 Config,
37 FileDataset,
38 DatasetRef,
39 DatasetType,
40 DatasetTypeNotSupportedError,
41 Datastore,
42 DatastoreConfig,
43 DatastoreValidationError,
44 FakeDatasetRef,
45 FileDescriptor,
46 FileTemplates,
47 FileTemplateValidationError,
48 Formatter,
49 FormatterFactory,
50 Location,
51 LocationFactory,
52 StorageClass,
53 StoredFileInfo,
54)
56from lsst.daf.butler import ddl
57from lsst.daf.butler.registry.interfaces import ReadOnlyDatabaseError
59from lsst.daf.butler.core.repoRelocation import replaceRoot
60from lsst.daf.butler.core.utils import getInstanceOf, NamedValueSet, getClassOf, transactional
61from .genericDatastore import GenericBaseDatastore
63log = logging.getLogger(__name__)
65# String to use when a Python None is encountered
66NULLSTR = "__NULL_STRING__"
69class _IngestPrepData(Datastore.IngestPrepData):
70 """Helper class for FileLikeDatastore ingest implementation.
72 Parameters
73 ----------
74 datasets : `list` of `FileDataset`
75 Files to be ingested by this datastore.
76 """
77 def __init__(self, datasets: List[FileDataset]):
78 super().__init__(ref for dataset in datasets for ref in dataset.refs)
79 self.datasets = datasets
82@dataclass(frozen=True)
83class DatastoreFileGetInformation:
84 """Collection of useful parameters needed to retrieve a file from
85 a Datastore.
86 """
88 location: Location
89 """The location from which to read the dataset."""
91 formatter: Formatter
92 """The `Formatter` to use to deserialize the dataset."""
94 info: StoredFileInfo
95 """Stored information about this file and its formatter."""
97 assemblerParams: dict
98 """Parameters to use for post-processing the retrieved dataset."""
100 component: Optional[str]
101 """The component to be retrieved (can be `None`)."""
103 readStorageClass: StorageClass
104 """The `StorageClass` of the dataset being read."""
107class FileLikeDatastore(GenericBaseDatastore):
108 """Generic Datastore for file-based implementations.
110 Should always be sub-classed since key abstract methods are missing.
112 Parameters
113 ----------
114 config : `DatastoreConfig` or `str`
115 Configuration as either a `Config` object or URI to file.
117 Raises
118 ------
119 ValueError
120 If root location does not exist and ``create`` is `False` in the
121 configuration.
122 """
124 defaultConfigFile = None
125 """Path to configuration defaults. Relative to $DAF_BUTLER_DIR/config or
126 absolute path. Can be None if no defaults specified.
127 """
129 root: str
130 """Root directory or URI of this `Datastore`."""
132 locationFactory: LocationFactory
133 """Factory for creating locations relative to the datastore root."""
135 formatterFactory: FormatterFactory
136 """Factory for creating instances of formatters."""
138 templates: FileTemplates
139 """File templates that can be used by this `Datastore`."""
141 composites: CompositesMap
142 """Determines whether a dataset should be disassembled on put."""
144 @classmethod
145 def setConfigRoot(cls, root, config, full, overwrite=True):
146 """Set any filesystem-dependent config options for this Datastore to
147 be appropriate for a new empty repository with the given root.
149 Parameters
150 ----------
151 root : `str`
152 URI to the root of the data repository.
153 config : `Config`
154 A `Config` to update. Only the subset understood by
155 this component will be updated. Will not expand
156 defaults.
157 full : `Config`
158 A complete config with all defaults expanded that can be
159 converted to a `DatastoreConfig`. Read-only and will not be
160 modified by this method.
161 Repository-specific options that should not be obtained
162 from defaults when Butler instances are constructed
163 should be copied from ``full`` to ``config``.
164 overwrite : `bool`, optional
165 If `False`, do not modify a value in ``config`` if the value
166 already exists. Default is always to overwrite with the provided
167 ``root``.
169 Notes
170 -----
171 If a keyword is explicitly defined in the supplied ``config`` it
172 will not be overridden by this method if ``overwrite`` is `False`.
173 This allows explicit values set in external configs to be retained.
174 """
175 Config.updateParameters(DatastoreConfig, config, full,
176 toUpdate={"root": root},
177 toCopy=("cls", ("records", "table")), overwrite=overwrite)
179 @classmethod
180 def makeTableSpec(cls):
181 return ddl.TableSpec(
182 fields=NamedValueSet([
183 ddl.FieldSpec(name="dataset_id", dtype=Integer, primaryKey=True),
184 ddl.FieldSpec(name="path", dtype=String, length=256, nullable=False),
185 ddl.FieldSpec(name="formatter", dtype=String, length=128, nullable=False),
186 ddl.FieldSpec(name="storage_class", dtype=String, length=64, nullable=False),
187 # Use empty string to indicate no component
188 ddl.FieldSpec(name="component", dtype=String, length=32, primaryKey=True),
189 # TODO: should checksum be Base64Bytes instead?
190 ddl.FieldSpec(name="checksum", dtype=String, length=128, nullable=True),
191 ddl.FieldSpec(name="file_size", dtype=Integer, nullable=True),
192 ]),
193 unique=frozenset(),
194 )
196 def __init__(self, config, registry, butlerRoot=None):
197 super().__init__(config, registry)
198 if "root" not in self.config: 198 ↛ 199line 198 didn't jump to line 199, because the condition on line 198 was never true
199 raise ValueError("No root directory specified in configuration")
201 # Name ourselves either using an explicit name or a name
202 # derived from the (unexpanded) root
203 if "name" in self.config:
204 self.name = self.config["name"]
205 else:
206 # We use the unexpanded root in the name to indicate that this
207 # datastore can be moved without having to update registry.
208 self.name = "{}@{}".format(type(self).__name__,
209 self.config["root"])
211 # Support repository relocation in config
212 # Existence of self.root is checked in subclass
213 self.root = replaceRoot(self.config["root"], butlerRoot)
215 self.locationFactory = LocationFactory(self.root)
216 self.formatterFactory = FormatterFactory()
218 # Now associate formatters with storage classes
219 self.formatterFactory.registerFormatters(self.config["formatters"],
220 universe=self.registry.dimensions)
222 # Read the file naming templates
223 self.templates = FileTemplates(self.config["templates"],
224 universe=self.registry.dimensions)
226 # See if composites should be disassembled
227 self.composites = CompositesMap(self.config["composites"],
228 universe=self.registry.dimensions)
230 # Storage of paths and formatters, keyed by dataset_id
231 self._tableName = self.config["records", "table"]
232 try:
233 registry.registerOpaqueTable(self._tableName, self.makeTableSpec())
234 except ReadOnlyDatabaseError:
235 # If the database is read only and we just tried and failed to
236 # create a table, it means someone is trying to create a read-only
237 # butler client for an empty repo. That should be okay, as long
238 # as they then try to get any datasets before some other client
239 # creates the table. Chances are they'rejust validating
240 # configuration.
241 pass
243 # Determine whether checksums should be used
244 self.useChecksum = self.config.get("checksum", True)
246 def __str__(self):
247 return self.root
249 @abstractmethod
250 def _artifact_exists(self, location):
251 """Check that an artifact exists in this datastore at the specified
252 location.
254 Parameters
255 ----------
256 location : `Location`
257 Expected location of the artifact associated with this datastore.
259 Returns
260 -------
261 exists : `bool`
262 True if the location can be found, false otherwise.
263 """
264 raise NotImplementedError()
266 @abstractmethod
267 def _delete_artifact(self, location):
268 """Delete the artifact from the datastore.
270 Parameters
271 ----------
272 location : `Location`
273 Location of the artifact associated with this datastore.
274 """
275 raise NotImplementedError()
277 def addStoredItemInfo(self, refs, infos):
278 # Docstring inherited from GenericBaseDatastore
279 records = []
280 for ref, info in zip(refs, infos):
281 # Component should come from ref and fall back on info
282 component = ref.datasetType.component()
283 if component is None and info.component is not None: 283 ↛ 284line 283 didn't jump to line 284, because the condition on line 283 was never true
284 component = info.component
285 if component is None:
286 # Use empty string since we want this to be part of the
287 # primary key.
288 component = NULLSTR
289 records.append(
290 dict(dataset_id=ref.id, formatter=info.formatter, path=info.path,
291 storage_class=info.storageClass.name, component=component,
292 checksum=info.checksum, file_size=info.file_size)
293 )
294 self.registry.insertOpaqueData(self._tableName, *records)
296 def getStoredItemInfo(self, ref):
297 # Docstring inherited from GenericBaseDatastore
299 where = {"dataset_id": ref.id}
301 # If we have no component we want the row from this table without
302 # a component. If we do have a component we either need the row
303 # with no component or the row with the component, depending on how
304 # this dataset was dissassembled.
306 # if we are emptying trash we won't have real refs so can't constrain
307 # by component. Will need to fix this to return multiple matches
308 # in future.
309 try:
310 component = ref.datasetType.component()
311 except AttributeError:
312 component = None
313 else:
314 if component is None:
315 where["component"] = NULLSTR
317 # Look for the dataset_id -- there might be multiple matches
318 # if we have disassembled the dataset.
319 records = list(self.registry.fetchOpaqueData(self._tableName, **where))
320 if len(records) == 0: 320 ↛ 321line 320 didn't jump to line 321, because the condition on line 320 was never true
321 raise KeyError(f"Unable to retrieve location associated with dataset {ref}.")
323 # if we are not asking for a component
324 if not component and len(records) != 1: 324 ↛ 325line 324 didn't jump to line 325, because the condition on line 324 was never true
325 raise RuntimeError(f"Got {len(records)} from location query of dataset {ref}")
327 # if we had a FakeDatasetRef we pick the first record regardless
328 if isinstance(ref, FakeDatasetRef): 328 ↛ 329line 328 didn't jump to line 329, because the condition on line 328 was never true
329 record = records[0]
330 else:
331 records_by_component = {}
332 for r in records:
333 this_component = r["component"] if r["component"] and r["component"] != NULLSTR else None
334 records_by_component[this_component] = r
336 # Look for component by name else fall back to the parent
337 for lookup in (component, None): 337 ↛ 342line 337 didn't jump to line 342, because the loop on line 337 didn't complete
338 if lookup in records_by_component: 338 ↛ 337line 338 didn't jump to line 337, because the condition on line 338 was never false
339 record = records_by_component[lookup]
340 break
341 else:
342 raise KeyError(f"Unable to retrieve location for component {component} associated with "
343 f"dataset {ref}.")
345 # Convert name of StorageClass to instance
346 storageClass = self.storageClassFactory.getStorageClass(record["storage_class"])
348 return StoredFileInfo(formatter=record["formatter"],
349 path=record["path"],
350 storageClass=storageClass,
351 component=component,
352 checksum=record["checksum"],
353 file_size=record["file_size"])
355 def getStoredItemsInfo(self, ref):
356 # Docstring inherited from GenericBaseDatastore
358 # Look for the dataset_id -- there might be multiple matches
359 # if we have disassembled the dataset.
360 records = list(self.registry.fetchOpaqueData(self._tableName, dataset_id=ref.id))
362 results = []
363 for record in records:
364 # Convert name of StorageClass to instance
365 storageClass = self.storageClassFactory.getStorageClass(record["storage_class"])
366 component = record["component"] if (record["component"]
367 and record["component"] != NULLSTR) else None
369 info = StoredFileInfo(formatter=record["formatter"],
370 path=record["path"],
371 storageClass=storageClass,
372 component=component,
373 checksum=record["checksum"],
374 file_size=record["file_size"])
375 results.append(info)
377 return results
379 def _registered_refs_per_artifact(self, pathInStore):
380 """Return all dataset refs associated with the supplied path.
382 Parameters
383 ----------
384 pathInStore : `str`
385 Path of interest in the data store.
387 Returns
388 -------
389 ids : `set` of `int`
390 All `DatasetRef` IDs associated with this path.
391 """
392 records = list(self.registry.fetchOpaqueData(self._tableName, path=pathInStore))
393 ids = {r["dataset_id"] for r in records}
394 return ids
396 def removeStoredItemInfo(self, ref):
397 # Docstring inherited from GenericBaseDatastore
398 self.registry.deleteOpaqueData(self._tableName, dataset_id=ref.id)
400 def _get_dataset_location_info(self, ref):
401 """Find the `Location` of the requested dataset in the
402 `Datastore` and the associated stored file information.
404 Parameters
405 ----------
406 ref : `DatasetRef`
407 Reference to the required `Dataset`.
409 Returns
410 -------
411 location : `Location`
412 Location of the dataset within the datastore.
413 Returns `None` if the dataset can not be located.
414 info : `StoredFileInfo`
415 Stored information about this file and its formatter.
416 """
417 # Get the file information (this will fail if no file)
418 try:
419 storedFileInfo = self.getStoredItemInfo(ref)
420 except KeyError:
421 return None, None
423 # Use the path to determine the location
424 location = self.locationFactory.fromPath(storedFileInfo.path)
426 return location, storedFileInfo
428 def _get_dataset_locations_info(self, ref):
429 r"""Find all the `Location`\ s of the requested dataset in the
430 `Datastore` and the associated stored file information.
432 Parameters
433 ----------
434 ref : `DatasetRef`
435 Reference to the required `Dataset`.
437 Returns
438 -------
439 results : `list` [`tuple` [`Location`, `StoredFileInfo` ]]
440 Location of the dataset within the datastore and
441 stored information about each file and its formatter.
442 """
443 # Get the file information (this will fail if no file)
444 records = self.getStoredItemsInfo(ref)
446 # Use the path to determine the location
447 return [(self.locationFactory.fromPath(r.path), r) for r in records]
449 def _can_remove_dataset_artifact(self, ref, location):
450 """Check that there is only one dataset associated with the
451 specified artifact.
453 Parameters
454 ----------
455 ref : `DatasetRef`
456 Dataset to be removed.
457 location : `Location`
458 The location of the artifact to be removed.
460 Returns
461 -------
462 can_remove : `Bool`
463 True if the artifact can be safely removed.
464 """
466 # Get all entries associated with this path
467 allRefs = self._registered_refs_per_artifact(location.pathInStore)
468 if not allRefs: 468 ↛ 469line 468 didn't jump to line 469, because the condition on line 468 was never true
469 raise RuntimeError(f"Datastore inconsistency error. {location.pathInStore} not in registry")
471 # Get all the refs associated with this dataset if it is a composite
472 theseRefs = {r.id for r in ref.flatten([ref])}
474 # Remove these refs from all the refs and if there is nothing left
475 # then we can delete
476 remainingRefs = allRefs - theseRefs
478 if remainingRefs:
479 return False
480 return True
482 def _prepare_for_get(self, ref, parameters=None):
483 """Check parameters for ``get`` and obtain formatter and
484 location.
486 Parameters
487 ----------
488 ref : `DatasetRef`
489 Reference to the required Dataset.
490 parameters : `dict`
491 `StorageClass`-specific parameters that specify, for example,
492 a slice of the dataset to be loaded.
494 Returns
495 -------
496 getInfo : `list` [`DatastoreFileGetInformation`]
497 Parameters needed to retrieve each file.
498 """
499 log.debug("Retrieve %s from %s with parameters %s", ref, self.name, parameters)
501 # Get file metadata and internal metadata
502 fileLocations = self._get_dataset_locations_info(ref)
503 if not fileLocations:
504 raise FileNotFoundError(f"Could not retrieve dataset {ref}.")
506 # The storage class we want to use eventually
507 refStorageClass = ref.datasetType.storageClass
509 # Check that the supplied parameters are suitable for the type read
510 refStorageClass.validateParameters(parameters)
512 if len(fileLocations) > 1:
513 disassembled = True
514 else:
515 disassembled = False
517 # Is this a component request?
518 refComponent = ref.datasetType.component()
520 fileGetInfo = []
521 for location, storedFileInfo in fileLocations:
523 # The storage class used to write the file
524 writeStorageClass = storedFileInfo.storageClass
526 # If this has been disassembled we need read to match the write
527 if disassembled:
528 readStorageClass = writeStorageClass
529 else:
530 readStorageClass = refStorageClass
532 formatter = getInstanceOf(storedFileInfo.formatter,
533 FileDescriptor(location, readStorageClass=readStorageClass,
534 storageClass=writeStorageClass, parameters=parameters),
535 ref.dataId)
537 _, notFormatterParams = formatter.segregateParameters()
539 # Of the remaining parameters, extract the ones supported by
540 # this StorageClass (for components not all will be handled)
541 assemblerParams = readStorageClass.filterParameters(notFormatterParams)
543 # The ref itself could be a component if the dataset was
544 # disassembled by butler, or we disassembled in datastore and
545 # components came from the datastore records
546 if storedFileInfo.component:
547 component = storedFileInfo.component
548 else:
549 component = refComponent
551 fileGetInfo.append(DatastoreFileGetInformation(location, formatter, storedFileInfo,
552 assemblerParams, component, readStorageClass))
554 return fileGetInfo
556 def _prepare_for_put(self, inMemoryDataset, ref):
557 """Check the arguments for ``put`` and obtain formatter and
558 location.
560 Parameters
561 ----------
562 inMemoryDataset : `object`
563 The dataset to store.
564 ref : `DatasetRef`
565 Reference to the associated Dataset.
567 Returns
568 -------
569 location : `Location`
570 The location to write the dataset.
571 formatter : `Formatter`
572 The `Formatter` to use to write the dataset.
574 Raises
575 ------
576 TypeError
577 Supplied object and storage class are inconsistent.
578 DatasetTypeNotSupportedError
579 The associated `DatasetType` is not handled by this datastore.
580 """
581 self._validate_put_parameters(inMemoryDataset, ref)
583 # Work out output file name
584 try:
585 template = self.templates.getTemplate(ref)
586 except KeyError as e:
587 raise DatasetTypeNotSupportedError(f"Unable to find template for {ref}") from e
589 location = self.locationFactory.fromPath(template.format(ref))
591 # Get the formatter based on the storage class
592 storageClass = ref.datasetType.storageClass
593 try:
594 formatter = self.formatterFactory.getFormatter(ref,
595 FileDescriptor(location,
596 storageClass=storageClass),
597 ref.dataId)
598 except KeyError as e:
599 raise DatasetTypeNotSupportedError(f"Unable to find formatter for {ref}") from e
601 return location, formatter
603 @abstractmethod
604 def _standardizeIngestPath(self, path: str, *, transfer: Optional[str] = None) -> str:
605 """Standardize the path of a to-be-ingested file.
607 Parameters
608 ----------
609 path : `str`
610 Path of a file to be ingested.
611 transfer : `str`, optional
612 How (and whether) the dataset should be added to the datastore.
613 See `ingest` for details of transfer modes.
614 This implementation is provided only so
615 `NotImplementedError` can be raised if the mode is not supported;
616 actual transfers are deferred to `_extractIngestInfo`.
618 Returns
619 -------
620 path : `str`
621 New path in what the datastore considers standard form.
623 Notes
624 -----
625 Subclasses of `FileLikeDatastore` should implement this method instead
626 of `_prepIngest`. It should not modify the data repository or given
627 file in any way.
629 Raises
630 ------
631 NotImplementedError
632 Raised if the datastore does not support the given transfer mode
633 (including the case where ingest is not supported at all).
634 FileNotFoundError
635 Raised if one of the given files does not exist.
636 """
637 raise NotImplementedError("Must be implemented by subclasses.")
639 @abstractmethod
640 def _extractIngestInfo(self, path: str, ref: DatasetRef, *, formatter: Type[Formatter],
641 transfer: Optional[str] = None) -> StoredFileInfo:
642 """Relocate (if necessary) and extract `StoredFileInfo` from a
643 to-be-ingested file.
645 Parameters
646 ----------
647 path : `str`
648 Path of a file to be ingested.
649 ref : `DatasetRef`
650 Reference for the dataset being ingested. Guaranteed to have
651 ``dataset_id not None`.
652 formatter : `type`
653 `Formatter` subclass to use for this dataset.
654 transfer : `str`, optional
655 How (and whether) the dataset should be added to the datastore.
656 See `ingest` for details of transfer modes.
658 Returns
659 -------
660 info : `StoredFileInfo`
661 Internal datastore record for this file. This will be inserted by
662 the caller; the `_extractIngestInfo` is only resposible for
663 creating and populating the struct.
665 Raises
666 ------
667 FileNotFoundError
668 Raised if one of the given files does not exist.
669 FileExistsError
670 Raised if transfer is not `None` but the (internal) location the
671 file would be moved to is already occupied.
672 """
673 raise NotImplementedError("Must be implemented by subclasses.")
675 def _prepIngest(self, *datasets: FileDataset, transfer: Optional[str] = None) -> _IngestPrepData:
676 # Docstring inherited from Datastore._prepIngest.
677 filtered = []
678 for dataset in datasets:
679 acceptable = [ref for ref in dataset.refs if self.constraints.isAcceptable(ref)]
680 if not acceptable:
681 continue
682 else:
683 dataset.refs = acceptable
684 if dataset.formatter is None:
685 dataset.formatter = self.formatterFactory.getFormatterClass(dataset.refs[0])
686 else:
687 dataset.formatter = getClassOf(dataset.formatter)
688 dataset.path = self._standardizeIngestPath(dataset.path, transfer=transfer)
689 filtered.append(dataset)
690 return _IngestPrepData(filtered)
692 @transactional
693 def _finishIngest(self, prepData: Datastore.IngestPrepData, *, transfer: Optional[str] = None):
694 # Docstring inherited from Datastore._finishIngest.
695 refsAndInfos = []
696 for dataset in prepData.datasets:
697 # Do ingest as if the first dataset ref is associated with the file
698 info = self._extractIngestInfo(dataset.path, dataset.refs[0], formatter=dataset.formatter,
699 transfer=transfer)
700 refsAndInfos.extend([(ref, info) for ref in dataset.refs])
701 self._register_datasets(refsAndInfos)
703 def exists(self, ref):
704 """Check if the dataset exists in the datastore.
706 Parameters
707 ----------
708 ref : `DatasetRef`
709 Reference to the required dataset.
711 Returns
712 -------
713 exists : `bool`
714 `True` if the entity exists in the `Datastore`.
715 """
716 fileLocations = self._get_dataset_locations_info(ref)
717 if not fileLocations:
718 return False
719 for location, _ in fileLocations:
720 if not self._artifact_exists(location):
721 return False
723 return True
725 def getUri(self, ref, predict=False):
726 """URI to the Dataset.
728 Parameters
729 ----------
730 ref : `DatasetRef`
731 Reference to the required Dataset.
732 predict : `bool`
733 If `True`, allow URIs to be returned of datasets that have not
734 been written.
736 Returns
737 -------
738 uri : `str`
739 URI string pointing to the dataset within the datastore. If the
740 dataset does not exist in the datastore, and if ``predict`` is
741 `True`, the URI will be a prediction and will include a URI
742 fragment "#predicted".
743 If the datastore does not have entities that relate well
744 to the concept of a URI the returned URI string will be
745 descriptive. The returned URI is not guaranteed to be obtainable.
747 Raises
748 ------
749 FileNotFoundError
750 A URI has been requested for a dataset that does not exist and
751 guessing is not allowed.
753 Notes
754 -----
755 When a predicted URI is requested an attempt will be made to form
756 a reasonable URI based on file templates and the expected formatter.
757 """
758 # if this has never been written then we have to guess
759 if not self.exists(ref):
760 if not predict:
761 raise FileNotFoundError("Dataset {} not in this datastore".format(ref))
763 template = self.templates.getTemplate(ref)
764 location = self.locationFactory.fromPath(template.format(ref))
765 storageClass = ref.datasetType.storageClass
766 formatter = self.formatterFactory.getFormatter(ref, FileDescriptor(location,
767 storageClass=storageClass))
768 # Try to use the extension attribute but ignore problems if the
769 # formatter does not define one.
770 try:
771 location = formatter.makeUpdatedLocation(location)
772 except Exception:
773 # Use the default extension
774 pass
776 # Add a URI fragment to indicate this is a guess
777 return location.uri + "#predicted"
779 # If this is a ref that we have written we can get the path.
780 # Get file metadata and internal metadata
781 storedFileInfo = self.getStoredItemInfo(ref)
783 # Use the path to determine the location
784 location = self.locationFactory.fromPath(storedFileInfo.path)
786 return location.uri
788 def get(self, ref, parameters=None):
789 """Load an InMemoryDataset from the store.
791 Parameters
792 ----------
793 ref : `DatasetRef`
794 Reference to the required Dataset.
795 parameters : `dict`
796 `StorageClass`-specific parameters that specify, for example,
797 a slice of the dataset to be loaded.
799 Returns
800 -------
801 inMemoryDataset : `object`
802 Requested dataset or slice thereof as an InMemoryDataset.
804 Raises
805 ------
806 FileNotFoundError
807 Requested dataset can not be retrieved.
808 TypeError
809 Return value from formatter has unexpected type.
810 ValueError
811 Formatter failed to process the dataset.
812 """
813 allGetInfo = self._prepare_for_get(ref, parameters)
814 refComponent = ref.datasetType.component()
816 if len(allGetInfo) > 1 and not refComponent:
817 # This was a disassembled dataset spread over multiple files
818 # and we need to put them all back together again.
819 # Read into memory and then assemble
820 usedParams = set()
821 components = {}
822 for getInfo in allGetInfo:
823 # assemblerParams are parameters not understood by the
824 # associated formatter.
825 usedParams.update(set(getInfo.assemblerParams))
827 component = getInfo.component
828 # We do not want the formatter to think it's reading
829 # a component though because it is really reading a
830 # standalone dataset -- always tell reader it is not a
831 # component.
832 components[component] = self._read_artifact_into_memory(getInfo, ref, isComponent=False)
834 inMemoryDataset = ref.datasetType.storageClass.assembler().assemble(components)
836 # Any unused parameters will have to be passed to the assembler
837 if parameters:
838 unusedParams = {k: v for k, v in parameters.items() if k not in usedParams}
839 else:
840 unusedParams = {}
842 # Process parameters
843 return ref.datasetType.storageClass.assembler().handleParameters(inMemoryDataset,
844 parameters=unusedParams)
846 else:
847 # Single file request or component from that composite file
848 allComponents = {i.component: i for i in allGetInfo}
849 for lookup in (refComponent, None): 849 ↛ 854line 849 didn't jump to line 854, because the loop on line 849 didn't complete
850 if lookup in allComponents: 850 ↛ 849line 850 didn't jump to line 849, because the condition on line 850 was never false
851 getInfo = allComponents[lookup]
852 break
853 else:
854 raise FileNotFoundError(f"Component {refComponent} not found "
855 f"for ref {ref} in datastore {self.name}")
857 return self._read_artifact_into_memory(getInfo, ref, isComponent=getInfo.component is not None)
859 @transactional
860 def put(self, inMemoryDataset, ref):
861 """Write a InMemoryDataset with a given `DatasetRef` to the store.
863 Parameters
864 ----------
865 inMemoryDataset : `object`
866 The dataset to store.
867 ref : `DatasetRef`
868 Reference to the associated Dataset.
870 Raises
871 ------
872 TypeError
873 Supplied object and storage class are inconsistent.
874 DatasetTypeNotSupportedError
875 The associated `DatasetType` is not handled by this datastore.
877 Notes
878 -----
879 If the datastore is configured to reject certain dataset types it
880 is possible that the put will fail and raise a
881 `DatasetTypeNotSupportedError`. The main use case for this is to
882 allow `ChainedDatastore` to put to multiple datastores without
883 requiring that every datastore accepts the dataset.
884 """
886 doDisassembly = self.composites.shouldBeDisassembled(ref)
887 # doDisassembly = True
889 artifacts = []
890 if doDisassembly:
891 components = ref.datasetType.storageClass.assembler().disassemble(inMemoryDataset)
892 for component, componentInfo in components.items():
893 compTypeName = ref.datasetType.componentTypeName(component)
894 # Don't recurse because we want to take advantage of
895 # bulk insert -- need a new DatasetRef that refers to the
896 # same dataset_id but has the component DatasetType
897 # DatasetType does not refer to the types of components
898 # So we construct one ourselves.
899 compType = DatasetType(compTypeName, dimensions=ref.datasetType.dimensions,
900 storageClass=componentInfo.storageClass)
901 compRef = DatasetRef(compType, ref.dataId, id=ref.id, run=ref.run, conform=False)
902 storedInfo = self._write_in_memory_to_artifact(componentInfo.component, compRef)
903 artifacts.append((compRef, storedInfo))
904 else:
905 # Write the entire thing out
906 storedInfo = self._write_in_memory_to_artifact(inMemoryDataset, ref)
907 artifacts.append((ref, storedInfo))
909 self._register_datasets(artifacts)
911 @transactional
912 def trash(self, ref, ignore_errors=True):
913 """Indicate to the datastore that a dataset can be removed.
915 Parameters
916 ----------
917 ref : `DatasetRef`
918 Reference to the required Dataset.
919 ignore_errors : `bool`
920 If `True` return without error even if something went wrong.
921 Problems could occur if another process is simultaneously trying
922 to delete.
924 Raises
925 ------
926 FileNotFoundError
927 Attempt to remove a dataset that does not exist.
928 """
929 # Get file metadata and internal metadata
930 log.debug("Trashing %s in datastore %s", ref, self.name)
932 fileLocations = self._get_dataset_locations_info(ref)
934 if not fileLocations:
935 err_msg = f"Requested dataset to trash ({ref}) is not known to datastore {self.name}"
936 if ignore_errors:
937 log.warning(err_msg)
938 return
939 else:
940 raise FileNotFoundError(err_msg)
942 for location, storedFileInfo in fileLocations:
943 if not self._artifact_exists(location): 943 ↛ 944line 943 didn't jump to line 944, because the condition on line 943 was never true
944 err_msg = f"Dataset is known to datastore {self.name} but " \
945 f"associated artifact ({location.uri}) is missing"
946 if ignore_errors:
947 log.warning(err_msg)
948 return
949 else:
950 raise FileNotFoundError(err_msg)
952 # Mark dataset as trashed
953 try:
954 self._move_to_trash_in_registry(ref)
955 except Exception as e:
956 if ignore_errors:
957 log.warning(f"Attempted to mark dataset ({ref}) to be trashed in datastore {self.name} "
958 f"but encountered an error: {e}")
959 pass
960 else:
961 raise
963 @transactional
964 def emptyTrash(self, ignore_errors=True):
965 """Remove all datasets from the trash.
967 Parameters
968 ----------
969 ignore_errors : `bool`
970 If `True` return without error even if something went wrong.
971 Problems could occur if another process is simultaneously trying
972 to delete.
973 """
974 log.debug("Emptying trash in datastore %s", self.name)
975 trashed = self.registry.getTrashedDatasets(self.name)
977 for ref in trashed:
978 fileLocations = self._get_dataset_locations_info(ref)
980 for location, _ in fileLocations:
982 if location is None: 982 ↛ 983line 982 didn't jump to line 983, because the condition on line 982 was never true
983 err_msg = f"Requested dataset ({ref}) does not exist in datastore {self.name}"
984 if ignore_errors:
985 log.warning(err_msg)
986 continue
987 else:
988 raise FileNotFoundError(err_msg)
990 if not self._artifact_exists(location): 990 ↛ 991line 990 didn't jump to line 991, because the condition on line 990 was never true
991 err_msg = f"Dataset {location.uri} no longer present in datastore {self.name}"
992 if ignore_errors:
993 log.warning(err_msg)
994 continue
995 else:
996 raise FileNotFoundError(err_msg)
998 # Can only delete the artifact if there are no references
999 # to the file from untrashed dataset refs.
1000 if self._can_remove_dataset_artifact(ref, location):
1001 # Point of no return for this artifact
1002 log.debug("Removing artifact %s from datastore %s", location.uri, self.name)
1003 try:
1004 self._delete_artifact(location)
1005 except Exception as e:
1006 if ignore_errors:
1007 log.critical("Encountered error removing artifact %s from datastore %s: %s",
1008 location.uri, self.name, e)
1009 else:
1010 raise
1012 # Now must remove the entry from the internal registry even if
1013 # the artifact removal failed and was ignored,
1014 # otherwise the removal check above will never be true
1015 try:
1016 # There may be multiple rows associated with this ref
1017 # depending on disassembly
1018 self.removeStoredItemInfo(ref)
1019 except Exception as e:
1020 if ignore_errors:
1021 log.warning("Error removing dataset %s (%s) from internal registry of %s: %s",
1022 ref.id, location.uri, self.name, e)
1023 continue
1024 else:
1025 raise
1027 # Inform registry that we have removed items from datastore
1028 # This should work even if another process is clearing out those rows
1029 self.registry.emptyDatasetLocationsTrash(self.name, trashed)
1031 def validateConfiguration(self, entities, logFailures=False):
1032 """Validate some of the configuration for this datastore.
1034 Parameters
1035 ----------
1036 entities : iterable of `DatasetRef`, `DatasetType`, or `StorageClass`
1037 Entities to test against this configuration. Can be differing
1038 types.
1039 logFailures : `bool`, optional
1040 If `True`, output a log message for every validation error
1041 detected.
1043 Raises
1044 ------
1045 DatastoreValidationError
1046 Raised if there is a validation problem with a configuration.
1047 All the problems are reported in a single exception.
1049 Notes
1050 -----
1051 This method checks that all the supplied entities have valid file
1052 templates and also have formatters defined.
1053 """
1055 templateFailed = None
1056 try:
1057 self.templates.validateTemplates(entities, logFailures=logFailures)
1058 except FileTemplateValidationError as e:
1059 templateFailed = str(e)
1061 formatterFailed = []
1062 for entity in entities:
1063 try:
1064 self.formatterFactory.getFormatterClass(entity)
1065 except KeyError as e:
1066 formatterFailed.append(str(e))
1067 if logFailures: 1067 ↛ 1062line 1067 didn't jump to line 1062, because the condition on line 1067 was never false
1068 log.fatal("Formatter failure: %s", e)
1070 if templateFailed or formatterFailed:
1071 messages = []
1072 if templateFailed: 1072 ↛ 1073line 1072 didn't jump to line 1073, because the condition on line 1072 was never true
1073 messages.append(templateFailed)
1074 if formatterFailed: 1074 ↛ 1076line 1074 didn't jump to line 1076, because the condition on line 1074 was never false
1075 messages.append(",".join(formatterFailed))
1076 msg = ";\n".join(messages)
1077 raise DatastoreValidationError(msg)
1079 def getLookupKeys(self):
1080 # Docstring is inherited from base class
1081 return self.templates.getLookupKeys() | self.formatterFactory.getLookupKeys() | \
1082 self.constraints.getLookupKeys()
1084 def validateKey(self, lookupKey, entity):
1085 # Docstring is inherited from base class
1086 # The key can be valid in either formatters or templates so we can
1087 # only check the template if it exists
1088 if lookupKey in self.templates:
1089 try:
1090 self.templates[lookupKey].validateTemplate(entity)
1091 except FileTemplateValidationError as e:
1092 raise DatastoreValidationError(e) from e