Coverage for python/lsst/daf/butler/datastores/fileLikeDatastore.py : 81%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22"""Generic file-based datastore code."""
24__all__ = ("FileLikeDatastore", )
26import logging
27from abc import abstractmethod
29from sqlalchemy import Integer, String
31from dataclasses import dataclass
32from typing import Optional, List, Type
34from lsst.daf.butler import (
35 CompositesMap,
36 Config,
37 FileDataset,
38 DatasetRef,
39 DatasetType,
40 DatasetTypeNotSupportedError,
41 Datastore,
42 DatastoreConfig,
43 DatastoreValidationError,
44 FakeDatasetRef,
45 FileDescriptor,
46 FileTemplates,
47 FileTemplateValidationError,
48 Formatter,
49 FormatterFactory,
50 Location,
51 LocationFactory,
52 StorageClass,
53 StoredFileInfo,
54)
56from lsst.daf.butler import ddl
57from lsst.daf.butler.registry.interfaces import ReadOnlyDatabaseError
59from lsst.daf.butler.core.repoRelocation import replaceRoot
60from lsst.daf.butler.core.utils import getInstanceOf, NamedValueSet, getClassOf, transactional
61from .genericDatastore import GenericBaseDatastore
63log = logging.getLogger(__name__)
66class _IngestPrepData(Datastore.IngestPrepData):
67 """Helper class for FileLikeDatastore ingest implementation.
69 Parameters
70 ----------
71 datasets : `list` of `FileDataset`
72 Files to be ingested by this datastore.
73 """
74 def __init__(self, datasets: List[FileDataset]):
75 super().__init__(ref for dataset in datasets for ref in dataset.refs)
76 self.datasets = datasets
79@dataclass(frozen=True)
80class DatastoreFileGetInformation:
81 """Collection of useful parameters needed to retrieve a file from
82 a Datastore.
83 """
85 location: Location
86 """The location from which to read the dataset."""
88 formatter: Formatter
89 """The `Formatter` to use to deserialize the dataset."""
91 info: StoredFileInfo
92 """Stored information about this file and its formatter."""
94 assemblerParams: dict
95 """Parameters to use for post-processing the retrieved dataset."""
97 component: Optional[str]
98 """The component to be retrieved (can be `None`)."""
100 readStorageClass: StorageClass
101 """The `StorageClass` of the dataset being read."""
104class FileLikeDatastore(GenericBaseDatastore):
105 """Generic Datastore for file-based implementations.
107 Should always be sub-classed since key abstract methods are missing.
109 Parameters
110 ----------
111 config : `DatastoreConfig` or `str`
112 Configuration as either a `Config` object or URI to file.
114 Raises
115 ------
116 ValueError
117 If root location does not exist and ``create`` is `False` in the
118 configuration.
119 """
121 defaultConfigFile = None
122 """Path to configuration defaults. Relative to $DAF_BUTLER_DIR/config or
123 absolute path. Can be None if no defaults specified.
124 """
126 root: str
127 """Root directory or URI of this `Datastore`."""
129 locationFactory: LocationFactory
130 """Factory for creating locations relative to the datastore root."""
132 formatterFactory: FormatterFactory
133 """Factory for creating instances of formatters."""
135 templates: FileTemplates
136 """File templates that can be used by this `Datastore`."""
138 composites: CompositesMap
139 """Determines whether a dataset should be disassembled on put."""
141 @classmethod
142 def setConfigRoot(cls, root, config, full, overwrite=True):
143 """Set any filesystem-dependent config options for this Datastore to
144 be appropriate for a new empty repository with the given root.
146 Parameters
147 ----------
148 root : `str`
149 URI to the root of the data repository.
150 config : `Config`
151 A `Config` to update. Only the subset understood by
152 this component will be updated. Will not expand
153 defaults.
154 full : `Config`
155 A complete config with all defaults expanded that can be
156 converted to a `DatastoreConfig`. Read-only and will not be
157 modified by this method.
158 Repository-specific options that should not be obtained
159 from defaults when Butler instances are constructed
160 should be copied from ``full`` to ``config``.
161 overwrite : `bool`, optional
162 If `False`, do not modify a value in ``config`` if the value
163 already exists. Default is always to overwrite with the provided
164 ``root``.
166 Notes
167 -----
168 If a keyword is explicitly defined in the supplied ``config`` it
169 will not be overridden by this method if ``overwrite`` is `False`.
170 This allows explicit values set in external configs to be retained.
171 """
172 Config.updateParameters(DatastoreConfig, config, full,
173 toUpdate={"root": root},
174 toCopy=("cls", ("records", "table")), overwrite=overwrite)
176 @classmethod
177 def makeTableSpec(cls):
178 return ddl.TableSpec(
179 fields=NamedValueSet([
180 ddl.FieldSpec(name="dataset_id", dtype=Integer, primaryKey=True),
181 ddl.FieldSpec(name="path", dtype=String, length=256, nullable=False),
182 ddl.FieldSpec(name="formatter", dtype=String, length=128, nullable=False),
183 ddl.FieldSpec(name="storage_class", dtype=String, length=64, nullable=False),
184 # Use empty string to indicate no component
185 ddl.FieldSpec(name="component", dtype=String, length=32, primaryKey=True),
186 # TODO: should checksum be Base64Bytes instead?
187 ddl.FieldSpec(name="checksum", dtype=String, length=128, nullable=True),
188 ddl.FieldSpec(name="file_size", dtype=Integer, nullable=True),
189 ]),
190 unique=frozenset(),
191 )
193 def __init__(self, config, registry, butlerRoot=None):
194 super().__init__(config, registry)
195 if "root" not in self.config: 195 ↛ 196line 195 didn't jump to line 196, because the condition on line 195 was never true
196 raise ValueError("No root directory specified in configuration")
198 # Name ourselves either using an explicit name or a name
199 # derived from the (unexpanded) root
200 if "name" in self.config:
201 self.name = self.config["name"]
202 else:
203 # We use the unexpanded root in the name to indicate that this
204 # datastore can be moved without having to update registry.
205 self.name = "{}@{}".format(type(self).__name__,
206 self.config["root"])
208 # Support repository relocation in config
209 # Existence of self.root is checked in subclass
210 self.root = replaceRoot(self.config["root"], butlerRoot)
212 self.locationFactory = LocationFactory(self.root)
213 self.formatterFactory = FormatterFactory()
215 # Now associate formatters with storage classes
216 self.formatterFactory.registerFormatters(self.config["formatters"],
217 universe=self.registry.dimensions)
219 # Read the file naming templates
220 self.templates = FileTemplates(self.config["templates"],
221 universe=self.registry.dimensions)
223 # See if composites should be disassembled
224 self.composites = CompositesMap(self.config["composites"],
225 universe=self.registry.dimensions)
227 # Storage of paths and formatters, keyed by dataset_id
228 self._tableName = self.config["records", "table"]
229 try:
230 registry.registerOpaqueTable(self._tableName, self.makeTableSpec())
231 except ReadOnlyDatabaseError:
232 # If the database is read only and we just tried and failed to
233 # create a table, it means someone is trying to create a read-only
234 # butler client for an empty repo. That should be okay, as long
235 # as they then try to get any datasets before some other client
236 # creates the table. Chances are they'rejust validating
237 # configuration.
238 pass
240 # Determine whether checksums should be used
241 self.useChecksum = self.config.get("checksum", True)
243 def __str__(self):
244 return self.root
246 @abstractmethod
247 def _artifact_exists(self, location):
248 """Check that an artifact exists in this datastore at the specified
249 location.
251 Parameters
252 ----------
253 location : `Location`
254 Expected location of the artifact associated with this datastore.
256 Returns
257 -------
258 exists : `bool`
259 True if the location can be found, false otherwise.
260 """
261 raise NotImplementedError()
263 @abstractmethod
264 def _delete_artifact(self, location):
265 """Delete the artifact from the datastore.
267 Parameters
268 ----------
269 location : `Location`
270 Location of the artifact associated with this datastore.
271 """
272 raise NotImplementedError()
274 def addStoredItemInfo(self, refs, infos):
275 # Docstring inherited from GenericBaseDatastore
276 records = []
277 for ref, info in zip(refs, infos):
278 # Component should come from ref and fall back on info
279 component = ref.datasetType.component()
280 if component is None and info.component is not None: 280 ↛ 281line 280 didn't jump to line 281, because the condition on line 280 was never true
281 component = info.component
282 if component is None:
283 # Use empty string since we want this to be part of the
284 # primary key.
285 component = ""
286 records.append(
287 dict(dataset_id=ref.id, formatter=info.formatter, path=info.path,
288 storage_class=info.storageClass.name, component=component,
289 checksum=info.checksum, file_size=info.file_size)
290 )
291 self.registry.insertOpaqueData(self._tableName, *records)
293 def getStoredItemInfo(self, ref):
294 # Docstring inherited from GenericBaseDatastore
296 where = {"dataset_id": ref.id}
298 # If we have no component we want the row from this table without
299 # a component. If we do have a component we either need the row
300 # with no component or the row with the component, depending on how
301 # this dataset was dissassembled.
303 # if we are emptying trash we won't have real refs so can't constrain
304 # by component. Will need to fix this to return multiple matches
305 # in future.
306 try:
307 component = ref.datasetType.component()
308 except AttributeError:
309 component = None
310 else:
311 if component is None:
312 where["component"] = ""
314 # Look for the dataset_id -- there might be multiple matches
315 # if we have disassembled the dataset.
316 records = list(self.registry.fetchOpaqueData(self._tableName, **where))
317 if len(records) == 0: 317 ↛ 318line 317 didn't jump to line 318, because the condition on line 317 was never true
318 raise KeyError(f"Unable to retrieve location associated with dataset {ref}.")
320 # if we are not asking for a component
321 if not component and len(records) != 1: 321 ↛ 322line 321 didn't jump to line 322, because the condition on line 321 was never true
322 raise RuntimeError(f"Got {len(records)} from location query of dataset {ref}")
324 # if we had a FakeDatasetRef we pick the first record regardless
325 if isinstance(ref, FakeDatasetRef): 325 ↛ 326line 325 didn't jump to line 326, because the condition on line 325 was never true
326 record = records[0]
327 else:
328 records_by_component = {}
329 for r in records:
330 this_component = r["component"] if r["component"] else None
331 records_by_component[this_component] = r
333 # Look for component by name else fall back to the parent
334 for lookup in (component, None): 334 ↛ 339line 334 didn't jump to line 339, because the loop on line 334 didn't complete
335 if lookup in records_by_component: 335 ↛ 334line 335 didn't jump to line 334, because the condition on line 335 was never false
336 record = records_by_component[lookup]
337 break
338 else:
339 raise KeyError(f"Unable to retrieve location for component {component} associated with "
340 f"dataset {ref}.")
342 # Convert name of StorageClass to instance
343 storageClass = self.storageClassFactory.getStorageClass(record["storage_class"])
345 return StoredFileInfo(formatter=record["formatter"],
346 path=record["path"],
347 storageClass=storageClass,
348 component=component,
349 checksum=record["checksum"],
350 file_size=record["file_size"])
352 def getStoredItemsInfo(self, ref):
353 # Docstring inherited from GenericBaseDatastore
355 # Look for the dataset_id -- there might be multiple matches
356 # if we have disassembled the dataset.
357 records = list(self.registry.fetchOpaqueData(self._tableName, dataset_id=ref.id))
359 results = []
360 for record in records:
361 # Convert name of StorageClass to instance
362 storageClass = self.storageClassFactory.getStorageClass(record["storage_class"])
363 component = record["component"] if record["component"] else None
365 info = StoredFileInfo(formatter=record["formatter"],
366 path=record["path"],
367 storageClass=storageClass,
368 component=component,
369 checksum=record["checksum"],
370 file_size=record["file_size"])
371 results.append(info)
373 return results
375 def _registered_refs_per_artifact(self, pathInStore):
376 """Return all dataset refs associated with the supplied path.
378 Parameters
379 ----------
380 pathInStore : `str`
381 Path of interest in the data store.
383 Returns
384 -------
385 ids : `set` of `int`
386 All `DatasetRef` IDs associated with this path.
387 """
388 records = list(self.registry.fetchOpaqueData(self._tableName, path=pathInStore))
389 ids = {r["dataset_id"] for r in records}
390 return ids
392 def removeStoredItemInfo(self, ref):
393 # Docstring inherited from GenericBaseDatastore
394 self.registry.deleteOpaqueData(self._tableName, dataset_id=ref.id)
396 def _get_dataset_location_info(self, ref):
397 """Find the `Location` of the requested dataset in the
398 `Datastore` and the associated stored file information.
400 Parameters
401 ----------
402 ref : `DatasetRef`
403 Reference to the required `Dataset`.
405 Returns
406 -------
407 location : `Location`
408 Location of the dataset within the datastore.
409 Returns `None` if the dataset can not be located.
410 info : `StoredFileInfo`
411 Stored information about this file and its formatter.
412 """
413 # Get the file information (this will fail if no file)
414 try:
415 storedFileInfo = self.getStoredItemInfo(ref)
416 except KeyError:
417 return None, None
419 # Use the path to determine the location
420 location = self.locationFactory.fromPath(storedFileInfo.path)
422 return location, storedFileInfo
424 def _get_dataset_locations_info(self, ref):
425 r"""Find all the `Location`\ s of the requested dataset in the
426 `Datastore` and the associated stored file information.
428 Parameters
429 ----------
430 ref : `DatasetRef`
431 Reference to the required `Dataset`.
433 Returns
434 -------
435 results : `list` [`tuple` [`Location`, `StoredFileInfo` ]]
436 Location of the dataset within the datastore and
437 stored information about each file and its formatter.
438 """
439 # Get the file information (this will fail if no file)
440 records = self.getStoredItemsInfo(ref)
442 # Use the path to determine the location
443 return [(self.locationFactory.fromPath(r.path), r) for r in records]
445 def _can_remove_dataset_artifact(self, ref, location):
446 """Check that there is only one dataset associated with the
447 specified artifact.
449 Parameters
450 ----------
451 ref : `DatasetRef`
452 Dataset to be removed.
453 location : `Location`
454 The location of the artifact to be removed.
456 Returns
457 -------
458 can_remove : `Bool`
459 True if the artifact can be safely removed.
460 """
462 # Get all entries associated with this path
463 allRefs = self._registered_refs_per_artifact(location.pathInStore)
464 if not allRefs: 464 ↛ 465line 464 didn't jump to line 465, because the condition on line 464 was never true
465 raise RuntimeError(f"Datastore inconsistency error. {location.pathInStore} not in registry")
467 # Get all the refs associated with this dataset if it is a composite
468 theseRefs = {r.id for r in ref.flatten([ref])}
470 # Remove these refs from all the refs and if there is nothing left
471 # then we can delete
472 remainingRefs = allRefs - theseRefs
474 if remainingRefs:
475 return False
476 return True
478 def _prepare_for_get(self, ref, parameters=None):
479 """Check parameters for ``get`` and obtain formatter and
480 location.
482 Parameters
483 ----------
484 ref : `DatasetRef`
485 Reference to the required Dataset.
486 parameters : `dict`
487 `StorageClass`-specific parameters that specify, for example,
488 a slice of the dataset to be loaded.
490 Returns
491 -------
492 getInfo : `list` [`DatastoreFileGetInformation`]
493 Parameters needed to retrieve each file.
494 """
495 log.debug("Retrieve %s from %s with parameters %s", ref, self.name, parameters)
497 # Get file metadata and internal metadata
498 fileLocations = self._get_dataset_locations_info(ref)
499 if not fileLocations:
500 raise FileNotFoundError(f"Could not retrieve dataset {ref}.")
502 # The storage class we want to use eventually
503 refStorageClass = ref.datasetType.storageClass
505 # Check that the supplied parameters are suitable for the type read
506 refStorageClass.validateParameters(parameters)
508 if len(fileLocations) > 1:
509 disassembled = True
510 else:
511 disassembled = False
513 # Is this a component request?
514 refComponent = ref.datasetType.component()
516 fileGetInfo = []
517 for location, storedFileInfo in fileLocations:
519 # The storage class used to write the file
520 writeStorageClass = storedFileInfo.storageClass
522 # If this has been disassembled we need read to match the write
523 if disassembled:
524 readStorageClass = writeStorageClass
525 else:
526 readStorageClass = refStorageClass
528 formatter = getInstanceOf(storedFileInfo.formatter,
529 FileDescriptor(location, readStorageClass=readStorageClass,
530 storageClass=writeStorageClass, parameters=parameters),
531 ref.dataId)
533 _, notFormatterParams = formatter.segregateParameters()
535 # Of the remaining parameters, extract the ones supported by
536 # this StorageClass (for components not all will be handled)
537 assemblerParams = readStorageClass.filterParameters(notFormatterParams)
539 # The ref itself could be a component if the dataset was
540 # disassembled by butler, or we disassembled in datastore and
541 # components came from the datastore records
542 if storedFileInfo.component:
543 component = storedFileInfo.component
544 else:
545 component = refComponent
547 fileGetInfo.append(DatastoreFileGetInformation(location, formatter, storedFileInfo,
548 assemblerParams, component, readStorageClass))
550 return fileGetInfo
552 def _prepare_for_put(self, inMemoryDataset, ref):
553 """Check the arguments for ``put`` and obtain formatter and
554 location.
556 Parameters
557 ----------
558 inMemoryDataset : `object`
559 The dataset to store.
560 ref : `DatasetRef`
561 Reference to the associated Dataset.
563 Returns
564 -------
565 location : `Location`
566 The location to write the dataset.
567 formatter : `Formatter`
568 The `Formatter` to use to write the dataset.
570 Raises
571 ------
572 TypeError
573 Supplied object and storage class are inconsistent.
574 DatasetTypeNotSupportedError
575 The associated `DatasetType` is not handled by this datastore.
576 """
577 self._validate_put_parameters(inMemoryDataset, ref)
579 # Work out output file name
580 try:
581 template = self.templates.getTemplate(ref)
582 except KeyError as e:
583 raise DatasetTypeNotSupportedError(f"Unable to find template for {ref}") from e
585 location = self.locationFactory.fromPath(template.format(ref))
587 # Get the formatter based on the storage class
588 storageClass = ref.datasetType.storageClass
589 try:
590 formatter = self.formatterFactory.getFormatter(ref,
591 FileDescriptor(location,
592 storageClass=storageClass),
593 ref.dataId)
594 except KeyError as e:
595 raise DatasetTypeNotSupportedError(f"Unable to find formatter for {ref}") from e
597 return location, formatter
599 @abstractmethod
600 def _standardizeIngestPath(self, path: str, *, transfer: Optional[str] = None) -> str:
601 """Standardize the path of a to-be-ingested file.
603 Parameters
604 ----------
605 path : `str`
606 Path of a file to be ingested.
607 transfer : `str`, optional
608 How (and whether) the dataset should be added to the datastore.
609 See `ingest` for details of transfer modes.
610 This implementation is provided only so
611 `NotImplementedError` can be raised if the mode is not supported;
612 actual transfers are deferred to `_extractIngestInfo`.
614 Returns
615 -------
616 path : `str`
617 New path in what the datastore considers standard form.
619 Notes
620 -----
621 Subclasses of `FileLikeDatastore` should implement this method instead
622 of `_prepIngest`. It should not modify the data repository or given
623 file in any way.
625 Raises
626 ------
627 NotImplementedError
628 Raised if the datastore does not support the given transfer mode
629 (including the case where ingest is not supported at all).
630 FileNotFoundError
631 Raised if one of the given files does not exist.
632 """
633 raise NotImplementedError("Must be implemented by subclasses.")
635 @abstractmethod
636 def _extractIngestInfo(self, path: str, ref: DatasetRef, *, formatter: Type[Formatter],
637 transfer: Optional[str] = None) -> StoredFileInfo:
638 """Relocate (if necessary) and extract `StoredFileInfo` from a
639 to-be-ingested file.
641 Parameters
642 ----------
643 path : `str`
644 Path of a file to be ingested.
645 ref : `DatasetRef`
646 Reference for the dataset being ingested. Guaranteed to have
647 ``dataset_id not None`.
648 formatter : `type`
649 `Formatter` subclass to use for this dataset.
650 transfer : `str`, optional
651 How (and whether) the dataset should be added to the datastore.
652 See `ingest` for details of transfer modes.
654 Returns
655 -------
656 info : `StoredFileInfo`
657 Internal datastore record for this file. This will be inserted by
658 the caller; the `_extractIngestInfo` is only resposible for
659 creating and populating the struct.
661 Raises
662 ------
663 FileNotFoundError
664 Raised if one of the given files does not exist.
665 FileExistsError
666 Raised if transfer is not `None` but the (internal) location the
667 file would be moved to is already occupied.
668 """
669 raise NotImplementedError("Must be implemented by subclasses.")
671 def _prepIngest(self, *datasets: FileDataset, transfer: Optional[str] = None) -> _IngestPrepData:
672 # Docstring inherited from Datastore._prepIngest.
673 filtered = []
674 for dataset in datasets:
675 acceptable = [ref for ref in dataset.refs if self.constraints.isAcceptable(ref)]
676 if not acceptable:
677 continue
678 else:
679 dataset.refs = acceptable
680 if dataset.formatter is None:
681 dataset.formatter = self.formatterFactory.getFormatterClass(dataset.refs[0])
682 else:
683 dataset.formatter = getClassOf(dataset.formatter)
684 dataset.path = self._standardizeIngestPath(dataset.path, transfer=transfer)
685 filtered.append(dataset)
686 return _IngestPrepData(filtered)
688 @transactional
689 def _finishIngest(self, prepData: Datastore.IngestPrepData, *, transfer: Optional[str] = None):
690 # Docstring inherited from Datastore._finishIngest.
691 refsAndInfos = []
692 for dataset in prepData.datasets:
693 # Do ingest as if the first dataset ref is associated with the file
694 info = self._extractIngestInfo(dataset.path, dataset.refs[0], formatter=dataset.formatter,
695 transfer=transfer)
696 refsAndInfos.extend([(ref, info) for ref in dataset.refs])
697 self._register_datasets(refsAndInfos)
699 def exists(self, ref):
700 """Check if the dataset exists in the datastore.
702 Parameters
703 ----------
704 ref : `DatasetRef`
705 Reference to the required dataset.
707 Returns
708 -------
709 exists : `bool`
710 `True` if the entity exists in the `Datastore`.
711 """
712 fileLocations = self._get_dataset_locations_info(ref)
713 if not fileLocations:
714 return False
715 for location, _ in fileLocations:
716 if not self._artifact_exists(location):
717 return False
719 return True
721 def getUri(self, ref, predict=False):
722 """URI to the Dataset.
724 Parameters
725 ----------
726 ref : `DatasetRef`
727 Reference to the required Dataset.
728 predict : `bool`
729 If `True`, allow URIs to be returned of datasets that have not
730 been written.
732 Returns
733 -------
734 uri : `str`
735 URI string pointing to the dataset within the datastore. If the
736 dataset does not exist in the datastore, and if ``predict`` is
737 `True`, the URI will be a prediction and will include a URI
738 fragment "#predicted".
739 If the datastore does not have entities that relate well
740 to the concept of a URI the returned URI string will be
741 descriptive. The returned URI is not guaranteed to be obtainable.
743 Raises
744 ------
745 FileNotFoundError
746 A URI has been requested for a dataset that does not exist and
747 guessing is not allowed.
749 Notes
750 -----
751 When a predicted URI is requested an attempt will be made to form
752 a reasonable URI based on file templates and the expected formatter.
753 """
754 # if this has never been written then we have to guess
755 if not self.exists(ref):
756 if not predict:
757 raise FileNotFoundError("Dataset {} not in this datastore".format(ref))
759 template = self.templates.getTemplate(ref)
760 location = self.locationFactory.fromPath(template.format(ref))
761 storageClass = ref.datasetType.storageClass
762 formatter = self.formatterFactory.getFormatter(ref, FileDescriptor(location,
763 storageClass=storageClass))
764 # Try to use the extension attribute but ignore problems if the
765 # formatter does not define one.
766 try:
767 location = formatter.makeUpdatedLocation(location)
768 except Exception:
769 # Use the default extension
770 pass
772 # Add a URI fragment to indicate this is a guess
773 return location.uri + "#predicted"
775 # If this is a ref that we have written we can get the path.
776 # Get file metadata and internal metadata
777 storedFileInfo = self.getStoredItemInfo(ref)
779 # Use the path to determine the location
780 location = self.locationFactory.fromPath(storedFileInfo.path)
782 return location.uri
784 def get(self, ref, parameters=None):
785 """Load an InMemoryDataset from the store.
787 Parameters
788 ----------
789 ref : `DatasetRef`
790 Reference to the required Dataset.
791 parameters : `dict`
792 `StorageClass`-specific parameters that specify, for example,
793 a slice of the dataset to be loaded.
795 Returns
796 -------
797 inMemoryDataset : `object`
798 Requested dataset or slice thereof as an InMemoryDataset.
800 Raises
801 ------
802 FileNotFoundError
803 Requested dataset can not be retrieved.
804 TypeError
805 Return value from formatter has unexpected type.
806 ValueError
807 Formatter failed to process the dataset.
808 """
809 allGetInfo = self._prepare_for_get(ref, parameters)
810 refComponent = ref.datasetType.component()
812 if len(allGetInfo) > 1 and not refComponent:
813 # This was a disassembled dataset spread over multiple files
814 # and we need to put them all back together again.
815 # Read into memory and then assemble
816 usedParams = set()
817 components = {}
818 for getInfo in allGetInfo:
819 # assemblerParams are parameters not understood by the
820 # associated formatter.
821 usedParams.update(set(getInfo.assemblerParams))
823 component = getInfo.component
824 # We do not want the formatter to think it's reading
825 # a component though because it is really reading a
826 # standalone dataset -- always tell reader it is not a
827 # component.
828 components[component] = self._read_artifact_into_memory(getInfo, ref, isComponent=False)
830 inMemoryDataset = ref.datasetType.storageClass.assembler().assemble(components)
832 # Any unused parameters will have to be passed to the assembler
833 if parameters:
834 unusedParams = {k: v for k, v in parameters.items() if k not in usedParams}
835 else:
836 unusedParams = {}
838 # Process parameters
839 return ref.datasetType.storageClass.assembler().handleParameters(inMemoryDataset,
840 parameters=unusedParams)
842 else:
843 # Single file request or component from that composite file
844 allComponents = {i.component: i for i in allGetInfo}
845 for lookup in (refComponent, None): 845 ↛ 850line 845 didn't jump to line 850, because the loop on line 845 didn't complete
846 if lookup in allComponents: 846 ↛ 845line 846 didn't jump to line 845, because the condition on line 846 was never false
847 getInfo = allComponents[lookup]
848 break
849 else:
850 raise FileNotFoundError(f"Component {refComponent} not found "
851 f"for ref {ref} in datastore {self.name}")
853 return self._read_artifact_into_memory(getInfo, ref, isComponent=getInfo.component is not None)
855 @transactional
856 def put(self, inMemoryDataset, ref):
857 """Write a InMemoryDataset with a given `DatasetRef` to the store.
859 Parameters
860 ----------
861 inMemoryDataset : `object`
862 The dataset to store.
863 ref : `DatasetRef`
864 Reference to the associated Dataset.
866 Raises
867 ------
868 TypeError
869 Supplied object and storage class are inconsistent.
870 DatasetTypeNotSupportedError
871 The associated `DatasetType` is not handled by this datastore.
873 Notes
874 -----
875 If the datastore is configured to reject certain dataset types it
876 is possible that the put will fail and raise a
877 `DatasetTypeNotSupportedError`. The main use case for this is to
878 allow `ChainedDatastore` to put to multiple datastores without
879 requiring that every datastore accepts the dataset.
880 """
882 doDisassembly = self.composites.shouldBeDisassembled(ref)
883 # doDisassembly = True
885 artifacts = []
886 if doDisassembly:
887 components = ref.datasetType.storageClass.assembler().disassemble(inMemoryDataset)
888 for component, componentInfo in components.items():
889 compTypeName = ref.datasetType.componentTypeName(component)
890 # Don't recurse because we want to take advantage of
891 # bulk insert -- need a new DatasetRef that refers to the
892 # same dataset_id but has the component DatasetType
893 # DatasetType does not refer to the types of components
894 # So we construct one ourselves.
895 compType = DatasetType(compTypeName, dimensions=ref.datasetType.dimensions,
896 storageClass=componentInfo.storageClass)
897 compRef = DatasetRef(compType, ref.dataId, id=ref.id, run=ref.run, conform=False)
898 storedInfo = self._write_in_memory_to_artifact(componentInfo.component, compRef)
899 artifacts.append((compRef, storedInfo))
900 else:
901 # Write the entire thing out
902 storedInfo = self._write_in_memory_to_artifact(inMemoryDataset, ref)
903 artifacts.append((ref, storedInfo))
905 self._register_datasets(artifacts)
907 @transactional
908 def trash(self, ref, ignore_errors=True):
909 """Indicate to the datastore that a dataset can be removed.
911 Parameters
912 ----------
913 ref : `DatasetRef`
914 Reference to the required Dataset.
915 ignore_errors : `bool`
916 If `True` return without error even if something went wrong.
917 Problems could occur if another process is simultaneously trying
918 to delete.
920 Raises
921 ------
922 FileNotFoundError
923 Attempt to remove a dataset that does not exist.
924 """
925 # Get file metadata and internal metadata
926 log.debug("Trashing %s in datastore %s", ref, self.name)
928 fileLocations = self._get_dataset_locations_info(ref)
930 if not fileLocations:
931 err_msg = f"Requested dataset to trash ({ref}) is not known to datastore {self.name}"
932 if ignore_errors:
933 log.warning(err_msg)
934 return
935 else:
936 raise FileNotFoundError(err_msg)
938 for location, storedFileInfo in fileLocations:
939 if not self._artifact_exists(location): 939 ↛ 940line 939 didn't jump to line 940, because the condition on line 939 was never true
940 err_msg = f"Dataset is known to datastore {self.name} but " \
941 f"associated artifact ({location.uri}) is missing"
942 if ignore_errors:
943 log.warning(err_msg)
944 return
945 else:
946 raise FileNotFoundError(err_msg)
948 # Mark dataset as trashed
949 try:
950 self._move_to_trash_in_registry(ref)
951 except Exception as e:
952 if ignore_errors:
953 log.warning(f"Attempted to mark dataset ({ref}) to be trashed in datastore {self.name} "
954 f"but encountered an error: {e}")
955 pass
956 else:
957 raise
959 @transactional
960 def emptyTrash(self, ignore_errors=True):
961 """Remove all datasets from the trash.
963 Parameters
964 ----------
965 ignore_errors : `bool`
966 If `True` return without error even if something went wrong.
967 Problems could occur if another process is simultaneously trying
968 to delete.
969 """
970 log.debug("Emptying trash in datastore %s", self.name)
971 trashed = self.registry.getTrashedDatasets(self.name)
973 for ref in trashed:
974 fileLocations = self._get_dataset_locations_info(ref)
976 for location, _ in fileLocations:
978 if location is None: 978 ↛ 979line 978 didn't jump to line 979, because the condition on line 978 was never true
979 err_msg = f"Requested dataset ({ref}) does not exist in datastore {self.name}"
980 if ignore_errors:
981 log.warning(err_msg)
982 continue
983 else:
984 raise FileNotFoundError(err_msg)
986 if not self._artifact_exists(location): 986 ↛ 987line 986 didn't jump to line 987, because the condition on line 986 was never true
987 err_msg = f"Dataset {location.uri} no longer present in datastore {self.name}"
988 if ignore_errors:
989 log.warning(err_msg)
990 continue
991 else:
992 raise FileNotFoundError(err_msg)
994 # Can only delete the artifact if there are no references
995 # to the file from untrashed dataset refs.
996 if self._can_remove_dataset_artifact(ref, location):
997 # Point of no return for this artifact
998 log.debug("Removing artifact %s from datastore %s", location.uri, self.name)
999 try:
1000 self._delete_artifact(location)
1001 except Exception as e:
1002 if ignore_errors:
1003 log.critical("Encountered error removing artifact %s from datastore %s: %s",
1004 location.uri, self.name, e)
1005 else:
1006 raise
1008 # Now must remove the entry from the internal registry even if
1009 # the artifact removal failed and was ignored,
1010 # otherwise the removal check above will never be true
1011 try:
1012 # There may be multiple rows associated with this ref
1013 # depending on disassembly
1014 self.removeStoredItemInfo(ref)
1015 except Exception as e:
1016 if ignore_errors:
1017 log.warning(f"Error removing dataset %s (%s) from internal registry of %s: %s",
1018 ref.id, location.uri, self.name, e)
1019 continue
1020 else:
1021 raise
1023 # Inform registry that we have removed items from datastore
1024 # This should work even if another process is clearing out those rows
1025 self.registry.emptyDatasetLocationsTrash(self.name, trashed)
1027 def validateConfiguration(self, entities, logFailures=False):
1028 """Validate some of the configuration for this datastore.
1030 Parameters
1031 ----------
1032 entities : iterable of `DatasetRef`, `DatasetType`, or `StorageClass`
1033 Entities to test against this configuration. Can be differing
1034 types.
1035 logFailures : `bool`, optional
1036 If `True`, output a log message for every validation error
1037 detected.
1039 Raises
1040 ------
1041 DatastoreValidationError
1042 Raised if there is a validation problem with a configuration.
1043 All the problems are reported in a single exception.
1045 Notes
1046 -----
1047 This method checks that all the supplied entities have valid file
1048 templates and also have formatters defined.
1049 """
1051 templateFailed = None
1052 try:
1053 self.templates.validateTemplates(entities, logFailures=logFailures)
1054 except FileTemplateValidationError as e:
1055 templateFailed = str(e)
1057 formatterFailed = []
1058 for entity in entities:
1059 try:
1060 self.formatterFactory.getFormatterClass(entity)
1061 except KeyError as e:
1062 formatterFailed.append(str(e))
1063 if logFailures: 1063 ↛ 1058line 1063 didn't jump to line 1058, because the condition on line 1063 was never false
1064 log.fatal("Formatter failure: %s", e)
1066 if templateFailed or formatterFailed:
1067 messages = []
1068 if templateFailed: 1068 ↛ 1069line 1068 didn't jump to line 1069, because the condition on line 1068 was never true
1069 messages.append(templateFailed)
1070 if formatterFailed: 1070 ↛ 1072line 1070 didn't jump to line 1072, because the condition on line 1070 was never false
1071 messages.append(",".join(formatterFailed))
1072 msg = ";\n".join(messages)
1073 raise DatastoreValidationError(msg)
1075 def getLookupKeys(self):
1076 # Docstring is inherited from base class
1077 return self.templates.getLookupKeys() | self.formatterFactory.getLookupKeys() | \
1078 self.constraints.getLookupKeys()
1080 def validateKey(self, lookupKey, entity):
1081 # Docstring is inherited from base class
1082 # The key can be valid in either formatters or templates so we can
1083 # only check the template if it exists
1084 if lookupKey in self.templates:
1085 try:
1086 self.templates[lookupKey].validateTemplate(entity)
1087 except FileTemplateValidationError as e:
1088 raise DatastoreValidationError(e) from e