Coverage for python/lsst/daf/butler/datastores/fileDatastore.py : 82%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
21from __future__ import annotations
23"""Generic file-based datastore code."""
25__all__ = ("FileDatastore", )
27import hashlib
28import logging
29import os
30import tempfile
32from sqlalchemy import BigInteger, String
34from collections import defaultdict
35from dataclasses import dataclass
36from typing import (
37 TYPE_CHECKING,
38 Any,
39 ClassVar,
40 Dict,
41 Iterable,
42 List,
43 Mapping,
44 Optional,
45 Set,
46 Tuple,
47 Type,
48 Union,
49)
51from lsst.daf.butler import (
52 ButlerURI,
53 CompositesMap,
54 Config,
55 FileDataset,
56 DatasetId,
57 DatasetRef,
58 DatasetType,
59 DatasetTypeNotSupportedError,
60 Datastore,
61 DatastoreCacheManager,
62 DatastoreDisabledCacheManager,
63 DatastoreConfig,
64 DatastoreValidationError,
65 FileDescriptor,
66 FileTemplates,
67 FileTemplateValidationError,
68 Formatter,
69 FormatterFactory,
70 Location,
71 LocationFactory,
72 Progress,
73 StorageClass,
74 StoredFileInfo,
75)
77from lsst.daf.butler import ddl
78from lsst.daf.butler.registry.interfaces import (
79 ReadOnlyDatabaseError,
80 DatastoreRegistryBridge,
81)
83from lsst.daf.butler.core.repoRelocation import replaceRoot
84from lsst.daf.butler.core.utils import getInstanceOf, getClassOf, transactional
85from .genericDatastore import GenericBaseDatastore
87if TYPE_CHECKING: 87 ↛ 88line 87 didn't jump to line 88, because the condition on line 87 was never true
88 from lsst.daf.butler import LookupKey, AbstractDatastoreCacheManager
89 from lsst.daf.butler.registry.interfaces import DatasetIdRef, DatastoreRegistryBridgeManager
91log = logging.getLogger(__name__)
94class _IngestPrepData(Datastore.IngestPrepData):
95 """Helper class for FileDatastore ingest implementation.
97 Parameters
98 ----------
99 datasets : `list` of `FileDataset`
100 Files to be ingested by this datastore.
101 """
102 def __init__(self, datasets: List[FileDataset]):
103 super().__init__(ref for dataset in datasets for ref in dataset.refs)
104 self.datasets = datasets
107@dataclass(frozen=True)
108class DatastoreFileGetInformation:
109 """Collection of useful parameters needed to retrieve a file from
110 a Datastore.
111 """
113 location: Location
114 """The location from which to read the dataset."""
116 formatter: Formatter
117 """The `Formatter` to use to deserialize the dataset."""
119 info: StoredFileInfo
120 """Stored information about this file and its formatter."""
122 assemblerParams: Dict[str, Any]
123 """Parameters to use for post-processing the retrieved dataset."""
125 formatterParams: Dict[str, Any]
126 """Parameters that were understood by the associated formatter."""
128 component: Optional[str]
129 """The component to be retrieved (can be `None`)."""
131 readStorageClass: StorageClass
132 """The `StorageClass` of the dataset being read."""
135class FileDatastore(GenericBaseDatastore):
136 """Generic Datastore for file-based implementations.
138 Should always be sub-classed since key abstract methods are missing.
140 Parameters
141 ----------
142 config : `DatastoreConfig` or `str`
143 Configuration as either a `Config` object or URI to file.
144 bridgeManager : `DatastoreRegistryBridgeManager`
145 Object that manages the interface between `Registry` and datastores.
146 butlerRoot : `str`, optional
147 New datastore root to use to override the configuration value.
149 Raises
150 ------
151 ValueError
152 If root location does not exist and ``create`` is `False` in the
153 configuration.
154 """
156 defaultConfigFile: ClassVar[Optional[str]] = None
157 """Path to configuration defaults. Accessed within the ``config`` resource
158 or relative to a search path. Can be None if no defaults specified.
159 """
161 root: ButlerURI
162 """Root directory URI of this `Datastore`."""
164 locationFactory: LocationFactory
165 """Factory for creating locations relative to the datastore root."""
167 formatterFactory: FormatterFactory
168 """Factory for creating instances of formatters."""
170 templates: FileTemplates
171 """File templates that can be used by this `Datastore`."""
173 composites: CompositesMap
174 """Determines whether a dataset should be disassembled on put."""
176 defaultConfigFile = "datastores/fileDatastore.yaml"
177 """Path to configuration defaults. Accessed within the ``config`` resource
178 or relative to a search path. Can be None if no defaults specified.
179 """
181 @classmethod
182 def setConfigRoot(cls, root: str, config: Config, full: Config, overwrite: bool = True) -> None:
183 """Set any filesystem-dependent config options for this Datastore to
184 be appropriate for a new empty repository with the given root.
186 Parameters
187 ----------
188 root : `str`
189 URI to the root of the data repository.
190 config : `Config`
191 A `Config` to update. Only the subset understood by
192 this component will be updated. Will not expand
193 defaults.
194 full : `Config`
195 A complete config with all defaults expanded that can be
196 converted to a `DatastoreConfig`. Read-only and will not be
197 modified by this method.
198 Repository-specific options that should not be obtained
199 from defaults when Butler instances are constructed
200 should be copied from ``full`` to ``config``.
201 overwrite : `bool`, optional
202 If `False`, do not modify a value in ``config`` if the value
203 already exists. Default is always to overwrite with the provided
204 ``root``.
206 Notes
207 -----
208 If a keyword is explicitly defined in the supplied ``config`` it
209 will not be overridden by this method if ``overwrite`` is `False`.
210 This allows explicit values set in external configs to be retained.
211 """
212 Config.updateParameters(DatastoreConfig, config, full,
213 toUpdate={"root": root},
214 toCopy=("cls", ("records", "table")), overwrite=overwrite)
216 @classmethod
217 def makeTableSpec(cls, datasetIdColumnType: type) -> ddl.TableSpec:
218 return ddl.TableSpec(
219 fields=[
220 ddl.FieldSpec(name="dataset_id", dtype=datasetIdColumnType, primaryKey=True),
221 ddl.FieldSpec(name="path", dtype=String, length=256, nullable=False),
222 ddl.FieldSpec(name="formatter", dtype=String, length=128, nullable=False),
223 ddl.FieldSpec(name="storage_class", dtype=String, length=64, nullable=False),
224 # Use empty string to indicate no component
225 ddl.FieldSpec(name="component", dtype=String, length=32, primaryKey=True),
226 # TODO: should checksum be Base64Bytes instead?
227 ddl.FieldSpec(name="checksum", dtype=String, length=128, nullable=True),
228 ddl.FieldSpec(name="file_size", dtype=BigInteger, nullable=True),
229 ],
230 unique=frozenset(),
231 indexes=[tuple(["path"])],
232 )
234 def __init__(self, config: Union[DatastoreConfig, str],
235 bridgeManager: DatastoreRegistryBridgeManager, butlerRoot: str = None):
236 super().__init__(config, bridgeManager)
237 if "root" not in self.config: 237 ↛ 238line 237 didn't jump to line 238, because the condition on line 237 was never true
238 raise ValueError("No root directory specified in configuration")
240 # Name ourselves either using an explicit name or a name
241 # derived from the (unexpanded) root
242 if "name" in self.config:
243 self.name = self.config["name"]
244 else:
245 # We use the unexpanded root in the name to indicate that this
246 # datastore can be moved without having to update registry.
247 self.name = "{}@{}".format(type(self).__name__,
248 self.config["root"])
250 # Support repository relocation in config
251 # Existence of self.root is checked in subclass
252 self.root = ButlerURI(replaceRoot(self.config["root"], butlerRoot),
253 forceDirectory=True, forceAbsolute=True)
255 self.locationFactory = LocationFactory(self.root)
256 self.formatterFactory = FormatterFactory()
258 # Now associate formatters with storage classes
259 self.formatterFactory.registerFormatters(self.config["formatters"],
260 universe=bridgeManager.universe)
262 # Read the file naming templates
263 self.templates = FileTemplates(self.config["templates"],
264 universe=bridgeManager.universe)
266 # See if composites should be disassembled
267 self.composites = CompositesMap(self.config["composites"],
268 universe=bridgeManager.universe)
270 tableName = self.config["records", "table"]
271 try:
272 # Storage of paths and formatters, keyed by dataset_id
273 self._table = bridgeManager.opaque.register(
274 tableName, self.makeTableSpec(bridgeManager.datasetIdColumnType))
275 # Interface to Registry.
276 self._bridge = bridgeManager.register(self.name)
277 except ReadOnlyDatabaseError:
278 # If the database is read only and we just tried and failed to
279 # create a table, it means someone is trying to create a read-only
280 # butler client for an empty repo. That should be okay, as long
281 # as they then try to get any datasets before some other client
282 # creates the table. Chances are they'rejust validating
283 # configuration.
284 pass
286 # Determine whether checksums should be used - default to False
287 self.useChecksum = self.config.get("checksum", False)
289 # Determine whether we can fall back to configuration if a
290 # requested dataset is not known to registry
291 self.trustGetRequest = self.config.get("trust_get_request", False)
293 # Create a cache manager
294 self.cacheManager: AbstractDatastoreCacheManager
295 if "cached" in self.config: 295 ↛ 299line 295 didn't jump to line 299, because the condition on line 295 was never false
296 self.cacheManager = DatastoreCacheManager(self.config["cached"],
297 universe=bridgeManager.universe)
298 else:
299 self.cacheManager = DatastoreDisabledCacheManager("",
300 universe=bridgeManager.universe)
302 # Check existence and create directory structure if necessary
303 if not self.root.exists():
304 if "create" not in self.config or not self.config["create"]: 304 ↛ 305line 304 didn't jump to line 305, because the condition on line 304 was never true
305 raise ValueError(f"No valid root and not allowed to create one at: {self.root}")
306 try:
307 self.root.mkdir()
308 except Exception as e:
309 raise ValueError(f"Can not create datastore root '{self.root}', check permissions."
310 f" Got error: {e}") from e
312 def __str__(self) -> str:
313 return str(self.root)
315 @property
316 def bridge(self) -> DatastoreRegistryBridge:
317 return self._bridge
319 def _artifact_exists(self, location: Location) -> bool:
320 """Check that an artifact exists in this datastore at the specified
321 location.
323 Parameters
324 ----------
325 location : `Location`
326 Expected location of the artifact associated with this datastore.
328 Returns
329 -------
330 exists : `bool`
331 True if the location can be found, false otherwise.
332 """
333 log.debug("Checking if resource exists: %s", location.uri)
334 return location.uri.exists()
336 def _delete_artifact(self, location: Location) -> None:
337 """Delete the artifact from the datastore.
339 Parameters
340 ----------
341 location : `Location`
342 Location of the artifact associated with this datastore.
343 """
344 if location.pathInStore.isabs(): 344 ↛ 345line 344 didn't jump to line 345, because the condition on line 344 was never true
345 raise RuntimeError(f"Cannot delete artifact with absolute uri {location.uri}.")
347 try:
348 location.uri.remove()
349 except FileNotFoundError:
350 log.debug("File %s did not exist and so could not be deleted.", location.uri)
351 raise
352 except Exception as e:
353 log.critical("Failed to delete file: %s (%s)", location.uri, e)
354 raise
355 log.debug("Successfully deleted file: %s", location.uri)
357 def addStoredItemInfo(self, refs: Iterable[DatasetRef], infos: Iterable[StoredFileInfo]) -> None:
358 # Docstring inherited from GenericBaseDatastore
359 records = [info.to_record(ref) for ref, info in zip(refs, infos)]
360 self._table.insert(*records)
362 def getStoredItemsInfo(self, ref: DatasetIdRef) -> List[StoredFileInfo]:
363 # Docstring inherited from GenericBaseDatastore
365 # Look for the dataset_id -- there might be multiple matches
366 # if we have disassembled the dataset.
367 records = list(self._table.fetch(dataset_id=ref.id))
368 return [StoredFileInfo.from_record(record) for record in records]
370 def _refs_associated_with_artifacts(self, paths: List[Union[str, ButlerURI]]) -> Dict[str,
371 Set[DatasetId]]:
372 """Return paths and associated dataset refs.
374 Parameters
375 ----------
376 paths : `list` of `str` or `ButlerURI`
377 All the paths to include in search.
379 Returns
380 -------
381 mapping : `dict` of [`str`, `set` [`DatasetId`]]
382 Mapping of each path to a set of associated database IDs.
383 """
384 records = list(self._table.fetch(path=[str(path) for path in paths]))
385 result = defaultdict(set)
386 for row in records:
387 result[row["path"]].add(row["dataset_id"])
388 return result
390 def _registered_refs_per_artifact(self, pathInStore: ButlerURI) -> Set[DatasetId]:
391 """Return all dataset refs associated with the supplied path.
393 Parameters
394 ----------
395 pathInStore : `ButlerURI`
396 Path of interest in the data store.
398 Returns
399 -------
400 ids : `set` of `int`
401 All `DatasetRef` IDs associated with this path.
402 """
403 records = list(self._table.fetch(path=str(pathInStore)))
404 ids = {r["dataset_id"] for r in records}
405 return ids
407 def removeStoredItemInfo(self, ref: DatasetIdRef) -> None:
408 # Docstring inherited from GenericBaseDatastore
409 self._table.delete(["dataset_id"], {"dataset_id": ref.id})
411 def _get_dataset_locations_info(self, ref: DatasetIdRef) -> List[Tuple[Location, StoredFileInfo]]:
412 r"""Find all the `Location`\ s of the requested dataset in the
413 `Datastore` and the associated stored file information.
415 Parameters
416 ----------
417 ref : `DatasetRef`
418 Reference to the required `Dataset`.
420 Returns
421 -------
422 results : `list` [`tuple` [`Location`, `StoredFileInfo` ]]
423 Location of the dataset within the datastore and
424 stored information about each file and its formatter.
425 """
426 # Get the file information (this will fail if no file)
427 records = self.getStoredItemsInfo(ref)
429 # Use the path to determine the location -- we need to take
430 # into account absolute URIs in the datastore record
431 return [(r.file_location(self.locationFactory), r) for r in records]
433 def _can_remove_dataset_artifact(self, ref: DatasetIdRef, location: Location) -> bool:
434 """Check that there is only one dataset associated with the
435 specified artifact.
437 Parameters
438 ----------
439 ref : `DatasetRef` or `FakeDatasetRef`
440 Dataset to be removed.
441 location : `Location`
442 The location of the artifact to be removed.
444 Returns
445 -------
446 can_remove : `Bool`
447 True if the artifact can be safely removed.
448 """
449 # Can't ever delete absolute URIs.
450 if location.pathInStore.isabs():
451 return False
453 # Get all entries associated with this path
454 allRefs = self._registered_refs_per_artifact(location.pathInStore)
455 if not allRefs:
456 raise RuntimeError(f"Datastore inconsistency error. {location.pathInStore} not in registry")
458 # Remove these refs from all the refs and if there is nothing left
459 # then we can delete
460 remainingRefs = allRefs - {ref.id}
462 if remainingRefs:
463 return False
464 return True
466 def _get_expected_dataset_locations_info(self, ref: DatasetRef) -> List[Tuple[Location,
467 StoredFileInfo]]:
468 """Predict the location and related file information of the requested
469 dataset in this datastore.
471 Parameters
472 ----------
473 ref : `DatasetRef`
474 Reference to the required `Dataset`.
476 Returns
477 -------
478 results : `list` [`tuple` [`Location`, `StoredFileInfo` ]]
479 Expected Location of the dataset within the datastore and
480 placeholder information about each file and its formatter.
482 Notes
483 -----
484 Uses the current configuration to determine how we would expect the
485 datastore files to have been written if we couldn't ask registry.
486 This is safe so long as there has been no change to datastore
487 configuration between writing the dataset and wanting to read it.
488 Will not work for files that have been ingested without using the
489 standard file template or default formatter.
490 """
492 # If we have a component ref we always need to ask the questions
493 # of the composite. If the composite is disassembled this routine
494 # should return all components. If the composite was not
495 # disassembled the composite is what is stored regardless of
496 # component request. Note that if the caller has disassembled
497 # a composite there is no way for this guess to know that
498 # without trying both the composite and component ref and seeing
499 # if there is something at the component Location even without
500 # disassembly being enabled.
501 if ref.datasetType.isComponent():
502 ref = ref.makeCompositeRef()
504 # See if the ref is a composite that should be disassembled
505 doDisassembly = self.composites.shouldBeDisassembled(ref)
507 all_info: List[Tuple[Location, Formatter, StorageClass, Optional[str]]] = []
509 if doDisassembly:
510 for component, componentStorage in ref.datasetType.storageClass.components.items():
511 compRef = ref.makeComponentRef(component)
512 location, formatter = self._determine_put_formatter_location(compRef)
513 all_info.append((location, formatter, componentStorage, component))
515 else:
516 # Always use the composite ref if no disassembly
517 location, formatter = self._determine_put_formatter_location(ref)
518 all_info.append((location, formatter, ref.datasetType.storageClass, None))
520 # Convert the list of tuples to have StoredFileInfo as second element
521 return [(location, StoredFileInfo(formatter=formatter,
522 path=location.pathInStore.path,
523 storageClass=storageClass,
524 component=component,
525 checksum=None,
526 file_size=-1))
527 for location, formatter, storageClass, component in all_info]
529 def _prepare_for_get(self, ref: DatasetRef,
530 parameters: Optional[Mapping[str, Any]] = None) -> List[DatastoreFileGetInformation]:
531 """Check parameters for ``get`` and obtain formatter and
532 location.
534 Parameters
535 ----------
536 ref : `DatasetRef`
537 Reference to the required Dataset.
538 parameters : `dict`
539 `StorageClass`-specific parameters that specify, for example,
540 a slice of the dataset to be loaded.
542 Returns
543 -------
544 getInfo : `list` [`DatastoreFileGetInformation`]
545 Parameters needed to retrieve each file.
546 """
547 log.debug("Retrieve %s from %s with parameters %s", ref, self.name, parameters)
549 # Get file metadata and internal metadata
550 fileLocations = self._get_dataset_locations_info(ref)
551 if not fileLocations:
552 if not self.trustGetRequest:
553 raise FileNotFoundError(f"Could not retrieve dataset {ref}.")
554 # Assume the dataset is where we think it should be
555 fileLocations = self._get_expected_dataset_locations_info(ref)
557 # The storage class we want to use eventually
558 refStorageClass = ref.datasetType.storageClass
560 if len(fileLocations) > 1:
561 disassembled = True
562 else:
563 disassembled = False
565 # Is this a component request?
566 refComponent = ref.datasetType.component()
568 fileGetInfo = []
569 for location, storedFileInfo in fileLocations:
571 # The storage class used to write the file
572 writeStorageClass = storedFileInfo.storageClass
574 # If this has been disassembled we need read to match the write
575 if disassembled:
576 readStorageClass = writeStorageClass
577 else:
578 readStorageClass = refStorageClass
580 formatter = getInstanceOf(storedFileInfo.formatter,
581 FileDescriptor(location, readStorageClass=readStorageClass,
582 storageClass=writeStorageClass, parameters=parameters),
583 ref.dataId)
585 formatterParams, notFormatterParams = formatter.segregateParameters()
587 # Of the remaining parameters, extract the ones supported by
588 # this StorageClass (for components not all will be handled)
589 assemblerParams = readStorageClass.filterParameters(notFormatterParams)
591 # The ref itself could be a component if the dataset was
592 # disassembled by butler, or we disassembled in datastore and
593 # components came from the datastore records
594 component = storedFileInfo.component if storedFileInfo.component else refComponent
596 fileGetInfo.append(DatastoreFileGetInformation(location, formatter, storedFileInfo,
597 assemblerParams, formatterParams,
598 component, readStorageClass))
600 return fileGetInfo
602 def _prepare_for_put(self, inMemoryDataset: Any, ref: DatasetRef) -> Tuple[Location, Formatter]:
603 """Check the arguments for ``put`` and obtain formatter and
604 location.
606 Parameters
607 ----------
608 inMemoryDataset : `object`
609 The dataset to store.
610 ref : `DatasetRef`
611 Reference to the associated Dataset.
613 Returns
614 -------
615 location : `Location`
616 The location to write the dataset.
617 formatter : `Formatter`
618 The `Formatter` to use to write the dataset.
620 Raises
621 ------
622 TypeError
623 Supplied object and storage class are inconsistent.
624 DatasetTypeNotSupportedError
625 The associated `DatasetType` is not handled by this datastore.
626 """
627 self._validate_put_parameters(inMemoryDataset, ref)
628 return self._determine_put_formatter_location(ref)
630 def _determine_put_formatter_location(self, ref: DatasetRef) -> Tuple[Location, Formatter]:
631 """Calculate the formatter and output location to use for put.
633 Parameters
634 ----------
635 ref : `DatasetRef`
636 Reference to the associated Dataset.
638 Returns
639 -------
640 location : `Location`
641 The location to write the dataset.
642 formatter : `Formatter`
643 The `Formatter` to use to write the dataset.
644 """
645 # Work out output file name
646 try:
647 template = self.templates.getTemplate(ref)
648 except KeyError as e:
649 raise DatasetTypeNotSupportedError(f"Unable to find template for {ref}") from e
651 # Validate the template to protect against filenames from different
652 # dataIds returning the same and causing overwrite confusion.
653 template.validateTemplate(ref)
655 location = self.locationFactory.fromPath(template.format(ref))
657 # Get the formatter based on the storage class
658 storageClass = ref.datasetType.storageClass
659 try:
660 formatter = self.formatterFactory.getFormatter(ref,
661 FileDescriptor(location,
662 storageClass=storageClass),
663 ref.dataId)
664 except KeyError as e:
665 raise DatasetTypeNotSupportedError(f"Unable to find formatter for {ref} in datastore "
666 f"{self.name}") from e
668 # Now that we know the formatter, update the location
669 location = formatter.makeUpdatedLocation(location)
671 return location, formatter
673 def _overrideTransferMode(self, *datasets: FileDataset, transfer: Optional[str] = None) -> Optional[str]:
674 # Docstring inherited from base class
675 if transfer != "auto":
676 return transfer
678 # See if the paths are within the datastore or not
679 inside = [self._pathInStore(d.path) is not None for d in datasets]
681 if all(inside):
682 transfer = None
683 elif not any(inside): 683 ↛ 687line 683 didn't jump to line 687, because the condition on line 683 was never false
684 # Allow ButlerURI to use its own knowledge
685 transfer = "auto"
686 else:
687 raise ValueError("Some datasets are inside the datastore and some are outside."
688 " Please use an explicit transfer mode and not 'auto'.")
690 return transfer
692 def _pathInStore(self, path: Union[str, ButlerURI]) -> Optional[str]:
693 """Return path relative to datastore root
695 Parameters
696 ----------
697 path : `str` or `ButlerURI`
698 Path to dataset. Can be absolute URI. If relative assumed to
699 be relative to the datastore. Returns path in datastore
700 or raises an exception if the path it outside.
702 Returns
703 -------
704 inStore : `str`
705 Path relative to datastore root. Returns `None` if the file is
706 outside the root.
707 """
708 # Relative path will always be relative to datastore
709 pathUri = ButlerURI(path, forceAbsolute=False)
710 return pathUri.relative_to(self.root)
712 def _standardizeIngestPath(self, path: Union[str, ButlerURI], *,
713 transfer: Optional[str] = None) -> Union[str, ButlerURI]:
714 """Standardize the path of a to-be-ingested file.
716 Parameters
717 ----------
718 path : `str` or `ButlerURI`
719 Path of a file to be ingested.
720 transfer : `str`, optional
721 How (and whether) the dataset should be added to the datastore.
722 See `ingest` for details of transfer modes.
723 This implementation is provided only so
724 `NotImplementedError` can be raised if the mode is not supported;
725 actual transfers are deferred to `_extractIngestInfo`.
727 Returns
728 -------
729 path : `str` or `ButlerURI`
730 New path in what the datastore considers standard form. If an
731 absolute URI was given that will be returned unchanged.
733 Notes
734 -----
735 Subclasses of `FileDatastore` can implement this method instead
736 of `_prepIngest`. It should not modify the data repository or given
737 file in any way.
739 Raises
740 ------
741 NotImplementedError
742 Raised if the datastore does not support the given transfer mode
743 (including the case where ingest is not supported at all).
744 FileNotFoundError
745 Raised if one of the given files does not exist.
746 """
747 if transfer not in (None, "direct") + self.root.transferModes: 747 ↛ 748line 747 didn't jump to line 748, because the condition on line 747 was never true
748 raise NotImplementedError(f"Transfer mode {transfer} not supported.")
750 # A relative URI indicates relative to datastore root
751 srcUri = ButlerURI(path, forceAbsolute=False)
752 if not srcUri.isabs():
753 srcUri = self.root.join(path)
755 if not srcUri.exists():
756 raise FileNotFoundError(f"Resource at {srcUri} does not exist; note that paths to ingest "
757 f"are assumed to be relative to {self.root} unless they are absolute.")
759 if transfer is None:
760 relpath = srcUri.relative_to(self.root)
761 if not relpath:
762 raise RuntimeError(f"Transfer is none but source file ({srcUri}) is not "
763 f"within datastore ({self.root})")
765 # Return the relative path within the datastore for internal
766 # transfer
767 path = relpath
769 return path
771 def _extractIngestInfo(self, path: Union[str, ButlerURI], ref: DatasetRef, *,
772 formatter: Union[Formatter, Type[Formatter]],
773 transfer: Optional[str] = None) -> StoredFileInfo:
774 """Relocate (if necessary) and extract `StoredFileInfo` from a
775 to-be-ingested file.
777 Parameters
778 ----------
779 path : `str` or `ButlerURI`
780 URI or path of a file to be ingested.
781 ref : `DatasetRef`
782 Reference for the dataset being ingested. Guaranteed to have
783 ``dataset_id not None`.
784 formatter : `type` or `Formatter`
785 `Formatter` subclass to use for this dataset or an instance.
786 transfer : `str`, optional
787 How (and whether) the dataset should be added to the datastore.
788 See `ingest` for details of transfer modes.
790 Returns
791 -------
792 info : `StoredFileInfo`
793 Internal datastore record for this file. This will be inserted by
794 the caller; the `_extractIngestInfo` is only resposible for
795 creating and populating the struct.
797 Raises
798 ------
799 FileNotFoundError
800 Raised if one of the given files does not exist.
801 FileExistsError
802 Raised if transfer is not `None` but the (internal) location the
803 file would be moved to is already occupied.
804 """
805 if self._transaction is None: 805 ↛ 806line 805 didn't jump to line 806, because the condition on line 805 was never true
806 raise RuntimeError("Ingest called without transaction enabled")
808 # Create URI of the source path, do not need to force a relative
809 # path to absolute.
810 srcUri = ButlerURI(path, forceAbsolute=False)
812 # Track whether we have read the size of the source yet
813 have_sized = False
815 tgtLocation: Optional[Location]
816 if transfer is None:
817 # A relative path is assumed to be relative to the datastore
818 # in this context
819 if not srcUri.isabs():
820 tgtLocation = self.locationFactory.fromPath(srcUri.ospath)
821 else:
822 # Work out the path in the datastore from an absolute URI
823 # This is required to be within the datastore.
824 pathInStore = srcUri.relative_to(self.root)
825 if pathInStore is None: 825 ↛ 826line 825 didn't jump to line 826, because the condition on line 825 was never true
826 raise RuntimeError(f"Unexpectedly learned that {srcUri} is "
827 f"not within datastore {self.root}")
828 tgtLocation = self.locationFactory.fromPath(pathInStore)
829 elif transfer == "direct": 829 ↛ 834line 829 didn't jump to line 834, because the condition on line 829 was never true
830 # Want to store the full URI to the resource directly in
831 # datastore. This is useful for referring to permanent archive
832 # storage for raw data.
833 # Trust that people know what they are doing.
834 tgtLocation = None
835 else:
836 # Work out the name we want this ingested file to have
837 # inside the datastore
838 tgtLocation = self._calculate_ingested_datastore_name(srcUri, ref, formatter)
839 if not tgtLocation.uri.dirname().exists():
840 log.debug("Folder %s does not exist yet.", tgtLocation.uri.dirname())
841 tgtLocation.uri.dirname().mkdir()
843 # if we are transferring from a local file to a remote location
844 # it may be more efficient to get the size and checksum of the
845 # local file rather than the transferred one
846 if not srcUri.scheme or srcUri.scheme == "file": 846 ↛ 852line 846 didn't jump to line 852, because the condition on line 846 was never false
847 size = srcUri.size()
848 checksum = self.computeChecksum(srcUri) if self.useChecksum else None
849 have_sized = True
851 # transfer the resource to the destination
852 tgtLocation.uri.transfer_from(srcUri, transfer=transfer, transaction=self._transaction)
854 if tgtLocation is None: 854 ↛ 856line 854 didn't jump to line 856, because the condition on line 854 was never true
855 # This means we are using direct mode
856 targetUri = srcUri
857 targetPath = str(srcUri)
858 else:
859 targetUri = tgtLocation.uri
860 targetPath = tgtLocation.pathInStore.path
862 # the file should exist in the datastore now
863 if not have_sized:
864 size = targetUri.size()
865 checksum = self.computeChecksum(targetUri) if self.useChecksum else None
867 return StoredFileInfo(formatter=formatter, path=targetPath,
868 storageClass=ref.datasetType.storageClass,
869 component=ref.datasetType.component(),
870 file_size=size, checksum=checksum)
872 def _prepIngest(self, *datasets: FileDataset, transfer: Optional[str] = None) -> _IngestPrepData:
873 # Docstring inherited from Datastore._prepIngest.
874 filtered = []
875 for dataset in datasets:
876 acceptable = [ref for ref in dataset.refs if self.constraints.isAcceptable(ref)]
877 if not acceptable:
878 continue
879 else:
880 dataset.refs = acceptable
881 if dataset.formatter is None:
882 dataset.formatter = self.formatterFactory.getFormatterClass(dataset.refs[0])
883 else:
884 assert isinstance(dataset.formatter, (type, str))
885 dataset.formatter = getClassOf(dataset.formatter)
886 dataset.path = self._standardizeIngestPath(dataset.path, transfer=transfer)
887 filtered.append(dataset)
888 return _IngestPrepData(filtered)
890 @transactional
891 def _finishIngest(self, prepData: Datastore.IngestPrepData, *, transfer: Optional[str] = None) -> None:
892 # Docstring inherited from Datastore._finishIngest.
893 refsAndInfos = []
894 progress = Progress("lsst.daf.butler.datastores.FileDatastore.ingest", level=logging.DEBUG)
895 for dataset in progress.wrap(prepData.datasets, desc="Ingesting dataset files"):
896 # Do ingest as if the first dataset ref is associated with the file
897 info = self._extractIngestInfo(dataset.path, dataset.refs[0], formatter=dataset.formatter,
898 transfer=transfer)
899 refsAndInfos.extend([(ref, info) for ref in dataset.refs])
900 self._register_datasets(refsAndInfos)
902 def _calculate_ingested_datastore_name(self, srcUri: ButlerURI, ref: DatasetRef,
903 formatter: Union[Formatter, Type[Formatter]]) -> Location:
904 """Given a source URI and a DatasetRef, determine the name the
905 dataset will have inside datastore.
907 Parameters
908 ----------
909 srcUri : `ButlerURI`
910 URI to the source dataset file.
911 ref : `DatasetRef`
912 Ref associated with the newly-ingested dataset artifact. This
913 is used to determine the name within the datastore.
914 formatter : `Formatter` or Formatter class.
915 Formatter to use for validation. Can be a class or an instance.
917 Returns
918 -------
919 location : `Location`
920 Target location for the newly-ingested dataset.
921 """
922 # Ingesting a file from outside the datastore.
923 # This involves a new name.
924 template = self.templates.getTemplate(ref)
925 location = self.locationFactory.fromPath(template.format(ref))
927 # Get the extension
928 ext = srcUri.getExtension()
930 # Update the destination to include that extension
931 location.updateExtension(ext)
933 # Ask the formatter to validate this extension
934 formatter.validateExtension(location)
936 return location
938 def _write_in_memory_to_artifact(self, inMemoryDataset: Any, ref: DatasetRef) -> StoredFileInfo:
939 """Write out in memory dataset to datastore.
941 Parameters
942 ----------
943 inMemoryDataset : `object`
944 Dataset to write to datastore.
945 ref : `DatasetRef`
946 Registry information associated with this dataset.
948 Returns
949 -------
950 info : `StoredFileInfo`
951 Information describin the artifact written to the datastore.
952 """
953 location, formatter = self._prepare_for_put(inMemoryDataset, ref)
954 uri = location.uri
956 if not uri.dirname().exists():
957 log.debug("Folder %s does not exist yet so creating it.", uri.dirname())
958 uri.dirname().mkdir()
960 if self._transaction is None: 960 ↛ 961line 960 didn't jump to line 961, because the condition on line 960 was never true
961 raise RuntimeError("Attempting to write artifact without transaction enabled")
963 def _removeFileExists(uri: ButlerURI) -> None:
964 """Remove a file and do not complain if it is not there.
966 This is important since a formatter might fail before the file
967 is written and we should not confuse people by writing spurious
968 error messages to the log.
969 """
970 try:
971 uri.remove()
972 except FileNotFoundError:
973 pass
975 # Register a callback to try to delete the uploaded data if
976 # something fails below
977 self._transaction.registerUndo("artifactWrite", _removeFileExists, uri)
979 # For a local file, simply use the formatter directly
980 if uri.isLocal:
981 try:
982 formatter.write(inMemoryDataset)
983 except Exception as e:
984 raise RuntimeError(f"Failed to serialize dataset {ref} of type {type(inMemoryDataset)} "
985 f"to location {uri}") from e
986 log.debug("Successfully wrote python object to local file at %s", uri)
987 else:
988 # This is a remote URI, so first try bytes and write directly else
989 # fallback to a temporary file
990 try:
991 serializedDataset = formatter.toBytes(inMemoryDataset)
992 except NotImplementedError: 992 ↛ 1011line 992 didn't jump to line 1011
993 with tempfile.NamedTemporaryFile(suffix=uri.getExtension()) as tmpFile:
994 # Need to configure the formatter to write to a different
995 # location and that needs us to overwrite internals
996 tmpLocation = Location(*os.path.split(tmpFile.name))
997 log.debug("Writing dataset to temporary location at %s", tmpLocation.uri)
998 with formatter._updateLocation(tmpLocation):
999 try:
1000 formatter.write(inMemoryDataset)
1001 except Exception as e:
1002 raise RuntimeError(f"Failed to serialize dataset {ref} of type"
1003 f" {type(inMemoryDataset)} to "
1004 f"temporary location {tmpLocation.uri}") from e
1005 uri.transfer_from(tmpLocation.uri, transfer="copy", overwrite=True)
1007 # Cache if required
1008 self.cacheManager.move_to_cache(tmpLocation.uri, ref)
1010 log.debug("Successfully wrote dataset to %s via a temporary file.", uri)
1011 except Exception as e:
1012 raise RuntimeError(f"Failed to serialize dataset {ref} to bytes.") from e
1013 else:
1014 log.debug("Writing bytes directly to %s", uri)
1015 uri.write(serializedDataset, overwrite=True)
1016 log.debug("Successfully wrote bytes directly to %s", uri)
1018 # URI is needed to resolve what ingest case are we dealing with
1019 return self._extractIngestInfo(uri, ref, formatter=formatter)
1021 def _read_artifact_into_memory(self, getInfo: DatastoreFileGetInformation,
1022 ref: DatasetRef, isComponent: bool = False) -> Any:
1023 """Read the artifact from datastore into in memory object.
1025 Parameters
1026 ----------
1027 getInfo : `DatastoreFileGetInformation`
1028 Information about the artifact within the datastore.
1029 ref : `DatasetRef`
1030 The registry information associated with this artifact.
1031 isComponent : `bool`
1032 Flag to indicate if a component is being read from this artifact.
1034 Returns
1035 -------
1036 inMemoryDataset : `object`
1037 The artifact as a python object.
1038 """
1039 location = getInfo.location
1040 uri = location.uri
1041 log.debug("Accessing data from %s", uri)
1043 # Cannot recalculate checksum but can compare size as a quick check
1044 # Do not do this if the size is negative since that indicates
1045 # we do not know.
1046 recorded_size = getInfo.info.file_size
1047 resource_size = uri.size()
1048 if recorded_size >= 0 and resource_size != recorded_size: 1048 ↛ 1049line 1048 didn't jump to line 1049, because the condition on line 1048 was never true
1049 raise RuntimeError("Integrity failure in Datastore. "
1050 f"Size of file {uri} ({resource_size}) "
1051 f"does not match size recorded in registry of {recorded_size}")
1053 # For the general case we have choices for how to proceed.
1054 # 1. Always use a local file (downloading the remote resource to a
1055 # temporary file if needed).
1056 # 2. Use a threshold size and read into memory and use bytes.
1057 # Use both for now with an arbitrary hand off size.
1058 # This allows small datasets to be downloaded from remote object
1059 # stores without requiring a temporary file.
1061 formatter = getInfo.formatter
1062 nbytes_max = 10_000_000 # Arbitrary number that we can tune
1063 if resource_size <= nbytes_max and formatter.can_read_bytes():
1064 serializedDataset = uri.read()
1065 log.debug("Deserializing %s from %d bytes from location %s with formatter %s",
1066 f"component {getInfo.component}" if isComponent else "",
1067 len(serializedDataset), uri, formatter.name())
1068 try:
1069 result = formatter.fromBytes(serializedDataset,
1070 component=getInfo.component if isComponent else None)
1071 except Exception as e:
1072 raise ValueError(f"Failure from formatter '{formatter.name()}' for dataset {ref.id}"
1073 f" ({ref.datasetType.name} from {uri}): {e}") from e
1074 else:
1075 # Read from file.
1077 # Have to update the Location associated with the formatter
1078 # because formatter.read does not allow an override.
1079 # This could be improved.
1080 location_updated = False
1081 msg = ""
1083 # First check in cache for local version.
1084 # The cache will only be relevant for remote resources.
1085 if not uri.isLocal:
1086 cached_file = self.cacheManager.find_in_cache(ref, uri.getExtension())
1087 if cached_file is not None: 1087 ↛ 1088line 1087 didn't jump to line 1088, because the condition on line 1087 was never true
1088 msg = f"(via cache read of remote file {uri})"
1089 uri = cached_file
1090 location_updated = True
1092 with uri.as_local() as local_uri:
1094 # URI was remote and file was downloaded
1095 if uri != local_uri:
1096 cache_msg = ""
1097 location_updated = True
1099 # Cache the downloaded file if needed.
1100 cached_uri = self.cacheManager.move_to_cache(local_uri, ref)
1101 if cached_uri is not None: 1101 ↛ 1102line 1101 didn't jump to line 1102, because the condition on line 1101 was never true
1102 local_uri = cached_uri
1103 cache_msg = " and cached"
1105 msg = f"(via download to local file{cache_msg})"
1107 # Calculate the (possibly) new location for the formatter
1108 # to use.
1109 newLocation = Location(*local_uri.split()) if location_updated else None
1111 log.debug("Reading%s from location %s %s with formatter %s",
1112 f" component {getInfo.component}" if isComponent else "",
1113 uri, msg, formatter.name())
1114 try:
1115 with formatter._updateLocation(newLocation):
1116 result = formatter.read(component=getInfo.component if isComponent else None)
1117 except Exception as e:
1118 raise ValueError(f"Failure from formatter '{formatter.name()}' for dataset {ref.id}"
1119 f" ({ref.datasetType.name} from {uri}): {e}") from e
1121 return self._post_process_get(result, getInfo.readStorageClass, getInfo.assemblerParams,
1122 isComponent=isComponent)
1124 def exists(self, ref: DatasetRef) -> bool:
1125 """Check if the dataset exists in the datastore.
1127 Parameters
1128 ----------
1129 ref : `DatasetRef`
1130 Reference to the required dataset.
1132 Returns
1133 -------
1134 exists : `bool`
1135 `True` if the entity exists in the `Datastore`.
1136 """
1137 fileLocations = self._get_dataset_locations_info(ref)
1139 # if we are being asked to trust that registry might not be correct
1140 # we ask for the expected locations and check them explicitly
1141 if not fileLocations:
1142 if not self.trustGetRequest:
1143 return False
1144 fileLocations = self._get_expected_dataset_locations_info(ref)
1145 for location, _ in fileLocations:
1146 if not self._artifact_exists(location):
1147 return False
1149 return True
1151 def getURIs(self, ref: DatasetRef,
1152 predict: bool = False) -> Tuple[Optional[ButlerURI], Dict[str, ButlerURI]]:
1153 """Return URIs associated with dataset.
1155 Parameters
1156 ----------
1157 ref : `DatasetRef`
1158 Reference to the required dataset.
1159 predict : `bool`, optional
1160 If the datastore does not know about the dataset, should it
1161 return a predicted URI or not?
1163 Returns
1164 -------
1165 primary : `ButlerURI`
1166 The URI to the primary artifact associated with this dataset.
1167 If the dataset was disassembled within the datastore this
1168 may be `None`.
1169 components : `dict`
1170 URIs to any components associated with the dataset artifact.
1171 Can be empty if there are no components.
1172 """
1174 primary: Optional[ButlerURI] = None
1175 components: Dict[str, ButlerURI] = {}
1177 # if this has never been written then we have to guess
1178 if not self.exists(ref):
1179 if not predict:
1180 raise FileNotFoundError("Dataset {} not in this datastore".format(ref))
1182 doDisassembly = self.composites.shouldBeDisassembled(ref)
1184 if doDisassembly:
1186 for component, componentStorage in ref.datasetType.storageClass.components.items():
1187 compRef = ref.makeComponentRef(component)
1188 compLocation, _ = self._determine_put_formatter_location(compRef)
1190 # Add a URI fragment to indicate this is a guess
1191 components[component] = ButlerURI(compLocation.uri.geturl() + "#predicted")
1193 else:
1195 location, _ = self._determine_put_formatter_location(ref)
1197 # Add a URI fragment to indicate this is a guess
1198 primary = ButlerURI(location.uri.geturl() + "#predicted")
1200 return primary, components
1202 # If this is a ref that we have written we can get the path.
1203 # Get file metadata and internal metadata
1204 fileLocations = self._get_dataset_locations_info(ref)
1206 guessing = False
1207 if not fileLocations:
1208 if not self.trustGetRequest: 1208 ↛ 1209line 1208 didn't jump to line 1209, because the condition on line 1208 was never true
1209 raise RuntimeError(f"Unexpectedly got no artifacts for dataset {ref}")
1210 fileLocations = self._get_expected_dataset_locations_info(ref)
1211 guessing = True
1213 if len(fileLocations) == 1:
1214 # No disassembly so this is the primary URI
1215 uri = fileLocations[0][0].uri
1216 if guessing and not uri.exists(): 1216 ↛ 1217line 1216 didn't jump to line 1217, because the condition on line 1216 was never true
1217 raise FileNotFoundError(f"Expected URI ({uri}) does not exist")
1218 primary = uri
1220 else:
1221 for location, storedFileInfo in fileLocations:
1222 if storedFileInfo.component is None: 1222 ↛ 1223line 1222 didn't jump to line 1223, because the condition on line 1222 was never true
1223 raise RuntimeError(f"Unexpectedly got no component name for a component at {location}")
1224 uri = location.uri
1225 if guessing and not uri.exists(): 1225 ↛ 1226line 1225 didn't jump to line 1226, because the condition on line 1225 was never true
1226 raise FileNotFoundError(f"Expected URI ({uri}) does not exist")
1227 components[storedFileInfo.component] = uri
1229 return primary, components
1231 def getURI(self, ref: DatasetRef, predict: bool = False) -> ButlerURI:
1232 """URI to the Dataset.
1234 Parameters
1235 ----------
1236 ref : `DatasetRef`
1237 Reference to the required Dataset.
1238 predict : `bool`
1239 If `True`, allow URIs to be returned of datasets that have not
1240 been written.
1242 Returns
1243 -------
1244 uri : `str`
1245 URI pointing to the dataset within the datastore. If the
1246 dataset does not exist in the datastore, and if ``predict`` is
1247 `True`, the URI will be a prediction and will include a URI
1248 fragment "#predicted".
1249 If the datastore does not have entities that relate well
1250 to the concept of a URI the returned URI will be
1251 descriptive. The returned URI is not guaranteed to be obtainable.
1253 Raises
1254 ------
1255 FileNotFoundError
1256 Raised if a URI has been requested for a dataset that does not
1257 exist and guessing is not allowed.
1258 RuntimeError
1259 Raised if a request is made for a single URI but multiple URIs
1260 are associated with this dataset.
1262 Notes
1263 -----
1264 When a predicted URI is requested an attempt will be made to form
1265 a reasonable URI based on file templates and the expected formatter.
1266 """
1267 primary, components = self.getURIs(ref, predict)
1268 if primary is None or components: 1268 ↛ 1269line 1268 didn't jump to line 1269, because the condition on line 1268 was never true
1269 raise RuntimeError(f"Dataset ({ref}) includes distinct URIs for components. "
1270 "Use Dataastore.getURIs() instead.")
1271 return primary
1273 def retrieveArtifacts(self, refs: Iterable[DatasetRef],
1274 destination: ButlerURI, transfer: str = "auto",
1275 preserve_path: bool = True,
1276 overwrite: bool = False) -> List[ButlerURI]:
1277 """Retrieve the file artifacts associated with the supplied refs.
1279 Parameters
1280 ----------
1281 refs : iterable of `DatasetRef`
1282 The datasets for which file artifacts are to be retrieved.
1283 A single ref can result in multiple files. The refs must
1284 be resolved.
1285 destination : `ButlerURI`
1286 Location to write the file artifacts.
1287 transfer : `str`, optional
1288 Method to use to transfer the artifacts. Must be one of the options
1289 supported by `ButlerURI.transfer_from()`. "move" is not allowed.
1290 preserve_path : `bool`, optional
1291 If `True` the full path of the file artifact within the datastore
1292 is preserved. If `False` the final file component of the path
1293 is used.
1294 overwrite : `bool`, optional
1295 If `True` allow transfers to overwrite existing files at the
1296 destination.
1298 Returns
1299 -------
1300 targets : `list` of `ButlerURI`
1301 URIs of file artifacts in destination location. Order is not
1302 preserved.
1303 """
1304 if not destination.isdir(): 1304 ↛ 1305line 1304 didn't jump to line 1305, because the condition on line 1304 was never true
1305 raise ValueError(f"Destination location must refer to a directory. Given {destination}")
1307 if transfer == "move":
1308 raise ValueError("Can not move artifacts out of datastore. Use copy instead.")
1310 # Source -> Destination
1311 # This also helps filter out duplicate DatasetRef in the request
1312 # that will map to the same underlying file transfer.
1313 to_transfer: Dict[ButlerURI, ButlerURI] = {}
1315 for ref in refs:
1316 locations = self._get_dataset_locations_info(ref)
1317 for location, _ in locations:
1318 source_uri = location.uri
1319 target_path: Union[str, ButlerURI]
1320 if preserve_path:
1321 target_path = location.pathInStore
1322 if target_path.isabs(): 1322 ↛ 1325line 1322 didn't jump to line 1325, because the condition on line 1322 was never true
1323 # This is an absolute path to an external file.
1324 # Use the full path.
1325 target_path = target_path.relativeToPathRoot
1326 else:
1327 target_path = source_uri.basename()
1328 target_uri = destination.join(target_path)
1329 to_transfer[source_uri] = target_uri
1331 # In theory can now parallelize the transfer
1332 log.debug("Number of artifacts to transfer to %s: %d",
1333 str(destination), len(to_transfer))
1334 for source_uri, target_uri in to_transfer.items():
1335 target_uri.transfer_from(source_uri, transfer=transfer, overwrite=overwrite)
1337 return list(to_transfer.values())
1339 def get(self, ref: DatasetRef, parameters: Optional[Mapping[str, Any]] = None) -> Any:
1340 """Load an InMemoryDataset from the store.
1342 Parameters
1343 ----------
1344 ref : `DatasetRef`
1345 Reference to the required Dataset.
1346 parameters : `dict`
1347 `StorageClass`-specific parameters that specify, for example,
1348 a slice of the dataset to be loaded.
1350 Returns
1351 -------
1352 inMemoryDataset : `object`
1353 Requested dataset or slice thereof as an InMemoryDataset.
1355 Raises
1356 ------
1357 FileNotFoundError
1358 Requested dataset can not be retrieved.
1359 TypeError
1360 Return value from formatter has unexpected type.
1361 ValueError
1362 Formatter failed to process the dataset.
1363 """
1364 allGetInfo = self._prepare_for_get(ref, parameters)
1365 refComponent = ref.datasetType.component()
1367 # Supplied storage class for the component being read
1368 refStorageClass = ref.datasetType.storageClass
1370 # Create mapping from component name to related info
1371 allComponents = {i.component: i for i in allGetInfo}
1373 # By definition the dataset is disassembled if we have more
1374 # than one record for it.
1375 isDisassembled = len(allGetInfo) > 1
1377 # Look for the special case where we are disassembled but the
1378 # component is a derived component that was not written during
1379 # disassembly. For this scenario we need to check that the
1380 # component requested is listed as a derived component for the
1381 # composite storage class
1382 isDisassembledReadOnlyComponent = False
1383 if isDisassembled and refComponent:
1384 # The composite storage class should be accessible through
1385 # the component dataset type
1386 compositeStorageClass = ref.datasetType.parentStorageClass
1388 # In the unlikely scenario where the composite storage
1389 # class is not known, we can only assume that this is a
1390 # normal component. If that assumption is wrong then the
1391 # branch below that reads a persisted component will fail
1392 # so there is no need to complain here.
1393 if compositeStorageClass is not None: 1393 ↛ 1396line 1393 didn't jump to line 1396, because the condition on line 1393 was never false
1394 isDisassembledReadOnlyComponent = refComponent in compositeStorageClass.derivedComponents
1396 if isDisassembled and not refComponent:
1397 # This was a disassembled dataset spread over multiple files
1398 # and we need to put them all back together again.
1399 # Read into memory and then assemble
1401 # Check that the supplied parameters are suitable for the type read
1402 refStorageClass.validateParameters(parameters)
1404 # We want to keep track of all the parameters that were not used
1405 # by formatters. We assume that if any of the component formatters
1406 # use a parameter that we do not need to apply it again in the
1407 # assembler.
1408 usedParams = set()
1410 components: Dict[str, Any] = {}
1411 for getInfo in allGetInfo:
1412 # assemblerParams are parameters not understood by the
1413 # associated formatter.
1414 usedParams.update(set(getInfo.formatterParams))
1416 component = getInfo.component
1418 if component is None: 1418 ↛ 1419line 1418 didn't jump to line 1419, because the condition on line 1418 was never true
1419 raise RuntimeError(f"Internal error in datastore assembly of {ref}")
1421 # We do not want the formatter to think it's reading
1422 # a component though because it is really reading a
1423 # standalone dataset -- always tell reader it is not a
1424 # component.
1425 components[component] = self._read_artifact_into_memory(getInfo, ref, isComponent=False)
1427 inMemoryDataset = ref.datasetType.storageClass.delegate().assemble(components)
1429 # Any unused parameters will have to be passed to the assembler
1430 if parameters:
1431 unusedParams = {k: v for k, v in parameters.items() if k not in usedParams}
1432 else:
1433 unusedParams = {}
1435 # Process parameters
1436 return ref.datasetType.storageClass.delegate().handleParameters(inMemoryDataset,
1437 parameters=unusedParams)
1439 elif isDisassembledReadOnlyComponent:
1441 compositeStorageClass = ref.datasetType.parentStorageClass
1442 if compositeStorageClass is None: 1442 ↛ 1443line 1442 didn't jump to line 1443, because the condition on line 1442 was never true
1443 raise RuntimeError(f"Unable to retrieve derived component '{refComponent}' since"
1444 "no composite storage class is available.")
1446 if refComponent is None: 1446 ↛ 1448line 1446 didn't jump to line 1448, because the condition on line 1446 was never true
1447 # Mainly for mypy
1448 raise RuntimeError(f"Internal error in datastore {self.name}: component can not be None here")
1450 # Assume that every derived component can be calculated by
1451 # forwarding the request to a single read/write component.
1452 # Rather than guessing which rw component is the right one by
1453 # scanning each for a derived component of the same name,
1454 # we ask the storage class delegate directly which one is best to
1455 # use.
1456 compositeDelegate = compositeStorageClass.delegate()
1457 forwardedComponent = compositeDelegate.selectResponsibleComponent(refComponent,
1458 set(allComponents))
1460 # Select the relevant component
1461 rwInfo = allComponents[forwardedComponent]
1463 # For now assume that read parameters are validated against
1464 # the real component and not the requested component
1465 forwardedStorageClass = rwInfo.formatter.fileDescriptor.readStorageClass
1466 forwardedStorageClass.validateParameters(parameters)
1468 # Unfortunately the FileDescriptor inside the formatter will have
1469 # the wrong write storage class so we need to create a new one
1470 # given the immutability constraint.
1471 writeStorageClass = rwInfo.info.storageClass
1473 # We may need to put some thought into parameters for read
1474 # components but for now forward them on as is
1475 readFormatter = type(rwInfo.formatter)(FileDescriptor(rwInfo.location,
1476 readStorageClass=refStorageClass,
1477 storageClass=writeStorageClass,
1478 parameters=parameters),
1479 ref.dataId)
1481 # The assembler can not receive any parameter requests for a
1482 # derived component at this time since the assembler will
1483 # see the storage class of the derived component and those
1484 # parameters will have to be handled by the formatter on the
1485 # forwarded storage class.
1486 assemblerParams: Dict[str, Any] = {}
1488 # Need to created a new info that specifies the derived
1489 # component and associated storage class
1490 readInfo = DatastoreFileGetInformation(rwInfo.location, readFormatter,
1491 rwInfo.info, assemblerParams, {},
1492 refComponent, refStorageClass)
1494 return self._read_artifact_into_memory(readInfo, ref, isComponent=True)
1496 else:
1497 # Single file request or component from that composite file
1498 for lookup in (refComponent, None): 1498 ↛ 1503line 1498 didn't jump to line 1503, because the loop on line 1498 didn't complete
1499 if lookup in allComponents: 1499 ↛ 1498line 1499 didn't jump to line 1498, because the condition on line 1499 was never false
1500 getInfo = allComponents[lookup]
1501 break
1502 else:
1503 raise FileNotFoundError(f"Component {refComponent} not found "
1504 f"for ref {ref} in datastore {self.name}")
1506 # Do not need the component itself if already disassembled
1507 if isDisassembled:
1508 isComponent = False
1509 else:
1510 isComponent = getInfo.component is not None
1512 # For a disassembled component we can validate parametersagainst
1513 # the component storage class directly
1514 if isDisassembled:
1515 refStorageClass.validateParameters(parameters)
1516 else:
1517 # For an assembled composite this could be a derived
1518 # component derived from a real component. The validity
1519 # of the parameters is not clear. For now validate against
1520 # the composite storage class
1521 getInfo.formatter.fileDescriptor.storageClass.validateParameters(parameters)
1523 return self._read_artifact_into_memory(getInfo, ref, isComponent=isComponent)
1525 @transactional
1526 def put(self, inMemoryDataset: Any, ref: DatasetRef) -> None:
1527 """Write a InMemoryDataset with a given `DatasetRef` to the store.
1529 Parameters
1530 ----------
1531 inMemoryDataset : `object`
1532 The dataset to store.
1533 ref : `DatasetRef`
1534 Reference to the associated Dataset.
1536 Raises
1537 ------
1538 TypeError
1539 Supplied object and storage class are inconsistent.
1540 DatasetTypeNotSupportedError
1541 The associated `DatasetType` is not handled by this datastore.
1543 Notes
1544 -----
1545 If the datastore is configured to reject certain dataset types it
1546 is possible that the put will fail and raise a
1547 `DatasetTypeNotSupportedError`. The main use case for this is to
1548 allow `ChainedDatastore` to put to multiple datastores without
1549 requiring that every datastore accepts the dataset.
1550 """
1552 doDisassembly = self.composites.shouldBeDisassembled(ref)
1553 # doDisassembly = True
1555 artifacts = []
1556 if doDisassembly:
1557 components = ref.datasetType.storageClass.delegate().disassemble(inMemoryDataset)
1558 for component, componentInfo in components.items():
1559 # Don't recurse because we want to take advantage of
1560 # bulk insert -- need a new DatasetRef that refers to the
1561 # same dataset_id but has the component DatasetType
1562 # DatasetType does not refer to the types of components
1563 # So we construct one ourselves.
1564 compRef = ref.makeComponentRef(component)
1565 storedInfo = self._write_in_memory_to_artifact(componentInfo.component, compRef)
1566 artifacts.append((compRef, storedInfo))
1567 else:
1568 # Write the entire thing out
1569 storedInfo = self._write_in_memory_to_artifact(inMemoryDataset, ref)
1570 artifacts.append((ref, storedInfo))
1572 self._register_datasets(artifacts)
1574 @transactional
1575 def trash(self, ref: DatasetRef, ignore_errors: bool = True) -> None:
1576 """Indicate to the datastore that a dataset can be removed.
1578 Parameters
1579 ----------
1580 ref : `DatasetRef`
1581 Reference to the required Dataset.
1582 ignore_errors : `bool`
1583 If `True` return without error even if something went wrong.
1584 Problems could occur if another process is simultaneously trying
1585 to delete.
1587 Raises
1588 ------
1589 FileNotFoundError
1590 Attempt to remove a dataset that does not exist.
1591 """
1592 # Get file metadata and internal metadata
1593 log.debug("Trashing %s in datastore %s", ref, self.name)
1595 fileLocations = self._get_dataset_locations_info(ref)
1597 if not fileLocations:
1598 err_msg = f"Requested dataset to trash ({ref}) is not known to datastore {self.name}"
1599 if ignore_errors:
1600 log.warning(err_msg)
1601 return
1602 else:
1603 raise FileNotFoundError(err_msg)
1605 for location, storedFileInfo in fileLocations:
1606 if not self._artifact_exists(location): 1606 ↛ 1607line 1606 didn't jump to line 1607, because the condition on line 1606 was never true
1607 err_msg = f"Dataset is known to datastore {self.name} but " \
1608 f"associated artifact ({location.uri}) is missing"
1609 if ignore_errors:
1610 log.warning(err_msg)
1611 return
1612 else:
1613 raise FileNotFoundError(err_msg)
1615 # Mark dataset as trashed
1616 try:
1617 self._move_to_trash_in_registry(ref)
1618 except Exception as e:
1619 if ignore_errors:
1620 log.warning(f"Attempted to mark dataset ({ref}) to be trashed in datastore {self.name} "
1621 f"but encountered an error: {e}")
1622 pass
1623 else:
1624 raise
1626 @transactional
1627 def emptyTrash(self, ignore_errors: bool = True) -> None:
1628 """Remove all datasets from the trash.
1630 Parameters
1631 ----------
1632 ignore_errors : `bool`
1633 If `True` return without error even if something went wrong.
1634 Problems could occur if another process is simultaneously trying
1635 to delete.
1636 """
1637 log.debug("Emptying trash in datastore %s", self.name)
1639 # Context manager will empty trash iff we finish it without raising.
1640 # It will also automatically delete the relevant rows from the
1641 # trash table and the records table.
1642 with self.bridge.emptyTrash(self._table, record_class=StoredFileInfo,
1643 record_column="path") as trash_data:
1644 # Removing the artifacts themselves requires that the files are
1645 # not also associated with refs that are not to be trashed.
1646 # Therefore need to do a query with the file paths themselves
1647 # and return all the refs associated with them. Can only delete
1648 # a file if the refs to be trashed are the only refs associated
1649 # with the file.
1650 # This requires multiple copies of the trashed items
1651 trashed, artifacts_to_keep = trash_data
1653 if artifacts_to_keep is None:
1654 # The bridge is not helping us so have to work it out
1655 # ourselves. This is not going to be as efficient.
1656 trashed = list(trashed)
1658 # The instance check is for mypy since up to this point it
1659 # does not know the type of info.
1660 path_map = self._refs_associated_with_artifacts([info.path for _, info in trashed
1661 if isinstance(info, StoredFileInfo)])
1663 for ref, info in trashed:
1665 # Mypy needs to know this is not the base class
1666 assert isinstance(info, StoredFileInfo), f"Unexpectedly got info of class {type(info)}"
1668 # Check for mypy
1669 assert ref.id is not None, f"Internal logic error in emptyTrash with ref {ref}/{info}"
1671 path_map[info.path].remove(ref.id)
1672 if not path_map[info.path]: 1672 ↛ 1663line 1672 didn't jump to line 1663, because the condition on line 1672 was never false
1673 del path_map[info.path]
1675 artifacts_to_keep = set(path_map)
1677 for ref, info in trashed:
1679 # Should not happen for this implementation but need
1680 # to keep mypy happy.
1681 assert info is not None, f"Internal logic error in emptyTrash with ref {ref}."
1683 # Mypy needs to know this is not the base class
1684 assert isinstance(info, StoredFileInfo), f"Unexpectedly got info of class {type(info)}"
1686 # Check for mypy
1687 assert ref.id is not None, f"Internal logic error in emptyTrash with ref {ref}/{info}"
1689 if info.path in artifacts_to_keep:
1690 # This is a multi-dataset artifact and we are not
1691 # removing all associated refs.
1692 continue
1694 # Only trashed refs still known to datastore will be returned.
1695 location = info.file_location(self.locationFactory)
1697 # Point of no return for this artifact
1698 log.debug("Removing artifact %s from datastore %s", location.uri, self.name)
1699 try:
1700 self._delete_artifact(location)
1701 except FileNotFoundError:
1702 # If the file itself has been deleted there is nothing
1703 # we can do about it. It is possible that trash has
1704 # been run in parallel in another process or someone
1705 # decided to delete the file. It is unlikely to come
1706 # back and so we should still continue with the removal
1707 # of the entry from the trash table. It is also possible
1708 # we removed it in a previous iteration if it was
1709 # a multi-dataset artifact. The delete artifact method
1710 # will log a debug message in this scenario.
1711 # Distinguishing file missing before trash started and
1712 # file already removed previously as part of this trash
1713 # is not worth the distinction with regards to potential
1714 # memory cost.
1715 pass
1716 except Exception as e:
1717 if ignore_errors:
1718 # Use a debug message here even though it's not
1719 # a good situation. In some cases this can be
1720 # caused by a race between user A and user B
1721 # and neither of them has permissions for the
1722 # other's files. Butler does not know about users
1723 # and trash has no idea what collections these
1724 # files were in (without guessing from a path).
1725 log.debug("Encountered error removing artifact %s from datastore %s: %s",
1726 location.uri, self.name, e)
1727 else:
1728 raise
1730 @transactional
1731 def forget(self, refs: Iterable[DatasetRef]) -> None:
1732 # Docstring inherited.
1733 refs = list(refs)
1734 self.bridge.forget(refs)
1735 self._table.delete(["dataset_id"], *[{"dataset_id": ref.getCheckedId()} for ref in refs])
1737 def validateConfiguration(self, entities: Iterable[Union[DatasetRef, DatasetType, StorageClass]],
1738 logFailures: bool = False) -> None:
1739 """Validate some of the configuration for this datastore.
1741 Parameters
1742 ----------
1743 entities : iterable of `DatasetRef`, `DatasetType`, or `StorageClass`
1744 Entities to test against this configuration. Can be differing
1745 types.
1746 logFailures : `bool`, optional
1747 If `True`, output a log message for every validation error
1748 detected.
1750 Raises
1751 ------
1752 DatastoreValidationError
1753 Raised if there is a validation problem with a configuration.
1754 All the problems are reported in a single exception.
1756 Notes
1757 -----
1758 This method checks that all the supplied entities have valid file
1759 templates and also have formatters defined.
1760 """
1762 templateFailed = None
1763 try:
1764 self.templates.validateTemplates(entities, logFailures=logFailures)
1765 except FileTemplateValidationError as e:
1766 templateFailed = str(e)
1768 formatterFailed = []
1769 for entity in entities:
1770 try:
1771 self.formatterFactory.getFormatterClass(entity)
1772 except KeyError as e:
1773 formatterFailed.append(str(e))
1774 if logFailures: 1774 ↛ 1769line 1774 didn't jump to line 1769, because the condition on line 1774 was never false
1775 log.critical("Formatter failure: %s", e)
1777 if templateFailed or formatterFailed:
1778 messages = []
1779 if templateFailed: 1779 ↛ 1780line 1779 didn't jump to line 1780, because the condition on line 1779 was never true
1780 messages.append(templateFailed)
1781 if formatterFailed: 1781 ↛ 1783line 1781 didn't jump to line 1783, because the condition on line 1781 was never false
1782 messages.append(",".join(formatterFailed))
1783 msg = ";\n".join(messages)
1784 raise DatastoreValidationError(msg)
1786 def getLookupKeys(self) -> Set[LookupKey]:
1787 # Docstring is inherited from base class
1788 return self.templates.getLookupKeys() | self.formatterFactory.getLookupKeys() | \
1789 self.constraints.getLookupKeys()
1791 def validateKey(self, lookupKey: LookupKey,
1792 entity: Union[DatasetRef, DatasetType, StorageClass]) -> None:
1793 # Docstring is inherited from base class
1794 # The key can be valid in either formatters or templates so we can
1795 # only check the template if it exists
1796 if lookupKey in self.templates:
1797 try:
1798 self.templates[lookupKey].validateTemplate(entity)
1799 except FileTemplateValidationError as e:
1800 raise DatastoreValidationError(e) from e
1802 def export(self, refs: Iterable[DatasetRef], *,
1803 directory: Optional[Union[ButlerURI, str]] = None,
1804 transfer: Optional[str] = "auto") -> Iterable[FileDataset]:
1805 # Docstring inherited from Datastore.export.
1806 if transfer is not None and directory is None: 1806 ↛ 1807line 1806 didn't jump to line 1807, because the condition on line 1806 was never true
1807 raise RuntimeError(f"Cannot export using transfer mode {transfer} with no "
1808 "export directory given")
1810 # Force the directory to be a URI object
1811 directoryUri: Optional[ButlerURI] = None
1812 if directory is not None: 1812 ↛ 1815line 1812 didn't jump to line 1815, because the condition on line 1812 was never false
1813 directoryUri = ButlerURI(directory, forceDirectory=True)
1815 if transfer is not None and directoryUri is not None: 1815 ↛ 1820line 1815 didn't jump to line 1820, because the condition on line 1815 was never false
1816 # mypy needs the second test
1817 if not directoryUri.exists(): 1817 ↛ 1818line 1817 didn't jump to line 1818, because the condition on line 1817 was never true
1818 raise FileNotFoundError(f"Export location {directory} does not exist")
1820 progress = Progress("lsst.daf.butler.datastores.FileDatastore.export", level=logging.DEBUG)
1821 for ref in progress.wrap(refs, "Exporting dataset files"):
1822 fileLocations = self._get_dataset_locations_info(ref)
1823 if not fileLocations: 1823 ↛ 1824line 1823 didn't jump to line 1824, because the condition on line 1823 was never true
1824 raise FileNotFoundError(f"Could not retrieve dataset {ref}.")
1825 # For now we can not export disassembled datasets
1826 if len(fileLocations) > 1: 1826 ↛ 1827line 1826 didn't jump to line 1827, because the condition on line 1826 was never true
1827 raise NotImplementedError(f"Can not export disassembled datasets such as {ref}")
1828 location, storedFileInfo = fileLocations[0]
1830 pathInStore = location.pathInStore.path
1831 if transfer is None: 1831 ↛ 1834line 1831 didn't jump to line 1834, because the condition on line 1831 was never true
1832 # TODO: do we also need to return the readStorageClass somehow?
1833 # We will use the path in store directly
1834 pass
1835 elif transfer == "direct": 1835 ↛ 1837line 1835 didn't jump to line 1837, because the condition on line 1835 was never true
1836 # Use full URIs to the remote store in the export
1837 pathInStore = str(location.uri)
1838 else:
1839 # mypy needs help
1840 assert directoryUri is not None, "directoryUri must be defined to get here"
1841 storeUri = ButlerURI(location.uri)
1843 # if the datastore has an absolute URI to a resource, we
1844 # have two options:
1845 # 1. Keep the absolute URI in the exported YAML
1846 # 2. Allocate a new name in the local datastore and transfer
1847 # it.
1848 # For now go with option 2
1849 if location.pathInStore.isabs(): 1849 ↛ 1850line 1849 didn't jump to line 1850, because the condition on line 1849 was never true
1850 template = self.templates.getTemplate(ref)
1851 newURI = ButlerURI(template.format(ref), forceAbsolute=False)
1852 pathInStore = str(newURI.updatedExtension(location.pathInStore.getExtension()))
1854 exportUri = directoryUri.join(pathInStore)
1855 exportUri.transfer_from(storeUri, transfer=transfer)
1857 yield FileDataset(refs=[ref], path=pathInStore, formatter=storedFileInfo.formatter)
1859 @staticmethod
1860 def computeChecksum(uri: ButlerURI, algorithm: str = "blake2b", block_size: int = 8192) -> Optional[str]:
1861 """Compute the checksum of the supplied file.
1863 Parameters
1864 ----------
1865 uri : `ButlerURI`
1866 Name of resource to calculate checksum from.
1867 algorithm : `str`, optional
1868 Name of algorithm to use. Must be one of the algorithms supported
1869 by :py:class`hashlib`.
1870 block_size : `int`
1871 Number of bytes to read from file at one time.
1873 Returns
1874 -------
1875 hexdigest : `str`
1876 Hex digest of the file.
1878 Notes
1879 -----
1880 Currently returns None if the URI is for a remote resource.
1881 """
1882 if algorithm not in hashlib.algorithms_guaranteed: 1882 ↛ 1883line 1882 didn't jump to line 1883, because the condition on line 1882 was never true
1883 raise NameError("The specified algorithm '{}' is not supported by hashlib".format(algorithm))
1885 if not uri.isLocal: 1885 ↛ 1886line 1885 didn't jump to line 1886, because the condition on line 1885 was never true
1886 return None
1888 hasher = hashlib.new(algorithm)
1890 with uri.as_local() as local_uri:
1891 with open(local_uri.ospath, "rb") as f:
1892 for chunk in iter(lambda: f.read(block_size), b""):
1893 hasher.update(chunk)
1895 return hasher.hexdigest()
1897 def needs_expanded_data_ids(
1898 self,
1899 transfer: Optional[str],
1900 entity: Optional[Union[DatasetRef, DatasetType, StorageClass]] = None,
1901 ) -> bool:
1902 # Docstring inherited.
1903 # This _could_ also use entity to inspect whether the filename template
1904 # involves placeholders other than the required dimensions for its
1905 # dataset type, but that's not necessary for correctness; it just
1906 # enables more optimizations (perhaps only in theory).
1907 return transfer not in ("direct", None)