Coverage for python/lsst/daf/butler/datastores/fileDatastore.py : 84%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
21from __future__ import annotations
23"""Generic file-based datastore code."""
25__all__ = ("FileDatastore", )
27import hashlib
28import logging
29import os
30import tempfile
32from sqlalchemy import BigInteger, String
34from dataclasses import dataclass
35from typing import (
36 TYPE_CHECKING,
37 Any,
38 ClassVar,
39 Dict,
40 Iterable,
41 List,
42 Mapping,
43 Optional,
44 Set,
45 Tuple,
46 Type,
47 Union,
48)
50from lsst.daf.butler import (
51 ButlerURI,
52 CompositesMap,
53 Config,
54 FileDataset,
55 DatasetRef,
56 DatasetType,
57 DatasetTypeNotSupportedError,
58 Datastore,
59 DatastoreConfig,
60 DatastoreValidationError,
61 FileDescriptor,
62 FileTemplates,
63 FileTemplateValidationError,
64 Formatter,
65 FormatterFactory,
66 Location,
67 LocationFactory,
68 Progress,
69 StorageClass,
70 StoredFileInfo,
71)
73from lsst.daf.butler import ddl
74from lsst.daf.butler.registry.interfaces import (
75 ReadOnlyDatabaseError,
76 DatastoreRegistryBridge,
77)
79from lsst.daf.butler.core.repoRelocation import replaceRoot
80from lsst.daf.butler.core.utils import getInstanceOf, getClassOf, transactional
81from .genericDatastore import GenericBaseDatastore
83if TYPE_CHECKING: 83 ↛ 84line 83 didn't jump to line 84, because the condition on line 83 was never true
84 from lsst.daf.butler import LookupKey
85 from lsst.daf.butler.registry.interfaces import DatasetIdRef, DatastoreRegistryBridgeManager
87log = logging.getLogger(__name__)
89# String to use when a Python None is encountered
90NULLSTR = "__NULL_STRING__"
93class _IngestPrepData(Datastore.IngestPrepData):
94 """Helper class for FileDatastore ingest implementation.
96 Parameters
97 ----------
98 datasets : `list` of `FileDataset`
99 Files to be ingested by this datastore.
100 """
101 def __init__(self, datasets: List[FileDataset]):
102 super().__init__(ref for dataset in datasets for ref in dataset.refs)
103 self.datasets = datasets
106@dataclass(frozen=True)
107class DatastoreFileGetInformation:
108 """Collection of useful parameters needed to retrieve a file from
109 a Datastore.
110 """
112 location: Location
113 """The location from which to read the dataset."""
115 formatter: Formatter
116 """The `Formatter` to use to deserialize the dataset."""
118 info: StoredFileInfo
119 """Stored information about this file and its formatter."""
121 assemblerParams: Dict[str, Any]
122 """Parameters to use for post-processing the retrieved dataset."""
124 formatterParams: Dict[str, Any]
125 """Parameters that were understood by the associated formatter."""
127 component: Optional[str]
128 """The component to be retrieved (can be `None`)."""
130 readStorageClass: StorageClass
131 """The `StorageClass` of the dataset being read."""
134class FileDatastore(GenericBaseDatastore):
135 """Generic Datastore for file-based implementations.
137 Should always be sub-classed since key abstract methods are missing.
139 Parameters
140 ----------
141 config : `DatastoreConfig` or `str`
142 Configuration as either a `Config` object or URI to file.
143 bridgeManager : `DatastoreRegistryBridgeManager`
144 Object that manages the interface between `Registry` and datastores.
145 butlerRoot : `str`, optional
146 New datastore root to use to override the configuration value.
148 Raises
149 ------
150 ValueError
151 If root location does not exist and ``create`` is `False` in the
152 configuration.
153 """
155 defaultConfigFile: ClassVar[Optional[str]] = None
156 """Path to configuration defaults. Accessed within the ``config`` resource
157 or relative to a search path. Can be None if no defaults specified.
158 """
160 root: ButlerURI
161 """Root directory URI of this `Datastore`."""
163 locationFactory: LocationFactory
164 """Factory for creating locations relative to the datastore root."""
166 formatterFactory: FormatterFactory
167 """Factory for creating instances of formatters."""
169 templates: FileTemplates
170 """File templates that can be used by this `Datastore`."""
172 composites: CompositesMap
173 """Determines whether a dataset should be disassembled on put."""
175 defaultConfigFile = "datastores/fileDatastore.yaml"
176 """Path to configuration defaults. Accessed within the ``config`` resource
177 or relative to a search path. Can be None if no defaults specified.
178 """
180 @classmethod
181 def setConfigRoot(cls, root: str, config: Config, full: Config, overwrite: bool = True) -> None:
182 """Set any filesystem-dependent config options for this Datastore to
183 be appropriate for a new empty repository with the given root.
185 Parameters
186 ----------
187 root : `str`
188 URI to the root of the data repository.
189 config : `Config`
190 A `Config` to update. Only the subset understood by
191 this component will be updated. Will not expand
192 defaults.
193 full : `Config`
194 A complete config with all defaults expanded that can be
195 converted to a `DatastoreConfig`. Read-only and will not be
196 modified by this method.
197 Repository-specific options that should not be obtained
198 from defaults when Butler instances are constructed
199 should be copied from ``full`` to ``config``.
200 overwrite : `bool`, optional
201 If `False`, do not modify a value in ``config`` if the value
202 already exists. Default is always to overwrite with the provided
203 ``root``.
205 Notes
206 -----
207 If a keyword is explicitly defined in the supplied ``config`` it
208 will not be overridden by this method if ``overwrite`` is `False`.
209 This allows explicit values set in external configs to be retained.
210 """
211 Config.updateParameters(DatastoreConfig, config, full,
212 toUpdate={"root": root},
213 toCopy=("cls", ("records", "table")), overwrite=overwrite)
215 @classmethod
216 def makeTableSpec(cls) -> ddl.TableSpec:
217 return ddl.TableSpec(
218 fields=[
219 ddl.FieldSpec(name="dataset_id", dtype=BigInteger, primaryKey=True),
220 ddl.FieldSpec(name="path", dtype=String, length=256, nullable=False),
221 ddl.FieldSpec(name="formatter", dtype=String, length=128, nullable=False),
222 ddl.FieldSpec(name="storage_class", dtype=String, length=64, nullable=False),
223 # Use empty string to indicate no component
224 ddl.FieldSpec(name="component", dtype=String, length=32, primaryKey=True),
225 # TODO: should checksum be Base64Bytes instead?
226 ddl.FieldSpec(name="checksum", dtype=String, length=128, nullable=True),
227 ddl.FieldSpec(name="file_size", dtype=BigInteger, nullable=True),
228 ],
229 unique=frozenset(),
230 )
232 def __init__(self, config: Union[DatastoreConfig, str],
233 bridgeManager: DatastoreRegistryBridgeManager, butlerRoot: str = None):
234 super().__init__(config, bridgeManager)
235 if "root" not in self.config: 235 ↛ 236line 235 didn't jump to line 236, because the condition on line 235 was never true
236 raise ValueError("No root directory specified in configuration")
238 # Name ourselves either using an explicit name or a name
239 # derived from the (unexpanded) root
240 if "name" in self.config:
241 self.name = self.config["name"]
242 else:
243 # We use the unexpanded root in the name to indicate that this
244 # datastore can be moved without having to update registry.
245 self.name = "{}@{}".format(type(self).__name__,
246 self.config["root"])
248 # Support repository relocation in config
249 # Existence of self.root is checked in subclass
250 self.root = ButlerURI(replaceRoot(self.config["root"], butlerRoot),
251 forceDirectory=True, forceAbsolute=True)
253 self.locationFactory = LocationFactory(self.root)
254 self.formatterFactory = FormatterFactory()
256 # Now associate formatters with storage classes
257 self.formatterFactory.registerFormatters(self.config["formatters"],
258 universe=bridgeManager.universe)
260 # Read the file naming templates
261 self.templates = FileTemplates(self.config["templates"],
262 universe=bridgeManager.universe)
264 # See if composites should be disassembled
265 self.composites = CompositesMap(self.config["composites"],
266 universe=bridgeManager.universe)
268 tableName = self.config["records", "table"]
269 try:
270 # Storage of paths and formatters, keyed by dataset_id
271 self._table = bridgeManager.opaque.register(tableName, self.makeTableSpec())
272 # Interface to Registry.
273 self._bridge = bridgeManager.register(self.name)
274 except ReadOnlyDatabaseError:
275 # If the database is read only and we just tried and failed to
276 # create a table, it means someone is trying to create a read-only
277 # butler client for an empty repo. That should be okay, as long
278 # as they then try to get any datasets before some other client
279 # creates the table. Chances are they'rejust validating
280 # configuration.
281 pass
283 # Determine whether checksums should be used - default to False
284 self.useChecksum = self.config.get("checksum", False)
286 # Determine whether we can fall back to configuration if a
287 # requested dataset is not known to registry
288 self.trustGetRequest = self.config.get("trust_get_request", False)
290 # Check existence and create directory structure if necessary
291 if not self.root.exists():
292 if "create" not in self.config or not self.config["create"]: 292 ↛ 293line 292 didn't jump to line 293, because the condition on line 292 was never true
293 raise ValueError(f"No valid root and not allowed to create one at: {self.root}")
294 try:
295 self.root.mkdir()
296 except Exception as e:
297 raise ValueError(f"Can not create datastore root '{self.root}', check permissions."
298 f" Got error: {e}") from e
300 def __str__(self) -> str:
301 return str(self.root)
303 @property
304 def bridge(self) -> DatastoreRegistryBridge:
305 return self._bridge
307 def _artifact_exists(self, location: Location) -> bool:
308 """Check that an artifact exists in this datastore at the specified
309 location.
311 Parameters
312 ----------
313 location : `Location`
314 Expected location of the artifact associated with this datastore.
316 Returns
317 -------
318 exists : `bool`
319 True if the location can be found, false otherwise.
320 """
321 log.debug("Checking if resource exists: %s", location.uri)
322 return location.uri.exists()
324 def _delete_artifact(self, location: Location) -> None:
325 """Delete the artifact from the datastore.
327 Parameters
328 ----------
329 location : `Location`
330 Location of the artifact associated with this datastore.
331 """
332 if location.pathInStore.isabs(): 332 ↛ 333line 332 didn't jump to line 333, because the condition on line 332 was never true
333 raise RuntimeError(f"Cannot delete artifact with absolute uri {location.uri}.")
334 log.debug("Deleting file: %s", location.uri)
335 location.uri.remove()
336 log.debug("Successfully deleted file: %s", location.uri)
338 def addStoredItemInfo(self, refs: Iterable[DatasetRef], infos: Iterable[StoredFileInfo]) -> None:
339 # Docstring inherited from GenericBaseDatastore
340 records = []
341 for ref, info in zip(refs, infos):
342 # Component should come from ref and fall back on info
343 component = ref.datasetType.component()
344 if component is None and info.component is not None: 344 ↛ 345line 344 didn't jump to line 345, because the condition on line 344 was never true
345 component = info.component
346 if component is None:
347 # Use empty string since we want this to be part of the
348 # primary key.
349 component = NULLSTR
350 records.append(
351 dict(dataset_id=ref.id, formatter=info.formatter, path=info.path,
352 storage_class=info.storageClass.name, component=component,
353 checksum=info.checksum, file_size=info.file_size)
354 )
355 self._table.insert(*records)
357 def getStoredItemsInfo(self, ref: DatasetIdRef) -> List[StoredFileInfo]:
358 # Docstring inherited from GenericBaseDatastore
360 # Look for the dataset_id -- there might be multiple matches
361 # if we have disassembled the dataset.
362 records = list(self._table.fetch(dataset_id=ref.id))
364 results = []
365 for record in records:
366 # Convert name of StorageClass to instance
367 storageClass = self.storageClassFactory.getStorageClass(record["storage_class"])
368 component = record["component"] if (record["component"]
369 and record["component"] != NULLSTR) else None
371 info = StoredFileInfo(formatter=record["formatter"],
372 path=record["path"],
373 storageClass=storageClass,
374 component=component,
375 checksum=record["checksum"],
376 file_size=record["file_size"])
377 results.append(info)
379 return results
381 def _registered_refs_per_artifact(self, pathInStore: ButlerURI) -> Set[int]:
382 """Return all dataset refs associated with the supplied path.
384 Parameters
385 ----------
386 pathInStore : `ButlerURI`
387 Path of interest in the data store.
389 Returns
390 -------
391 ids : `set` of `int`
392 All `DatasetRef` IDs associated with this path.
393 """
394 records = list(self._table.fetch(path=str(pathInStore)))
395 ids = {r["dataset_id"] for r in records}
396 return ids
398 def removeStoredItemInfo(self, ref: DatasetIdRef) -> None:
399 # Docstring inherited from GenericBaseDatastore
400 self._table.delete(["dataset_id"], {"dataset_id": ref.id})
402 def _get_dataset_locations_info(self, ref: DatasetIdRef) -> List[Tuple[Location, StoredFileInfo]]:
403 r"""Find all the `Location`\ s of the requested dataset in the
404 `Datastore` and the associated stored file information.
406 Parameters
407 ----------
408 ref : `DatasetRef`
409 Reference to the required `Dataset`.
411 Returns
412 -------
413 results : `list` [`tuple` [`Location`, `StoredFileInfo` ]]
414 Location of the dataset within the datastore and
415 stored information about each file and its formatter.
416 """
417 # Get the file information (this will fail if no file)
418 records = self.getStoredItemsInfo(ref)
420 # Use the path to determine the location -- we need to take
421 # into account absolute URIs in the datastore record
422 locations: List[Tuple[Location, StoredFileInfo]] = []
423 for r in records:
424 uriInStore = ButlerURI(r.path, forceAbsolute=False)
425 if uriInStore.isabs(): 425 ↛ 426line 425 didn't jump to line 426, because the condition on line 425 was never true
426 location = Location(None, uriInStore)
427 else:
428 location = self.locationFactory.fromPath(r.path)
429 locations.append((location, r))
430 return locations
432 def _can_remove_dataset_artifact(self, ref: DatasetIdRef, location: Location) -> bool:
433 """Check that there is only one dataset associated with the
434 specified artifact.
436 Parameters
437 ----------
438 ref : `DatasetRef` or `FakeDatasetRef`
439 Dataset to be removed.
440 location : `Location`
441 The location of the artifact to be removed.
443 Returns
444 -------
445 can_remove : `Bool`
446 True if the artifact can be safely removed.
447 """
448 # Can't ever delete absolute URIs.
449 if location.pathInStore.isabs(): 449 ↛ 450line 449 didn't jump to line 450, because the condition on line 449 was never true
450 return False
452 # Get all entries associated with this path
453 allRefs = self._registered_refs_per_artifact(location.pathInStore)
454 if not allRefs: 454 ↛ 455line 454 didn't jump to line 455, because the condition on line 454 was never true
455 raise RuntimeError(f"Datastore inconsistency error. {location.pathInStore} not in registry")
457 # Remove these refs from all the refs and if there is nothing left
458 # then we can delete
459 remainingRefs = allRefs - {ref.id}
461 if remainingRefs:
462 return False
463 return True
465 def _get_expected_dataset_locations_info(self, ref: DatasetRef) -> List[Tuple[Location,
466 StoredFileInfo]]:
467 """Predict the location and related file information of the requested
468 dataset in this datastore.
470 Parameters
471 ----------
472 ref : `DatasetRef`
473 Reference to the required `Dataset`.
475 Returns
476 -------
477 results : `list` [`tuple` [`Location`, `StoredFileInfo` ]]
478 Expected Location of the dataset within the datastore and
479 placeholder information about each file and its formatter.
481 Notes
482 -----
483 Uses the current configuration to determine how we would expect the
484 datastore files to have been written if we couldn't ask registry.
485 This is safe so long as there has been no change to datastore
486 configuration between writing the dataset and wanting to read it.
487 Will not work for files that have been ingested without using the
488 standard file template or default formatter.
489 """
491 # If we have a component ref we always need to ask the questions
492 # of the composite. If the composite is disassembled this routine
493 # should return all components. If the composite was not
494 # disassembled the composite is what is stored regardless of
495 # component request. Note that if the caller has disassembled
496 # a composite there is no way for this guess to know that
497 # without trying both the composite and component ref and seeing
498 # if there is something at the component Location even without
499 # disassembly being enabled.
500 if ref.datasetType.isComponent():
501 ref = ref.makeCompositeRef()
503 # See if the ref is a composite that should be disassembled
504 doDisassembly = self.composites.shouldBeDisassembled(ref)
506 all_info: List[Tuple[Location, Formatter, StorageClass, Optional[str]]] = []
508 if doDisassembly:
509 for component, componentStorage in ref.datasetType.storageClass.components.items():
510 compRef = ref.makeComponentRef(component)
511 location, formatter = self._determine_put_formatter_location(compRef)
512 all_info.append((location, formatter, componentStorage, component))
514 else:
515 # Always use the composite ref if no disassembly
516 location, formatter = self._determine_put_formatter_location(ref)
517 all_info.append((location, formatter, ref.datasetType.storageClass, None))
519 # Convert the list of tuples to have StoredFileInfo as second element
520 return [(location, StoredFileInfo(formatter=formatter,
521 path=location.pathInStore.path,
522 storageClass=storageClass,
523 component=component,
524 checksum=None,
525 file_size=-1))
526 for location, formatter, storageClass, component in all_info]
528 def _prepare_for_get(self, ref: DatasetRef,
529 parameters: Optional[Mapping[str, Any]] = None) -> List[DatastoreFileGetInformation]:
530 """Check parameters for ``get`` and obtain formatter and
531 location.
533 Parameters
534 ----------
535 ref : `DatasetRef`
536 Reference to the required Dataset.
537 parameters : `dict`
538 `StorageClass`-specific parameters that specify, for example,
539 a slice of the dataset to be loaded.
541 Returns
542 -------
543 getInfo : `list` [`DatastoreFileGetInformation`]
544 Parameters needed to retrieve each file.
545 """
546 log.debug("Retrieve %s from %s with parameters %s", ref, self.name, parameters)
548 # Get file metadata and internal metadata
549 fileLocations = self._get_dataset_locations_info(ref)
550 if not fileLocations:
551 if not self.trustGetRequest:
552 raise FileNotFoundError(f"Could not retrieve dataset {ref}.")
553 # Assume the dataset is where we think it should be
554 fileLocations = self._get_expected_dataset_locations_info(ref)
556 # The storage class we want to use eventually
557 refStorageClass = ref.datasetType.storageClass
559 if len(fileLocations) > 1:
560 disassembled = True
561 else:
562 disassembled = False
564 # Is this a component request?
565 refComponent = ref.datasetType.component()
567 fileGetInfo = []
568 for location, storedFileInfo in fileLocations:
570 # The storage class used to write the file
571 writeStorageClass = storedFileInfo.storageClass
573 # If this has been disassembled we need read to match the write
574 if disassembled:
575 readStorageClass = writeStorageClass
576 else:
577 readStorageClass = refStorageClass
579 formatter = getInstanceOf(storedFileInfo.formatter,
580 FileDescriptor(location, readStorageClass=readStorageClass,
581 storageClass=writeStorageClass, parameters=parameters),
582 ref.dataId)
584 formatterParams, notFormatterParams = formatter.segregateParameters()
586 # Of the remaining parameters, extract the ones supported by
587 # this StorageClass (for components not all will be handled)
588 assemblerParams = readStorageClass.filterParameters(notFormatterParams)
590 # The ref itself could be a component if the dataset was
591 # disassembled by butler, or we disassembled in datastore and
592 # components came from the datastore records
593 component = storedFileInfo.component if storedFileInfo.component else refComponent
595 fileGetInfo.append(DatastoreFileGetInformation(location, formatter, storedFileInfo,
596 assemblerParams, formatterParams,
597 component, readStorageClass))
599 return fileGetInfo
601 def _prepare_for_put(self, inMemoryDataset: Any, ref: DatasetRef) -> Tuple[Location, Formatter]:
602 """Check the arguments for ``put`` and obtain formatter and
603 location.
605 Parameters
606 ----------
607 inMemoryDataset : `object`
608 The dataset to store.
609 ref : `DatasetRef`
610 Reference to the associated Dataset.
612 Returns
613 -------
614 location : `Location`
615 The location to write the dataset.
616 formatter : `Formatter`
617 The `Formatter` to use to write the dataset.
619 Raises
620 ------
621 TypeError
622 Supplied object and storage class are inconsistent.
623 DatasetTypeNotSupportedError
624 The associated `DatasetType` is not handled by this datastore.
625 """
626 self._validate_put_parameters(inMemoryDataset, ref)
627 return self._determine_put_formatter_location(ref)
629 def _determine_put_formatter_location(self, ref: DatasetRef) -> Tuple[Location, Formatter]:
630 """Calculate the formatter and output location to use for put.
632 Parameters
633 ----------
634 ref : `DatasetRef`
635 Reference to the associated Dataset.
637 Returns
638 -------
639 location : `Location`
640 The location to write the dataset.
641 formatter : `Formatter`
642 The `Formatter` to use to write the dataset.
643 """
644 # Work out output file name
645 try:
646 template = self.templates.getTemplate(ref)
647 except KeyError as e:
648 raise DatasetTypeNotSupportedError(f"Unable to find template for {ref}") from e
650 # Validate the template to protect against filenames from different
651 # dataIds returning the same and causing overwrite confusion.
652 template.validateTemplate(ref)
654 location = self.locationFactory.fromPath(template.format(ref))
656 # Get the formatter based on the storage class
657 storageClass = ref.datasetType.storageClass
658 try:
659 formatter = self.formatterFactory.getFormatter(ref,
660 FileDescriptor(location,
661 storageClass=storageClass),
662 ref.dataId)
663 except KeyError as e:
664 raise DatasetTypeNotSupportedError(f"Unable to find formatter for {ref} in datastore "
665 f"{self.name}") from e
667 # Now that we know the formatter, update the location
668 location = formatter.makeUpdatedLocation(location)
670 return location, formatter
672 def _overrideTransferMode(self, *datasets: FileDataset, transfer: Optional[str] = None) -> Optional[str]:
673 # Docstring inherited from base class
674 if transfer != "auto":
675 return transfer
677 # See if the paths are within the datastore or not
678 inside = [self._pathInStore(d.path) is not None for d in datasets]
680 if all(inside):
681 transfer = None
682 elif not any(inside): 682 ↛ 686line 682 didn't jump to line 686, because the condition on line 682 was never false
683 # Allow ButlerURI to use its own knowledge
684 transfer = "auto"
685 else:
686 raise ValueError("Some datasets are inside the datastore and some are outside."
687 " Please use an explicit transfer mode and not 'auto'.")
689 return transfer
691 def _pathInStore(self, path: Union[str, ButlerURI]) -> Optional[str]:
692 """Return path relative to datastore root
694 Parameters
695 ----------
696 path : `str` or `ButlerURI`
697 Path to dataset. Can be absolute URI. If relative assumed to
698 be relative to the datastore. Returns path in datastore
699 or raises an exception if the path it outside.
701 Returns
702 -------
703 inStore : `str`
704 Path relative to datastore root. Returns `None` if the file is
705 outside the root.
706 """
707 # Relative path will always be relative to datastore
708 pathUri = ButlerURI(path, forceAbsolute=False)
709 return pathUri.relative_to(self.root)
711 def _standardizeIngestPath(self, path: Union[str, ButlerURI], *,
712 transfer: Optional[str] = None) -> Union[str, ButlerURI]:
713 """Standardize the path of a to-be-ingested file.
715 Parameters
716 ----------
717 path : `str` or `ButlerURI`
718 Path of a file to be ingested.
719 transfer : `str`, optional
720 How (and whether) the dataset should be added to the datastore.
721 See `ingest` for details of transfer modes.
722 This implementation is provided only so
723 `NotImplementedError` can be raised if the mode is not supported;
724 actual transfers are deferred to `_extractIngestInfo`.
726 Returns
727 -------
728 path : `str` or `ButlerURI`
729 New path in what the datastore considers standard form. If an
730 absolute URI was given that will be returned unchanged.
732 Notes
733 -----
734 Subclasses of `FileDatastore` can implement this method instead
735 of `_prepIngest`. It should not modify the data repository or given
736 file in any way.
738 Raises
739 ------
740 NotImplementedError
741 Raised if the datastore does not support the given transfer mode
742 (including the case where ingest is not supported at all).
743 FileNotFoundError
744 Raised if one of the given files does not exist.
745 """
746 if transfer not in (None, "direct") + self.root.transferModes: 746 ↛ 747line 746 didn't jump to line 747, because the condition on line 746 was never true
747 raise NotImplementedError(f"Transfer mode {transfer} not supported.")
749 # A relative URI indicates relative to datastore root
750 srcUri = ButlerURI(path, forceAbsolute=False)
751 if not srcUri.isabs():
752 srcUri = self.root.join(path)
754 if not srcUri.exists():
755 raise FileNotFoundError(f"Resource at {srcUri} does not exist; note that paths to ingest "
756 f"are assumed to be relative to {self.root} unless they are absolute.")
758 if transfer is None:
759 relpath = srcUri.relative_to(self.root)
760 if not relpath:
761 raise RuntimeError(f"Transfer is none but source file ({srcUri}) is not "
762 f"within datastore ({self.root})")
764 # Return the relative path within the datastore for internal
765 # transfer
766 path = relpath
768 return path
770 def _extractIngestInfo(self, path: Union[str, ButlerURI], ref: DatasetRef, *,
771 formatter: Union[Formatter, Type[Formatter]],
772 transfer: Optional[str] = None) -> StoredFileInfo:
773 """Relocate (if necessary) and extract `StoredFileInfo` from a
774 to-be-ingested file.
776 Parameters
777 ----------
778 path : `str` or `ButlerURI`
779 URI or path of a file to be ingested.
780 ref : `DatasetRef`
781 Reference for the dataset being ingested. Guaranteed to have
782 ``dataset_id not None`.
783 formatter : `type` or `Formatter`
784 `Formatter` subclass to use for this dataset or an instance.
785 transfer : `str`, optional
786 How (and whether) the dataset should be added to the datastore.
787 See `ingest` for details of transfer modes.
789 Returns
790 -------
791 info : `StoredFileInfo`
792 Internal datastore record for this file. This will be inserted by
793 the caller; the `_extractIngestInfo` is only resposible for
794 creating and populating the struct.
796 Raises
797 ------
798 FileNotFoundError
799 Raised if one of the given files does not exist.
800 FileExistsError
801 Raised if transfer is not `None` but the (internal) location the
802 file would be moved to is already occupied.
803 """
804 if self._transaction is None: 804 ↛ 805line 804 didn't jump to line 805, because the condition on line 804 was never true
805 raise RuntimeError("Ingest called without transaction enabled")
807 # Create URI of the source path, do not need to force a relative
808 # path to absolute.
809 srcUri = ButlerURI(path, forceAbsolute=False)
811 # Track whether we have read the size of the source yet
812 have_sized = False
814 tgtLocation: Optional[Location]
815 if transfer is None:
816 # A relative path is assumed to be relative to the datastore
817 # in this context
818 if not srcUri.isabs():
819 tgtLocation = self.locationFactory.fromPath(srcUri.ospath)
820 else:
821 # Work out the path in the datastore from an absolute URI
822 # This is required to be within the datastore.
823 pathInStore = srcUri.relative_to(self.root)
824 if pathInStore is None: 824 ↛ 825line 824 didn't jump to line 825, because the condition on line 824 was never true
825 raise RuntimeError(f"Unexpectedly learned that {srcUri} is "
826 f"not within datastore {self.root}")
827 tgtLocation = self.locationFactory.fromPath(pathInStore)
828 elif transfer == "direct": 828 ↛ 833line 828 didn't jump to line 833, because the condition on line 828 was never true
829 # Want to store the full URI to the resource directly in
830 # datastore. This is useful for referring to permanent archive
831 # storage for raw data.
832 # Trust that people know what they are doing.
833 tgtLocation = None
834 else:
835 # Work out the name we want this ingested file to have
836 # inside the datastore
837 tgtLocation = self._calculate_ingested_datastore_name(srcUri, ref, formatter)
838 if not tgtLocation.uri.dirname().exists():
839 log.debug("Folder %s does not exist yet.", tgtLocation.uri.dirname())
840 tgtLocation.uri.dirname().mkdir()
842 # if we are transferring from a local file to a remote location
843 # it may be more efficient to get the size and checksum of the
844 # local file rather than the transferred one
845 if not srcUri.scheme or srcUri.scheme == "file": 845 ↛ 851line 845 didn't jump to line 851, because the condition on line 845 was never false
846 size = srcUri.size()
847 checksum = self.computeChecksum(srcUri) if self.useChecksum else None
848 have_sized = True
850 # transfer the resource to the destination
851 tgtLocation.uri.transfer_from(srcUri, transfer=transfer, transaction=self._transaction)
853 if tgtLocation is None: 853 ↛ 855line 853 didn't jump to line 855, because the condition on line 853 was never true
854 # This means we are using direct mode
855 targetUri = srcUri
856 targetPath = str(srcUri)
857 else:
858 targetUri = tgtLocation.uri
859 targetPath = tgtLocation.pathInStore.path
861 # the file should exist in the datastore now
862 if not have_sized:
863 size = targetUri.size()
864 checksum = self.computeChecksum(targetUri) if self.useChecksum else None
866 return StoredFileInfo(formatter=formatter, path=targetPath,
867 storageClass=ref.datasetType.storageClass,
868 component=ref.datasetType.component(),
869 file_size=size, checksum=checksum)
871 def _prepIngest(self, *datasets: FileDataset, transfer: Optional[str] = None) -> _IngestPrepData:
872 # Docstring inherited from Datastore._prepIngest.
873 filtered = []
874 for dataset in datasets:
875 acceptable = [ref for ref in dataset.refs if self.constraints.isAcceptable(ref)]
876 if not acceptable:
877 continue
878 else:
879 dataset.refs = acceptable
880 if dataset.formatter is None:
881 dataset.formatter = self.formatterFactory.getFormatterClass(dataset.refs[0])
882 else:
883 assert isinstance(dataset.formatter, (type, str))
884 dataset.formatter = getClassOf(dataset.formatter)
885 dataset.path = self._standardizeIngestPath(dataset.path, transfer=transfer)
886 filtered.append(dataset)
887 return _IngestPrepData(filtered)
889 @transactional
890 def _finishIngest(self, prepData: Datastore.IngestPrepData, *, transfer: Optional[str] = None) -> None:
891 # Docstring inherited from Datastore._finishIngest.
892 refsAndInfos = []
893 progress = Progress("lsst.daf.butler.datastores.FileDatastore.ingest", level=logging.DEBUG)
894 for dataset in progress.wrap(prepData.datasets, desc="Ingesting dataset files"):
895 # Do ingest as if the first dataset ref is associated with the file
896 info = self._extractIngestInfo(dataset.path, dataset.refs[0], formatter=dataset.formatter,
897 transfer=transfer)
898 refsAndInfos.extend([(ref, info) for ref in dataset.refs])
899 self._register_datasets(refsAndInfos)
901 def _calculate_ingested_datastore_name(self, srcUri: ButlerURI, ref: DatasetRef,
902 formatter: Union[Formatter, Type[Formatter]]) -> Location:
903 """Given a source URI and a DatasetRef, determine the name the
904 dataset will have inside datastore.
906 Parameters
907 ----------
908 srcUri : `ButlerURI`
909 URI to the source dataset file.
910 ref : `DatasetRef`
911 Ref associated with the newly-ingested dataset artifact. This
912 is used to determine the name within the datastore.
913 formatter : `Formatter` or Formatter class.
914 Formatter to use for validation. Can be a class or an instance.
916 Returns
917 -------
918 location : `Location`
919 Target location for the newly-ingested dataset.
920 """
921 # Ingesting a file from outside the datastore.
922 # This involves a new name.
923 template = self.templates.getTemplate(ref)
924 location = self.locationFactory.fromPath(template.format(ref))
926 # Get the extension
927 ext = srcUri.getExtension()
929 # Update the destination to include that extension
930 location.updateExtension(ext)
932 # Ask the formatter to validate this extension
933 formatter.validateExtension(location)
935 return location
937 def _write_in_memory_to_artifact(self, inMemoryDataset: Any, ref: DatasetRef) -> StoredFileInfo:
938 """Write out in memory dataset to datastore.
940 Parameters
941 ----------
942 inMemoryDataset : `object`
943 Dataset to write to datastore.
944 ref : `DatasetRef`
945 Registry information associated with this dataset.
947 Returns
948 -------
949 info : `StoredFileInfo`
950 Information describin the artifact written to the datastore.
951 """
952 location, formatter = self._prepare_for_put(inMemoryDataset, ref)
953 uri = location.uri
955 if not uri.dirname().exists():
956 log.debug("Folder %s does not exist yet so creating it.", uri.dirname())
957 uri.dirname().mkdir()
959 if self._transaction is None: 959 ↛ 960line 959 didn't jump to line 960, because the condition on line 959 was never true
960 raise RuntimeError("Attempting to write artifact without transaction enabled")
962 def _removeFileExists(uri: ButlerURI) -> None:
963 """Remove a file and do not complain if it is not there.
965 This is important since a formatter might fail before the file
966 is written and we should not confuse people by writing spurious
967 error messages to the log.
968 """
969 try:
970 uri.remove()
971 except FileNotFoundError:
972 pass
974 # Register a callback to try to delete the uploaded data if
975 # something fails below
976 self._transaction.registerUndo("artifactWrite", _removeFileExists, uri)
978 # For a local file, simply use the formatter directly
979 if uri.isLocal:
980 formatter.write(inMemoryDataset)
981 log.debug("Successfully wrote python object to local file at %s", uri)
982 else:
983 # This is a remote URI, so first try bytes and write directly else
984 # fallback to a temporary file
985 try:
986 serializedDataset = formatter.toBytes(inMemoryDataset)
987 log.debug("Writing bytes directly to %s", uri)
988 uri.write(serializedDataset, overwrite=True)
989 log.debug("Successfully wrote bytes directly to %s", uri)
990 except NotImplementedError:
991 with tempfile.NamedTemporaryFile(suffix=uri.getExtension()) as tmpFile:
992 # Need to configure the formatter to write to a different
993 # location and that needs us to overwrite internals
994 tmpLocation = Location(*os.path.split(tmpFile.name))
995 log.debug("Writing dataset to temporary location at %s", tmpLocation.uri)
996 with formatter._updateLocation(tmpLocation):
997 formatter.write(inMemoryDataset)
998 uri.transfer_from(tmpLocation.uri, transfer="copy", overwrite=True)
999 log.debug("Successfully wrote dataset to %s via a temporary file.", uri)
1001 # URI is needed to resolve what ingest case are we dealing with
1002 return self._extractIngestInfo(uri, ref, formatter=formatter)
1004 def _read_artifact_into_memory(self, getInfo: DatastoreFileGetInformation,
1005 ref: DatasetRef, isComponent: bool = False) -> Any:
1006 """Read the artifact from datastore into in memory object.
1008 Parameters
1009 ----------
1010 getInfo : `DatastoreFileGetInformation`
1011 Information about the artifact within the datastore.
1012 ref : `DatasetRef`
1013 The registry information associated with this artifact.
1014 isComponent : `bool`
1015 Flag to indicate if a component is being read from this artifact.
1017 Returns
1018 -------
1019 inMemoryDataset : `object`
1020 The artifact as a python object.
1021 """
1022 location = getInfo.location
1023 uri = location.uri
1024 log.debug("Accessing data from %s", uri)
1026 # Cannot recalculate checksum but can compare size as a quick check
1027 # Do not do this if the size is negative since that indicates
1028 # we do not know.
1029 recorded_size = getInfo.info.file_size
1030 resource_size = uri.size()
1031 if recorded_size >= 0 and resource_size != recorded_size: 1031 ↛ 1032line 1031 didn't jump to line 1032, because the condition on line 1031 was never true
1032 raise RuntimeError("Integrity failure in Datastore. "
1033 f"Size of file {uri} ({resource_size}) "
1034 f"does not match size recorded in registry of {recorded_size}")
1036 # For the general case we have choices for how to proceed.
1037 # 1. Always use a local file (downloading the remote resource to a
1038 # temporary file if needed).
1039 # 2. Use a threshold size and read into memory and use bytes.
1040 # Use both for now with an arbitrary hand off size.
1041 # This allows small datasets to be downloaded from remote object
1042 # stores without requiring a temporary file.
1044 formatter = getInfo.formatter
1045 nbytes_max = 10_000_000 # Arbitrary number that we can tune
1046 if resource_size <= nbytes_max and formatter.can_read_bytes():
1047 serializedDataset = uri.read()
1048 log.debug("Deserializing %s from %d bytes from location %s with formatter %s",
1049 f"component {getInfo.component}" if isComponent else "",
1050 len(serializedDataset), uri, formatter.name())
1051 try:
1052 result = formatter.fromBytes(serializedDataset,
1053 component=getInfo.component if isComponent else None)
1054 except Exception as e:
1055 raise ValueError(f"Failure from formatter '{formatter.name()}' for dataset {ref.id}"
1056 f" ({ref.datasetType.name} from {uri}): {e}") from e
1057 else:
1058 # Read from file
1059 with uri.as_local() as local_uri:
1060 # Have to update the Location associated with the formatter
1061 # because formatter.read does not allow an override.
1062 # This could be improved.
1063 msg = ""
1064 newLocation = None
1065 if uri != local_uri:
1066 newLocation = Location(*local_uri.split())
1067 msg = "(via download to local file)"
1069 log.debug("Reading %s from location %s %s with formatter %s",
1070 f"component {getInfo.component}" if isComponent else "",
1071 uri, msg, formatter.name())
1072 try:
1073 with formatter._updateLocation(newLocation):
1074 result = formatter.read(component=getInfo.component if isComponent else None)
1075 except Exception as e:
1076 raise ValueError(f"Failure from formatter '{formatter.name()}' for dataset {ref.id}"
1077 f" ({ref.datasetType.name} from {uri}): {e}") from e
1079 return self._post_process_get(result, getInfo.readStorageClass, getInfo.assemblerParams,
1080 isComponent=isComponent)
1082 def exists(self, ref: DatasetRef) -> bool:
1083 """Check if the dataset exists in the datastore.
1085 Parameters
1086 ----------
1087 ref : `DatasetRef`
1088 Reference to the required dataset.
1090 Returns
1091 -------
1092 exists : `bool`
1093 `True` if the entity exists in the `Datastore`.
1094 """
1095 fileLocations = self._get_dataset_locations_info(ref)
1097 # if we are being asked to trust that registry might not be correct
1098 # we ask for the expected locations and check them explicitly
1099 if not fileLocations:
1100 if not self.trustGetRequest:
1101 return False
1102 fileLocations = self._get_expected_dataset_locations_info(ref)
1103 for location, _ in fileLocations:
1104 if not self._artifact_exists(location):
1105 return False
1107 return True
1109 def getURIs(self, ref: DatasetRef,
1110 predict: bool = False) -> Tuple[Optional[ButlerURI], Dict[str, ButlerURI]]:
1111 """Return URIs associated with dataset.
1113 Parameters
1114 ----------
1115 ref : `DatasetRef`
1116 Reference to the required dataset.
1117 predict : `bool`, optional
1118 If the datastore does not know about the dataset, should it
1119 return a predicted URI or not?
1121 Returns
1122 -------
1123 primary : `ButlerURI`
1124 The URI to the primary artifact associated with this dataset.
1125 If the dataset was disassembled within the datastore this
1126 may be `None`.
1127 components : `dict`
1128 URIs to any components associated with the dataset artifact.
1129 Can be empty if there are no components.
1130 """
1132 primary: Optional[ButlerURI] = None
1133 components: Dict[str, ButlerURI] = {}
1135 # if this has never been written then we have to guess
1136 if not self.exists(ref):
1137 if not predict:
1138 raise FileNotFoundError("Dataset {} not in this datastore".format(ref))
1140 doDisassembly = self.composites.shouldBeDisassembled(ref)
1142 if doDisassembly:
1144 for component, componentStorage in ref.datasetType.storageClass.components.items():
1145 compRef = ref.makeComponentRef(component)
1146 compLocation, _ = self._determine_put_formatter_location(compRef)
1148 # Add a URI fragment to indicate this is a guess
1149 components[component] = ButlerURI(compLocation.uri.geturl() + "#predicted")
1151 else:
1153 location, _ = self._determine_put_formatter_location(ref)
1155 # Add a URI fragment to indicate this is a guess
1156 primary = ButlerURI(location.uri.geturl() + "#predicted")
1158 return primary, components
1160 # If this is a ref that we have written we can get the path.
1161 # Get file metadata and internal metadata
1162 fileLocations = self._get_dataset_locations_info(ref)
1164 guessing = False
1165 if not fileLocations:
1166 if not self.trustGetRequest: 1166 ↛ 1167line 1166 didn't jump to line 1167, because the condition on line 1166 was never true
1167 raise RuntimeError(f"Unexpectedly got no artifacts for dataset {ref}")
1168 fileLocations = self._get_expected_dataset_locations_info(ref)
1169 guessing = True
1171 if len(fileLocations) == 1:
1172 # No disassembly so this is the primary URI
1173 uri = fileLocations[0][0].uri
1174 if guessing and not uri.exists(): 1174 ↛ 1175line 1174 didn't jump to line 1175, because the condition on line 1174 was never true
1175 raise FileNotFoundError(f"Expected URI ({uri}) does not exist")
1176 primary = uri
1178 else:
1179 for location, storedFileInfo in fileLocations:
1180 if storedFileInfo.component is None: 1180 ↛ 1181line 1180 didn't jump to line 1181, because the condition on line 1180 was never true
1181 raise RuntimeError(f"Unexpectedly got no component name for a component at {location}")
1182 uri = location.uri
1183 if guessing and not uri.exists(): 1183 ↛ 1184line 1183 didn't jump to line 1184, because the condition on line 1183 was never true
1184 raise FileNotFoundError(f"Expected URI ({uri}) does not exist")
1185 components[storedFileInfo.component] = uri
1187 return primary, components
1189 def getURI(self, ref: DatasetRef, predict: bool = False) -> ButlerURI:
1190 """URI to the Dataset.
1192 Parameters
1193 ----------
1194 ref : `DatasetRef`
1195 Reference to the required Dataset.
1196 predict : `bool`
1197 If `True`, allow URIs to be returned of datasets that have not
1198 been written.
1200 Returns
1201 -------
1202 uri : `str`
1203 URI pointing to the dataset within the datastore. If the
1204 dataset does not exist in the datastore, and if ``predict`` is
1205 `True`, the URI will be a prediction and will include a URI
1206 fragment "#predicted".
1207 If the datastore does not have entities that relate well
1208 to the concept of a URI the returned URI will be
1209 descriptive. The returned URI is not guaranteed to be obtainable.
1211 Raises
1212 ------
1213 FileNotFoundError
1214 Raised if a URI has been requested for a dataset that does not
1215 exist and guessing is not allowed.
1216 RuntimeError
1217 Raised if a request is made for a single URI but multiple URIs
1218 are associated with this dataset.
1220 Notes
1221 -----
1222 When a predicted URI is requested an attempt will be made to form
1223 a reasonable URI based on file templates and the expected formatter.
1224 """
1225 primary, components = self.getURIs(ref, predict)
1226 if primary is None or components: 1226 ↛ 1227line 1226 didn't jump to line 1227, because the condition on line 1226 was never true
1227 raise RuntimeError(f"Dataset ({ref}) includes distinct URIs for components. "
1228 "Use Dataastore.getURIs() instead.")
1229 return primary
1231 def get(self, ref: DatasetRef, parameters: Optional[Mapping[str, Any]] = None) -> Any:
1232 """Load an InMemoryDataset from the store.
1234 Parameters
1235 ----------
1236 ref : `DatasetRef`
1237 Reference to the required Dataset.
1238 parameters : `dict`
1239 `StorageClass`-specific parameters that specify, for example,
1240 a slice of the dataset to be loaded.
1242 Returns
1243 -------
1244 inMemoryDataset : `object`
1245 Requested dataset or slice thereof as an InMemoryDataset.
1247 Raises
1248 ------
1249 FileNotFoundError
1250 Requested dataset can not be retrieved.
1251 TypeError
1252 Return value from formatter has unexpected type.
1253 ValueError
1254 Formatter failed to process the dataset.
1255 """
1256 allGetInfo = self._prepare_for_get(ref, parameters)
1257 refComponent = ref.datasetType.component()
1259 # Supplied storage class for the component being read
1260 refStorageClass = ref.datasetType.storageClass
1262 # Create mapping from component name to related info
1263 allComponents = {i.component: i for i in allGetInfo}
1265 # By definition the dataset is disassembled if we have more
1266 # than one record for it.
1267 isDisassembled = len(allGetInfo) > 1
1269 # Look for the special case where we are disassembled but the
1270 # component is a derived component that was not written during
1271 # disassembly. For this scenario we need to check that the
1272 # component requested is listed as a derived component for the
1273 # composite storage class
1274 isDisassembledReadOnlyComponent = False
1275 if isDisassembled and refComponent:
1276 # The composite storage class should be accessible through
1277 # the component dataset type
1278 compositeStorageClass = ref.datasetType.parentStorageClass
1280 # In the unlikely scenario where the composite storage
1281 # class is not known, we can only assume that this is a
1282 # normal component. If that assumption is wrong then the
1283 # branch below that reads a persisted component will fail
1284 # so there is no need to complain here.
1285 if compositeStorageClass is not None: 1285 ↛ 1288line 1285 didn't jump to line 1288, because the condition on line 1285 was never false
1286 isDisassembledReadOnlyComponent = refComponent in compositeStorageClass.derivedComponents
1288 if isDisassembled and not refComponent:
1289 # This was a disassembled dataset spread over multiple files
1290 # and we need to put them all back together again.
1291 # Read into memory and then assemble
1293 # Check that the supplied parameters are suitable for the type read
1294 refStorageClass.validateParameters(parameters)
1296 # We want to keep track of all the parameters that were not used
1297 # by formatters. We assume that if any of the component formatters
1298 # use a parameter that we do not need to apply it again in the
1299 # assembler.
1300 usedParams = set()
1302 components: Dict[str, Any] = {}
1303 for getInfo in allGetInfo:
1304 # assemblerParams are parameters not understood by the
1305 # associated formatter.
1306 usedParams.update(set(getInfo.formatterParams))
1308 component = getInfo.component
1310 if component is None: 1310 ↛ 1311line 1310 didn't jump to line 1311, because the condition on line 1310 was never true
1311 raise RuntimeError(f"Internal error in datastore assembly of {ref}")
1313 # We do not want the formatter to think it's reading
1314 # a component though because it is really reading a
1315 # standalone dataset -- always tell reader it is not a
1316 # component.
1317 components[component] = self._read_artifact_into_memory(getInfo, ref, isComponent=False)
1319 inMemoryDataset = ref.datasetType.storageClass.delegate().assemble(components)
1321 # Any unused parameters will have to be passed to the assembler
1322 if parameters:
1323 unusedParams = {k: v for k, v in parameters.items() if k not in usedParams}
1324 else:
1325 unusedParams = {}
1327 # Process parameters
1328 return ref.datasetType.storageClass.delegate().handleParameters(inMemoryDataset,
1329 parameters=unusedParams)
1331 elif isDisassembledReadOnlyComponent:
1333 compositeStorageClass = ref.datasetType.parentStorageClass
1334 if compositeStorageClass is None: 1334 ↛ 1335line 1334 didn't jump to line 1335, because the condition on line 1334 was never true
1335 raise RuntimeError(f"Unable to retrieve derived component '{refComponent}' since"
1336 "no composite storage class is available.")
1338 if refComponent is None: 1338 ↛ 1340line 1338 didn't jump to line 1340, because the condition on line 1338 was never true
1339 # Mainly for mypy
1340 raise RuntimeError(f"Internal error in datastore {self.name}: component can not be None here")
1342 # Assume that every derived component can be calculated by
1343 # forwarding the request to a single read/write component.
1344 # Rather than guessing which rw component is the right one by
1345 # scanning each for a derived component of the same name,
1346 # we ask the storage class delegate directly which one is best to
1347 # use.
1348 compositeDelegate = compositeStorageClass.delegate()
1349 forwardedComponent = compositeDelegate.selectResponsibleComponent(refComponent,
1350 set(allComponents))
1352 # Select the relevant component
1353 rwInfo = allComponents[forwardedComponent]
1355 # For now assume that read parameters are validated against
1356 # the real component and not the requested component
1357 forwardedStorageClass = rwInfo.formatter.fileDescriptor.readStorageClass
1358 forwardedStorageClass.validateParameters(parameters)
1360 # Unfortunately the FileDescriptor inside the formatter will have
1361 # the wrong write storage class so we need to create a new one
1362 # given the immutability constraint.
1363 writeStorageClass = rwInfo.info.storageClass
1365 # We may need to put some thought into parameters for read
1366 # components but for now forward them on as is
1367 readFormatter = type(rwInfo.formatter)(FileDescriptor(rwInfo.location,
1368 readStorageClass=refStorageClass,
1369 storageClass=writeStorageClass,
1370 parameters=parameters),
1371 ref.dataId)
1373 # The assembler can not receive any parameter requests for a
1374 # derived component at this time since the assembler will
1375 # see the storage class of the derived component and those
1376 # parameters will have to be handled by the formatter on the
1377 # forwarded storage class.
1378 assemblerParams: Dict[str, Any] = {}
1380 # Need to created a new info that specifies the derived
1381 # component and associated storage class
1382 readInfo = DatastoreFileGetInformation(rwInfo.location, readFormatter,
1383 rwInfo.info, assemblerParams, {},
1384 refComponent, refStorageClass)
1386 return self._read_artifact_into_memory(readInfo, ref, isComponent=True)
1388 else:
1389 # Single file request or component from that composite file
1390 for lookup in (refComponent, None): 1390 ↛ 1395line 1390 didn't jump to line 1395, because the loop on line 1390 didn't complete
1391 if lookup in allComponents: 1391 ↛ 1390line 1391 didn't jump to line 1390, because the condition on line 1391 was never false
1392 getInfo = allComponents[lookup]
1393 break
1394 else:
1395 raise FileNotFoundError(f"Component {refComponent} not found "
1396 f"for ref {ref} in datastore {self.name}")
1398 # Do not need the component itself if already disassembled
1399 if isDisassembled:
1400 isComponent = False
1401 else:
1402 isComponent = getInfo.component is not None
1404 # For a disassembled component we can validate parametersagainst
1405 # the component storage class directly
1406 if isDisassembled:
1407 refStorageClass.validateParameters(parameters)
1408 else:
1409 # For an assembled composite this could be a derived
1410 # component derived from a real component. The validity
1411 # of the parameters is not clear. For now validate against
1412 # the composite storage class
1413 getInfo.formatter.fileDescriptor.storageClass.validateParameters(parameters)
1415 return self._read_artifact_into_memory(getInfo, ref, isComponent=isComponent)
1417 @transactional
1418 def put(self, inMemoryDataset: Any, ref: DatasetRef) -> None:
1419 """Write a InMemoryDataset with a given `DatasetRef` to the store.
1421 Parameters
1422 ----------
1423 inMemoryDataset : `object`
1424 The dataset to store.
1425 ref : `DatasetRef`
1426 Reference to the associated Dataset.
1428 Raises
1429 ------
1430 TypeError
1431 Supplied object and storage class are inconsistent.
1432 DatasetTypeNotSupportedError
1433 The associated `DatasetType` is not handled by this datastore.
1435 Notes
1436 -----
1437 If the datastore is configured to reject certain dataset types it
1438 is possible that the put will fail and raise a
1439 `DatasetTypeNotSupportedError`. The main use case for this is to
1440 allow `ChainedDatastore` to put to multiple datastores without
1441 requiring that every datastore accepts the dataset.
1442 """
1444 doDisassembly = self.composites.shouldBeDisassembled(ref)
1445 # doDisassembly = True
1447 artifacts = []
1448 if doDisassembly:
1449 components = ref.datasetType.storageClass.delegate().disassemble(inMemoryDataset)
1450 for component, componentInfo in components.items():
1451 # Don't recurse because we want to take advantage of
1452 # bulk insert -- need a new DatasetRef that refers to the
1453 # same dataset_id but has the component DatasetType
1454 # DatasetType does not refer to the types of components
1455 # So we construct one ourselves.
1456 compRef = ref.makeComponentRef(component)
1457 storedInfo = self._write_in_memory_to_artifact(componentInfo.component, compRef)
1458 artifacts.append((compRef, storedInfo))
1459 else:
1460 # Write the entire thing out
1461 storedInfo = self._write_in_memory_to_artifact(inMemoryDataset, ref)
1462 artifacts.append((ref, storedInfo))
1464 self._register_datasets(artifacts)
1466 @transactional
1467 def trash(self, ref: DatasetRef, ignore_errors: bool = True) -> None:
1468 """Indicate to the datastore that a dataset can be removed.
1470 Parameters
1471 ----------
1472 ref : `DatasetRef`
1473 Reference to the required Dataset.
1474 ignore_errors : `bool`
1475 If `True` return without error even if something went wrong.
1476 Problems could occur if another process is simultaneously trying
1477 to delete.
1479 Raises
1480 ------
1481 FileNotFoundError
1482 Attempt to remove a dataset that does not exist.
1483 """
1484 # Get file metadata and internal metadata
1485 log.debug("Trashing %s in datastore %s", ref, self.name)
1487 fileLocations = self._get_dataset_locations_info(ref)
1489 if not fileLocations:
1490 err_msg = f"Requested dataset to trash ({ref}) is not known to datastore {self.name}"
1491 if ignore_errors:
1492 log.warning(err_msg)
1493 return
1494 else:
1495 raise FileNotFoundError(err_msg)
1497 for location, storedFileInfo in fileLocations:
1498 if not self._artifact_exists(location): 1498 ↛ 1499line 1498 didn't jump to line 1499, because the condition on line 1498 was never true
1499 err_msg = f"Dataset is known to datastore {self.name} but " \
1500 f"associated artifact ({location.uri}) is missing"
1501 if ignore_errors:
1502 log.warning(err_msg)
1503 return
1504 else:
1505 raise FileNotFoundError(err_msg)
1507 # Mark dataset as trashed
1508 try:
1509 self._move_to_trash_in_registry(ref)
1510 except Exception as e:
1511 if ignore_errors:
1512 log.warning(f"Attempted to mark dataset ({ref}) to be trashed in datastore {self.name} "
1513 f"but encountered an error: {e}")
1514 pass
1515 else:
1516 raise
1518 @transactional
1519 def emptyTrash(self, ignore_errors: bool = True) -> None:
1520 """Remove all datasets from the trash.
1522 Parameters
1523 ----------
1524 ignore_errors : `bool`
1525 If `True` return without error even if something went wrong.
1526 Problems could occur if another process is simultaneously trying
1527 to delete.
1528 """
1529 log.debug("Emptying trash in datastore %s", self.name)
1530 # Context manager will empty trash iff we finish it without raising.
1531 with self.bridge.emptyTrash() as trashed:
1532 for ref in trashed:
1533 fileLocations = self._get_dataset_locations_info(ref)
1535 if not fileLocations: 1535 ↛ 1536line 1535 didn't jump to line 1536, because the condition on line 1535 was never true
1536 err_msg = f"Requested dataset ({ref}) does not exist in datastore {self.name}"
1537 if ignore_errors:
1538 log.warning(err_msg)
1539 continue
1540 else:
1541 raise FileNotFoundError(err_msg)
1543 for location, _ in fileLocations:
1545 if not self._artifact_exists(location): 1545 ↛ 1546line 1545 didn't jump to line 1546, because the condition on line 1545 was never true
1546 err_msg = f"Dataset {location.uri} no longer present in datastore {self.name}"
1547 if ignore_errors:
1548 log.warning(err_msg)
1549 continue
1550 else:
1551 raise FileNotFoundError(err_msg)
1553 # Can only delete the artifact if there are no references
1554 # to the file from untrashed dataset refs.
1555 if self._can_remove_dataset_artifact(ref, location):
1556 # Point of no return for this artifact
1557 log.debug("Removing artifact %s from datastore %s", location.uri, self.name)
1558 try:
1559 self._delete_artifact(location)
1560 except Exception as e:
1561 if ignore_errors:
1562 log.critical("Encountered error removing artifact %s from datastore %s: %s",
1563 location.uri, self.name, e)
1564 else:
1565 raise
1567 # Now must remove the entry from the internal registry even if
1568 # the artifact removal failed and was ignored,
1569 # otherwise the removal check above will never be true
1570 try:
1571 # There may be multiple rows associated with this ref
1572 # depending on disassembly
1573 self.removeStoredItemInfo(ref)
1574 except Exception as e:
1575 if ignore_errors:
1576 log.warning("Error removing dataset %s (%s) from internal registry of %s: %s",
1577 ref.id, location.uri, self.name, e)
1578 continue
1579 else:
1580 raise FileNotFoundError(
1581 f"Error removing dataset {ref.id} ({location.uri}) from internal registry "
1582 f"of {self.name}"
1583 ) from e
1585 @transactional
1586 def forget(self, refs: Iterable[DatasetRef]) -> None:
1587 # Docstring inherited.
1588 refs = list(refs)
1589 self.bridge.forget(refs)
1590 self._table.delete(["dataset_id"], *[{"dataset_id": ref.getCheckedId()} for ref in refs])
1592 def validateConfiguration(self, entities: Iterable[Union[DatasetRef, DatasetType, StorageClass]],
1593 logFailures: bool = False) -> None:
1594 """Validate some of the configuration for this datastore.
1596 Parameters
1597 ----------
1598 entities : iterable of `DatasetRef`, `DatasetType`, or `StorageClass`
1599 Entities to test against this configuration. Can be differing
1600 types.
1601 logFailures : `bool`, optional
1602 If `True`, output a log message for every validation error
1603 detected.
1605 Raises
1606 ------
1607 DatastoreValidationError
1608 Raised if there is a validation problem with a configuration.
1609 All the problems are reported in a single exception.
1611 Notes
1612 -----
1613 This method checks that all the supplied entities have valid file
1614 templates and also have formatters defined.
1615 """
1617 templateFailed = None
1618 try:
1619 self.templates.validateTemplates(entities, logFailures=logFailures)
1620 except FileTemplateValidationError as e:
1621 templateFailed = str(e)
1623 formatterFailed = []
1624 for entity in entities:
1625 try:
1626 self.formatterFactory.getFormatterClass(entity)
1627 except KeyError as e:
1628 formatterFailed.append(str(e))
1629 if logFailures: 1629 ↛ 1624line 1629 didn't jump to line 1624, because the condition on line 1629 was never false
1630 log.critical("Formatter failure: %s", e)
1632 if templateFailed or formatterFailed:
1633 messages = []
1634 if templateFailed: 1634 ↛ 1635line 1634 didn't jump to line 1635, because the condition on line 1634 was never true
1635 messages.append(templateFailed)
1636 if formatterFailed: 1636 ↛ 1638line 1636 didn't jump to line 1638, because the condition on line 1636 was never false
1637 messages.append(",".join(formatterFailed))
1638 msg = ";\n".join(messages)
1639 raise DatastoreValidationError(msg)
1641 def getLookupKeys(self) -> Set[LookupKey]:
1642 # Docstring is inherited from base class
1643 return self.templates.getLookupKeys() | self.formatterFactory.getLookupKeys() | \
1644 self.constraints.getLookupKeys()
1646 def validateKey(self, lookupKey: LookupKey,
1647 entity: Union[DatasetRef, DatasetType, StorageClass]) -> None:
1648 # Docstring is inherited from base class
1649 # The key can be valid in either formatters or templates so we can
1650 # only check the template if it exists
1651 if lookupKey in self.templates:
1652 try:
1653 self.templates[lookupKey].validateTemplate(entity)
1654 except FileTemplateValidationError as e:
1655 raise DatastoreValidationError(e) from e
1657 def export(self, refs: Iterable[DatasetRef], *,
1658 directory: Optional[Union[ButlerURI, str]] = None,
1659 transfer: Optional[str] = "auto") -> Iterable[FileDataset]:
1660 # Docstring inherited from Datastore.export.
1661 if transfer is not None and directory is None: 1661 ↛ 1662line 1661 didn't jump to line 1662, because the condition on line 1661 was never true
1662 raise RuntimeError(f"Cannot export using transfer mode {transfer} with no "
1663 "export directory given")
1665 # Force the directory to be a URI object
1666 directoryUri: Optional[ButlerURI] = None
1667 if directory is not None: 1667 ↛ 1670line 1667 didn't jump to line 1670, because the condition on line 1667 was never false
1668 directoryUri = ButlerURI(directory, forceDirectory=True)
1670 if transfer is not None and directoryUri is not None: 1670 ↛ 1675line 1670 didn't jump to line 1675, because the condition on line 1670 was never false
1671 # mypy needs the second test
1672 if not directoryUri.exists(): 1672 ↛ 1673line 1672 didn't jump to line 1673, because the condition on line 1672 was never true
1673 raise FileNotFoundError(f"Export location {directory} does not exist")
1675 progress = Progress("lsst.daf.butler.datastores.FileDatastore.export", level=logging.DEBUG)
1676 for ref in progress.wrap(refs, "Exporting dataset files"):
1677 fileLocations = self._get_dataset_locations_info(ref)
1678 if not fileLocations: 1678 ↛ 1679line 1678 didn't jump to line 1679, because the condition on line 1678 was never true
1679 raise FileNotFoundError(f"Could not retrieve dataset {ref}.")
1680 # For now we can not export disassembled datasets
1681 if len(fileLocations) > 1:
1682 raise NotImplementedError(f"Can not export disassembled datasets such as {ref}")
1683 location, storedFileInfo = fileLocations[0]
1685 pathInStore = location.pathInStore.path
1686 if transfer is None: 1686 ↛ 1689line 1686 didn't jump to line 1689, because the condition on line 1686 was never true
1687 # TODO: do we also need to return the readStorageClass somehow?
1688 # We will use the path in store directly
1689 pass
1690 elif transfer == "direct": 1690 ↛ 1692line 1690 didn't jump to line 1692, because the condition on line 1690 was never true
1691 # Use full URIs to the remote store in the export
1692 pathInStore = str(location.uri)
1693 else:
1694 # mypy needs help
1695 assert directoryUri is not None, "directoryUri must be defined to get here"
1696 storeUri = ButlerURI(location.uri)
1698 # if the datastore has an absolute URI to a resource, we
1699 # have two options:
1700 # 1. Keep the absolute URI in the exported YAML
1701 # 2. Allocate a new name in the local datastore and transfer
1702 # it.
1703 # For now go with option 2
1704 if location.pathInStore.isabs(): 1704 ↛ 1705line 1704 didn't jump to line 1705, because the condition on line 1704 was never true
1705 template = self.templates.getTemplate(ref)
1706 newURI = ButlerURI(template.format(ref), forceAbsolute=False)
1707 pathInStore = str(newURI.updatedExtension(location.pathInStore.getExtension()))
1709 exportUri = directoryUri.join(pathInStore)
1710 exportUri.transfer_from(storeUri, transfer=transfer)
1712 yield FileDataset(refs=[ref], path=pathInStore, formatter=storedFileInfo.formatter)
1714 @staticmethod
1715 def computeChecksum(uri: ButlerURI, algorithm: str = "blake2b", block_size: int = 8192) -> Optional[str]:
1716 """Compute the checksum of the supplied file.
1718 Parameters
1719 ----------
1720 uri : `ButlerURI`
1721 Name of resource to calculate checksum from.
1722 algorithm : `str`, optional
1723 Name of algorithm to use. Must be one of the algorithms supported
1724 by :py:class`hashlib`.
1725 block_size : `int`
1726 Number of bytes to read from file at one time.
1728 Returns
1729 -------
1730 hexdigest : `str`
1731 Hex digest of the file.
1733 Notes
1734 -----
1735 Currently returns None if the URI is for a remote resource.
1736 """
1737 if algorithm not in hashlib.algorithms_guaranteed: 1737 ↛ 1738line 1737 didn't jump to line 1738, because the condition on line 1737 was never true
1738 raise NameError("The specified algorithm '{}' is not supported by hashlib".format(algorithm))
1740 if not uri.isLocal: 1740 ↛ 1741line 1740 didn't jump to line 1741, because the condition on line 1740 was never true
1741 return None
1743 hasher = hashlib.new(algorithm)
1745 with uri.as_local() as local_uri:
1746 with open(local_uri.ospath, "rb") as f:
1747 for chunk in iter(lambda: f.read(block_size), b""):
1748 hasher.update(chunk)
1750 return hasher.hexdigest()