Coverage for python/lsst/daf/butler/datastores/fileDatastore.py: 10%
977 statements
« prev ^ index » next coverage.py v7.2.7, created at 2023-07-14 19:21 +0000
« prev ^ index » next coverage.py v7.2.7, created at 2023-07-14 19:21 +0000
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22"""Generic file-based datastore code."""
24from __future__ import annotations
26__all__ = ("FileDatastore",)
28import hashlib
29import logging
30from collections import defaultdict
31from collections.abc import Callable, Iterable, Mapping, Sequence
32from dataclasses import dataclass
33from typing import TYPE_CHECKING, Any, ClassVar
35from lsst.daf.butler import (
36 CompositesMap,
37 Config,
38 DatasetId,
39 DatasetRef,
40 DatasetRefURIs,
41 DatasetType,
42 DatasetTypeNotSupportedError,
43 Datastore,
44 DatastoreCacheManager,
45 DatastoreConfig,
46 DatastoreDisabledCacheManager,
47 DatastoreRecordData,
48 DatastoreValidationError,
49 FileDataset,
50 FileDescriptor,
51 FileTemplates,
52 FileTemplateValidationError,
53 Formatter,
54 FormatterFactory,
55 Location,
56 LocationFactory,
57 Progress,
58 StorageClass,
59 StoredDatastoreItemInfo,
60 StoredFileInfo,
61 ddl,
62)
63from lsst.daf.butler.core.repoRelocation import replaceRoot
64from lsst.daf.butler.core.utils import transactional
65from lsst.daf.butler.registry.interfaces import DatastoreRegistryBridge, ReadOnlyDatabaseError
66from lsst.resources import ResourcePath, ResourcePathExpression
67from lsst.utils.introspection import get_class_of, get_instance_of
68from lsst.utils.iteration import chunk_iterable
70# For VERBOSE logging usage.
71from lsst.utils.logging import VERBOSE, getLogger
72from lsst.utils.timer import time_this
73from sqlalchemy import BigInteger, String
75from ..registry.interfaces import FakeDatasetRef
76from .genericDatastore import GenericBaseDatastore
78if TYPE_CHECKING:
79 from lsst.daf.butler import AbstractDatastoreCacheManager, LookupKey
80 from lsst.daf.butler.registry.interfaces import DatasetIdRef, DatastoreRegistryBridgeManager
82log = getLogger(__name__)
85class _IngestPrepData(Datastore.IngestPrepData):
86 """Helper class for FileDatastore ingest implementation.
88 Parameters
89 ----------
90 datasets : `~collections.abc.Iterable` of `FileDataset`
91 Files to be ingested by this datastore.
92 """
94 def __init__(self, datasets: Iterable[FileDataset]):
95 super().__init__(ref for dataset in datasets for ref in dataset.refs)
96 self.datasets = datasets
99@dataclass(frozen=True)
100class DatastoreFileGetInformation:
101 """Collection of useful parameters needed to retrieve a file from
102 a Datastore.
103 """
105 location: Location
106 """The location from which to read the dataset."""
108 formatter: Formatter
109 """The `Formatter` to use to deserialize the dataset."""
111 info: StoredFileInfo
112 """Stored information about this file and its formatter."""
114 assemblerParams: Mapping[str, Any]
115 """Parameters to use for post-processing the retrieved dataset."""
117 formatterParams: Mapping[str, Any]
118 """Parameters that were understood by the associated formatter."""
120 component: str | None
121 """The component to be retrieved (can be `None`)."""
123 readStorageClass: StorageClass
124 """The `StorageClass` of the dataset being read."""
127class FileDatastore(GenericBaseDatastore):
128 """Generic Datastore for file-based implementations.
130 Should always be sub-classed since key abstract methods are missing.
132 Parameters
133 ----------
134 config : `DatastoreConfig` or `str`
135 Configuration as either a `Config` object or URI to file.
136 bridgeManager : `DatastoreRegistryBridgeManager`
137 Object that manages the interface between `Registry` and datastores.
138 butlerRoot : `str`, optional
139 New datastore root to use to override the configuration value.
141 Raises
142 ------
143 ValueError
144 If root location does not exist and ``create`` is `False` in the
145 configuration.
146 """
148 defaultConfigFile: ClassVar[str | None] = None
149 """Path to configuration defaults. Accessed within the ``config`` resource
150 or relative to a search path. Can be None if no defaults specified.
151 """
153 root: ResourcePath
154 """Root directory URI of this `Datastore`."""
156 locationFactory: LocationFactory
157 """Factory for creating locations relative to the datastore root."""
159 formatterFactory: FormatterFactory
160 """Factory for creating instances of formatters."""
162 templates: FileTemplates
163 """File templates that can be used by this `Datastore`."""
165 composites: CompositesMap
166 """Determines whether a dataset should be disassembled on put."""
168 defaultConfigFile = "datastores/fileDatastore.yaml"
169 """Path to configuration defaults. Accessed within the ``config`` resource
170 or relative to a search path. Can be None if no defaults specified.
171 """
173 _retrieve_dataset_method: Callable[[str], DatasetType | None] | None = None
174 """Callable that is used in trusted mode to retrieve registry definition
175 of a named dataset type.
176 """
178 @classmethod
179 def setConfigRoot(cls, root: str, config: Config, full: Config, overwrite: bool = True) -> None:
180 """Set any filesystem-dependent config options for this Datastore to
181 be appropriate for a new empty repository with the given root.
183 Parameters
184 ----------
185 root : `str`
186 URI to the root of the data repository.
187 config : `Config`
188 A `Config` to update. Only the subset understood by
189 this component will be updated. Will not expand
190 defaults.
191 full : `Config`
192 A complete config with all defaults expanded that can be
193 converted to a `DatastoreConfig`. Read-only and will not be
194 modified by this method.
195 Repository-specific options that should not be obtained
196 from defaults when Butler instances are constructed
197 should be copied from ``full`` to ``config``.
198 overwrite : `bool`, optional
199 If `False`, do not modify a value in ``config`` if the value
200 already exists. Default is always to overwrite with the provided
201 ``root``.
203 Notes
204 -----
205 If a keyword is explicitly defined in the supplied ``config`` it
206 will not be overridden by this method if ``overwrite`` is `False`.
207 This allows explicit values set in external configs to be retained.
208 """
209 Config.updateParameters(
210 DatastoreConfig,
211 config,
212 full,
213 toUpdate={"root": root},
214 toCopy=("cls", ("records", "table")),
215 overwrite=overwrite,
216 )
218 @classmethod
219 def makeTableSpec(cls, datasetIdColumnType: type) -> ddl.TableSpec:
220 return ddl.TableSpec(
221 fields=[
222 ddl.FieldSpec(name="dataset_id", dtype=datasetIdColumnType, primaryKey=True),
223 ddl.FieldSpec(name="path", dtype=String, length=256, nullable=False),
224 ddl.FieldSpec(name="formatter", dtype=String, length=128, nullable=False),
225 ddl.FieldSpec(name="storage_class", dtype=String, length=64, nullable=False),
226 # Use empty string to indicate no component
227 ddl.FieldSpec(name="component", dtype=String, length=32, primaryKey=True),
228 # TODO: should checksum be Base64Bytes instead?
229 ddl.FieldSpec(name="checksum", dtype=String, length=128, nullable=True),
230 ddl.FieldSpec(name="file_size", dtype=BigInteger, nullable=True),
231 ],
232 unique=frozenset(),
233 indexes=[ddl.IndexSpec("path")],
234 )
236 def __init__(
237 self,
238 config: DatastoreConfig | ResourcePathExpression,
239 bridgeManager: DatastoreRegistryBridgeManager,
240 butlerRoot: str | None = None,
241 ):
242 super().__init__(config, bridgeManager)
243 if "root" not in self.config:
244 raise ValueError("No root directory specified in configuration")
246 self._bridgeManager = bridgeManager
248 # Name ourselves either using an explicit name or a name
249 # derived from the (unexpanded) root
250 if "name" in self.config:
251 self.name = self.config["name"]
252 else:
253 # We use the unexpanded root in the name to indicate that this
254 # datastore can be moved without having to update registry.
255 self.name = "{}@{}".format(type(self).__name__, self.config["root"])
257 # Support repository relocation in config
258 # Existence of self.root is checked in subclass
259 self.root = ResourcePath(
260 replaceRoot(self.config["root"], butlerRoot), forceDirectory=True, forceAbsolute=True
261 )
263 self.locationFactory = LocationFactory(self.root)
264 self.formatterFactory = FormatterFactory()
266 # Now associate formatters with storage classes
267 self.formatterFactory.registerFormatters(self.config["formatters"], universe=bridgeManager.universe)
269 # Read the file naming templates
270 self.templates = FileTemplates(self.config["templates"], universe=bridgeManager.universe)
272 # See if composites should be disassembled
273 self.composites = CompositesMap(self.config["composites"], universe=bridgeManager.universe)
275 tableName = self.config["records", "table"]
276 try:
277 # Storage of paths and formatters, keyed by dataset_id
278 self._table = bridgeManager.opaque.register(
279 tableName, self.makeTableSpec(bridgeManager.datasetIdColumnType)
280 )
281 # Interface to Registry.
282 self._bridge = bridgeManager.register(self.name)
283 except ReadOnlyDatabaseError:
284 # If the database is read only and we just tried and failed to
285 # create a table, it means someone is trying to create a read-only
286 # butler client for an empty repo. That should be okay, as long
287 # as they then try to get any datasets before some other client
288 # creates the table. Chances are they'rejust validating
289 # configuration.
290 pass
292 # Determine whether checksums should be used - default to False
293 self.useChecksum = self.config.get("checksum", False)
295 # Determine whether we can fall back to configuration if a
296 # requested dataset is not known to registry
297 self.trustGetRequest = self.config.get("trust_get_request", False)
299 # Create a cache manager
300 self.cacheManager: AbstractDatastoreCacheManager
301 if "cached" in self.config:
302 self.cacheManager = DatastoreCacheManager(self.config["cached"], universe=bridgeManager.universe)
303 else:
304 self.cacheManager = DatastoreDisabledCacheManager("", universe=bridgeManager.universe)
306 # Check existence and create directory structure if necessary
307 if not self.root.exists():
308 if "create" not in self.config or not self.config["create"]:
309 raise ValueError(f"No valid root and not allowed to create one at: {self.root}")
310 try:
311 self.root.mkdir()
312 except Exception as e:
313 raise ValueError(
314 f"Can not create datastore root '{self.root}', check permissions. Got error: {e}"
315 ) from e
317 def __str__(self) -> str:
318 return str(self.root)
320 @property
321 def bridge(self) -> DatastoreRegistryBridge:
322 return self._bridge
324 @property
325 def roots(self) -> dict[str, ResourcePath | None]:
326 # Docstring inherited.
327 return {self.name: self.root}
329 def _artifact_exists(self, location: Location) -> bool:
330 """Check that an artifact exists in this datastore at the specified
331 location.
333 Parameters
334 ----------
335 location : `Location`
336 Expected location of the artifact associated with this datastore.
338 Returns
339 -------
340 exists : `bool`
341 True if the location can be found, false otherwise.
342 """
343 log.debug("Checking if resource exists: %s", location.uri)
344 return location.uri.exists()
346 def _delete_artifact(self, location: Location) -> None:
347 """Delete the artifact from the datastore.
349 Parameters
350 ----------
351 location : `Location`
352 Location of the artifact associated with this datastore.
353 """
354 if location.pathInStore.isabs():
355 raise RuntimeError(f"Cannot delete artifact with absolute uri {location.uri}.")
357 try:
358 location.uri.remove()
359 except FileNotFoundError:
360 log.debug("File %s did not exist and so could not be deleted.", location.uri)
361 raise
362 except Exception as e:
363 log.critical("Failed to delete file: %s (%s)", location.uri, e)
364 raise
365 log.debug("Successfully deleted file: %s", location.uri)
367 def addStoredItemInfo(self, refs: Iterable[DatasetRef], infos: Iterable[StoredFileInfo]) -> None:
368 # Docstring inherited from GenericBaseDatastore
369 records = [info.rebase(ref).to_record() for ref, info in zip(refs, infos)]
370 self._table.insert(*records, transaction=self._transaction)
372 def getStoredItemsInfo(self, ref: DatasetIdRef) -> list[StoredFileInfo]:
373 # Docstring inherited from GenericBaseDatastore
375 # Look for the dataset_id -- there might be multiple matches
376 # if we have disassembled the dataset.
377 records = self._table.fetch(dataset_id=ref.id)
378 return [StoredFileInfo.from_record(record) for record in records]
380 def _get_stored_records_associated_with_refs(
381 self, refs: Iterable[DatasetIdRef]
382 ) -> dict[DatasetId, list[StoredFileInfo]]:
383 """Retrieve all records associated with the provided refs.
385 Parameters
386 ----------
387 refs : iterable of `DatasetIdRef`
388 The refs for which records are to be retrieved.
390 Returns
391 -------
392 records : `dict` of [`DatasetId`, `list` of `StoredFileInfo`]
393 The matching records indexed by the ref ID. The number of entries
394 in the dict can be smaller than the number of requested refs.
395 """
396 records = self._table.fetch(dataset_id=[ref.id for ref in refs])
398 # Uniqueness is dataset_id + component so can have multiple records
399 # per ref.
400 records_by_ref = defaultdict(list)
401 for record in records:
402 records_by_ref[record["dataset_id"]].append(StoredFileInfo.from_record(record))
403 return records_by_ref
405 def _refs_associated_with_artifacts(self, paths: list[str | ResourcePath]) -> dict[str, set[DatasetId]]:
406 """Return paths and associated dataset refs.
408 Parameters
409 ----------
410 paths : `list` of `str` or `lsst.resources.ResourcePath`
411 All the paths to include in search.
413 Returns
414 -------
415 mapping : `dict` of [`str`, `set` [`DatasetId`]]
416 Mapping of each path to a set of associated database IDs.
417 """
418 records = self._table.fetch(path=[str(path) for path in paths])
419 result = defaultdict(set)
420 for row in records:
421 result[row["path"]].add(row["dataset_id"])
422 return result
424 def _registered_refs_per_artifact(self, pathInStore: ResourcePath) -> set[DatasetId]:
425 """Return all dataset refs associated with the supplied path.
427 Parameters
428 ----------
429 pathInStore : `lsst.resources.ResourcePath`
430 Path of interest in the data store.
432 Returns
433 -------
434 ids : `set` of `int`
435 All `DatasetRef` IDs associated with this path.
436 """
437 records = list(self._table.fetch(path=str(pathInStore)))
438 ids = {r["dataset_id"] for r in records}
439 return ids
441 def removeStoredItemInfo(self, ref: DatasetIdRef) -> None:
442 # Docstring inherited from GenericBaseDatastore
443 self._table.delete(["dataset_id"], {"dataset_id": ref.id})
445 def _get_dataset_locations_info(self, ref: DatasetIdRef) -> list[tuple[Location, StoredFileInfo]]:
446 r"""Find all the `Location`\ s of the requested dataset in the
447 `Datastore` and the associated stored file information.
449 Parameters
450 ----------
451 ref : `DatasetRef`
452 Reference to the required `Dataset`.
454 Returns
455 -------
456 results : `list` [`tuple` [`Location`, `StoredFileInfo` ]]
457 Location of the dataset within the datastore and
458 stored information about each file and its formatter.
459 """
460 # Get the file information (this will fail if no file)
461 records = self.getStoredItemsInfo(ref)
463 # Use the path to determine the location -- we need to take
464 # into account absolute URIs in the datastore record
465 return [(r.file_location(self.locationFactory), r) for r in records]
467 def _can_remove_dataset_artifact(self, ref: DatasetIdRef, location: Location) -> bool:
468 """Check that there is only one dataset associated with the
469 specified artifact.
471 Parameters
472 ----------
473 ref : `DatasetRef` or `FakeDatasetRef`
474 Dataset to be removed.
475 location : `Location`
476 The location of the artifact to be removed.
478 Returns
479 -------
480 can_remove : `Bool`
481 True if the artifact can be safely removed.
482 """
483 # Can't ever delete absolute URIs.
484 if location.pathInStore.isabs():
485 return False
487 # Get all entries associated with this path
488 allRefs = self._registered_refs_per_artifact(location.pathInStore)
489 if not allRefs:
490 raise RuntimeError(f"Datastore inconsistency error. {location.pathInStore} not in registry")
492 # Remove these refs from all the refs and if there is nothing left
493 # then we can delete
494 remainingRefs = allRefs - {ref.id}
496 if remainingRefs:
497 return False
498 return True
500 def _get_expected_dataset_locations_info(self, ref: DatasetRef) -> list[tuple[Location, StoredFileInfo]]:
501 """Predict the location and related file information of the requested
502 dataset in this datastore.
504 Parameters
505 ----------
506 ref : `DatasetRef`
507 Reference to the required `Dataset`.
509 Returns
510 -------
511 results : `list` [`tuple` [`Location`, `StoredFileInfo` ]]
512 Expected Location of the dataset within the datastore and
513 placeholder information about each file and its formatter.
515 Notes
516 -----
517 Uses the current configuration to determine how we would expect the
518 datastore files to have been written if we couldn't ask registry.
519 This is safe so long as there has been no change to datastore
520 configuration between writing the dataset and wanting to read it.
521 Will not work for files that have been ingested without using the
522 standard file template or default formatter.
523 """
524 # If we have a component ref we always need to ask the questions
525 # of the composite. If the composite is disassembled this routine
526 # should return all components. If the composite was not
527 # disassembled the composite is what is stored regardless of
528 # component request. Note that if the caller has disassembled
529 # a composite there is no way for this guess to know that
530 # without trying both the composite and component ref and seeing
531 # if there is something at the component Location even without
532 # disassembly being enabled.
533 if ref.datasetType.isComponent():
534 ref = ref.makeCompositeRef()
536 # See if the ref is a composite that should be disassembled
537 doDisassembly = self.composites.shouldBeDisassembled(ref)
539 all_info: list[tuple[Location, Formatter, StorageClass, str | None]] = []
541 if doDisassembly:
542 for component, componentStorage in ref.datasetType.storageClass.components.items():
543 compRef = ref.makeComponentRef(component)
544 location, formatter = self._determine_put_formatter_location(compRef)
545 all_info.append((location, formatter, componentStorage, component))
547 else:
548 # Always use the composite ref if no disassembly
549 location, formatter = self._determine_put_formatter_location(ref)
550 all_info.append((location, formatter, ref.datasetType.storageClass, None))
552 # Convert the list of tuples to have StoredFileInfo as second element
553 return [
554 (
555 location,
556 StoredFileInfo(
557 formatter=formatter,
558 path=location.pathInStore.path,
559 storageClass=storageClass,
560 component=component,
561 checksum=None,
562 file_size=-1,
563 dataset_id=ref.id,
564 ),
565 )
566 for location, formatter, storageClass, component in all_info
567 ]
569 def _prepare_for_get(
570 self, ref: DatasetRef, parameters: Mapping[str, Any] | None = None
571 ) -> list[DatastoreFileGetInformation]:
572 """Check parameters for ``get`` and obtain formatter and
573 location.
575 Parameters
576 ----------
577 ref : `DatasetRef`
578 Reference to the required Dataset.
579 parameters : `dict`
580 `StorageClass`-specific parameters that specify, for example,
581 a slice of the dataset to be loaded.
583 Returns
584 -------
585 getInfo : `list` [`DatastoreFileGetInformation`]
586 Parameters needed to retrieve each file.
587 """
588 log.debug("Retrieve %s from %s with parameters %s", ref, self.name, parameters)
590 # The storage class we want to use eventually
591 refStorageClass = ref.datasetType.storageClass
593 # For trusted mode need to reset storage class.
594 ref = self._cast_storage_class(ref)
596 # Get file metadata and internal metadata
597 fileLocations = self._get_dataset_locations_info(ref)
598 if not fileLocations:
599 if not self.trustGetRequest:
600 raise FileNotFoundError(f"Could not retrieve dataset {ref}.")
601 # Assume the dataset is where we think it should be
602 fileLocations = self._get_expected_dataset_locations_info(ref)
604 if len(fileLocations) > 1:
605 disassembled = True
607 # If trust is involved it is possible that there will be
608 # components listed here that do not exist in the datastore.
609 # Explicitly check for file artifact existence and filter out any
610 # that are missing.
611 if self.trustGetRequest:
612 fileLocations = [loc for loc in fileLocations if loc[0].uri.exists()]
614 # For now complain only if we have no components at all. One
615 # component is probably a problem but we can punt that to the
616 # assembler.
617 if not fileLocations:
618 raise FileNotFoundError(f"None of the component files for dataset {ref} exist.")
620 else:
621 disassembled = False
623 # Is this a component request?
624 refComponent = ref.datasetType.component()
626 fileGetInfo = []
627 for location, storedFileInfo in fileLocations:
628 # The storage class used to write the file
629 writeStorageClass = storedFileInfo.storageClass
631 # If this has been disassembled we need read to match the write
632 if disassembled:
633 readStorageClass = writeStorageClass
634 else:
635 readStorageClass = refStorageClass
637 formatter = get_instance_of(
638 storedFileInfo.formatter,
639 FileDescriptor(
640 location,
641 readStorageClass=readStorageClass,
642 storageClass=writeStorageClass,
643 parameters=parameters,
644 ),
645 ref.dataId,
646 )
648 formatterParams, notFormatterParams = formatter.segregateParameters()
650 # Of the remaining parameters, extract the ones supported by
651 # this StorageClass (for components not all will be handled)
652 assemblerParams = readStorageClass.filterParameters(notFormatterParams)
654 # The ref itself could be a component if the dataset was
655 # disassembled by butler, or we disassembled in datastore and
656 # components came from the datastore records
657 component = storedFileInfo.component if storedFileInfo.component else refComponent
659 fileGetInfo.append(
660 DatastoreFileGetInformation(
661 location,
662 formatter,
663 storedFileInfo,
664 assemblerParams,
665 formatterParams,
666 component,
667 readStorageClass,
668 )
669 )
671 return fileGetInfo
673 def _prepare_for_put(self, inMemoryDataset: Any, ref: DatasetRef) -> tuple[Location, Formatter]:
674 """Check the arguments for ``put`` and obtain formatter and
675 location.
677 Parameters
678 ----------
679 inMemoryDataset : `object`
680 The dataset to store.
681 ref : `DatasetRef`
682 Reference to the associated Dataset.
684 Returns
685 -------
686 location : `Location`
687 The location to write the dataset.
688 formatter : `Formatter`
689 The `Formatter` to use to write the dataset.
691 Raises
692 ------
693 TypeError
694 Supplied object and storage class are inconsistent.
695 DatasetTypeNotSupportedError
696 The associated `DatasetType` is not handled by this datastore.
697 """
698 self._validate_put_parameters(inMemoryDataset, ref)
699 return self._determine_put_formatter_location(ref)
701 def _determine_put_formatter_location(self, ref: DatasetRef) -> tuple[Location, Formatter]:
702 """Calculate the formatter and output location to use for put.
704 Parameters
705 ----------
706 ref : `DatasetRef`
707 Reference to the associated Dataset.
709 Returns
710 -------
711 location : `Location`
712 The location to write the dataset.
713 formatter : `Formatter`
714 The `Formatter` to use to write the dataset.
715 """
716 # Work out output file name
717 try:
718 template = self.templates.getTemplate(ref)
719 except KeyError as e:
720 raise DatasetTypeNotSupportedError(f"Unable to find template for {ref}") from e
722 # Validate the template to protect against filenames from different
723 # dataIds returning the same and causing overwrite confusion.
724 template.validateTemplate(ref)
726 location = self.locationFactory.fromPath(template.format(ref))
728 # Get the formatter based on the storage class
729 storageClass = ref.datasetType.storageClass
730 try:
731 formatter = self.formatterFactory.getFormatter(
732 ref, FileDescriptor(location, storageClass=storageClass), ref.dataId
733 )
734 except KeyError as e:
735 raise DatasetTypeNotSupportedError(
736 f"Unable to find formatter for {ref} in datastore {self.name}"
737 ) from e
739 # Now that we know the formatter, update the location
740 location = formatter.makeUpdatedLocation(location)
742 return location, formatter
744 def _overrideTransferMode(self, *datasets: FileDataset, transfer: str | None = None) -> str | None:
745 # Docstring inherited from base class
746 if transfer != "auto":
747 return transfer
749 # See if the paths are within the datastore or not
750 inside = [self._pathInStore(d.path) is not None for d in datasets]
752 if all(inside):
753 transfer = None
754 elif not any(inside):
755 # Allow ResourcePath to use its own knowledge
756 transfer = "auto"
757 else:
758 # This can happen when importing from a datastore that
759 # has had some datasets ingested using "direct" mode.
760 # Also allow ResourcePath to sort it out but warn about it.
761 # This can happen if you are importing from a datastore
762 # that had some direct transfer datasets.
763 log.warning(
764 "Some datasets are inside the datastore and some are outside. Using 'split' "
765 "transfer mode. This assumes that the files outside the datastore are "
766 "still accessible to the new butler since they will not be copied into "
767 "the target datastore."
768 )
769 transfer = "split"
771 return transfer
773 def _pathInStore(self, path: ResourcePathExpression) -> str | None:
774 """Return path relative to datastore root.
776 Parameters
777 ----------
778 path : `lsst.resources.ResourcePathExpression`
779 Path to dataset. Can be absolute URI. If relative assumed to
780 be relative to the datastore. Returns path in datastore
781 or raises an exception if the path it outside.
783 Returns
784 -------
785 inStore : `str`
786 Path relative to datastore root. Returns `None` if the file is
787 outside the root.
788 """
789 # Relative path will always be relative to datastore
790 pathUri = ResourcePath(path, forceAbsolute=False)
791 return pathUri.relative_to(self.root)
793 def _standardizeIngestPath(
794 self, path: str | ResourcePath, *, transfer: str | None = None
795 ) -> str | ResourcePath:
796 """Standardize the path of a to-be-ingested file.
798 Parameters
799 ----------
800 path : `str` or `lsst.resources.ResourcePath`
801 Path of a file to be ingested. This parameter is not expected
802 to be all the types that can be used to construct a
803 `~lsst.resources.ResourcePath`.
804 transfer : `str`, optional
805 How (and whether) the dataset should be added to the datastore.
806 See `ingest` for details of transfer modes.
807 This implementation is provided only so
808 `NotImplementedError` can be raised if the mode is not supported;
809 actual transfers are deferred to `_extractIngestInfo`.
811 Returns
812 -------
813 path : `str` or `lsst.resources.ResourcePath`
814 New path in what the datastore considers standard form. If an
815 absolute URI was given that will be returned unchanged.
817 Notes
818 -----
819 Subclasses of `FileDatastore` can implement this method instead
820 of `_prepIngest`. It should not modify the data repository or given
821 file in any way.
823 Raises
824 ------
825 NotImplementedError
826 Raised if the datastore does not support the given transfer mode
827 (including the case where ingest is not supported at all).
828 FileNotFoundError
829 Raised if one of the given files does not exist.
830 """
831 if transfer not in (None, "direct", "split") + self.root.transferModes:
832 raise NotImplementedError(f"Transfer mode {transfer} not supported.")
834 # A relative URI indicates relative to datastore root
835 srcUri = ResourcePath(path, forceAbsolute=False)
836 if not srcUri.isabs():
837 srcUri = self.root.join(path)
839 if not srcUri.exists():
840 raise FileNotFoundError(
841 f"Resource at {srcUri} does not exist; note that paths to ingest "
842 f"are assumed to be relative to {self.root} unless they are absolute."
843 )
845 if transfer is None:
846 relpath = srcUri.relative_to(self.root)
847 if not relpath:
848 raise RuntimeError(
849 f"Transfer is none but source file ({srcUri}) is not within datastore ({self.root})"
850 )
852 # Return the relative path within the datastore for internal
853 # transfer
854 path = relpath
856 return path
858 def _extractIngestInfo(
859 self,
860 path: ResourcePathExpression,
861 ref: DatasetRef,
862 *,
863 formatter: Formatter | type[Formatter],
864 transfer: str | None = None,
865 record_validation_info: bool = True,
866 ) -> StoredFileInfo:
867 """Relocate (if necessary) and extract `StoredFileInfo` from a
868 to-be-ingested file.
870 Parameters
871 ----------
872 path : `lsst.resources.ResourcePathExpression`
873 URI or path of a file to be ingested.
874 ref : `DatasetRef`
875 Reference for the dataset being ingested. Guaranteed to have
876 ``dataset_id not None`.
877 formatter : `type` or `Formatter`
878 `Formatter` subclass to use for this dataset or an instance.
879 transfer : `str`, optional
880 How (and whether) the dataset should be added to the datastore.
881 See `ingest` for details of transfer modes.
882 record_validation_info : `bool`, optional
883 If `True`, the default, the datastore can record validation
884 information associated with the file. If `False` the datastore
885 will not attempt to track any information such as checksums
886 or file sizes. This can be useful if such information is tracked
887 in an external system or if the file is to be compressed in place.
888 It is up to the datastore whether this parameter is relevant.
890 Returns
891 -------
892 info : `StoredFileInfo`
893 Internal datastore record for this file. This will be inserted by
894 the caller; the `_extractIngestInfo` is only responsible for
895 creating and populating the struct.
897 Raises
898 ------
899 FileNotFoundError
900 Raised if one of the given files does not exist.
901 FileExistsError
902 Raised if transfer is not `None` but the (internal) location the
903 file would be moved to is already occupied.
904 """
905 if self._transaction is None:
906 raise RuntimeError("Ingest called without transaction enabled")
908 # Create URI of the source path, do not need to force a relative
909 # path to absolute.
910 srcUri = ResourcePath(path, forceAbsolute=False)
912 # Track whether we have read the size of the source yet
913 have_sized = False
915 tgtLocation: Location | None
916 if transfer is None or transfer == "split":
917 # A relative path is assumed to be relative to the datastore
918 # in this context
919 if not srcUri.isabs():
920 tgtLocation = self.locationFactory.fromPath(srcUri.ospath)
921 else:
922 # Work out the path in the datastore from an absolute URI
923 # This is required to be within the datastore.
924 pathInStore = srcUri.relative_to(self.root)
925 if pathInStore is None and transfer is None:
926 raise RuntimeError(
927 f"Unexpectedly learned that {srcUri} is not within datastore {self.root}"
928 )
929 if pathInStore:
930 tgtLocation = self.locationFactory.fromPath(pathInStore)
931 elif transfer == "split":
932 # Outside the datastore but treat that as a direct ingest
933 # instead.
934 tgtLocation = None
935 else:
936 raise RuntimeError(f"Unexpected transfer mode encountered: {transfer} for URI {srcUri}")
937 elif transfer == "direct":
938 # Want to store the full URI to the resource directly in
939 # datastore. This is useful for referring to permanent archive
940 # storage for raw data.
941 # Trust that people know what they are doing.
942 tgtLocation = None
943 else:
944 # Work out the name we want this ingested file to have
945 # inside the datastore
946 tgtLocation = self._calculate_ingested_datastore_name(srcUri, ref, formatter)
947 if not tgtLocation.uri.dirname().exists():
948 log.debug("Folder %s does not exist yet.", tgtLocation.uri.dirname())
949 tgtLocation.uri.dirname().mkdir()
951 # if we are transferring from a local file to a remote location
952 # it may be more efficient to get the size and checksum of the
953 # local file rather than the transferred one
954 if record_validation_info and srcUri.isLocal:
955 size = srcUri.size()
956 checksum = self.computeChecksum(srcUri) if self.useChecksum else None
957 have_sized = True
959 # Transfer the resource to the destination.
960 # Allow overwrite of an existing file. This matches the behavior
961 # of datastore.put() in that it trusts that registry would not
962 # be asking to overwrite unless registry thought that the
963 # overwrite was allowed.
964 tgtLocation.uri.transfer_from(
965 srcUri, transfer=transfer, transaction=self._transaction, overwrite=True
966 )
968 if tgtLocation is None:
969 # This means we are using direct mode
970 targetUri = srcUri
971 targetPath = str(srcUri)
972 else:
973 targetUri = tgtLocation.uri
974 targetPath = tgtLocation.pathInStore.path
976 # the file should exist in the datastore now
977 if record_validation_info:
978 if not have_sized:
979 size = targetUri.size()
980 checksum = self.computeChecksum(targetUri) if self.useChecksum else None
981 else:
982 # Not recording any file information.
983 size = -1
984 checksum = None
986 return StoredFileInfo(
987 formatter=formatter,
988 path=targetPath,
989 storageClass=ref.datasetType.storageClass,
990 component=ref.datasetType.component(),
991 file_size=size,
992 checksum=checksum,
993 dataset_id=ref.id,
994 )
996 def _prepIngest(self, *datasets: FileDataset, transfer: str | None = None) -> _IngestPrepData:
997 # Docstring inherited from Datastore._prepIngest.
998 filtered = []
999 for dataset in datasets:
1000 acceptable = [ref for ref in dataset.refs if self.constraints.isAcceptable(ref)]
1001 if not acceptable:
1002 continue
1003 else:
1004 dataset.refs = acceptable
1005 if dataset.formatter is None:
1006 dataset.formatter = self.formatterFactory.getFormatterClass(dataset.refs[0])
1007 else:
1008 assert isinstance(dataset.formatter, (type, str))
1009 formatter_class = get_class_of(dataset.formatter)
1010 if not issubclass(formatter_class, Formatter):
1011 raise TypeError(f"Requested formatter {dataset.formatter} is not a Formatter class.")
1012 dataset.formatter = formatter_class
1013 dataset.path = self._standardizeIngestPath(dataset.path, transfer=transfer)
1014 filtered.append(dataset)
1015 return _IngestPrepData(filtered)
1017 @transactional
1018 def _finishIngest(
1019 self,
1020 prepData: Datastore.IngestPrepData,
1021 *,
1022 transfer: str | None = None,
1023 record_validation_info: bool = True,
1024 ) -> None:
1025 # Docstring inherited from Datastore._finishIngest.
1026 refsAndInfos = []
1027 progress = Progress("lsst.daf.butler.datastores.FileDatastore.ingest", level=logging.DEBUG)
1028 for dataset in progress.wrap(prepData.datasets, desc="Ingesting dataset files"):
1029 # Do ingest as if the first dataset ref is associated with the file
1030 info = self._extractIngestInfo(
1031 dataset.path,
1032 dataset.refs[0],
1033 formatter=dataset.formatter,
1034 transfer=transfer,
1035 record_validation_info=record_validation_info,
1036 )
1037 refsAndInfos.extend([(ref, info) for ref in dataset.refs])
1038 self._register_datasets(refsAndInfos)
1040 def _calculate_ingested_datastore_name(
1041 self,
1042 srcUri: ResourcePath,
1043 ref: DatasetRef,
1044 formatter: Formatter | type[Formatter] | None = None,
1045 ) -> Location:
1046 """Given a source URI and a DatasetRef, determine the name the
1047 dataset will have inside datastore.
1049 Parameters
1050 ----------
1051 srcUri : `lsst.resources.ResourcePath`
1052 URI to the source dataset file.
1053 ref : `DatasetRef`
1054 Ref associated with the newly-ingested dataset artifact. This
1055 is used to determine the name within the datastore.
1056 formatter : `Formatter` or Formatter class.
1057 Formatter to use for validation. Can be a class or an instance.
1058 No validation of the file extension is performed if the
1059 ``formatter`` is `None`. This can be used if the caller knows
1060 that the source URI and target URI will use the same formatter.
1062 Returns
1063 -------
1064 location : `Location`
1065 Target location for the newly-ingested dataset.
1066 """
1067 # Ingesting a file from outside the datastore.
1068 # This involves a new name.
1069 template = self.templates.getTemplate(ref)
1070 location = self.locationFactory.fromPath(template.format(ref))
1072 # Get the extension
1073 ext = srcUri.getExtension()
1075 # Update the destination to include that extension
1076 location.updateExtension(ext)
1078 # Ask the formatter to validate this extension
1079 if formatter is not None:
1080 formatter.validateExtension(location)
1082 return location
1084 def _write_in_memory_to_artifact(self, inMemoryDataset: Any, ref: DatasetRef) -> StoredFileInfo:
1085 """Write out in memory dataset to datastore.
1087 Parameters
1088 ----------
1089 inMemoryDataset : `object`
1090 Dataset to write to datastore.
1091 ref : `DatasetRef`
1092 Registry information associated with this dataset.
1094 Returns
1095 -------
1096 info : `StoredFileInfo`
1097 Information describing the artifact written to the datastore.
1098 """
1099 # May need to coerce the in memory dataset to the correct
1100 # python type, but first we need to make sure the storage class
1101 # reflects the one defined in the data repository.
1102 ref = self._cast_storage_class(ref)
1103 inMemoryDataset = ref.datasetType.storageClass.coerce_type(inMemoryDataset)
1105 location, formatter = self._prepare_for_put(inMemoryDataset, ref)
1106 uri = location.uri
1108 if not uri.dirname().exists():
1109 log.debug("Folder %s does not exist yet so creating it.", uri.dirname())
1110 uri.dirname().mkdir()
1112 if self._transaction is None:
1113 raise RuntimeError("Attempting to write artifact without transaction enabled")
1115 def _removeFileExists(uri: ResourcePath) -> None:
1116 """Remove a file and do not complain if it is not there.
1118 This is important since a formatter might fail before the file
1119 is written and we should not confuse people by writing spurious
1120 error messages to the log.
1121 """
1122 try:
1123 uri.remove()
1124 except FileNotFoundError:
1125 pass
1127 # Register a callback to try to delete the uploaded data if
1128 # something fails below
1129 self._transaction.registerUndo("artifactWrite", _removeFileExists, uri)
1131 data_written = False
1132 if not uri.isLocal:
1133 # This is a remote URI. Some datasets can be serialized directly
1134 # to bytes and sent to the remote datastore without writing a
1135 # file. If the dataset is intended to be saved to the cache
1136 # a file is always written and direct write to the remote
1137 # datastore is bypassed.
1138 if not self.cacheManager.should_be_cached(ref):
1139 try:
1140 serializedDataset = formatter.toBytes(inMemoryDataset)
1141 except NotImplementedError:
1142 # Fallback to the file writing option.
1143 pass
1144 except Exception as e:
1145 raise RuntimeError(
1146 f"Failed to serialize dataset {ref} of type {type(inMemoryDataset)} to bytes."
1147 ) from e
1148 else:
1149 log.debug("Writing bytes directly to %s", uri)
1150 uri.write(serializedDataset, overwrite=True)
1151 log.debug("Successfully wrote bytes directly to %s", uri)
1152 data_written = True
1154 if not data_written:
1155 # Did not write the bytes directly to object store so instead
1156 # write to temporary file. Always write to a temporary even if
1157 # using a local file system -- that gives us atomic writes.
1158 # If a process is killed as the file is being written we do not
1159 # want it to remain in the correct place but in corrupt state.
1160 # For local files write to the output directory not temporary dir.
1161 prefix = uri.dirname() if uri.isLocal else None
1162 with ResourcePath.temporary_uri(suffix=uri.getExtension(), prefix=prefix) as temporary_uri:
1163 # Need to configure the formatter to write to a different
1164 # location and that needs us to overwrite internals
1165 log.debug("Writing dataset to temporary location at %s", temporary_uri)
1166 with formatter._updateLocation(Location(None, temporary_uri)):
1167 try:
1168 formatter.write(inMemoryDataset)
1169 except Exception as e:
1170 raise RuntimeError(
1171 f"Failed to serialize dataset {ref} of type"
1172 f" {type(inMemoryDataset)} to "
1173 f"temporary location {temporary_uri}"
1174 ) from e
1176 # Use move for a local file since that becomes an efficient
1177 # os.rename. For remote resources we use copy to allow the
1178 # file to be cached afterwards.
1179 transfer = "move" if uri.isLocal else "copy"
1181 uri.transfer_from(temporary_uri, transfer=transfer, overwrite=True)
1183 if transfer == "copy":
1184 # Cache if required
1185 self.cacheManager.move_to_cache(temporary_uri, ref)
1187 log.debug("Successfully wrote dataset to %s via a temporary file.", uri)
1189 # URI is needed to resolve what ingest case are we dealing with
1190 return self._extractIngestInfo(uri, ref, formatter=formatter)
1192 def _read_artifact_into_memory(
1193 self,
1194 getInfo: DatastoreFileGetInformation,
1195 ref: DatasetRef,
1196 isComponent: bool = False,
1197 cache_ref: DatasetRef | None = None,
1198 ) -> Any:
1199 """Read the artifact from datastore into in memory object.
1201 Parameters
1202 ----------
1203 getInfo : `DatastoreFileGetInformation`
1204 Information about the artifact within the datastore.
1205 ref : `DatasetRef`
1206 The registry information associated with this artifact.
1207 isComponent : `bool`
1208 Flag to indicate if a component is being read from this artifact.
1209 cache_ref : `DatasetRef`, optional
1210 The DatasetRef to use when looking up the file in the cache.
1211 This ref must have the same ID as the supplied ref but can
1212 be a parent ref or component ref to indicate to the cache whether
1213 a composite file is being requested from the cache or a component
1214 file. Without this the cache will default to the supplied ref but
1215 it can get confused with read-only derived components for
1216 disassembled composites.
1218 Returns
1219 -------
1220 inMemoryDataset : `object`
1221 The artifact as a python object.
1222 """
1223 location = getInfo.location
1224 uri = location.uri
1225 log.debug("Accessing data from %s", uri)
1227 if cache_ref is None:
1228 cache_ref = ref
1229 if cache_ref.id != ref.id:
1230 raise ValueError(
1231 "The supplied cache dataset ref refers to a different dataset than expected:"
1232 f" {ref.id} != {cache_ref.id}"
1233 )
1235 # Cannot recalculate checksum but can compare size as a quick check
1236 # Do not do this if the size is negative since that indicates
1237 # we do not know.
1238 recorded_size = getInfo.info.file_size
1239 resource_size = uri.size()
1240 if recorded_size >= 0 and resource_size != recorded_size:
1241 raise RuntimeError(
1242 "Integrity failure in Datastore. "
1243 f"Size of file {uri} ({resource_size}) "
1244 f"does not match size recorded in registry of {recorded_size}"
1245 )
1247 # For the general case we have choices for how to proceed.
1248 # 1. Always use a local file (downloading the remote resource to a
1249 # temporary file if needed).
1250 # 2. Use a threshold size and read into memory and use bytes.
1251 # Use both for now with an arbitrary hand off size.
1252 # This allows small datasets to be downloaded from remote object
1253 # stores without requiring a temporary file.
1255 formatter = getInfo.formatter
1256 nbytes_max = 10_000_000 # Arbitrary number that we can tune
1257 if resource_size <= nbytes_max and formatter.can_read_bytes():
1258 with self.cacheManager.find_in_cache(cache_ref, uri.getExtension()) as cached_file:
1259 if cached_file is not None:
1260 desired_uri = cached_file
1261 msg = f" (cached version of {uri})"
1262 else:
1263 desired_uri = uri
1264 msg = ""
1265 with time_this(log, msg="Reading bytes from %s%s", args=(desired_uri, msg)):
1266 serializedDataset = desired_uri.read()
1267 log.debug(
1268 "Deserializing %s from %d bytes from location %s with formatter %s",
1269 f"component {getInfo.component}" if isComponent else "",
1270 len(serializedDataset),
1271 uri,
1272 formatter.name(),
1273 )
1274 try:
1275 result = formatter.fromBytes(
1276 serializedDataset, component=getInfo.component if isComponent else None
1277 )
1278 except Exception as e:
1279 raise ValueError(
1280 f"Failure from formatter '{formatter.name()}' for dataset {ref.id}"
1281 f" ({ref.datasetType.name} from {uri}): {e}"
1282 ) from e
1283 else:
1284 # Read from file.
1286 # Have to update the Location associated with the formatter
1287 # because formatter.read does not allow an override.
1288 # This could be improved.
1289 location_updated = False
1290 msg = ""
1292 # First check in cache for local version.
1293 # The cache will only be relevant for remote resources but
1294 # no harm in always asking. Context manager ensures that cache
1295 # file is not deleted during cache expiration.
1296 with self.cacheManager.find_in_cache(cache_ref, uri.getExtension()) as cached_file:
1297 if cached_file is not None:
1298 msg = f"(via cache read of remote file {uri})"
1299 uri = cached_file
1300 location_updated = True
1302 with uri.as_local() as local_uri:
1303 can_be_cached = False
1304 if uri != local_uri:
1305 # URI was remote and file was downloaded
1306 cache_msg = ""
1307 location_updated = True
1309 if self.cacheManager.should_be_cached(cache_ref):
1310 # In this scenario we want to ask if the downloaded
1311 # file should be cached but we should not cache
1312 # it until after we've used it (to ensure it can't
1313 # be expired whilst we are using it).
1314 can_be_cached = True
1316 # Say that it is "likely" to be cached because
1317 # if the formatter read fails we will not be
1318 # caching this file.
1319 cache_msg = " and likely cached"
1321 msg = f"(via download to local file{cache_msg})"
1323 # Calculate the (possibly) new location for the formatter
1324 # to use.
1325 newLocation = Location(*local_uri.split()) if location_updated else None
1327 log.debug(
1328 "Reading%s from location %s %s with formatter %s",
1329 f" component {getInfo.component}" if isComponent else "",
1330 uri,
1331 msg,
1332 formatter.name(),
1333 )
1334 try:
1335 with formatter._updateLocation(newLocation):
1336 with time_this(
1337 log,
1338 msg="Reading%s from location %s %s with formatter %s",
1339 args=(
1340 f" component {getInfo.component}" if isComponent else "",
1341 uri,
1342 msg,
1343 formatter.name(),
1344 ),
1345 ):
1346 result = formatter.read(component=getInfo.component if isComponent else None)
1347 except Exception as e:
1348 raise ValueError(
1349 f"Failure from formatter '{formatter.name()}' for dataset {ref.id}"
1350 f" ({ref.datasetType.name} from {uri}): {e}"
1351 ) from e
1353 # File was read successfully so can move to cache
1354 if can_be_cached:
1355 self.cacheManager.move_to_cache(local_uri, cache_ref)
1357 return self._post_process_get(
1358 result, ref.datasetType.storageClass, getInfo.assemblerParams, isComponent=isComponent
1359 )
1361 def knows(self, ref: DatasetRef) -> bool:
1362 """Check if the dataset is known to the datastore.
1364 Does not check for existence of any artifact.
1366 Parameters
1367 ----------
1368 ref : `DatasetRef`
1369 Reference to the required dataset.
1371 Returns
1372 -------
1373 exists : `bool`
1374 `True` if the dataset is known to the datastore.
1375 """
1376 fileLocations = self._get_dataset_locations_info(ref)
1377 if fileLocations:
1378 return True
1379 return False
1381 def knows_these(self, refs: Iterable[DatasetRef]) -> dict[DatasetRef, bool]:
1382 # Docstring inherited from the base class.
1384 # The records themselves. Could be missing some entries.
1385 records = self._get_stored_records_associated_with_refs(refs)
1387 return {ref: ref.id in records for ref in refs}
1389 def _process_mexists_records(
1390 self,
1391 id_to_ref: dict[DatasetId, DatasetRef],
1392 records: dict[DatasetId, list[StoredFileInfo]],
1393 all_required: bool,
1394 artifact_existence: dict[ResourcePath, bool] | None = None,
1395 ) -> dict[DatasetRef, bool]:
1396 """Check given records for existence.
1398 Helper function for `mexists()`.
1400 Parameters
1401 ----------
1402 id_to_ref : `dict` of [`DatasetId`, `DatasetRef`]
1403 Mapping of the dataset ID to the dataset ref itself.
1404 records : `dict` of [`DatasetId`, `list` of `StoredFileInfo`]
1405 Records as generally returned by
1406 ``_get_stored_records_associated_with_refs``.
1407 all_required : `bool`
1408 Flag to indicate whether existence requires all artifacts
1409 associated with a dataset ID to exist or not for existence.
1410 artifact_existence : `dict` [`lsst.resources.ResourcePath`, `bool`]
1411 Optional mapping of datastore artifact to existence. Updated by
1412 this method with details of all artifacts tested. Can be `None`
1413 if the caller is not interested.
1415 Returns
1416 -------
1417 existence : `dict` of [`DatasetRef`, `bool`]
1418 Mapping from dataset to boolean indicating existence.
1419 """
1420 # The URIs to be checked and a mapping of those URIs to
1421 # the dataset ID.
1422 uris_to_check: list[ResourcePath] = []
1423 location_map: dict[ResourcePath, DatasetId] = {}
1425 location_factory = self.locationFactory
1427 uri_existence: dict[ResourcePath, bool] = {}
1428 for ref_id, infos in records.items():
1429 # Key is the dataset Id, value is list of StoredItemInfo
1430 uris = [info.file_location(location_factory).uri for info in infos]
1431 location_map.update({uri: ref_id for uri in uris})
1433 # Check the local cache directly for a dataset corresponding
1434 # to the remote URI.
1435 if self.cacheManager.file_count > 0:
1436 ref = id_to_ref[ref_id]
1437 for uri, storedFileInfo in zip(uris, infos):
1438 check_ref = ref
1439 if not ref.datasetType.isComponent() and (component := storedFileInfo.component):
1440 check_ref = ref.makeComponentRef(component)
1441 if self.cacheManager.known_to_cache(check_ref, uri.getExtension()):
1442 # Proxy for URI existence.
1443 uri_existence[uri] = True
1444 else:
1445 uris_to_check.append(uri)
1446 else:
1447 # Check all of them.
1448 uris_to_check.extend(uris)
1450 if artifact_existence is not None:
1451 # If a URI has already been checked remove it from the list
1452 # and immediately add the status to the output dict.
1453 filtered_uris_to_check = []
1454 for uri in uris_to_check:
1455 if uri in artifact_existence:
1456 uri_existence[uri] = artifact_existence[uri]
1457 else:
1458 filtered_uris_to_check.append(uri)
1459 uris_to_check = filtered_uris_to_check
1461 # Results.
1462 dataset_existence: dict[DatasetRef, bool] = {}
1464 uri_existence.update(ResourcePath.mexists(uris_to_check))
1465 for uri, exists in uri_existence.items():
1466 dataset_id = location_map[uri]
1467 ref = id_to_ref[dataset_id]
1469 # Disassembled composite needs to check all locations.
1470 # all_required indicates whether all need to exist or not.
1471 if ref in dataset_existence:
1472 if all_required:
1473 exists = dataset_existence[ref] and exists
1474 else:
1475 exists = dataset_existence[ref] or exists
1476 dataset_existence[ref] = exists
1478 if artifact_existence is not None:
1479 artifact_existence.update(uri_existence)
1481 return dataset_existence
1483 def mexists(
1484 self, refs: Iterable[DatasetRef], artifact_existence: dict[ResourcePath, bool] | None = None
1485 ) -> dict[DatasetRef, bool]:
1486 """Check the existence of multiple datasets at once.
1488 Parameters
1489 ----------
1490 refs : iterable of `DatasetRef`
1491 The datasets to be checked.
1492 artifact_existence : `dict` [`lsst.resources.ResourcePath`, `bool`]
1493 Optional mapping of datastore artifact to existence. Updated by
1494 this method with details of all artifacts tested. Can be `None`
1495 if the caller is not interested.
1497 Returns
1498 -------
1499 existence : `dict` of [`DatasetRef`, `bool`]
1500 Mapping from dataset to boolean indicating existence.
1502 Notes
1503 -----
1504 To minimize potentially costly remote existence checks, the local
1505 cache is checked as a proxy for existence. If a file for this
1506 `DatasetRef` does exist no check is done for the actual URI. This
1507 could result in possibly unexpected behavior if the dataset itself
1508 has been removed from the datastore by another process whilst it is
1509 still in the cache.
1510 """
1511 chunk_size = 10_000
1512 dataset_existence: dict[DatasetRef, bool] = {}
1513 log.debug("Checking for the existence of multiple artifacts in datastore in chunks of %d", chunk_size)
1514 n_found_total = 0
1515 n_checked = 0
1516 n_chunks = 0
1517 for chunk in chunk_iterable(refs, chunk_size=chunk_size):
1518 chunk_result = self._mexists(chunk, artifact_existence)
1520 # The log message level and content depend on how many
1521 # datasets we are processing.
1522 n_results = len(chunk_result)
1524 # Use verbose logging to ensure that messages can be seen
1525 # easily if many refs are being checked.
1526 log_threshold = VERBOSE
1527 n_checked += n_results
1529 # This sum can take some time so only do it if we know the
1530 # result is going to be used.
1531 n_found = 0
1532 if log.isEnabledFor(log_threshold):
1533 # Can treat the booleans as 0, 1 integers and sum them.
1534 n_found = sum(chunk_result.values())
1535 n_found_total += n_found
1537 # We are deliberately not trying to count the number of refs
1538 # provided in case it's in the millions. This means there is a
1539 # situation where the number of refs exactly matches the chunk
1540 # size and we will switch to the multi-chunk path even though
1541 # we only have a single chunk.
1542 if n_results < chunk_size and n_chunks == 0:
1543 # Single chunk will be processed so we can provide more detail.
1544 if n_results == 1:
1545 ref = list(chunk_result)[0]
1546 # Use debug logging to be consistent with `exists()`.
1547 log.debug(
1548 "Calling mexists() with single ref that does%s exist (%s).",
1549 "" if chunk_result[ref] else " not",
1550 ref,
1551 )
1552 else:
1553 # Single chunk but multiple files. Summarize.
1554 log.log(
1555 log_threshold,
1556 "Number of datasets found in datastore: %d out of %d datasets checked.",
1557 n_found,
1558 n_checked,
1559 )
1561 else:
1562 # Use incremental verbose logging when we have multiple chunks.
1563 log.log(
1564 log_threshold,
1565 "Number of datasets found in datastore for chunk %d: %d out of %d checked "
1566 "(running total from all chunks so far: %d found out of %d checked)",
1567 n_chunks,
1568 n_found,
1569 n_results,
1570 n_found_total,
1571 n_checked,
1572 )
1573 dataset_existence.update(chunk_result)
1574 n_chunks += 1
1576 return dataset_existence
1578 def _mexists(
1579 self, refs: Sequence[DatasetRef], artifact_existence: dict[ResourcePath, bool] | None = None
1580 ) -> dict[DatasetRef, bool]:
1581 """Check the existence of multiple datasets at once.
1583 Parameters
1584 ----------
1585 refs : iterable of `DatasetRef`
1586 The datasets to be checked.
1587 artifact_existence : `dict` [`lsst.resources.ResourcePath`, `bool`]
1588 Optional mapping of datastore artifact to existence. Updated by
1589 this method with details of all artifacts tested. Can be `None`
1590 if the caller is not interested.
1592 Returns
1593 -------
1594 existence : `dict` of [`DatasetRef`, `bool`]
1595 Mapping from dataset to boolean indicating existence.
1596 """
1597 # Make a mapping from refs with the internal storage class to the given
1598 # refs that may have a different one. We'll use the internal refs
1599 # throughout this method and convert back at the very end.
1600 internal_ref_to_input_ref = {self._cast_storage_class(ref): ref for ref in refs}
1602 # Need a mapping of dataset_id to (internal) dataset ref since some
1603 # internal APIs work with dataset_id.
1604 id_to_ref = {ref.id: ref for ref in internal_ref_to_input_ref}
1606 # Set of all IDs we are checking for.
1607 requested_ids = set(id_to_ref.keys())
1609 # The records themselves. Could be missing some entries.
1610 records = self._get_stored_records_associated_with_refs(id_to_ref.values())
1612 dataset_existence = self._process_mexists_records(
1613 id_to_ref, records, True, artifact_existence=artifact_existence
1614 )
1616 # Set of IDs that have been handled.
1617 handled_ids = {ref.id for ref in dataset_existence.keys()}
1619 missing_ids = requested_ids - handled_ids
1620 if missing_ids:
1621 dataset_existence.update(
1622 self._mexists_check_expected(
1623 [id_to_ref[missing] for missing in missing_ids], artifact_existence
1624 )
1625 )
1627 return {
1628 internal_ref_to_input_ref[internal_ref]: existence
1629 for internal_ref, existence in dataset_existence.items()
1630 }
1632 def _mexists_check_expected(
1633 self, refs: Sequence[DatasetRef], artifact_existence: dict[ResourcePath, bool] | None = None
1634 ) -> dict[DatasetRef, bool]:
1635 """Check existence of refs that are not known to datastore.
1637 Parameters
1638 ----------
1639 refs : iterable of `DatasetRef`
1640 The datasets to be checked. These are assumed not to be known
1641 to datastore.
1642 artifact_existence : `dict` [`lsst.resources.ResourcePath`, `bool`]
1643 Optional mapping of datastore artifact to existence. Updated by
1644 this method with details of all artifacts tested. Can be `None`
1645 if the caller is not interested.
1647 Returns
1648 -------
1649 existence : `dict` of [`DatasetRef`, `bool`]
1650 Mapping from dataset to boolean indicating existence.
1651 """
1652 dataset_existence: dict[DatasetRef, bool] = {}
1653 if not self.trustGetRequest:
1654 # Must assume these do not exist
1655 for ref in refs:
1656 dataset_existence[ref] = False
1657 else:
1658 log.debug(
1659 "%d datasets were not known to datastore during initial existence check.",
1660 len(refs),
1661 )
1663 # Construct data structure identical to that returned
1664 # by _get_stored_records_associated_with_refs() but using
1665 # guessed names.
1666 records = {}
1667 id_to_ref = {}
1668 for missing_ref in refs:
1669 expected = self._get_expected_dataset_locations_info(missing_ref)
1670 dataset_id = missing_ref.id
1671 records[dataset_id] = [info for _, info in expected]
1672 id_to_ref[dataset_id] = missing_ref
1674 dataset_existence.update(
1675 self._process_mexists_records(
1676 id_to_ref,
1677 records,
1678 False,
1679 artifact_existence=artifact_existence,
1680 )
1681 )
1683 return dataset_existence
1685 def exists(self, ref: DatasetRef) -> bool:
1686 """Check if the dataset exists in the datastore.
1688 Parameters
1689 ----------
1690 ref : `DatasetRef`
1691 Reference to the required dataset.
1693 Returns
1694 -------
1695 exists : `bool`
1696 `True` if the entity exists in the `Datastore`.
1698 Notes
1699 -----
1700 The local cache is checked as a proxy for existence in the remote
1701 object store. It is possible that another process on a different
1702 compute node could remove the file from the object store even
1703 though it is present in the local cache.
1704 """
1705 ref = self._cast_storage_class(ref)
1706 fileLocations = self._get_dataset_locations_info(ref)
1708 # if we are being asked to trust that registry might not be correct
1709 # we ask for the expected locations and check them explicitly
1710 if not fileLocations:
1711 if not self.trustGetRequest:
1712 return False
1714 # First check the cache. If it is not found we must check
1715 # the datastore itself. Assume that any component in the cache
1716 # means that the dataset does exist somewhere.
1717 if self.cacheManager.known_to_cache(ref):
1718 return True
1720 # When we are guessing a dataset location we can not check
1721 # for the existence of every component since we can not
1722 # know if every component was written. Instead we check
1723 # for the existence of any of the expected locations.
1724 for location, _ in self._get_expected_dataset_locations_info(ref):
1725 if self._artifact_exists(location):
1726 return True
1727 return False
1729 # All listed artifacts must exist.
1730 for location, storedFileInfo in fileLocations:
1731 # Checking in cache needs the component ref.
1732 check_ref = ref
1733 if not ref.datasetType.isComponent() and (component := storedFileInfo.component):
1734 check_ref = ref.makeComponentRef(component)
1735 if self.cacheManager.known_to_cache(check_ref, location.getExtension()):
1736 continue
1738 if not self._artifact_exists(location):
1739 return False
1741 return True
1743 def getURIs(self, ref: DatasetRef, predict: bool = False) -> DatasetRefURIs:
1744 """Return URIs associated with dataset.
1746 Parameters
1747 ----------
1748 ref : `DatasetRef`
1749 Reference to the required dataset.
1750 predict : `bool`, optional
1751 If the datastore does not know about the dataset, should it
1752 return a predicted URI or not?
1754 Returns
1755 -------
1756 uris : `DatasetRefURIs`
1757 The URI to the primary artifact associated with this dataset (if
1758 the dataset was disassembled within the datastore this may be
1759 `None`), and the URIs to any components associated with the dataset
1760 artifact. (can be empty if there are no components).
1761 """
1762 many = self.getManyURIs([ref], predict=predict, allow_missing=False)
1763 return many[ref]
1765 def getURI(self, ref: DatasetRef, predict: bool = False) -> ResourcePath:
1766 """URI to the Dataset.
1768 Parameters
1769 ----------
1770 ref : `DatasetRef`
1771 Reference to the required Dataset.
1772 predict : `bool`
1773 If `True`, allow URIs to be returned of datasets that have not
1774 been written.
1776 Returns
1777 -------
1778 uri : `str`
1779 URI pointing to the dataset within the datastore. If the
1780 dataset does not exist in the datastore, and if ``predict`` is
1781 `True`, the URI will be a prediction and will include a URI
1782 fragment "#predicted".
1783 If the datastore does not have entities that relate well
1784 to the concept of a URI the returned URI will be
1785 descriptive. The returned URI is not guaranteed to be obtainable.
1787 Raises
1788 ------
1789 FileNotFoundError
1790 Raised if a URI has been requested for a dataset that does not
1791 exist and guessing is not allowed.
1792 RuntimeError
1793 Raised if a request is made for a single URI but multiple URIs
1794 are associated with this dataset.
1796 Notes
1797 -----
1798 When a predicted URI is requested an attempt will be made to form
1799 a reasonable URI based on file templates and the expected formatter.
1800 """
1801 primary, components = self.getURIs(ref, predict)
1802 if primary is None or components:
1803 raise RuntimeError(
1804 f"Dataset ({ref}) includes distinct URIs for components. Use Datastore.getURIs() instead."
1805 )
1806 return primary
1808 def _predict_URIs(
1809 self,
1810 ref: DatasetRef,
1811 ) -> DatasetRefURIs:
1812 """Predict the URIs of a dataset ref.
1814 Parameters
1815 ----------
1816 ref : `DatasetRef`
1817 Reference to the required Dataset.
1819 Returns
1820 -------
1821 URI : DatasetRefUris
1822 Primary and component URIs. URIs will contain a URI fragment
1823 "#predicted".
1824 """
1825 uris = DatasetRefURIs()
1827 if self.composites.shouldBeDisassembled(ref):
1828 for component, _ in ref.datasetType.storageClass.components.items():
1829 comp_ref = ref.makeComponentRef(component)
1830 comp_location, _ = self._determine_put_formatter_location(comp_ref)
1832 # Add the "#predicted" URI fragment to indicate this is a
1833 # guess
1834 uris.componentURIs[component] = ResourcePath(comp_location.uri.geturl() + "#predicted")
1836 else:
1837 location, _ = self._determine_put_formatter_location(ref)
1839 # Add the "#predicted" URI fragment to indicate this is a guess
1840 uris.primaryURI = ResourcePath(location.uri.geturl() + "#predicted")
1842 return uris
1844 def getManyURIs(
1845 self,
1846 refs: Iterable[DatasetRef],
1847 predict: bool = False,
1848 allow_missing: bool = False,
1849 ) -> dict[DatasetRef, DatasetRefURIs]:
1850 # Docstring inherited
1852 uris: dict[DatasetRef, DatasetRefURIs] = {}
1854 records = self._get_stored_records_associated_with_refs(refs)
1855 records_keys = records.keys()
1857 existing_refs = tuple(ref for ref in refs if ref.id in records_keys)
1858 missing_refs = tuple(ref for ref in refs if ref.id not in records_keys)
1860 # Have to handle trustGetRequest mode by checking for the existence
1861 # of the missing refs on disk.
1862 if missing_refs:
1863 dataset_existence = self._mexists_check_expected(missing_refs, None)
1864 really_missing = set()
1865 not_missing = set()
1866 for ref, exists in dataset_existence.items():
1867 if exists:
1868 not_missing.add(ref)
1869 else:
1870 really_missing.add(ref)
1872 if not_missing:
1873 # Need to recalculate the missing/existing split.
1874 existing_refs = existing_refs + tuple(not_missing)
1875 missing_refs = tuple(really_missing)
1877 for ref in missing_refs:
1878 # if this has never been written then we have to guess
1879 if not predict:
1880 if not allow_missing:
1881 raise FileNotFoundError(f"Dataset {ref} not in this datastore.")
1882 else:
1883 uris[ref] = self._predict_URIs(ref)
1885 for ref in existing_refs:
1886 file_infos = records[ref.id]
1887 file_locations = [(i.file_location(self.locationFactory), i) for i in file_infos]
1888 uris[ref] = self._locations_to_URI(ref, file_locations)
1890 return uris
1892 def _locations_to_URI(
1893 self,
1894 ref: DatasetRef,
1895 file_locations: Sequence[tuple[Location, StoredFileInfo]],
1896 ) -> DatasetRefURIs:
1897 """Convert one or more file locations associated with a DatasetRef
1898 to a DatasetRefURIs.
1900 Parameters
1901 ----------
1902 ref : `DatasetRef`
1903 Reference to the dataset.
1904 file_locations : Sequence[Tuple[Location, StoredFileInfo]]
1905 Each item in the sequence is the location of the dataset within the
1906 datastore and stored information about the file and its formatter.
1907 If there is only one item in the sequence then it is treated as the
1908 primary URI. If there is more than one item then they are treated
1909 as component URIs. If there are no items then an error is raised
1910 unless ``self.trustGetRequest`` is `True`.
1912 Returns
1913 -------
1914 uris: DatasetRefURIs
1915 Represents the primary URI or component URIs described by the
1916 inputs.
1918 Raises
1919 ------
1920 RuntimeError
1921 If no file locations are passed in and ``self.trustGetRequest`` is
1922 `False`.
1923 FileNotFoundError
1924 If the a passed-in URI does not exist, and ``self.trustGetRequest``
1925 is `False`.
1926 RuntimeError
1927 If a passed in `StoredFileInfo`'s ``component`` is `None` (this is
1928 unexpected).
1929 """
1930 guessing = False
1931 uris = DatasetRefURIs()
1933 if not file_locations:
1934 if not self.trustGetRequest:
1935 raise RuntimeError(f"Unexpectedly got no artifacts for dataset {ref}")
1936 file_locations = self._get_expected_dataset_locations_info(ref)
1937 guessing = True
1939 if len(file_locations) == 1:
1940 # No disassembly so this is the primary URI
1941 uris.primaryURI = file_locations[0][0].uri
1942 if guessing and not uris.primaryURI.exists():
1943 raise FileNotFoundError(f"Expected URI ({uris.primaryURI}) does not exist")
1944 else:
1945 for location, file_info in file_locations:
1946 if file_info.component is None:
1947 raise RuntimeError(f"Unexpectedly got no component name for a component at {location}")
1948 if guessing and not location.uri.exists():
1949 # If we are trusting then it is entirely possible for
1950 # some components to be missing. In that case we skip
1951 # to the next component.
1952 if self.trustGetRequest:
1953 continue
1954 raise FileNotFoundError(f"Expected URI ({location.uri}) does not exist")
1955 uris.componentURIs[file_info.component] = location.uri
1957 return uris
1959 def retrieveArtifacts(
1960 self,
1961 refs: Iterable[DatasetRef],
1962 destination: ResourcePath,
1963 transfer: str = "auto",
1964 preserve_path: bool = True,
1965 overwrite: bool = False,
1966 ) -> list[ResourcePath]:
1967 """Retrieve the file artifacts associated with the supplied refs.
1969 Parameters
1970 ----------
1971 refs : iterable of `DatasetRef`
1972 The datasets for which file artifacts are to be retrieved.
1973 A single ref can result in multiple files. The refs must
1974 be resolved.
1975 destination : `lsst.resources.ResourcePath`
1976 Location to write the file artifacts.
1977 transfer : `str`, optional
1978 Method to use to transfer the artifacts. Must be one of the options
1979 supported by `lsst.resources.ResourcePath.transfer_from()`.
1980 "move" is not allowed.
1981 preserve_path : `bool`, optional
1982 If `True` the full path of the file artifact within the datastore
1983 is preserved. If `False` the final file component of the path
1984 is used.
1985 overwrite : `bool`, optional
1986 If `True` allow transfers to overwrite existing files at the
1987 destination.
1989 Returns
1990 -------
1991 targets : `list` of `lsst.resources.ResourcePath`
1992 URIs of file artifacts in destination location. Order is not
1993 preserved.
1994 """
1995 if not destination.isdir():
1996 raise ValueError(f"Destination location must refer to a directory. Given {destination}")
1998 if transfer == "move":
1999 raise ValueError("Can not move artifacts out of datastore. Use copy instead.")
2001 # Source -> Destination
2002 # This also helps filter out duplicate DatasetRef in the request
2003 # that will map to the same underlying file transfer.
2004 to_transfer: dict[ResourcePath, ResourcePath] = {}
2006 for ref in refs:
2007 locations = self._get_dataset_locations_info(ref)
2008 for location, _ in locations:
2009 source_uri = location.uri
2010 target_path: ResourcePathExpression
2011 if preserve_path:
2012 target_path = location.pathInStore
2013 if target_path.isabs():
2014 # This is an absolute path to an external file.
2015 # Use the full path.
2016 target_path = target_path.relativeToPathRoot
2017 else:
2018 target_path = source_uri.basename()
2019 target_uri = destination.join(target_path)
2020 to_transfer[source_uri] = target_uri
2022 # In theory can now parallelize the transfer
2023 log.debug("Number of artifacts to transfer to %s: %d", str(destination), len(to_transfer))
2024 for source_uri, target_uri in to_transfer.items():
2025 target_uri.transfer_from(source_uri, transfer=transfer, overwrite=overwrite)
2027 return list(to_transfer.values())
2029 def get(
2030 self,
2031 ref: DatasetRef,
2032 parameters: Mapping[str, Any] | None = None,
2033 storageClass: StorageClass | str | None = None,
2034 ) -> Any:
2035 """Load an InMemoryDataset from the store.
2037 Parameters
2038 ----------
2039 ref : `DatasetRef`
2040 Reference to the required Dataset.
2041 parameters : `dict`
2042 `StorageClass`-specific parameters that specify, for example,
2043 a slice of the dataset to be loaded.
2044 storageClass : `StorageClass` or `str`, optional
2045 The storage class to be used to override the Python type
2046 returned by this method. By default the returned type matches
2047 the dataset type definition for this dataset. Specifying a
2048 read `StorageClass` can force a different type to be returned.
2049 This type must be compatible with the original type.
2051 Returns
2052 -------
2053 inMemoryDataset : `object`
2054 Requested dataset or slice thereof as an InMemoryDataset.
2056 Raises
2057 ------
2058 FileNotFoundError
2059 Requested dataset can not be retrieved.
2060 TypeError
2061 Return value from formatter has unexpected type.
2062 ValueError
2063 Formatter failed to process the dataset.
2064 """
2065 # Supplied storage class for the component being read is either
2066 # from the ref itself or some an override if we want to force
2067 # type conversion.
2068 if storageClass is not None:
2069 ref = ref.overrideStorageClass(storageClass)
2070 refStorageClass = ref.datasetType.storageClass
2072 allGetInfo = self._prepare_for_get(ref, parameters)
2073 refComponent = ref.datasetType.component()
2075 # Create mapping from component name to related info
2076 allComponents = {i.component: i for i in allGetInfo}
2078 # By definition the dataset is disassembled if we have more
2079 # than one record for it.
2080 isDisassembled = len(allGetInfo) > 1
2082 # Look for the special case where we are disassembled but the
2083 # component is a derived component that was not written during
2084 # disassembly. For this scenario we need to check that the
2085 # component requested is listed as a derived component for the
2086 # composite storage class
2087 isDisassembledReadOnlyComponent = False
2088 if isDisassembled and refComponent:
2089 # The composite storage class should be accessible through
2090 # the component dataset type
2091 compositeStorageClass = ref.datasetType.parentStorageClass
2093 # In the unlikely scenario where the composite storage
2094 # class is not known, we can only assume that this is a
2095 # normal component. If that assumption is wrong then the
2096 # branch below that reads a persisted component will fail
2097 # so there is no need to complain here.
2098 if compositeStorageClass is not None:
2099 isDisassembledReadOnlyComponent = refComponent in compositeStorageClass.derivedComponents
2101 if isDisassembled and not refComponent:
2102 # This was a disassembled dataset spread over multiple files
2103 # and we need to put them all back together again.
2104 # Read into memory and then assemble
2106 # Check that the supplied parameters are suitable for the type read
2107 refStorageClass.validateParameters(parameters)
2109 # We want to keep track of all the parameters that were not used
2110 # by formatters. We assume that if any of the component formatters
2111 # use a parameter that we do not need to apply it again in the
2112 # assembler.
2113 usedParams = set()
2115 components: dict[str, Any] = {}
2116 for getInfo in allGetInfo:
2117 # assemblerParams are parameters not understood by the
2118 # associated formatter.
2119 usedParams.update(set(getInfo.formatterParams))
2121 component = getInfo.component
2123 if component is None:
2124 raise RuntimeError(f"Internal error in datastore assembly of {ref}")
2126 # We do not want the formatter to think it's reading
2127 # a component though because it is really reading a
2128 # standalone dataset -- always tell reader it is not a
2129 # component.
2130 components[component] = self._read_artifact_into_memory(
2131 getInfo, ref.makeComponentRef(component), isComponent=False
2132 )
2134 inMemoryDataset = ref.datasetType.storageClass.delegate().assemble(components)
2136 # Any unused parameters will have to be passed to the assembler
2137 if parameters:
2138 unusedParams = {k: v for k, v in parameters.items() if k not in usedParams}
2139 else:
2140 unusedParams = {}
2142 # Process parameters
2143 return ref.datasetType.storageClass.delegate().handleParameters(
2144 inMemoryDataset, parameters=unusedParams
2145 )
2147 elif isDisassembledReadOnlyComponent:
2148 compositeStorageClass = ref.datasetType.parentStorageClass
2149 if compositeStorageClass is None:
2150 raise RuntimeError(
2151 f"Unable to retrieve derived component '{refComponent}' since"
2152 "no composite storage class is available."
2153 )
2155 if refComponent is None:
2156 # Mainly for mypy
2157 raise RuntimeError(f"Internal error in datastore {self.name}: component can not be None here")
2159 # Assume that every derived component can be calculated by
2160 # forwarding the request to a single read/write component.
2161 # Rather than guessing which rw component is the right one by
2162 # scanning each for a derived component of the same name,
2163 # we ask the storage class delegate directly which one is best to
2164 # use.
2165 compositeDelegate = compositeStorageClass.delegate()
2166 forwardedComponent = compositeDelegate.selectResponsibleComponent(
2167 refComponent, set(allComponents)
2168 )
2170 # Select the relevant component
2171 rwInfo = allComponents[forwardedComponent]
2173 # For now assume that read parameters are validated against
2174 # the real component and not the requested component
2175 forwardedStorageClass = rwInfo.formatter.fileDescriptor.readStorageClass
2176 forwardedStorageClass.validateParameters(parameters)
2178 # The reference to use for the caching must refer to the forwarded
2179 # component and not the derived component.
2180 cache_ref = ref.makeCompositeRef().makeComponentRef(forwardedComponent)
2182 # Unfortunately the FileDescriptor inside the formatter will have
2183 # the wrong write storage class so we need to create a new one
2184 # given the immutability constraint.
2185 writeStorageClass = rwInfo.info.storageClass
2187 # We may need to put some thought into parameters for read
2188 # components but for now forward them on as is
2189 readFormatter = type(rwInfo.formatter)(
2190 FileDescriptor(
2191 rwInfo.location,
2192 readStorageClass=refStorageClass,
2193 storageClass=writeStorageClass,
2194 parameters=parameters,
2195 ),
2196 ref.dataId,
2197 )
2199 # The assembler can not receive any parameter requests for a
2200 # derived component at this time since the assembler will
2201 # see the storage class of the derived component and those
2202 # parameters will have to be handled by the formatter on the
2203 # forwarded storage class.
2204 assemblerParams: dict[str, Any] = {}
2206 # Need to created a new info that specifies the derived
2207 # component and associated storage class
2208 readInfo = DatastoreFileGetInformation(
2209 rwInfo.location,
2210 readFormatter,
2211 rwInfo.info,
2212 assemblerParams,
2213 {},
2214 refComponent,
2215 refStorageClass,
2216 )
2218 return self._read_artifact_into_memory(readInfo, ref, isComponent=True, cache_ref=cache_ref)
2220 else:
2221 # Single file request or component from that composite file
2222 for lookup in (refComponent, None):
2223 if lookup in allComponents:
2224 getInfo = allComponents[lookup]
2225 break
2226 else:
2227 raise FileNotFoundError(
2228 f"Component {refComponent} not found for ref {ref} in datastore {self.name}"
2229 )
2231 # Do not need the component itself if already disassembled
2232 if isDisassembled:
2233 isComponent = False
2234 else:
2235 isComponent = getInfo.component is not None
2237 # For a component read of a composite we want the cache to
2238 # be looking at the composite ref itself.
2239 cache_ref = ref.makeCompositeRef() if isComponent else ref
2241 # For a disassembled component we can validate parametersagainst
2242 # the component storage class directly
2243 if isDisassembled:
2244 refStorageClass.validateParameters(parameters)
2245 else:
2246 # For an assembled composite this could be a derived
2247 # component derived from a real component. The validity
2248 # of the parameters is not clear. For now validate against
2249 # the composite storage class
2250 getInfo.formatter.fileDescriptor.storageClass.validateParameters(parameters)
2252 return self._read_artifact_into_memory(getInfo, ref, isComponent=isComponent, cache_ref=cache_ref)
2254 @transactional
2255 def put(self, inMemoryDataset: Any, ref: DatasetRef) -> None:
2256 """Write a InMemoryDataset with a given `DatasetRef` to the store.
2258 Parameters
2259 ----------
2260 inMemoryDataset : `object`
2261 The dataset to store.
2262 ref : `DatasetRef`
2263 Reference to the associated Dataset.
2265 Raises
2266 ------
2267 TypeError
2268 Supplied object and storage class are inconsistent.
2269 DatasetTypeNotSupportedError
2270 The associated `DatasetType` is not handled by this datastore.
2272 Notes
2273 -----
2274 If the datastore is configured to reject certain dataset types it
2275 is possible that the put will fail and raise a
2276 `DatasetTypeNotSupportedError`. The main use case for this is to
2277 allow `ChainedDatastore` to put to multiple datastores without
2278 requiring that every datastore accepts the dataset.
2279 """
2280 doDisassembly = self.composites.shouldBeDisassembled(ref)
2281 # doDisassembly = True
2283 artifacts = []
2284 if doDisassembly:
2285 components = ref.datasetType.storageClass.delegate().disassemble(inMemoryDataset)
2286 if components is None:
2287 raise RuntimeError(
2288 f"Inconsistent configuration: dataset type {ref.datasetType.name} "
2289 f"with storage class {ref.datasetType.storageClass.name} "
2290 "is configured to be disassembled, but cannot be."
2291 )
2292 for component, componentInfo in components.items():
2293 # Don't recurse because we want to take advantage of
2294 # bulk insert -- need a new DatasetRef that refers to the
2295 # same dataset_id but has the component DatasetType
2296 # DatasetType does not refer to the types of components
2297 # So we construct one ourselves.
2298 compRef = ref.makeComponentRef(component)
2299 storedInfo = self._write_in_memory_to_artifact(componentInfo.component, compRef)
2300 artifacts.append((compRef, storedInfo))
2301 else:
2302 # Write the entire thing out
2303 storedInfo = self._write_in_memory_to_artifact(inMemoryDataset, ref)
2304 artifacts.append((ref, storedInfo))
2306 self._register_datasets(artifacts)
2308 @transactional
2309 def trash(self, ref: DatasetRef | Iterable[DatasetRef], ignore_errors: bool = True) -> None:
2310 # At this point can safely remove these datasets from the cache
2311 # to avoid confusion later on. If they are not trashed later
2312 # the cache will simply be refilled.
2313 self.cacheManager.remove_from_cache(ref)
2315 # If we are in trust mode there will be nothing to move to
2316 # the trash table and we will have to try to delete the file
2317 # immediately.
2318 if self.trustGetRequest:
2319 # Try to keep the logic below for a single file trash.
2320 if isinstance(ref, DatasetRef):
2321 refs = {ref}
2322 else:
2323 # Will recreate ref at the end of this branch.
2324 refs = set(ref)
2326 # Determine which datasets are known to datastore directly.
2327 id_to_ref = {ref.id: ref for ref in refs}
2328 existing_ids = self._get_stored_records_associated_with_refs(refs)
2329 existing_refs = {id_to_ref[ref_id] for ref_id in existing_ids}
2331 missing = refs - existing_refs
2332 if missing:
2333 # Do an explicit existence check on these refs.
2334 # We only care about the artifacts at this point and not
2335 # the dataset existence.
2336 artifact_existence: dict[ResourcePath, bool] = {}
2337 _ = self.mexists(missing, artifact_existence)
2338 uris = [uri for uri, exists in artifact_existence.items() if exists]
2340 # FUTURE UPGRADE: Implement a parallelized bulk remove.
2341 log.debug("Removing %d artifacts from datastore that are unknown to datastore", len(uris))
2342 for uri in uris:
2343 try:
2344 uri.remove()
2345 except Exception as e:
2346 if ignore_errors:
2347 log.debug("Artifact %s could not be removed: %s", uri, e)
2348 continue
2349 raise
2351 # There is no point asking the code below to remove refs we
2352 # know are missing so update it with the list of existing
2353 # records. Try to retain one vs many logic.
2354 if not existing_refs:
2355 # Nothing more to do since none of the datasets were
2356 # known to the datastore record table.
2357 return
2358 ref = list(existing_refs)
2359 if len(ref) == 1:
2360 ref = ref[0]
2362 # Get file metadata and internal metadata
2363 if not isinstance(ref, DatasetRef):
2364 log.debug("Doing multi-dataset trash in datastore %s", self.name)
2365 # Assumed to be an iterable of refs so bulk mode enabled.
2366 try:
2367 self.bridge.moveToTrash(ref, transaction=self._transaction)
2368 except Exception as e:
2369 if ignore_errors:
2370 log.warning("Unexpected issue moving multiple datasets to trash: %s", e)
2371 else:
2372 raise
2373 return
2375 log.debug("Trashing dataset %s in datastore %s", ref, self.name)
2377 fileLocations = self._get_dataset_locations_info(ref)
2379 if not fileLocations:
2380 err_msg = f"Requested dataset to trash ({ref}) is not known to datastore {self.name}"
2381 if ignore_errors:
2382 log.warning(err_msg)
2383 return
2384 else:
2385 raise FileNotFoundError(err_msg)
2387 for location, storedFileInfo in fileLocations:
2388 if not self._artifact_exists(location):
2389 err_msg = (
2390 f"Dataset is known to datastore {self.name} but "
2391 f"associated artifact ({location.uri}) is missing"
2392 )
2393 if ignore_errors:
2394 log.warning(err_msg)
2395 return
2396 else:
2397 raise FileNotFoundError(err_msg)
2399 # Mark dataset as trashed
2400 try:
2401 self.bridge.moveToTrash([ref], transaction=self._transaction)
2402 except Exception as e:
2403 if ignore_errors:
2404 log.warning(
2405 "Attempted to mark dataset (%s) to be trashed in datastore %s "
2406 "but encountered an error: %s",
2407 ref,
2408 self.name,
2409 e,
2410 )
2411 pass
2412 else:
2413 raise
2415 @transactional
2416 def emptyTrash(self, ignore_errors: bool = True) -> None:
2417 """Remove all datasets from the trash.
2419 Parameters
2420 ----------
2421 ignore_errors : `bool`
2422 If `True` return without error even if something went wrong.
2423 Problems could occur if another process is simultaneously trying
2424 to delete.
2425 """
2426 log.debug("Emptying trash in datastore %s", self.name)
2428 # Context manager will empty trash iff we finish it without raising.
2429 # It will also automatically delete the relevant rows from the
2430 # trash table and the records table.
2431 with self.bridge.emptyTrash(
2432 self._table, record_class=StoredFileInfo, record_column="path"
2433 ) as trash_data:
2434 # Removing the artifacts themselves requires that the files are
2435 # not also associated with refs that are not to be trashed.
2436 # Therefore need to do a query with the file paths themselves
2437 # and return all the refs associated with them. Can only delete
2438 # a file if the refs to be trashed are the only refs associated
2439 # with the file.
2440 # This requires multiple copies of the trashed items
2441 trashed, artifacts_to_keep = trash_data
2443 if artifacts_to_keep is None:
2444 # The bridge is not helping us so have to work it out
2445 # ourselves. This is not going to be as efficient.
2446 trashed = list(trashed)
2448 # The instance check is for mypy since up to this point it
2449 # does not know the type of info.
2450 path_map = self._refs_associated_with_artifacts(
2451 [info.path for _, info in trashed if isinstance(info, StoredFileInfo)]
2452 )
2454 for ref, info in trashed:
2455 # Mypy needs to know this is not the base class
2456 assert isinstance(info, StoredFileInfo), f"Unexpectedly got info of class {type(info)}"
2458 path_map[info.path].remove(ref.id)
2459 if not path_map[info.path]:
2460 del path_map[info.path]
2462 artifacts_to_keep = set(path_map)
2464 for ref, info in trashed:
2465 # Should not happen for this implementation but need
2466 # to keep mypy happy.
2467 assert info is not None, f"Internal logic error in emptyTrash with ref {ref}."
2469 # Mypy needs to know this is not the base class
2470 assert isinstance(info, StoredFileInfo), f"Unexpectedly got info of class {type(info)}"
2472 if info.path in artifacts_to_keep:
2473 # This is a multi-dataset artifact and we are not
2474 # removing all associated refs.
2475 continue
2477 # Only trashed refs still known to datastore will be returned.
2478 location = info.file_location(self.locationFactory)
2480 # Point of no return for this artifact
2481 log.debug("Removing artifact %s from datastore %s", location.uri, self.name)
2482 try:
2483 self._delete_artifact(location)
2484 except FileNotFoundError:
2485 # If the file itself has been deleted there is nothing
2486 # we can do about it. It is possible that trash has
2487 # been run in parallel in another process or someone
2488 # decided to delete the file. It is unlikely to come
2489 # back and so we should still continue with the removal
2490 # of the entry from the trash table. It is also possible
2491 # we removed it in a previous iteration if it was
2492 # a multi-dataset artifact. The delete artifact method
2493 # will log a debug message in this scenario.
2494 # Distinguishing file missing before trash started and
2495 # file already removed previously as part of this trash
2496 # is not worth the distinction with regards to potential
2497 # memory cost.
2498 pass
2499 except Exception as e:
2500 if ignore_errors:
2501 # Use a debug message here even though it's not
2502 # a good situation. In some cases this can be
2503 # caused by a race between user A and user B
2504 # and neither of them has permissions for the
2505 # other's files. Butler does not know about users
2506 # and trash has no idea what collections these
2507 # files were in (without guessing from a path).
2508 log.debug(
2509 "Encountered error removing artifact %s from datastore %s: %s",
2510 location.uri,
2511 self.name,
2512 e,
2513 )
2514 else:
2515 raise
2517 @transactional
2518 def transfer_from(
2519 self,
2520 source_datastore: Datastore,
2521 refs: Iterable[DatasetRef],
2522 transfer: str = "auto",
2523 artifact_existence: dict[ResourcePath, bool] | None = None,
2524 ) -> tuple[set[DatasetRef], set[DatasetRef]]:
2525 # Docstring inherited
2526 if type(self) is not type(source_datastore):
2527 raise TypeError(
2528 f"Datastore mismatch between this datastore ({type(self)}) and the "
2529 f"source datastore ({type(source_datastore)})."
2530 )
2532 # Be explicit for mypy
2533 if not isinstance(source_datastore, FileDatastore):
2534 raise TypeError(
2535 "Can only transfer to a FileDatastore from another FileDatastore, not"
2536 f" {type(source_datastore)}"
2537 )
2539 # Stop early if "direct" transfer mode is requested. That would
2540 # require that the URI inside the source datastore should be stored
2541 # directly in the target datastore, which seems unlikely to be useful
2542 # since at any moment the source datastore could delete the file.
2543 if transfer in ("direct", "split"):
2544 raise ValueError(
2545 f"Can not transfer from a source datastore using {transfer} mode since"
2546 " those files are controlled by the other datastore."
2547 )
2549 # Empty existence lookup if none given.
2550 if artifact_existence is None:
2551 artifact_existence = {}
2553 # We will go through the list multiple times so must convert
2554 # generators to lists.
2555 refs = list(refs)
2557 # In order to handle disassembled composites the code works
2558 # at the records level since it can assume that internal APIs
2559 # can be used.
2560 # - If the record already exists in the destination this is assumed
2561 # to be okay.
2562 # - If there is no record but the source and destination URIs are
2563 # identical no transfer is done but the record is added.
2564 # - If the source record refers to an absolute URI currently assume
2565 # that that URI should remain absolute and will be visible to the
2566 # destination butler. May need to have a flag to indicate whether
2567 # the dataset should be transferred. This will only happen if
2568 # the detached Butler has had a local ingest.
2570 # What we really want is all the records in the source datastore
2571 # associated with these refs. Or derived ones if they don't exist
2572 # in the source.
2573 source_records = source_datastore._get_stored_records_associated_with_refs(refs)
2575 # The source dataset_ids are the keys in these records
2576 source_ids = set(source_records)
2577 log.debug("Number of datastore records found in source: %d", len(source_ids))
2579 requested_ids = {ref.id for ref in refs}
2580 missing_ids = requested_ids - source_ids
2582 # Missing IDs can be okay if that datastore has allowed
2583 # gets based on file existence. Should we transfer what we can
2584 # or complain about it and warn?
2585 if missing_ids and not source_datastore.trustGetRequest:
2586 raise ValueError(
2587 f"Some datasets are missing from source datastore {source_datastore}: {missing_ids}"
2588 )
2590 # Need to map these missing IDs to a DatasetRef so we can guess
2591 # the details.
2592 if missing_ids:
2593 log.info(
2594 "Number of expected datasets missing from source datastore records: %d out of %d",
2595 len(missing_ids),
2596 len(requested_ids),
2597 )
2598 id_to_ref = {ref.id: ref for ref in refs if ref.id in missing_ids}
2600 # This should be chunked in case we end up having to check
2601 # the file store since we need some log output to show
2602 # progress.
2603 for missing_ids_chunk in chunk_iterable(missing_ids, chunk_size=10_000):
2604 records = {}
2605 for missing in missing_ids_chunk:
2606 # Ask the source datastore where the missing artifacts
2607 # should be. An execution butler might not know about the
2608 # artifacts even if they are there.
2609 expected = source_datastore._get_expected_dataset_locations_info(id_to_ref[missing])
2610 records[missing] = [info for _, info in expected]
2612 # Call the mexist helper method in case we have not already
2613 # checked these artifacts such that artifact_existence is
2614 # empty. This allows us to benefit from parallelism.
2615 # datastore.mexists() itself does not give us access to the
2616 # derived datastore record.
2617 log.verbose("Checking existence of %d datasets unknown to datastore", len(records))
2618 ref_exists = source_datastore._process_mexists_records(
2619 id_to_ref, records, False, artifact_existence=artifact_existence
2620 )
2622 # Now go through the records and propagate the ones that exist.
2623 location_factory = source_datastore.locationFactory
2624 for missing, record_list in records.items():
2625 # Skip completely if the ref does not exist.
2626 ref = id_to_ref[missing]
2627 if not ref_exists[ref]:
2628 log.warning("Asked to transfer dataset %s but no file artifacts exist for it.", ref)
2629 continue
2630 # Check for file artifact to decide which parts of a
2631 # disassembled composite do exist. If there is only a
2632 # single record we don't even need to look because it can't
2633 # be a composite and must exist.
2634 if len(record_list) == 1:
2635 dataset_records = record_list
2636 else:
2637 dataset_records = [
2638 record
2639 for record in record_list
2640 if artifact_existence[record.file_location(location_factory).uri]
2641 ]
2642 assert len(dataset_records) > 0, "Disassembled composite should have had some files."
2644 # Rely on source_records being a defaultdict.
2645 source_records[missing].extend(dataset_records)
2647 # See if we already have these records
2648 target_records = self._get_stored_records_associated_with_refs(refs)
2650 # The artifacts to register
2651 artifacts = []
2653 # Refs that already exist
2654 already_present = []
2656 # Refs that were rejected by this datastore.
2657 rejected = set()
2659 # Refs that were transferred successfully.
2660 accepted = set()
2662 # Record each time we have done a "direct" transfer.
2663 direct_transfers = []
2665 # Now can transfer the artifacts
2666 for ref in refs:
2667 if not self.constraints.isAcceptable(ref):
2668 # This datastore should not be accepting this dataset.
2669 rejected.add(ref)
2670 continue
2672 accepted.add(ref)
2674 if ref.id in target_records:
2675 # Already have an artifact for this.
2676 already_present.append(ref)
2677 continue
2679 # mypy needs to know these are always resolved refs
2680 for info in source_records[ref.id]:
2681 source_location = info.file_location(source_datastore.locationFactory)
2682 target_location = info.file_location(self.locationFactory)
2683 if source_location == target_location and not source_location.pathInStore.isabs():
2684 # Artifact is already in the target location.
2685 # (which is how execution butler currently runs)
2686 pass
2687 else:
2688 if target_location.pathInStore.isabs():
2689 # Just because we can see the artifact when running
2690 # the transfer doesn't mean it will be generally
2691 # accessible to a user of this butler. Need to decide
2692 # what to do about an absolute path.
2693 if transfer == "auto":
2694 # For "auto" transfers we allow the absolute URI
2695 # to be recorded in the target datastore.
2696 direct_transfers.append(source_location)
2697 else:
2698 # The user is explicitly requesting a transfer
2699 # even for an absolute URI. This requires us to
2700 # calculate the target path.
2701 template_ref = ref
2702 if info.component:
2703 template_ref = ref.makeComponentRef(info.component)
2704 target_location = self._calculate_ingested_datastore_name(
2705 source_location.uri,
2706 template_ref,
2707 )
2709 info = info.update(path=target_location.pathInStore.path)
2711 # Need to transfer it to the new location.
2712 # Assume we should always overwrite. If the artifact
2713 # is there this might indicate that a previous transfer
2714 # was interrupted but was not able to be rolled back
2715 # completely (eg pre-emption) so follow Datastore default
2716 # and overwrite.
2717 target_location.uri.transfer_from(
2718 source_location.uri, transfer=transfer, overwrite=True, transaction=self._transaction
2719 )
2721 artifacts.append((ref, info))
2723 if direct_transfers:
2724 log.info(
2725 "Transfer request for an outside-datastore artifact with absolute URI done %d time%s",
2726 len(direct_transfers),
2727 "" if len(direct_transfers) == 1 else "s",
2728 )
2730 self._register_datasets(artifacts)
2732 if already_present:
2733 n_skipped = len(already_present)
2734 log.info(
2735 "Skipped transfer of %d dataset%s already present in datastore",
2736 n_skipped,
2737 "" if n_skipped == 1 else "s",
2738 )
2740 return accepted, rejected
2742 @transactional
2743 def forget(self, refs: Iterable[DatasetRef]) -> None:
2744 # Docstring inherited.
2745 refs = list(refs)
2746 self.bridge.forget(refs)
2747 self._table.delete(["dataset_id"], *[{"dataset_id": ref.id} for ref in refs])
2749 def validateConfiguration(
2750 self, entities: Iterable[DatasetRef | DatasetType | StorageClass], logFailures: bool = False
2751 ) -> None:
2752 """Validate some of the configuration for this datastore.
2754 Parameters
2755 ----------
2756 entities : iterable of `DatasetRef`, `DatasetType`, or `StorageClass`
2757 Entities to test against this configuration. Can be differing
2758 types.
2759 logFailures : `bool`, optional
2760 If `True`, output a log message for every validation error
2761 detected.
2763 Raises
2764 ------
2765 DatastoreValidationError
2766 Raised if there is a validation problem with a configuration.
2767 All the problems are reported in a single exception.
2769 Notes
2770 -----
2771 This method checks that all the supplied entities have valid file
2772 templates and also have formatters defined.
2773 """
2774 templateFailed = None
2775 try:
2776 self.templates.validateTemplates(entities, logFailures=logFailures)
2777 except FileTemplateValidationError as e:
2778 templateFailed = str(e)
2780 formatterFailed = []
2781 for entity in entities:
2782 try:
2783 self.formatterFactory.getFormatterClass(entity)
2784 except KeyError as e:
2785 formatterFailed.append(str(e))
2786 if logFailures:
2787 log.critical("Formatter failure: %s", e)
2789 if templateFailed or formatterFailed:
2790 messages = []
2791 if templateFailed:
2792 messages.append(templateFailed)
2793 if formatterFailed:
2794 messages.append(",".join(formatterFailed))
2795 msg = ";\n".join(messages)
2796 raise DatastoreValidationError(msg)
2798 def getLookupKeys(self) -> set[LookupKey]:
2799 # Docstring is inherited from base class
2800 return (
2801 self.templates.getLookupKeys()
2802 | self.formatterFactory.getLookupKeys()
2803 | self.constraints.getLookupKeys()
2804 )
2806 def validateKey(self, lookupKey: LookupKey, entity: DatasetRef | DatasetType | StorageClass) -> None:
2807 # Docstring is inherited from base class
2808 # The key can be valid in either formatters or templates so we can
2809 # only check the template if it exists
2810 if lookupKey in self.templates:
2811 try:
2812 self.templates[lookupKey].validateTemplate(entity)
2813 except FileTemplateValidationError as e:
2814 raise DatastoreValidationError(e) from e
2816 def export(
2817 self,
2818 refs: Iterable[DatasetRef],
2819 *,
2820 directory: ResourcePathExpression | None = None,
2821 transfer: str | None = "auto",
2822 ) -> Iterable[FileDataset]:
2823 # Docstring inherited from Datastore.export.
2824 if transfer == "auto" and directory is None:
2825 transfer = None
2827 if transfer is not None and directory is None:
2828 raise TypeError(f"Cannot export using transfer mode {transfer} with no export directory given")
2830 if transfer == "move":
2831 raise TypeError("Can not export by moving files out of datastore.")
2832 elif transfer == "direct":
2833 # For an export, treat this as equivalent to None. We do not
2834 # want an import to risk using absolute URIs to datasets owned
2835 # by another datastore.
2836 log.info("Treating 'direct' transfer mode as in-place export.")
2837 transfer = None
2839 # Force the directory to be a URI object
2840 directoryUri: ResourcePath | None = None
2841 if directory is not None:
2842 directoryUri = ResourcePath(directory, forceDirectory=True)
2844 if transfer is not None and directoryUri is not None:
2845 # mypy needs the second test
2846 if not directoryUri.exists():
2847 raise FileNotFoundError(f"Export location {directory} does not exist")
2849 progress = Progress("lsst.daf.butler.datastores.FileDatastore.export", level=logging.DEBUG)
2850 for ref in progress.wrap(refs, "Exporting dataset files"):
2851 fileLocations = self._get_dataset_locations_info(ref)
2852 if not fileLocations:
2853 raise FileNotFoundError(f"Could not retrieve dataset {ref}.")
2854 # For now we can not export disassembled datasets
2855 if len(fileLocations) > 1:
2856 raise NotImplementedError(f"Can not export disassembled datasets such as {ref}")
2857 location, storedFileInfo = fileLocations[0]
2859 pathInStore = location.pathInStore.path
2860 if transfer is None:
2861 # TODO: do we also need to return the readStorageClass somehow?
2862 # We will use the path in store directly. If this is an
2863 # absolute URI, preserve it.
2864 if location.pathInStore.isabs():
2865 pathInStore = str(location.uri)
2866 elif transfer == "direct":
2867 # Use full URIs to the remote store in the export
2868 pathInStore = str(location.uri)
2869 else:
2870 # mypy needs help
2871 assert directoryUri is not None, "directoryUri must be defined to get here"
2872 storeUri = ResourcePath(location.uri)
2874 # if the datastore has an absolute URI to a resource, we
2875 # have two options:
2876 # 1. Keep the absolute URI in the exported YAML
2877 # 2. Allocate a new name in the local datastore and transfer
2878 # it.
2879 # For now go with option 2
2880 if location.pathInStore.isabs():
2881 template = self.templates.getTemplate(ref)
2882 newURI = ResourcePath(template.format(ref), forceAbsolute=False)
2883 pathInStore = str(newURI.updatedExtension(location.pathInStore.getExtension()))
2885 exportUri = directoryUri.join(pathInStore)
2886 exportUri.transfer_from(storeUri, transfer=transfer)
2888 yield FileDataset(refs=[ref], path=pathInStore, formatter=storedFileInfo.formatter)
2890 @staticmethod
2891 def computeChecksum(uri: ResourcePath, algorithm: str = "blake2b", block_size: int = 8192) -> str | None:
2892 """Compute the checksum of the supplied file.
2894 Parameters
2895 ----------
2896 uri : `lsst.resources.ResourcePath`
2897 Name of resource to calculate checksum from.
2898 algorithm : `str`, optional
2899 Name of algorithm to use. Must be one of the algorithms supported
2900 by :py:class`hashlib`.
2901 block_size : `int`
2902 Number of bytes to read from file at one time.
2904 Returns
2905 -------
2906 hexdigest : `str`
2907 Hex digest of the file.
2909 Notes
2910 -----
2911 Currently returns None if the URI is for a remote resource.
2912 """
2913 if algorithm not in hashlib.algorithms_guaranteed:
2914 raise NameError(f"The specified algorithm '{algorithm}' is not supported by hashlib")
2916 if not uri.isLocal:
2917 return None
2919 hasher = hashlib.new(algorithm)
2921 with uri.as_local() as local_uri:
2922 with open(local_uri.ospath, "rb") as f:
2923 for chunk in iter(lambda: f.read(block_size), b""):
2924 hasher.update(chunk)
2926 return hasher.hexdigest()
2928 def needs_expanded_data_ids(
2929 self,
2930 transfer: str | None,
2931 entity: DatasetRef | DatasetType | StorageClass | None = None,
2932 ) -> bool:
2933 # Docstring inherited.
2934 # This _could_ also use entity to inspect whether the filename template
2935 # involves placeholders other than the required dimensions for its
2936 # dataset type, but that's not necessary for correctness; it just
2937 # enables more optimizations (perhaps only in theory).
2938 return transfer not in ("direct", None)
2940 def import_records(self, data: Mapping[str, DatastoreRecordData]) -> None:
2941 # Docstring inherited from the base class.
2942 record_data = data.get(self.name)
2943 if not record_data:
2944 return
2946 self._bridge.insert(FakeDatasetRef(dataset_id) for dataset_id in record_data.records.keys())
2948 # TODO: Verify that there are no unexpected table names in the dict?
2949 unpacked_records = []
2950 for dataset_data in record_data.records.values():
2951 records = dataset_data.get(self._table.name)
2952 if records:
2953 for info in records:
2954 assert isinstance(info, StoredFileInfo), "Expecting StoredFileInfo records"
2955 unpacked_records.append(info.to_record())
2956 if unpacked_records:
2957 self._table.insert(*unpacked_records, transaction=self._transaction)
2959 def export_records(self, refs: Iterable[DatasetIdRef]) -> Mapping[str, DatastoreRecordData]:
2960 # Docstring inherited from the base class.
2961 exported_refs = list(self._bridge.check(refs))
2962 ids = {ref.id for ref in exported_refs}
2963 records: dict[DatasetId, dict[str, list[StoredDatastoreItemInfo]]] = {id: {} for id in ids}
2964 for row in self._table.fetch(dataset_id=ids):
2965 info: StoredDatastoreItemInfo = StoredFileInfo.from_record(row)
2966 dataset_records = records.setdefault(info.dataset_id, {})
2967 dataset_records.setdefault(self._table.name, []).append(info)
2969 record_data = DatastoreRecordData(records=records)
2970 return {self.name: record_data}
2972 def set_retrieve_dataset_type_method(self, method: Callable[[str], DatasetType | None] | None) -> None:
2973 # Docstring inherited from the base class.
2974 self._retrieve_dataset_method = method
2976 def _cast_storage_class(self, ref: DatasetRef) -> DatasetRef:
2977 """Update dataset reference to use the storage class from registry."""
2978 if self._retrieve_dataset_method is None:
2979 # We could raise an exception here but unit tests do not define
2980 # this method.
2981 return ref
2982 dataset_type = self._retrieve_dataset_method(ref.datasetType.name)
2983 if dataset_type is not None:
2984 ref = ref.overrideStorageClass(dataset_type.storageClass)
2985 return ref