Coverage for python/lsst/daf/butler/datastores/fileDatastore.py: 8%
974 statements
« prev ^ index » next coverage.py v7.2.7, created at 2023-06-28 10:10 +0000
« prev ^ index » next coverage.py v7.2.7, created at 2023-06-28 10:10 +0000
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
21from __future__ import annotations
23"""Generic file-based datastore code."""
25__all__ = ("FileDatastore",)
27import hashlib
28import logging
29from collections import defaultdict
30from collections.abc import Callable, Iterable, Mapping, Sequence
31from dataclasses import dataclass
32from typing import TYPE_CHECKING, Any, ClassVar
34from lsst.daf.butler import (
35 CompositesMap,
36 Config,
37 DatasetId,
38 DatasetRef,
39 DatasetRefURIs,
40 DatasetType,
41 DatasetTypeNotSupportedError,
42 Datastore,
43 DatastoreCacheManager,
44 DatastoreConfig,
45 DatastoreDisabledCacheManager,
46 DatastoreRecordData,
47 DatastoreValidationError,
48 FileDataset,
49 FileDescriptor,
50 FileTemplates,
51 FileTemplateValidationError,
52 Formatter,
53 FormatterFactory,
54 Location,
55 LocationFactory,
56 Progress,
57 StorageClass,
58 StoredDatastoreItemInfo,
59 StoredFileInfo,
60 ddl,
61)
62from lsst.daf.butler.core.repoRelocation import replaceRoot
63from lsst.daf.butler.core.utils import transactional
64from lsst.daf.butler.registry.interfaces import DatastoreRegistryBridge, ReadOnlyDatabaseError
65from lsst.resources import ResourcePath, ResourcePathExpression
66from lsst.utils.introspection import get_class_of, get_instance_of
67from lsst.utils.iteration import chunk_iterable
69# For VERBOSE logging usage.
70from lsst.utils.logging import VERBOSE, getLogger
71from lsst.utils.timer import time_this
72from sqlalchemy import BigInteger, String
74from ..registry.interfaces import FakeDatasetRef
75from .genericDatastore import GenericBaseDatastore
77if TYPE_CHECKING:
78 from lsst.daf.butler import AbstractDatastoreCacheManager, LookupKey
79 from lsst.daf.butler.registry.interfaces import DatasetIdRef, DatastoreRegistryBridgeManager
81log = getLogger(__name__)
84class _IngestPrepData(Datastore.IngestPrepData):
85 """Helper class for FileDatastore ingest implementation.
87 Parameters
88 ----------
89 datasets : `~collections.abc.Iterable` of `FileDataset`
90 Files to be ingested by this datastore.
91 """
93 def __init__(self, datasets: Iterable[FileDataset]):
94 super().__init__(ref for dataset in datasets for ref in dataset.refs)
95 self.datasets = datasets
98@dataclass(frozen=True)
99class DatastoreFileGetInformation:
100 """Collection of useful parameters needed to retrieve a file from
101 a Datastore.
102 """
104 location: Location
105 """The location from which to read the dataset."""
107 formatter: Formatter
108 """The `Formatter` to use to deserialize the dataset."""
110 info: StoredFileInfo
111 """Stored information about this file and its formatter."""
113 assemblerParams: Mapping[str, Any]
114 """Parameters to use for post-processing the retrieved dataset."""
116 formatterParams: Mapping[str, Any]
117 """Parameters that were understood by the associated formatter."""
119 component: str | None
120 """The component to be retrieved (can be `None`)."""
122 readStorageClass: StorageClass
123 """The `StorageClass` of the dataset being read."""
126class FileDatastore(GenericBaseDatastore):
127 """Generic Datastore for file-based implementations.
129 Should always be sub-classed since key abstract methods are missing.
131 Parameters
132 ----------
133 config : `DatastoreConfig` or `str`
134 Configuration as either a `Config` object or URI to file.
135 bridgeManager : `DatastoreRegistryBridgeManager`
136 Object that manages the interface between `Registry` and datastores.
137 butlerRoot : `str`, optional
138 New datastore root to use to override the configuration value.
140 Raises
141 ------
142 ValueError
143 If root location does not exist and ``create`` is `False` in the
144 configuration.
145 """
147 defaultConfigFile: ClassVar[str | None] = None
148 """Path to configuration defaults. Accessed within the ``config`` resource
149 or relative to a search path. Can be None if no defaults specified.
150 """
152 root: ResourcePath
153 """Root directory URI of this `Datastore`."""
155 locationFactory: LocationFactory
156 """Factory for creating locations relative to the datastore root."""
158 formatterFactory: FormatterFactory
159 """Factory for creating instances of formatters."""
161 templates: FileTemplates
162 """File templates that can be used by this `Datastore`."""
164 composites: CompositesMap
165 """Determines whether a dataset should be disassembled on put."""
167 defaultConfigFile = "datastores/fileDatastore.yaml"
168 """Path to configuration defaults. Accessed within the ``config`` resource
169 or relative to a search path. Can be None if no defaults specified.
170 """
172 _retrieve_dataset_method: Callable[[str], DatasetType | None] | None = None
173 """Callable that is used in trusted mode to retrieve registry definition
174 of a named dataset type.
175 """
177 @classmethod
178 def setConfigRoot(cls, root: str, config: Config, full: Config, overwrite: bool = True) -> None:
179 """Set any filesystem-dependent config options for this Datastore to
180 be appropriate for a new empty repository with the given root.
182 Parameters
183 ----------
184 root : `str`
185 URI to the root of the data repository.
186 config : `Config`
187 A `Config` to update. Only the subset understood by
188 this component will be updated. Will not expand
189 defaults.
190 full : `Config`
191 A complete config with all defaults expanded that can be
192 converted to a `DatastoreConfig`. Read-only and will not be
193 modified by this method.
194 Repository-specific options that should not be obtained
195 from defaults when Butler instances are constructed
196 should be copied from ``full`` to ``config``.
197 overwrite : `bool`, optional
198 If `False`, do not modify a value in ``config`` if the value
199 already exists. Default is always to overwrite with the provided
200 ``root``.
202 Notes
203 -----
204 If a keyword is explicitly defined in the supplied ``config`` it
205 will not be overridden by this method if ``overwrite`` is `False`.
206 This allows explicit values set in external configs to be retained.
207 """
208 Config.updateParameters(
209 DatastoreConfig,
210 config,
211 full,
212 toUpdate={"root": root},
213 toCopy=("cls", ("records", "table")),
214 overwrite=overwrite,
215 )
217 @classmethod
218 def makeTableSpec(cls, datasetIdColumnType: type) -> ddl.TableSpec:
219 return ddl.TableSpec(
220 fields=[
221 ddl.FieldSpec(name="dataset_id", dtype=datasetIdColumnType, primaryKey=True),
222 ddl.FieldSpec(name="path", dtype=String, length=256, nullable=False),
223 ddl.FieldSpec(name="formatter", dtype=String, length=128, nullable=False),
224 ddl.FieldSpec(name="storage_class", dtype=String, length=64, nullable=False),
225 # Use empty string to indicate no component
226 ddl.FieldSpec(name="component", dtype=String, length=32, primaryKey=True),
227 # TODO: should checksum be Base64Bytes instead?
228 ddl.FieldSpec(name="checksum", dtype=String, length=128, nullable=True),
229 ddl.FieldSpec(name="file_size", dtype=BigInteger, nullable=True),
230 ],
231 unique=frozenset(),
232 indexes=[ddl.IndexSpec("path")],
233 )
235 def __init__(
236 self,
237 config: DatastoreConfig | ResourcePathExpression,
238 bridgeManager: DatastoreRegistryBridgeManager,
239 butlerRoot: str | None = None,
240 ):
241 super().__init__(config, bridgeManager)
242 if "root" not in self.config:
243 raise ValueError("No root directory specified in configuration")
245 self._bridgeManager = bridgeManager
247 # Name ourselves either using an explicit name or a name
248 # derived from the (unexpanded) root
249 if "name" in self.config:
250 self.name = self.config["name"]
251 else:
252 # We use the unexpanded root in the name to indicate that this
253 # datastore can be moved without having to update registry.
254 self.name = "{}@{}".format(type(self).__name__, self.config["root"])
256 # Support repository relocation in config
257 # Existence of self.root is checked in subclass
258 self.root = ResourcePath(
259 replaceRoot(self.config["root"], butlerRoot), forceDirectory=True, forceAbsolute=True
260 )
262 self.locationFactory = LocationFactory(self.root)
263 self.formatterFactory = FormatterFactory()
265 # Now associate formatters with storage classes
266 self.formatterFactory.registerFormatters(self.config["formatters"], universe=bridgeManager.universe)
268 # Read the file naming templates
269 self.templates = FileTemplates(self.config["templates"], universe=bridgeManager.universe)
271 # See if composites should be disassembled
272 self.composites = CompositesMap(self.config["composites"], universe=bridgeManager.universe)
274 tableName = self.config["records", "table"]
275 try:
276 # Storage of paths and formatters, keyed by dataset_id
277 self._table = bridgeManager.opaque.register(
278 tableName, self.makeTableSpec(bridgeManager.datasetIdColumnType)
279 )
280 # Interface to Registry.
281 self._bridge = bridgeManager.register(self.name)
282 except ReadOnlyDatabaseError:
283 # If the database is read only and we just tried and failed to
284 # create a table, it means someone is trying to create a read-only
285 # butler client for an empty repo. That should be okay, as long
286 # as they then try to get any datasets before some other client
287 # creates the table. Chances are they'rejust validating
288 # configuration.
289 pass
291 # Determine whether checksums should be used - default to False
292 self.useChecksum = self.config.get("checksum", False)
294 # Determine whether we can fall back to configuration if a
295 # requested dataset is not known to registry
296 self.trustGetRequest = self.config.get("trust_get_request", False)
298 # Create a cache manager
299 self.cacheManager: AbstractDatastoreCacheManager
300 if "cached" in self.config:
301 self.cacheManager = DatastoreCacheManager(self.config["cached"], universe=bridgeManager.universe)
302 else:
303 self.cacheManager = DatastoreDisabledCacheManager("", universe=bridgeManager.universe)
305 # Check existence and create directory structure if necessary
306 if not self.root.exists():
307 if "create" not in self.config or not self.config["create"]:
308 raise ValueError(f"No valid root and not allowed to create one at: {self.root}")
309 try:
310 self.root.mkdir()
311 except Exception as e:
312 raise ValueError(
313 f"Can not create datastore root '{self.root}', check permissions. Got error: {e}"
314 ) from e
316 def __str__(self) -> str:
317 return str(self.root)
319 @property
320 def bridge(self) -> DatastoreRegistryBridge:
321 return self._bridge
323 def _artifact_exists(self, location: Location) -> bool:
324 """Check that an artifact exists in this datastore at the specified
325 location.
327 Parameters
328 ----------
329 location : `Location`
330 Expected location of the artifact associated with this datastore.
332 Returns
333 -------
334 exists : `bool`
335 True if the location can be found, false otherwise.
336 """
337 log.debug("Checking if resource exists: %s", location.uri)
338 return location.uri.exists()
340 def _delete_artifact(self, location: Location) -> None:
341 """Delete the artifact from the datastore.
343 Parameters
344 ----------
345 location : `Location`
346 Location of the artifact associated with this datastore.
347 """
348 if location.pathInStore.isabs():
349 raise RuntimeError(f"Cannot delete artifact with absolute uri {location.uri}.")
351 try:
352 location.uri.remove()
353 except FileNotFoundError:
354 log.debug("File %s did not exist and so could not be deleted.", location.uri)
355 raise
356 except Exception as e:
357 log.critical("Failed to delete file: %s (%s)", location.uri, e)
358 raise
359 log.debug("Successfully deleted file: %s", location.uri)
361 def addStoredItemInfo(self, refs: Iterable[DatasetRef], infos: Iterable[StoredFileInfo]) -> None:
362 # Docstring inherited from GenericBaseDatastore
363 records = [info.rebase(ref).to_record() for ref, info in zip(refs, infos)]
364 self._table.insert(*records, transaction=self._transaction)
366 def getStoredItemsInfo(self, ref: DatasetIdRef) -> list[StoredFileInfo]:
367 # Docstring inherited from GenericBaseDatastore
369 # Look for the dataset_id -- there might be multiple matches
370 # if we have disassembled the dataset.
371 records = self._table.fetch(dataset_id=ref.id)
372 return [StoredFileInfo.from_record(record) for record in records]
374 def _get_stored_records_associated_with_refs(
375 self, refs: Iterable[DatasetIdRef]
376 ) -> dict[DatasetId, list[StoredFileInfo]]:
377 """Retrieve all records associated with the provided refs.
379 Parameters
380 ----------
381 refs : iterable of `DatasetIdRef`
382 The refs for which records are to be retrieved.
384 Returns
385 -------
386 records : `dict` of [`DatasetId`, `list` of `StoredFileInfo`]
387 The matching records indexed by the ref ID. The number of entries
388 in the dict can be smaller than the number of requested refs.
389 """
390 records = self._table.fetch(dataset_id=[ref.id for ref in refs])
392 # Uniqueness is dataset_id + component so can have multiple records
393 # per ref.
394 records_by_ref = defaultdict(list)
395 for record in records:
396 records_by_ref[record["dataset_id"]].append(StoredFileInfo.from_record(record))
397 return records_by_ref
399 def _refs_associated_with_artifacts(self, paths: list[str | ResourcePath]) -> dict[str, set[DatasetId]]:
400 """Return paths and associated dataset refs.
402 Parameters
403 ----------
404 paths : `list` of `str` or `lsst.resources.ResourcePath`
405 All the paths to include in search.
407 Returns
408 -------
409 mapping : `dict` of [`str`, `set` [`DatasetId`]]
410 Mapping of each path to a set of associated database IDs.
411 """
412 records = self._table.fetch(path=[str(path) for path in paths])
413 result = defaultdict(set)
414 for row in records:
415 result[row["path"]].add(row["dataset_id"])
416 return result
418 def _registered_refs_per_artifact(self, pathInStore: ResourcePath) -> set[DatasetId]:
419 """Return all dataset refs associated with the supplied path.
421 Parameters
422 ----------
423 pathInStore : `lsst.resources.ResourcePath`
424 Path of interest in the data store.
426 Returns
427 -------
428 ids : `set` of `int`
429 All `DatasetRef` IDs associated with this path.
430 """
431 records = list(self._table.fetch(path=str(pathInStore)))
432 ids = {r["dataset_id"] for r in records}
433 return ids
435 def removeStoredItemInfo(self, ref: DatasetIdRef) -> None:
436 # Docstring inherited from GenericBaseDatastore
437 self._table.delete(["dataset_id"], {"dataset_id": ref.id})
439 def _get_dataset_locations_info(self, ref: DatasetIdRef) -> list[tuple[Location, StoredFileInfo]]:
440 r"""Find all the `Location`\ s of the requested dataset in the
441 `Datastore` and the associated stored file information.
443 Parameters
444 ----------
445 ref : `DatasetRef`
446 Reference to the required `Dataset`.
448 Returns
449 -------
450 results : `list` [`tuple` [`Location`, `StoredFileInfo` ]]
451 Location of the dataset within the datastore and
452 stored information about each file and its formatter.
453 """
454 # Get the file information (this will fail if no file)
455 records = self.getStoredItemsInfo(ref)
457 # Use the path to determine the location -- we need to take
458 # into account absolute URIs in the datastore record
459 return [(r.file_location(self.locationFactory), r) for r in records]
461 def _can_remove_dataset_artifact(self, ref: DatasetIdRef, location: Location) -> bool:
462 """Check that there is only one dataset associated with the
463 specified artifact.
465 Parameters
466 ----------
467 ref : `DatasetRef` or `FakeDatasetRef`
468 Dataset to be removed.
469 location : `Location`
470 The location of the artifact to be removed.
472 Returns
473 -------
474 can_remove : `Bool`
475 True if the artifact can be safely removed.
476 """
477 # Can't ever delete absolute URIs.
478 if location.pathInStore.isabs():
479 return False
481 # Get all entries associated with this path
482 allRefs = self._registered_refs_per_artifact(location.pathInStore)
483 if not allRefs:
484 raise RuntimeError(f"Datastore inconsistency error. {location.pathInStore} not in registry")
486 # Remove these refs from all the refs and if there is nothing left
487 # then we can delete
488 remainingRefs = allRefs - {ref.id}
490 if remainingRefs:
491 return False
492 return True
494 def _get_expected_dataset_locations_info(self, ref: DatasetRef) -> list[tuple[Location, StoredFileInfo]]:
495 """Predict the location and related file information of the requested
496 dataset in this datastore.
498 Parameters
499 ----------
500 ref : `DatasetRef`
501 Reference to the required `Dataset`.
503 Returns
504 -------
505 results : `list` [`tuple` [`Location`, `StoredFileInfo` ]]
506 Expected Location of the dataset within the datastore and
507 placeholder information about each file and its formatter.
509 Notes
510 -----
511 Uses the current configuration to determine how we would expect the
512 datastore files to have been written if we couldn't ask registry.
513 This is safe so long as there has been no change to datastore
514 configuration between writing the dataset and wanting to read it.
515 Will not work for files that have been ingested without using the
516 standard file template or default formatter.
517 """
518 # If we have a component ref we always need to ask the questions
519 # of the composite. If the composite is disassembled this routine
520 # should return all components. If the composite was not
521 # disassembled the composite is what is stored regardless of
522 # component request. Note that if the caller has disassembled
523 # a composite there is no way for this guess to know that
524 # without trying both the composite and component ref and seeing
525 # if there is something at the component Location even without
526 # disassembly being enabled.
527 if ref.datasetType.isComponent():
528 ref = ref.makeCompositeRef()
530 # See if the ref is a composite that should be disassembled
531 doDisassembly = self.composites.shouldBeDisassembled(ref)
533 all_info: list[tuple[Location, Formatter, StorageClass, str | None]] = []
535 if doDisassembly:
536 for component, componentStorage in ref.datasetType.storageClass.components.items():
537 compRef = ref.makeComponentRef(component)
538 location, formatter = self._determine_put_formatter_location(compRef)
539 all_info.append((location, formatter, componentStorage, component))
541 else:
542 # Always use the composite ref if no disassembly
543 location, formatter = self._determine_put_formatter_location(ref)
544 all_info.append((location, formatter, ref.datasetType.storageClass, None))
546 # Convert the list of tuples to have StoredFileInfo as second element
547 return [
548 (
549 location,
550 StoredFileInfo(
551 formatter=formatter,
552 path=location.pathInStore.path,
553 storageClass=storageClass,
554 component=component,
555 checksum=None,
556 file_size=-1,
557 dataset_id=ref.id,
558 ),
559 )
560 for location, formatter, storageClass, component in all_info
561 ]
563 def _prepare_for_get(
564 self, ref: DatasetRef, parameters: Mapping[str, Any] | None = None
565 ) -> list[DatastoreFileGetInformation]:
566 """Check parameters for ``get`` and obtain formatter and
567 location.
569 Parameters
570 ----------
571 ref : `DatasetRef`
572 Reference to the required Dataset.
573 parameters : `dict`
574 `StorageClass`-specific parameters that specify, for example,
575 a slice of the dataset to be loaded.
577 Returns
578 -------
579 getInfo : `list` [`DatastoreFileGetInformation`]
580 Parameters needed to retrieve each file.
581 """
582 log.debug("Retrieve %s from %s with parameters %s", ref, self.name, parameters)
584 # The storage class we want to use eventually
585 refStorageClass = ref.datasetType.storageClass
587 # For trusted mode need to reset storage class.
588 ref = self._cast_storage_class(ref)
590 # Get file metadata and internal metadata
591 fileLocations = self._get_dataset_locations_info(ref)
592 if not fileLocations:
593 if not self.trustGetRequest:
594 raise FileNotFoundError(f"Could not retrieve dataset {ref}.")
595 # Assume the dataset is where we think it should be
596 fileLocations = self._get_expected_dataset_locations_info(ref)
598 if len(fileLocations) > 1:
599 disassembled = True
601 # If trust is involved it is possible that there will be
602 # components listed here that do not exist in the datastore.
603 # Explicitly check for file artifact existence and filter out any
604 # that are missing.
605 if self.trustGetRequest:
606 fileLocations = [loc for loc in fileLocations if loc[0].uri.exists()]
608 # For now complain only if we have no components at all. One
609 # component is probably a problem but we can punt that to the
610 # assembler.
611 if not fileLocations:
612 raise FileNotFoundError(f"None of the component files for dataset {ref} exist.")
614 else:
615 disassembled = False
617 # Is this a component request?
618 refComponent = ref.datasetType.component()
620 fileGetInfo = []
621 for location, storedFileInfo in fileLocations:
622 # The storage class used to write the file
623 writeStorageClass = storedFileInfo.storageClass
625 # If this has been disassembled we need read to match the write
626 if disassembled:
627 readStorageClass = writeStorageClass
628 else:
629 readStorageClass = refStorageClass
631 formatter = get_instance_of(
632 storedFileInfo.formatter,
633 FileDescriptor(
634 location,
635 readStorageClass=readStorageClass,
636 storageClass=writeStorageClass,
637 parameters=parameters,
638 ),
639 ref.dataId,
640 )
642 formatterParams, notFormatterParams = formatter.segregateParameters()
644 # Of the remaining parameters, extract the ones supported by
645 # this StorageClass (for components not all will be handled)
646 assemblerParams = readStorageClass.filterParameters(notFormatterParams)
648 # The ref itself could be a component if the dataset was
649 # disassembled by butler, or we disassembled in datastore and
650 # components came from the datastore records
651 component = storedFileInfo.component if storedFileInfo.component else refComponent
653 fileGetInfo.append(
654 DatastoreFileGetInformation(
655 location,
656 formatter,
657 storedFileInfo,
658 assemblerParams,
659 formatterParams,
660 component,
661 readStorageClass,
662 )
663 )
665 return fileGetInfo
667 def _prepare_for_put(self, inMemoryDataset: Any, ref: DatasetRef) -> tuple[Location, Formatter]:
668 """Check the arguments for ``put`` and obtain formatter and
669 location.
671 Parameters
672 ----------
673 inMemoryDataset : `object`
674 The dataset to store.
675 ref : `DatasetRef`
676 Reference to the associated Dataset.
678 Returns
679 -------
680 location : `Location`
681 The location to write the dataset.
682 formatter : `Formatter`
683 The `Formatter` to use to write the dataset.
685 Raises
686 ------
687 TypeError
688 Supplied object and storage class are inconsistent.
689 DatasetTypeNotSupportedError
690 The associated `DatasetType` is not handled by this datastore.
691 """
692 self._validate_put_parameters(inMemoryDataset, ref)
693 return self._determine_put_formatter_location(ref)
695 def _determine_put_formatter_location(self, ref: DatasetRef) -> tuple[Location, Formatter]:
696 """Calculate the formatter and output location to use for put.
698 Parameters
699 ----------
700 ref : `DatasetRef`
701 Reference to the associated Dataset.
703 Returns
704 -------
705 location : `Location`
706 The location to write the dataset.
707 formatter : `Formatter`
708 The `Formatter` to use to write the dataset.
709 """
710 # Work out output file name
711 try:
712 template = self.templates.getTemplate(ref)
713 except KeyError as e:
714 raise DatasetTypeNotSupportedError(f"Unable to find template for {ref}") from e
716 # Validate the template to protect against filenames from different
717 # dataIds returning the same and causing overwrite confusion.
718 template.validateTemplate(ref)
720 location = self.locationFactory.fromPath(template.format(ref))
722 # Get the formatter based on the storage class
723 storageClass = ref.datasetType.storageClass
724 try:
725 formatter = self.formatterFactory.getFormatter(
726 ref, FileDescriptor(location, storageClass=storageClass), ref.dataId
727 )
728 except KeyError as e:
729 raise DatasetTypeNotSupportedError(
730 f"Unable to find formatter for {ref} in datastore {self.name}"
731 ) from e
733 # Now that we know the formatter, update the location
734 location = formatter.makeUpdatedLocation(location)
736 return location, formatter
738 def _overrideTransferMode(self, *datasets: FileDataset, transfer: str | None = None) -> str | None:
739 # Docstring inherited from base class
740 if transfer != "auto":
741 return transfer
743 # See if the paths are within the datastore or not
744 inside = [self._pathInStore(d.path) is not None for d in datasets]
746 if all(inside):
747 transfer = None
748 elif not any(inside):
749 # Allow ResourcePath to use its own knowledge
750 transfer = "auto"
751 else:
752 # This can happen when importing from a datastore that
753 # has had some datasets ingested using "direct" mode.
754 # Also allow ResourcePath to sort it out but warn about it.
755 # This can happen if you are importing from a datastore
756 # that had some direct transfer datasets.
757 log.warning(
758 "Some datasets are inside the datastore and some are outside. Using 'split' "
759 "transfer mode. This assumes that the files outside the datastore are "
760 "still accessible to the new butler since they will not be copied into "
761 "the target datastore."
762 )
763 transfer = "split"
765 return transfer
767 def _pathInStore(self, path: ResourcePathExpression) -> str | None:
768 """Return path relative to datastore root.
770 Parameters
771 ----------
772 path : `lsst.resources.ResourcePathExpression`
773 Path to dataset. Can be absolute URI. If relative assumed to
774 be relative to the datastore. Returns path in datastore
775 or raises an exception if the path it outside.
777 Returns
778 -------
779 inStore : `str`
780 Path relative to datastore root. Returns `None` if the file is
781 outside the root.
782 """
783 # Relative path will always be relative to datastore
784 pathUri = ResourcePath(path, forceAbsolute=False)
785 return pathUri.relative_to(self.root)
787 def _standardizeIngestPath(
788 self, path: str | ResourcePath, *, transfer: str | None = None
789 ) -> str | ResourcePath:
790 """Standardize the path of a to-be-ingested file.
792 Parameters
793 ----------
794 path : `str` or `lsst.resources.ResourcePath`
795 Path of a file to be ingested. This parameter is not expected
796 to be all the types that can be used to construct a
797 `~lsst.resources.ResourcePath`.
798 transfer : `str`, optional
799 How (and whether) the dataset should be added to the datastore.
800 See `ingest` for details of transfer modes.
801 This implementation is provided only so
802 `NotImplementedError` can be raised if the mode is not supported;
803 actual transfers are deferred to `_extractIngestInfo`.
805 Returns
806 -------
807 path : `str` or `lsst.resources.ResourcePath`
808 New path in what the datastore considers standard form. If an
809 absolute URI was given that will be returned unchanged.
811 Notes
812 -----
813 Subclasses of `FileDatastore` can implement this method instead
814 of `_prepIngest`. It should not modify the data repository or given
815 file in any way.
817 Raises
818 ------
819 NotImplementedError
820 Raised if the datastore does not support the given transfer mode
821 (including the case where ingest is not supported at all).
822 FileNotFoundError
823 Raised if one of the given files does not exist.
824 """
825 if transfer not in (None, "direct", "split") + self.root.transferModes:
826 raise NotImplementedError(f"Transfer mode {transfer} not supported.")
828 # A relative URI indicates relative to datastore root
829 srcUri = ResourcePath(path, forceAbsolute=False)
830 if not srcUri.isabs():
831 srcUri = self.root.join(path)
833 if not srcUri.exists():
834 raise FileNotFoundError(
835 f"Resource at {srcUri} does not exist; note that paths to ingest "
836 f"are assumed to be relative to {self.root} unless they are absolute."
837 )
839 if transfer is None:
840 relpath = srcUri.relative_to(self.root)
841 if not relpath:
842 raise RuntimeError(
843 f"Transfer is none but source file ({srcUri}) is not within datastore ({self.root})"
844 )
846 # Return the relative path within the datastore for internal
847 # transfer
848 path = relpath
850 return path
852 def _extractIngestInfo(
853 self,
854 path: ResourcePathExpression,
855 ref: DatasetRef,
856 *,
857 formatter: Formatter | type[Formatter],
858 transfer: str | None = None,
859 record_validation_info: bool = True,
860 ) -> StoredFileInfo:
861 """Relocate (if necessary) and extract `StoredFileInfo` from a
862 to-be-ingested file.
864 Parameters
865 ----------
866 path : `lsst.resources.ResourcePathExpression`
867 URI or path of a file to be ingested.
868 ref : `DatasetRef`
869 Reference for the dataset being ingested. Guaranteed to have
870 ``dataset_id not None`.
871 formatter : `type` or `Formatter`
872 `Formatter` subclass to use for this dataset or an instance.
873 transfer : `str`, optional
874 How (and whether) the dataset should be added to the datastore.
875 See `ingest` for details of transfer modes.
876 record_validation_info : `bool`, optional
877 If `True`, the default, the datastore can record validation
878 information associated with the file. If `False` the datastore
879 will not attempt to track any information such as checksums
880 or file sizes. This can be useful if such information is tracked
881 in an external system or if the file is to be compressed in place.
882 It is up to the datastore whether this parameter is relevant.
884 Returns
885 -------
886 info : `StoredFileInfo`
887 Internal datastore record for this file. This will be inserted by
888 the caller; the `_extractIngestInfo` is only responsible for
889 creating and populating the struct.
891 Raises
892 ------
893 FileNotFoundError
894 Raised if one of the given files does not exist.
895 FileExistsError
896 Raised if transfer is not `None` but the (internal) location the
897 file would be moved to is already occupied.
898 """
899 if self._transaction is None:
900 raise RuntimeError("Ingest called without transaction enabled")
902 # Create URI of the source path, do not need to force a relative
903 # path to absolute.
904 srcUri = ResourcePath(path, forceAbsolute=False)
906 # Track whether we have read the size of the source yet
907 have_sized = False
909 tgtLocation: Location | None
910 if transfer is None or transfer == "split":
911 # A relative path is assumed to be relative to the datastore
912 # in this context
913 if not srcUri.isabs():
914 tgtLocation = self.locationFactory.fromPath(srcUri.ospath)
915 else:
916 # Work out the path in the datastore from an absolute URI
917 # This is required to be within the datastore.
918 pathInStore = srcUri.relative_to(self.root)
919 if pathInStore is None and transfer is None:
920 raise RuntimeError(
921 f"Unexpectedly learned that {srcUri} is not within datastore {self.root}"
922 )
923 if pathInStore:
924 tgtLocation = self.locationFactory.fromPath(pathInStore)
925 elif transfer == "split":
926 # Outside the datastore but treat that as a direct ingest
927 # instead.
928 tgtLocation = None
929 else:
930 raise RuntimeError(f"Unexpected transfer mode encountered: {transfer} for URI {srcUri}")
931 elif transfer == "direct":
932 # Want to store the full URI to the resource directly in
933 # datastore. This is useful for referring to permanent archive
934 # storage for raw data.
935 # Trust that people know what they are doing.
936 tgtLocation = None
937 else:
938 # Work out the name we want this ingested file to have
939 # inside the datastore
940 tgtLocation = self._calculate_ingested_datastore_name(srcUri, ref, formatter)
941 if not tgtLocation.uri.dirname().exists():
942 log.debug("Folder %s does not exist yet.", tgtLocation.uri.dirname())
943 tgtLocation.uri.dirname().mkdir()
945 # if we are transferring from a local file to a remote location
946 # it may be more efficient to get the size and checksum of the
947 # local file rather than the transferred one
948 if record_validation_info and srcUri.isLocal:
949 size = srcUri.size()
950 checksum = self.computeChecksum(srcUri) if self.useChecksum else None
951 have_sized = True
953 # Transfer the resource to the destination.
954 # Allow overwrite of an existing file. This matches the behavior
955 # of datastore.put() in that it trusts that registry would not
956 # be asking to overwrite unless registry thought that the
957 # overwrite was allowed.
958 tgtLocation.uri.transfer_from(
959 srcUri, transfer=transfer, transaction=self._transaction, overwrite=True
960 )
962 if tgtLocation is None:
963 # This means we are using direct mode
964 targetUri = srcUri
965 targetPath = str(srcUri)
966 else:
967 targetUri = tgtLocation.uri
968 targetPath = tgtLocation.pathInStore.path
970 # the file should exist in the datastore now
971 if record_validation_info:
972 if not have_sized:
973 size = targetUri.size()
974 checksum = self.computeChecksum(targetUri) if self.useChecksum else None
975 else:
976 # Not recording any file information.
977 size = -1
978 checksum = None
980 return StoredFileInfo(
981 formatter=formatter,
982 path=targetPath,
983 storageClass=ref.datasetType.storageClass,
984 component=ref.datasetType.component(),
985 file_size=size,
986 checksum=checksum,
987 dataset_id=ref.id,
988 )
990 def _prepIngest(self, *datasets: FileDataset, transfer: str | None = None) -> _IngestPrepData:
991 # Docstring inherited from Datastore._prepIngest.
992 filtered = []
993 for dataset in datasets:
994 acceptable = [ref for ref in dataset.refs if self.constraints.isAcceptable(ref)]
995 if not acceptable:
996 continue
997 else:
998 dataset.refs = acceptable
999 if dataset.formatter is None:
1000 dataset.formatter = self.formatterFactory.getFormatterClass(dataset.refs[0])
1001 else:
1002 assert isinstance(dataset.formatter, (type, str))
1003 formatter_class = get_class_of(dataset.formatter)
1004 if not issubclass(formatter_class, Formatter):
1005 raise TypeError(f"Requested formatter {dataset.formatter} is not a Formatter class.")
1006 dataset.formatter = formatter_class
1007 dataset.path = self._standardizeIngestPath(dataset.path, transfer=transfer)
1008 filtered.append(dataset)
1009 return _IngestPrepData(filtered)
1011 @transactional
1012 def _finishIngest(
1013 self,
1014 prepData: Datastore.IngestPrepData,
1015 *,
1016 transfer: str | None = None,
1017 record_validation_info: bool = True,
1018 ) -> None:
1019 # Docstring inherited from Datastore._finishIngest.
1020 refsAndInfos = []
1021 progress = Progress("lsst.daf.butler.datastores.FileDatastore.ingest", level=logging.DEBUG)
1022 for dataset in progress.wrap(prepData.datasets, desc="Ingesting dataset files"):
1023 # Do ingest as if the first dataset ref is associated with the file
1024 info = self._extractIngestInfo(
1025 dataset.path,
1026 dataset.refs[0],
1027 formatter=dataset.formatter,
1028 transfer=transfer,
1029 record_validation_info=record_validation_info,
1030 )
1031 refsAndInfos.extend([(ref, info) for ref in dataset.refs])
1032 self._register_datasets(refsAndInfos)
1034 def _calculate_ingested_datastore_name(
1035 self,
1036 srcUri: ResourcePath,
1037 ref: DatasetRef,
1038 formatter: Formatter | type[Formatter] | None = None,
1039 ) -> Location:
1040 """Given a source URI and a DatasetRef, determine the name the
1041 dataset will have inside datastore.
1043 Parameters
1044 ----------
1045 srcUri : `lsst.resources.ResourcePath`
1046 URI to the source dataset file.
1047 ref : `DatasetRef`
1048 Ref associated with the newly-ingested dataset artifact. This
1049 is used to determine the name within the datastore.
1050 formatter : `Formatter` or Formatter class.
1051 Formatter to use for validation. Can be a class or an instance.
1052 No validation of the file extension is performed if the
1053 ``formatter`` is `None`. This can be used if the caller knows
1054 that the source URI and target URI will use the same formatter.
1056 Returns
1057 -------
1058 location : `Location`
1059 Target location for the newly-ingested dataset.
1060 """
1061 # Ingesting a file from outside the datastore.
1062 # This involves a new name.
1063 template = self.templates.getTemplate(ref)
1064 location = self.locationFactory.fromPath(template.format(ref))
1066 # Get the extension
1067 ext = srcUri.getExtension()
1069 # Update the destination to include that extension
1070 location.updateExtension(ext)
1072 # Ask the formatter to validate this extension
1073 if formatter is not None:
1074 formatter.validateExtension(location)
1076 return location
1078 def _write_in_memory_to_artifact(self, inMemoryDataset: Any, ref: DatasetRef) -> StoredFileInfo:
1079 """Write out in memory dataset to datastore.
1081 Parameters
1082 ----------
1083 inMemoryDataset : `object`
1084 Dataset to write to datastore.
1085 ref : `DatasetRef`
1086 Registry information associated with this dataset.
1088 Returns
1089 -------
1090 info : `StoredFileInfo`
1091 Information describing the artifact written to the datastore.
1092 """
1093 # May need to coerce the in memory dataset to the correct
1094 # python type, but first we need to make sure the storage class
1095 # reflects the one defined in the data repository.
1096 ref = self._cast_storage_class(ref)
1097 inMemoryDataset = ref.datasetType.storageClass.coerce_type(inMemoryDataset)
1099 location, formatter = self._prepare_for_put(inMemoryDataset, ref)
1100 uri = location.uri
1102 if not uri.dirname().exists():
1103 log.debug("Folder %s does not exist yet so creating it.", uri.dirname())
1104 uri.dirname().mkdir()
1106 if self._transaction is None:
1107 raise RuntimeError("Attempting to write artifact without transaction enabled")
1109 def _removeFileExists(uri: ResourcePath) -> None:
1110 """Remove a file and do not complain if it is not there.
1112 This is important since a formatter might fail before the file
1113 is written and we should not confuse people by writing spurious
1114 error messages to the log.
1115 """
1116 try:
1117 uri.remove()
1118 except FileNotFoundError:
1119 pass
1121 # Register a callback to try to delete the uploaded data if
1122 # something fails below
1123 self._transaction.registerUndo("artifactWrite", _removeFileExists, uri)
1125 data_written = False
1126 if not uri.isLocal:
1127 # This is a remote URI. Some datasets can be serialized directly
1128 # to bytes and sent to the remote datastore without writing a
1129 # file. If the dataset is intended to be saved to the cache
1130 # a file is always written and direct write to the remote
1131 # datastore is bypassed.
1132 if not self.cacheManager.should_be_cached(ref):
1133 try:
1134 serializedDataset = formatter.toBytes(inMemoryDataset)
1135 except NotImplementedError:
1136 # Fallback to the file writing option.
1137 pass
1138 except Exception as e:
1139 raise RuntimeError(
1140 f"Failed to serialize dataset {ref} of type {type(inMemoryDataset)} to bytes."
1141 ) from e
1142 else:
1143 log.debug("Writing bytes directly to %s", uri)
1144 uri.write(serializedDataset, overwrite=True)
1145 log.debug("Successfully wrote bytes directly to %s", uri)
1146 data_written = True
1148 if not data_written:
1149 # Did not write the bytes directly to object store so instead
1150 # write to temporary file. Always write to a temporary even if
1151 # using a local file system -- that gives us atomic writes.
1152 # If a process is killed as the file is being written we do not
1153 # want it to remain in the correct place but in corrupt state.
1154 # For local files write to the output directory not temporary dir.
1155 prefix = uri.dirname() if uri.isLocal else None
1156 with ResourcePath.temporary_uri(suffix=uri.getExtension(), prefix=prefix) as temporary_uri:
1157 # Need to configure the formatter to write to a different
1158 # location and that needs us to overwrite internals
1159 log.debug("Writing dataset to temporary location at %s", temporary_uri)
1160 with formatter._updateLocation(Location(None, temporary_uri)):
1161 try:
1162 formatter.write(inMemoryDataset)
1163 except Exception as e:
1164 raise RuntimeError(
1165 f"Failed to serialize dataset {ref} of type"
1166 f" {type(inMemoryDataset)} to "
1167 f"temporary location {temporary_uri}"
1168 ) from e
1170 # Use move for a local file since that becomes an efficient
1171 # os.rename. For remote resources we use copy to allow the
1172 # file to be cached afterwards.
1173 transfer = "move" if uri.isLocal else "copy"
1175 uri.transfer_from(temporary_uri, transfer=transfer, overwrite=True)
1177 if transfer == "copy":
1178 # Cache if required
1179 self.cacheManager.move_to_cache(temporary_uri, ref)
1181 log.debug("Successfully wrote dataset to %s via a temporary file.", uri)
1183 # URI is needed to resolve what ingest case are we dealing with
1184 return self._extractIngestInfo(uri, ref, formatter=formatter)
1186 def _read_artifact_into_memory(
1187 self,
1188 getInfo: DatastoreFileGetInformation,
1189 ref: DatasetRef,
1190 isComponent: bool = False,
1191 cache_ref: DatasetRef | None = None,
1192 ) -> Any:
1193 """Read the artifact from datastore into in memory object.
1195 Parameters
1196 ----------
1197 getInfo : `DatastoreFileGetInformation`
1198 Information about the artifact within the datastore.
1199 ref : `DatasetRef`
1200 The registry information associated with this artifact.
1201 isComponent : `bool`
1202 Flag to indicate if a component is being read from this artifact.
1203 cache_ref : `DatasetRef`, optional
1204 The DatasetRef to use when looking up the file in the cache.
1205 This ref must have the same ID as the supplied ref but can
1206 be a parent ref or component ref to indicate to the cache whether
1207 a composite file is being requested from the cache or a component
1208 file. Without this the cache will default to the supplied ref but
1209 it can get confused with read-only derived components for
1210 disassembled composites.
1212 Returns
1213 -------
1214 inMemoryDataset : `object`
1215 The artifact as a python object.
1216 """
1217 location = getInfo.location
1218 uri = location.uri
1219 log.debug("Accessing data from %s", uri)
1221 if cache_ref is None:
1222 cache_ref = ref
1223 if cache_ref.id != ref.id:
1224 raise ValueError(
1225 "The supplied cache dataset ref refers to a different dataset than expected:"
1226 f" {ref.id} != {cache_ref.id}"
1227 )
1229 # Cannot recalculate checksum but can compare size as a quick check
1230 # Do not do this if the size is negative since that indicates
1231 # we do not know.
1232 recorded_size = getInfo.info.file_size
1233 resource_size = uri.size()
1234 if recorded_size >= 0 and resource_size != recorded_size:
1235 raise RuntimeError(
1236 "Integrity failure in Datastore. "
1237 f"Size of file {uri} ({resource_size}) "
1238 f"does not match size recorded in registry of {recorded_size}"
1239 )
1241 # For the general case we have choices for how to proceed.
1242 # 1. Always use a local file (downloading the remote resource to a
1243 # temporary file if needed).
1244 # 2. Use a threshold size and read into memory and use bytes.
1245 # Use both for now with an arbitrary hand off size.
1246 # This allows small datasets to be downloaded from remote object
1247 # stores without requiring a temporary file.
1249 formatter = getInfo.formatter
1250 nbytes_max = 10_000_000 # Arbitrary number that we can tune
1251 if resource_size <= nbytes_max and formatter.can_read_bytes():
1252 with self.cacheManager.find_in_cache(cache_ref, uri.getExtension()) as cached_file:
1253 if cached_file is not None:
1254 desired_uri = cached_file
1255 msg = f" (cached version of {uri})"
1256 else:
1257 desired_uri = uri
1258 msg = ""
1259 with time_this(log, msg="Reading bytes from %s%s", args=(desired_uri, msg)):
1260 serializedDataset = desired_uri.read()
1261 log.debug(
1262 "Deserializing %s from %d bytes from location %s with formatter %s",
1263 f"component {getInfo.component}" if isComponent else "",
1264 len(serializedDataset),
1265 uri,
1266 formatter.name(),
1267 )
1268 try:
1269 result = formatter.fromBytes(
1270 serializedDataset, component=getInfo.component if isComponent else None
1271 )
1272 except Exception as e:
1273 raise ValueError(
1274 f"Failure from formatter '{formatter.name()}' for dataset {ref.id}"
1275 f" ({ref.datasetType.name} from {uri}): {e}"
1276 ) from e
1277 else:
1278 # Read from file.
1280 # Have to update the Location associated with the formatter
1281 # because formatter.read does not allow an override.
1282 # This could be improved.
1283 location_updated = False
1284 msg = ""
1286 # First check in cache for local version.
1287 # The cache will only be relevant for remote resources but
1288 # no harm in always asking. Context manager ensures that cache
1289 # file is not deleted during cache expiration.
1290 with self.cacheManager.find_in_cache(cache_ref, uri.getExtension()) as cached_file:
1291 if cached_file is not None:
1292 msg = f"(via cache read of remote file {uri})"
1293 uri = cached_file
1294 location_updated = True
1296 with uri.as_local() as local_uri:
1297 can_be_cached = False
1298 if uri != local_uri:
1299 # URI was remote and file was downloaded
1300 cache_msg = ""
1301 location_updated = True
1303 if self.cacheManager.should_be_cached(cache_ref):
1304 # In this scenario we want to ask if the downloaded
1305 # file should be cached but we should not cache
1306 # it until after we've used it (to ensure it can't
1307 # be expired whilst we are using it).
1308 can_be_cached = True
1310 # Say that it is "likely" to be cached because
1311 # if the formatter read fails we will not be
1312 # caching this file.
1313 cache_msg = " and likely cached"
1315 msg = f"(via download to local file{cache_msg})"
1317 # Calculate the (possibly) new location for the formatter
1318 # to use.
1319 newLocation = Location(*local_uri.split()) if location_updated else None
1321 log.debug(
1322 "Reading%s from location %s %s with formatter %s",
1323 f" component {getInfo.component}" if isComponent else "",
1324 uri,
1325 msg,
1326 formatter.name(),
1327 )
1328 try:
1329 with formatter._updateLocation(newLocation):
1330 with time_this(
1331 log,
1332 msg="Reading%s from location %s %s with formatter %s",
1333 args=(
1334 f" component {getInfo.component}" if isComponent else "",
1335 uri,
1336 msg,
1337 formatter.name(),
1338 ),
1339 ):
1340 result = formatter.read(component=getInfo.component if isComponent else None)
1341 except Exception as e:
1342 raise ValueError(
1343 f"Failure from formatter '{formatter.name()}' for dataset {ref.id}"
1344 f" ({ref.datasetType.name} from {uri}): {e}"
1345 ) from e
1347 # File was read successfully so can move to cache
1348 if can_be_cached:
1349 self.cacheManager.move_to_cache(local_uri, cache_ref)
1351 return self._post_process_get(
1352 result, ref.datasetType.storageClass, getInfo.assemblerParams, isComponent=isComponent
1353 )
1355 def knows(self, ref: DatasetRef) -> bool:
1356 """Check if the dataset is known to the datastore.
1358 Does not check for existence of any artifact.
1360 Parameters
1361 ----------
1362 ref : `DatasetRef`
1363 Reference to the required dataset.
1365 Returns
1366 -------
1367 exists : `bool`
1368 `True` if the dataset is known to the datastore.
1369 """
1370 fileLocations = self._get_dataset_locations_info(ref)
1371 if fileLocations:
1372 return True
1373 return False
1375 def knows_these(self, refs: Iterable[DatasetRef]) -> dict[DatasetRef, bool]:
1376 # Docstring inherited from the base class.
1378 # The records themselves. Could be missing some entries.
1379 records = self._get_stored_records_associated_with_refs(refs)
1381 return {ref: ref.id in records for ref in refs}
1383 def _process_mexists_records(
1384 self,
1385 id_to_ref: dict[DatasetId, DatasetRef],
1386 records: dict[DatasetId, list[StoredFileInfo]],
1387 all_required: bool,
1388 artifact_existence: dict[ResourcePath, bool] | None = None,
1389 ) -> dict[DatasetRef, bool]:
1390 """Check given records for existence.
1392 Helper function for `mexists()`.
1394 Parameters
1395 ----------
1396 id_to_ref : `dict` of [`DatasetId`, `DatasetRef`]
1397 Mapping of the dataset ID to the dataset ref itself.
1398 records : `dict` of [`DatasetId`, `list` of `StoredFileInfo`]
1399 Records as generally returned by
1400 ``_get_stored_records_associated_with_refs``.
1401 all_required : `bool`
1402 Flag to indicate whether existence requires all artifacts
1403 associated with a dataset ID to exist or not for existence.
1404 artifact_existence : `dict` [`lsst.resources.ResourcePath`, `bool`]
1405 Optional mapping of datastore artifact to existence. Updated by
1406 this method with details of all artifacts tested. Can be `None`
1407 if the caller is not interested.
1409 Returns
1410 -------
1411 existence : `dict` of [`DatasetRef`, `bool`]
1412 Mapping from dataset to boolean indicating existence.
1413 """
1414 # The URIs to be checked and a mapping of those URIs to
1415 # the dataset ID.
1416 uris_to_check: list[ResourcePath] = []
1417 location_map: dict[ResourcePath, DatasetId] = {}
1419 location_factory = self.locationFactory
1421 uri_existence: dict[ResourcePath, bool] = {}
1422 for ref_id, infos in records.items():
1423 # Key is the dataset Id, value is list of StoredItemInfo
1424 uris = [info.file_location(location_factory).uri for info in infos]
1425 location_map.update({uri: ref_id for uri in uris})
1427 # Check the local cache directly for a dataset corresponding
1428 # to the remote URI.
1429 if self.cacheManager.file_count > 0:
1430 ref = id_to_ref[ref_id]
1431 for uri, storedFileInfo in zip(uris, infos):
1432 check_ref = ref
1433 if not ref.datasetType.isComponent() and (component := storedFileInfo.component):
1434 check_ref = ref.makeComponentRef(component)
1435 if self.cacheManager.known_to_cache(check_ref, uri.getExtension()):
1436 # Proxy for URI existence.
1437 uri_existence[uri] = True
1438 else:
1439 uris_to_check.append(uri)
1440 else:
1441 # Check all of them.
1442 uris_to_check.extend(uris)
1444 if artifact_existence is not None:
1445 # If a URI has already been checked remove it from the list
1446 # and immediately add the status to the output dict.
1447 filtered_uris_to_check = []
1448 for uri in uris_to_check:
1449 if uri in artifact_existence:
1450 uri_existence[uri] = artifact_existence[uri]
1451 else:
1452 filtered_uris_to_check.append(uri)
1453 uris_to_check = filtered_uris_to_check
1455 # Results.
1456 dataset_existence: dict[DatasetRef, bool] = {}
1458 uri_existence.update(ResourcePath.mexists(uris_to_check))
1459 for uri, exists in uri_existence.items():
1460 dataset_id = location_map[uri]
1461 ref = id_to_ref[dataset_id]
1463 # Disassembled composite needs to check all locations.
1464 # all_required indicates whether all need to exist or not.
1465 if ref in dataset_existence:
1466 if all_required:
1467 exists = dataset_existence[ref] and exists
1468 else:
1469 exists = dataset_existence[ref] or exists
1470 dataset_existence[ref] = exists
1472 if artifact_existence is not None:
1473 artifact_existence.update(uri_existence)
1475 return dataset_existence
1477 def mexists(
1478 self, refs: Iterable[DatasetRef], artifact_existence: dict[ResourcePath, bool] | None = None
1479 ) -> dict[DatasetRef, bool]:
1480 """Check the existence of multiple datasets at once.
1482 Parameters
1483 ----------
1484 refs : iterable of `DatasetRef`
1485 The datasets to be checked.
1486 artifact_existence : `dict` [`lsst.resources.ResourcePath`, `bool`]
1487 Optional mapping of datastore artifact to existence. Updated by
1488 this method with details of all artifacts tested. Can be `None`
1489 if the caller is not interested.
1491 Returns
1492 -------
1493 existence : `dict` of [`DatasetRef`, `bool`]
1494 Mapping from dataset to boolean indicating existence.
1496 Notes
1497 -----
1498 To minimize potentially costly remote existence checks, the local
1499 cache is checked as a proxy for existence. If a file for this
1500 `DatasetRef` does exist no check is done for the actual URI. This
1501 could result in possibly unexpected behavior if the dataset itself
1502 has been removed from the datastore by another process whilst it is
1503 still in the cache.
1504 """
1505 chunk_size = 10_000
1506 dataset_existence: dict[DatasetRef, bool] = {}
1507 log.debug("Checking for the existence of multiple artifacts in datastore in chunks of %d", chunk_size)
1508 n_found_total = 0
1509 n_checked = 0
1510 n_chunks = 0
1511 for chunk in chunk_iterable(refs, chunk_size=chunk_size):
1512 chunk_result = self._mexists(chunk, artifact_existence)
1514 # The log message level and content depend on how many
1515 # datasets we are processing.
1516 n_results = len(chunk_result)
1518 # Use verbose logging to ensure that messages can be seen
1519 # easily if many refs are being checked.
1520 log_threshold = VERBOSE
1521 n_checked += n_results
1523 # This sum can take some time so only do it if we know the
1524 # result is going to be used.
1525 n_found = 0
1526 if log.isEnabledFor(log_threshold):
1527 # Can treat the booleans as 0, 1 integers and sum them.
1528 n_found = sum(chunk_result.values())
1529 n_found_total += n_found
1531 # We are deliberately not trying to count the number of refs
1532 # provided in case it's in the millions. This means there is a
1533 # situation where the number of refs exactly matches the chunk
1534 # size and we will switch to the multi-chunk path even though
1535 # we only have a single chunk.
1536 if n_results < chunk_size and n_chunks == 0:
1537 # Single chunk will be processed so we can provide more detail.
1538 if n_results == 1:
1539 ref = list(chunk_result)[0]
1540 # Use debug logging to be consistent with `exists()`.
1541 log.debug(
1542 "Calling mexists() with single ref that does%s exist (%s).",
1543 "" if chunk_result[ref] else " not",
1544 ref,
1545 )
1546 else:
1547 # Single chunk but multiple files. Summarize.
1548 log.log(
1549 log_threshold,
1550 "Number of datasets found in datastore: %d out of %d datasets checked.",
1551 n_found,
1552 n_checked,
1553 )
1555 else:
1556 # Use incremental verbose logging when we have multiple chunks.
1557 log.log(
1558 log_threshold,
1559 "Number of datasets found in datastore for chunk %d: %d out of %d checked "
1560 "(running total from all chunks so far: %d found out of %d checked)",
1561 n_chunks,
1562 n_found,
1563 n_results,
1564 n_found_total,
1565 n_checked,
1566 )
1567 dataset_existence.update(chunk_result)
1568 n_chunks += 1
1570 return dataset_existence
1572 def _mexists(
1573 self, refs: Sequence[DatasetRef], artifact_existence: dict[ResourcePath, bool] | None = None
1574 ) -> dict[DatasetRef, bool]:
1575 """Check the existence of multiple datasets at once.
1577 Parameters
1578 ----------
1579 refs : iterable of `DatasetRef`
1580 The datasets to be checked.
1581 artifact_existence : `dict` [`lsst.resources.ResourcePath`, `bool`]
1582 Optional mapping of datastore artifact to existence. Updated by
1583 this method with details of all artifacts tested. Can be `None`
1584 if the caller is not interested.
1586 Returns
1587 -------
1588 existence : `dict` of [`DatasetRef`, `bool`]
1589 Mapping from dataset to boolean indicating existence.
1590 """
1591 # Make a mapping from refs with the internal storage class to the given
1592 # refs that may have a different one. We'll use the internal refs
1593 # throughout this method and convert back at the very end.
1594 internal_ref_to_input_ref = {self._cast_storage_class(ref): ref for ref in refs}
1596 # Need a mapping of dataset_id to (internal) dataset ref since some
1597 # internal APIs work with dataset_id.
1598 id_to_ref = {ref.id: ref for ref in internal_ref_to_input_ref}
1600 # Set of all IDs we are checking for.
1601 requested_ids = set(id_to_ref.keys())
1603 # The records themselves. Could be missing some entries.
1604 records = self._get_stored_records_associated_with_refs(id_to_ref.values())
1606 dataset_existence = self._process_mexists_records(
1607 id_to_ref, records, True, artifact_existence=artifact_existence
1608 )
1610 # Set of IDs that have been handled.
1611 handled_ids = {ref.id for ref in dataset_existence.keys()}
1613 missing_ids = requested_ids - handled_ids
1614 if missing_ids:
1615 dataset_existence.update(
1616 self._mexists_check_expected(
1617 [id_to_ref[missing] for missing in missing_ids], artifact_existence
1618 )
1619 )
1621 return {
1622 internal_ref_to_input_ref[internal_ref]: existence
1623 for internal_ref, existence in dataset_existence.items()
1624 }
1626 def _mexists_check_expected(
1627 self, refs: Sequence[DatasetRef], artifact_existence: dict[ResourcePath, bool] | None = None
1628 ) -> dict[DatasetRef, bool]:
1629 """Check existence of refs that are not known to datastore.
1631 Parameters
1632 ----------
1633 refs : iterable of `DatasetRef`
1634 The datasets to be checked. These are assumed not to be known
1635 to datastore.
1636 artifact_existence : `dict` [`lsst.resources.ResourcePath`, `bool`]
1637 Optional mapping of datastore artifact to existence. Updated by
1638 this method with details of all artifacts tested. Can be `None`
1639 if the caller is not interested.
1641 Returns
1642 -------
1643 existence : `dict` of [`DatasetRef`, `bool`]
1644 Mapping from dataset to boolean indicating existence.
1645 """
1646 dataset_existence: dict[DatasetRef, bool] = {}
1647 if not self.trustGetRequest:
1648 # Must assume these do not exist
1649 for ref in refs:
1650 dataset_existence[ref] = False
1651 else:
1652 log.debug(
1653 "%d datasets were not known to datastore during initial existence check.",
1654 len(refs),
1655 )
1657 # Construct data structure identical to that returned
1658 # by _get_stored_records_associated_with_refs() but using
1659 # guessed names.
1660 records = {}
1661 id_to_ref = {}
1662 for missing_ref in refs:
1663 expected = self._get_expected_dataset_locations_info(missing_ref)
1664 dataset_id = missing_ref.id
1665 records[dataset_id] = [info for _, info in expected]
1666 id_to_ref[dataset_id] = missing_ref
1668 dataset_existence.update(
1669 self._process_mexists_records(
1670 id_to_ref,
1671 records,
1672 False,
1673 artifact_existence=artifact_existence,
1674 )
1675 )
1677 return dataset_existence
1679 def exists(self, ref: DatasetRef) -> bool:
1680 """Check if the dataset exists in the datastore.
1682 Parameters
1683 ----------
1684 ref : `DatasetRef`
1685 Reference to the required dataset.
1687 Returns
1688 -------
1689 exists : `bool`
1690 `True` if the entity exists in the `Datastore`.
1692 Notes
1693 -----
1694 The local cache is checked as a proxy for existence in the remote
1695 object store. It is possible that another process on a different
1696 compute node could remove the file from the object store even
1697 though it is present in the local cache.
1698 """
1699 ref = self._cast_storage_class(ref)
1700 fileLocations = self._get_dataset_locations_info(ref)
1702 # if we are being asked to trust that registry might not be correct
1703 # we ask for the expected locations and check them explicitly
1704 if not fileLocations:
1705 if not self.trustGetRequest:
1706 return False
1708 # First check the cache. If it is not found we must check
1709 # the datastore itself. Assume that any component in the cache
1710 # means that the dataset does exist somewhere.
1711 if self.cacheManager.known_to_cache(ref):
1712 return True
1714 # When we are guessing a dataset location we can not check
1715 # for the existence of every component since we can not
1716 # know if every component was written. Instead we check
1717 # for the existence of any of the expected locations.
1718 for location, _ in self._get_expected_dataset_locations_info(ref):
1719 if self._artifact_exists(location):
1720 return True
1721 return False
1723 # All listed artifacts must exist.
1724 for location, storedFileInfo in fileLocations:
1725 # Checking in cache needs the component ref.
1726 check_ref = ref
1727 if not ref.datasetType.isComponent() and (component := storedFileInfo.component):
1728 check_ref = ref.makeComponentRef(component)
1729 if self.cacheManager.known_to_cache(check_ref, location.getExtension()):
1730 continue
1732 if not self._artifact_exists(location):
1733 return False
1735 return True
1737 def getURIs(self, ref: DatasetRef, predict: bool = False) -> DatasetRefURIs:
1738 """Return URIs associated with dataset.
1740 Parameters
1741 ----------
1742 ref : `DatasetRef`
1743 Reference to the required dataset.
1744 predict : `bool`, optional
1745 If the datastore does not know about the dataset, should it
1746 return a predicted URI or not?
1748 Returns
1749 -------
1750 uris : `DatasetRefURIs`
1751 The URI to the primary artifact associated with this dataset (if
1752 the dataset was disassembled within the datastore this may be
1753 `None`), and the URIs to any components associated with the dataset
1754 artifact. (can be empty if there are no components).
1755 """
1756 many = self.getManyURIs([ref], predict=predict, allow_missing=False)
1757 return many[ref]
1759 def getURI(self, ref: DatasetRef, predict: bool = False) -> ResourcePath:
1760 """URI to the Dataset.
1762 Parameters
1763 ----------
1764 ref : `DatasetRef`
1765 Reference to the required Dataset.
1766 predict : `bool`
1767 If `True`, allow URIs to be returned of datasets that have not
1768 been written.
1770 Returns
1771 -------
1772 uri : `str`
1773 URI pointing to the dataset within the datastore. If the
1774 dataset does not exist in the datastore, and if ``predict`` is
1775 `True`, the URI will be a prediction and will include a URI
1776 fragment "#predicted".
1777 If the datastore does not have entities that relate well
1778 to the concept of a URI the returned URI will be
1779 descriptive. The returned URI is not guaranteed to be obtainable.
1781 Raises
1782 ------
1783 FileNotFoundError
1784 Raised if a URI has been requested for a dataset that does not
1785 exist and guessing is not allowed.
1786 RuntimeError
1787 Raised if a request is made for a single URI but multiple URIs
1788 are associated with this dataset.
1790 Notes
1791 -----
1792 When a predicted URI is requested an attempt will be made to form
1793 a reasonable URI based on file templates and the expected formatter.
1794 """
1795 primary, components = self.getURIs(ref, predict)
1796 if primary is None or components:
1797 raise RuntimeError(
1798 f"Dataset ({ref}) includes distinct URIs for components. Use Datastore.getURIs() instead."
1799 )
1800 return primary
1802 def _predict_URIs(
1803 self,
1804 ref: DatasetRef,
1805 ) -> DatasetRefURIs:
1806 """Predict the URIs of a dataset ref.
1808 Parameters
1809 ----------
1810 ref : `DatasetRef`
1811 Reference to the required Dataset.
1813 Returns
1814 -------
1815 URI : DatasetRefUris
1816 Primary and component URIs. URIs will contain a URI fragment
1817 "#predicted".
1818 """
1819 uris = DatasetRefURIs()
1821 if self.composites.shouldBeDisassembled(ref):
1822 for component, _ in ref.datasetType.storageClass.components.items():
1823 comp_ref = ref.makeComponentRef(component)
1824 comp_location, _ = self._determine_put_formatter_location(comp_ref)
1826 # Add the "#predicted" URI fragment to indicate this is a
1827 # guess
1828 uris.componentURIs[component] = ResourcePath(comp_location.uri.geturl() + "#predicted")
1830 else:
1831 location, _ = self._determine_put_formatter_location(ref)
1833 # Add the "#predicted" URI fragment to indicate this is a guess
1834 uris.primaryURI = ResourcePath(location.uri.geturl() + "#predicted")
1836 return uris
1838 def getManyURIs(
1839 self,
1840 refs: Iterable[DatasetRef],
1841 predict: bool = False,
1842 allow_missing: bool = False,
1843 ) -> dict[DatasetRef, DatasetRefURIs]:
1844 # Docstring inherited
1846 uris: dict[DatasetRef, DatasetRefURIs] = {}
1848 records = self._get_stored_records_associated_with_refs(refs)
1849 records_keys = records.keys()
1851 existing_refs = tuple(ref for ref in refs if ref.id in records_keys)
1852 missing_refs = tuple(ref for ref in refs if ref.id not in records_keys)
1854 # Have to handle trustGetRequest mode by checking for the existence
1855 # of the missing refs on disk.
1856 if missing_refs:
1857 dataset_existence = self._mexists_check_expected(missing_refs, None)
1858 really_missing = set()
1859 not_missing = set()
1860 for ref, exists in dataset_existence.items():
1861 if exists:
1862 not_missing.add(ref)
1863 else:
1864 really_missing.add(ref)
1866 if not_missing:
1867 # Need to recalculate the missing/existing split.
1868 existing_refs = existing_refs + tuple(not_missing)
1869 missing_refs = tuple(really_missing)
1871 for ref in missing_refs:
1872 # if this has never been written then we have to guess
1873 if not predict:
1874 if not allow_missing:
1875 raise FileNotFoundError(f"Dataset {ref} not in this datastore.")
1876 else:
1877 uris[ref] = self._predict_URIs(ref)
1879 for ref in existing_refs:
1880 file_infos = records[ref.id]
1881 file_locations = [(i.file_location(self.locationFactory), i) for i in file_infos]
1882 uris[ref] = self._locations_to_URI(ref, file_locations)
1884 return uris
1886 def _locations_to_URI(
1887 self,
1888 ref: DatasetRef,
1889 file_locations: Sequence[tuple[Location, StoredFileInfo]],
1890 ) -> DatasetRefURIs:
1891 """Convert one or more file locations associated with a DatasetRef
1892 to a DatasetRefURIs.
1894 Parameters
1895 ----------
1896 ref : `DatasetRef`
1897 Reference to the dataset.
1898 file_locations : Sequence[Tuple[Location, StoredFileInfo]]
1899 Each item in the sequence is the location of the dataset within the
1900 datastore and stored information about the file and its formatter.
1901 If there is only one item in the sequence then it is treated as the
1902 primary URI. If there is more than one item then they are treated
1903 as component URIs. If there are no items then an error is raised
1904 unless ``self.trustGetRequest`` is `True`.
1906 Returns
1907 -------
1908 uris: DatasetRefURIs
1909 Represents the primary URI or component URIs described by the
1910 inputs.
1912 Raises
1913 ------
1914 RuntimeError
1915 If no file locations are passed in and ``self.trustGetRequest`` is
1916 `False`.
1917 FileNotFoundError
1918 If the a passed-in URI does not exist, and ``self.trustGetRequest``
1919 is `False`.
1920 RuntimeError
1921 If a passed in `StoredFileInfo`'s ``component`` is `None` (this is
1922 unexpected).
1923 """
1924 guessing = False
1925 uris = DatasetRefURIs()
1927 if not file_locations:
1928 if not self.trustGetRequest:
1929 raise RuntimeError(f"Unexpectedly got no artifacts for dataset {ref}")
1930 file_locations = self._get_expected_dataset_locations_info(ref)
1931 guessing = True
1933 if len(file_locations) == 1:
1934 # No disassembly so this is the primary URI
1935 uris.primaryURI = file_locations[0][0].uri
1936 if guessing and not uris.primaryURI.exists():
1937 raise FileNotFoundError(f"Expected URI ({uris.primaryURI}) does not exist")
1938 else:
1939 for location, file_info in file_locations:
1940 if file_info.component is None:
1941 raise RuntimeError(f"Unexpectedly got no component name for a component at {location}")
1942 if guessing and not location.uri.exists():
1943 # If we are trusting then it is entirely possible for
1944 # some components to be missing. In that case we skip
1945 # to the next component.
1946 if self.trustGetRequest:
1947 continue
1948 raise FileNotFoundError(f"Expected URI ({location.uri}) does not exist")
1949 uris.componentURIs[file_info.component] = location.uri
1951 return uris
1953 def retrieveArtifacts(
1954 self,
1955 refs: Iterable[DatasetRef],
1956 destination: ResourcePath,
1957 transfer: str = "auto",
1958 preserve_path: bool = True,
1959 overwrite: bool = False,
1960 ) -> list[ResourcePath]:
1961 """Retrieve the file artifacts associated with the supplied refs.
1963 Parameters
1964 ----------
1965 refs : iterable of `DatasetRef`
1966 The datasets for which file artifacts are to be retrieved.
1967 A single ref can result in multiple files. The refs must
1968 be resolved.
1969 destination : `lsst.resources.ResourcePath`
1970 Location to write the file artifacts.
1971 transfer : `str`, optional
1972 Method to use to transfer the artifacts. Must be one of the options
1973 supported by `lsst.resources.ResourcePath.transfer_from()`.
1974 "move" is not allowed.
1975 preserve_path : `bool`, optional
1976 If `True` the full path of the file artifact within the datastore
1977 is preserved. If `False` the final file component of the path
1978 is used.
1979 overwrite : `bool`, optional
1980 If `True` allow transfers to overwrite existing files at the
1981 destination.
1983 Returns
1984 -------
1985 targets : `list` of `lsst.resources.ResourcePath`
1986 URIs of file artifacts in destination location. Order is not
1987 preserved.
1988 """
1989 if not destination.isdir():
1990 raise ValueError(f"Destination location must refer to a directory. Given {destination}")
1992 if transfer == "move":
1993 raise ValueError("Can not move artifacts out of datastore. Use copy instead.")
1995 # Source -> Destination
1996 # This also helps filter out duplicate DatasetRef in the request
1997 # that will map to the same underlying file transfer.
1998 to_transfer: dict[ResourcePath, ResourcePath] = {}
2000 for ref in refs:
2001 locations = self._get_dataset_locations_info(ref)
2002 for location, _ in locations:
2003 source_uri = location.uri
2004 target_path: ResourcePathExpression
2005 if preserve_path:
2006 target_path = location.pathInStore
2007 if target_path.isabs():
2008 # This is an absolute path to an external file.
2009 # Use the full path.
2010 target_path = target_path.relativeToPathRoot
2011 else:
2012 target_path = source_uri.basename()
2013 target_uri = destination.join(target_path)
2014 to_transfer[source_uri] = target_uri
2016 # In theory can now parallelize the transfer
2017 log.debug("Number of artifacts to transfer to %s: %d", str(destination), len(to_transfer))
2018 for source_uri, target_uri in to_transfer.items():
2019 target_uri.transfer_from(source_uri, transfer=transfer, overwrite=overwrite)
2021 return list(to_transfer.values())
2023 def get(
2024 self,
2025 ref: DatasetRef,
2026 parameters: Mapping[str, Any] | None = None,
2027 storageClass: StorageClass | str | None = None,
2028 ) -> Any:
2029 """Load an InMemoryDataset from the store.
2031 Parameters
2032 ----------
2033 ref : `DatasetRef`
2034 Reference to the required Dataset.
2035 parameters : `dict`
2036 `StorageClass`-specific parameters that specify, for example,
2037 a slice of the dataset to be loaded.
2038 storageClass : `StorageClass` or `str`, optional
2039 The storage class to be used to override the Python type
2040 returned by this method. By default the returned type matches
2041 the dataset type definition for this dataset. Specifying a
2042 read `StorageClass` can force a different type to be returned.
2043 This type must be compatible with the original type.
2045 Returns
2046 -------
2047 inMemoryDataset : `object`
2048 Requested dataset or slice thereof as an InMemoryDataset.
2050 Raises
2051 ------
2052 FileNotFoundError
2053 Requested dataset can not be retrieved.
2054 TypeError
2055 Return value from formatter has unexpected type.
2056 ValueError
2057 Formatter failed to process the dataset.
2058 """
2059 # Supplied storage class for the component being read is either
2060 # from the ref itself or some an override if we want to force
2061 # type conversion.
2062 if storageClass is not None:
2063 ref = ref.overrideStorageClass(storageClass)
2064 refStorageClass = ref.datasetType.storageClass
2066 allGetInfo = self._prepare_for_get(ref, parameters)
2067 refComponent = ref.datasetType.component()
2069 # Create mapping from component name to related info
2070 allComponents = {i.component: i for i in allGetInfo}
2072 # By definition the dataset is disassembled if we have more
2073 # than one record for it.
2074 isDisassembled = len(allGetInfo) > 1
2076 # Look for the special case where we are disassembled but the
2077 # component is a derived component that was not written during
2078 # disassembly. For this scenario we need to check that the
2079 # component requested is listed as a derived component for the
2080 # composite storage class
2081 isDisassembledReadOnlyComponent = False
2082 if isDisassembled and refComponent:
2083 # The composite storage class should be accessible through
2084 # the component dataset type
2085 compositeStorageClass = ref.datasetType.parentStorageClass
2087 # In the unlikely scenario where the composite storage
2088 # class is not known, we can only assume that this is a
2089 # normal component. If that assumption is wrong then the
2090 # branch below that reads a persisted component will fail
2091 # so there is no need to complain here.
2092 if compositeStorageClass is not None:
2093 isDisassembledReadOnlyComponent = refComponent in compositeStorageClass.derivedComponents
2095 if isDisassembled and not refComponent:
2096 # This was a disassembled dataset spread over multiple files
2097 # and we need to put them all back together again.
2098 # Read into memory and then assemble
2100 # Check that the supplied parameters are suitable for the type read
2101 refStorageClass.validateParameters(parameters)
2103 # We want to keep track of all the parameters that were not used
2104 # by formatters. We assume that if any of the component formatters
2105 # use a parameter that we do not need to apply it again in the
2106 # assembler.
2107 usedParams = set()
2109 components: dict[str, Any] = {}
2110 for getInfo in allGetInfo:
2111 # assemblerParams are parameters not understood by the
2112 # associated formatter.
2113 usedParams.update(set(getInfo.formatterParams))
2115 component = getInfo.component
2117 if component is None:
2118 raise RuntimeError(f"Internal error in datastore assembly of {ref}")
2120 # We do not want the formatter to think it's reading
2121 # a component though because it is really reading a
2122 # standalone dataset -- always tell reader it is not a
2123 # component.
2124 components[component] = self._read_artifact_into_memory(
2125 getInfo, ref.makeComponentRef(component), isComponent=False
2126 )
2128 inMemoryDataset = ref.datasetType.storageClass.delegate().assemble(components)
2130 # Any unused parameters will have to be passed to the assembler
2131 if parameters:
2132 unusedParams = {k: v for k, v in parameters.items() if k not in usedParams}
2133 else:
2134 unusedParams = {}
2136 # Process parameters
2137 return ref.datasetType.storageClass.delegate().handleParameters(
2138 inMemoryDataset, parameters=unusedParams
2139 )
2141 elif isDisassembledReadOnlyComponent:
2142 compositeStorageClass = ref.datasetType.parentStorageClass
2143 if compositeStorageClass is None:
2144 raise RuntimeError(
2145 f"Unable to retrieve derived component '{refComponent}' since"
2146 "no composite storage class is available."
2147 )
2149 if refComponent is None:
2150 # Mainly for mypy
2151 raise RuntimeError(f"Internal error in datastore {self.name}: component can not be None here")
2153 # Assume that every derived component can be calculated by
2154 # forwarding the request to a single read/write component.
2155 # Rather than guessing which rw component is the right one by
2156 # scanning each for a derived component of the same name,
2157 # we ask the storage class delegate directly which one is best to
2158 # use.
2159 compositeDelegate = compositeStorageClass.delegate()
2160 forwardedComponent = compositeDelegate.selectResponsibleComponent(
2161 refComponent, set(allComponents)
2162 )
2164 # Select the relevant component
2165 rwInfo = allComponents[forwardedComponent]
2167 # For now assume that read parameters are validated against
2168 # the real component and not the requested component
2169 forwardedStorageClass = rwInfo.formatter.fileDescriptor.readStorageClass
2170 forwardedStorageClass.validateParameters(parameters)
2172 # The reference to use for the caching must refer to the forwarded
2173 # component and not the derived component.
2174 cache_ref = ref.makeCompositeRef().makeComponentRef(forwardedComponent)
2176 # Unfortunately the FileDescriptor inside the formatter will have
2177 # the wrong write storage class so we need to create a new one
2178 # given the immutability constraint.
2179 writeStorageClass = rwInfo.info.storageClass
2181 # We may need to put some thought into parameters for read
2182 # components but for now forward them on as is
2183 readFormatter = type(rwInfo.formatter)(
2184 FileDescriptor(
2185 rwInfo.location,
2186 readStorageClass=refStorageClass,
2187 storageClass=writeStorageClass,
2188 parameters=parameters,
2189 ),
2190 ref.dataId,
2191 )
2193 # The assembler can not receive any parameter requests for a
2194 # derived component at this time since the assembler will
2195 # see the storage class of the derived component and those
2196 # parameters will have to be handled by the formatter on the
2197 # forwarded storage class.
2198 assemblerParams: dict[str, Any] = {}
2200 # Need to created a new info that specifies the derived
2201 # component and associated storage class
2202 readInfo = DatastoreFileGetInformation(
2203 rwInfo.location,
2204 readFormatter,
2205 rwInfo.info,
2206 assemblerParams,
2207 {},
2208 refComponent,
2209 refStorageClass,
2210 )
2212 return self._read_artifact_into_memory(readInfo, ref, isComponent=True, cache_ref=cache_ref)
2214 else:
2215 # Single file request or component from that composite file
2216 for lookup in (refComponent, None):
2217 if lookup in allComponents:
2218 getInfo = allComponents[lookup]
2219 break
2220 else:
2221 raise FileNotFoundError(
2222 f"Component {refComponent} not found for ref {ref} in datastore {self.name}"
2223 )
2225 # Do not need the component itself if already disassembled
2226 if isDisassembled:
2227 isComponent = False
2228 else:
2229 isComponent = getInfo.component is not None
2231 # For a component read of a composite we want the cache to
2232 # be looking at the composite ref itself.
2233 cache_ref = ref.makeCompositeRef() if isComponent else ref
2235 # For a disassembled component we can validate parametersagainst
2236 # the component storage class directly
2237 if isDisassembled:
2238 refStorageClass.validateParameters(parameters)
2239 else:
2240 # For an assembled composite this could be a derived
2241 # component derived from a real component. The validity
2242 # of the parameters is not clear. For now validate against
2243 # the composite storage class
2244 getInfo.formatter.fileDescriptor.storageClass.validateParameters(parameters)
2246 return self._read_artifact_into_memory(getInfo, ref, isComponent=isComponent, cache_ref=cache_ref)
2248 @transactional
2249 def put(self, inMemoryDataset: Any, ref: DatasetRef) -> None:
2250 """Write a InMemoryDataset with a given `DatasetRef` to the store.
2252 Parameters
2253 ----------
2254 inMemoryDataset : `object`
2255 The dataset to store.
2256 ref : `DatasetRef`
2257 Reference to the associated Dataset.
2259 Raises
2260 ------
2261 TypeError
2262 Supplied object and storage class are inconsistent.
2263 DatasetTypeNotSupportedError
2264 The associated `DatasetType` is not handled by this datastore.
2266 Notes
2267 -----
2268 If the datastore is configured to reject certain dataset types it
2269 is possible that the put will fail and raise a
2270 `DatasetTypeNotSupportedError`. The main use case for this is to
2271 allow `ChainedDatastore` to put to multiple datastores without
2272 requiring that every datastore accepts the dataset.
2273 """
2274 doDisassembly = self.composites.shouldBeDisassembled(ref)
2275 # doDisassembly = True
2277 artifacts = []
2278 if doDisassembly:
2279 components = ref.datasetType.storageClass.delegate().disassemble(inMemoryDataset)
2280 if components is None:
2281 raise RuntimeError(
2282 f"Inconsistent configuration: dataset type {ref.datasetType.name} "
2283 f"with storage class {ref.datasetType.storageClass.name} "
2284 "is configured to be disassembled, but cannot be."
2285 )
2286 for component, componentInfo in components.items():
2287 # Don't recurse because we want to take advantage of
2288 # bulk insert -- need a new DatasetRef that refers to the
2289 # same dataset_id but has the component DatasetType
2290 # DatasetType does not refer to the types of components
2291 # So we construct one ourselves.
2292 compRef = ref.makeComponentRef(component)
2293 storedInfo = self._write_in_memory_to_artifact(componentInfo.component, compRef)
2294 artifacts.append((compRef, storedInfo))
2295 else:
2296 # Write the entire thing out
2297 storedInfo = self._write_in_memory_to_artifact(inMemoryDataset, ref)
2298 artifacts.append((ref, storedInfo))
2300 self._register_datasets(artifacts)
2302 @transactional
2303 def trash(self, ref: DatasetRef | Iterable[DatasetRef], ignore_errors: bool = True) -> None:
2304 # At this point can safely remove these datasets from the cache
2305 # to avoid confusion later on. If they are not trashed later
2306 # the cache will simply be refilled.
2307 self.cacheManager.remove_from_cache(ref)
2309 # If we are in trust mode there will be nothing to move to
2310 # the trash table and we will have to try to delete the file
2311 # immediately.
2312 if self.trustGetRequest:
2313 # Try to keep the logic below for a single file trash.
2314 if isinstance(ref, DatasetRef):
2315 refs = {ref}
2316 else:
2317 # Will recreate ref at the end of this branch.
2318 refs = set(ref)
2320 # Determine which datasets are known to datastore directly.
2321 id_to_ref = {ref.id: ref for ref in refs}
2322 existing_ids = self._get_stored_records_associated_with_refs(refs)
2323 existing_refs = {id_to_ref[ref_id] for ref_id in existing_ids}
2325 missing = refs - existing_refs
2326 if missing:
2327 # Do an explicit existence check on these refs.
2328 # We only care about the artifacts at this point and not
2329 # the dataset existence.
2330 artifact_existence: dict[ResourcePath, bool] = {}
2331 _ = self.mexists(missing, artifact_existence)
2332 uris = [uri for uri, exists in artifact_existence.items() if exists]
2334 # FUTURE UPGRADE: Implement a parallelized bulk remove.
2335 log.debug("Removing %d artifacts from datastore that are unknown to datastore", len(uris))
2336 for uri in uris:
2337 try:
2338 uri.remove()
2339 except Exception as e:
2340 if ignore_errors:
2341 log.debug("Artifact %s could not be removed: %s", uri, e)
2342 continue
2343 raise
2345 # There is no point asking the code below to remove refs we
2346 # know are missing so update it with the list of existing
2347 # records. Try to retain one vs many logic.
2348 if not existing_refs:
2349 # Nothing more to do since none of the datasets were
2350 # known to the datastore record table.
2351 return
2352 ref = list(existing_refs)
2353 if len(ref) == 1:
2354 ref = ref[0]
2356 # Get file metadata and internal metadata
2357 if not isinstance(ref, DatasetRef):
2358 log.debug("Doing multi-dataset trash in datastore %s", self.name)
2359 # Assumed to be an iterable of refs so bulk mode enabled.
2360 try:
2361 self.bridge.moveToTrash(ref, transaction=self._transaction)
2362 except Exception as e:
2363 if ignore_errors:
2364 log.warning("Unexpected issue moving multiple datasets to trash: %s", e)
2365 else:
2366 raise
2367 return
2369 log.debug("Trashing dataset %s in datastore %s", ref, self.name)
2371 fileLocations = self._get_dataset_locations_info(ref)
2373 if not fileLocations:
2374 err_msg = f"Requested dataset to trash ({ref}) is not known to datastore {self.name}"
2375 if ignore_errors:
2376 log.warning(err_msg)
2377 return
2378 else:
2379 raise FileNotFoundError(err_msg)
2381 for location, storedFileInfo in fileLocations:
2382 if not self._artifact_exists(location):
2383 err_msg = (
2384 f"Dataset is known to datastore {self.name} but "
2385 f"associated artifact ({location.uri}) is missing"
2386 )
2387 if ignore_errors:
2388 log.warning(err_msg)
2389 return
2390 else:
2391 raise FileNotFoundError(err_msg)
2393 # Mark dataset as trashed
2394 try:
2395 self.bridge.moveToTrash([ref], transaction=self._transaction)
2396 except Exception as e:
2397 if ignore_errors:
2398 log.warning(
2399 "Attempted to mark dataset (%s) to be trashed in datastore %s "
2400 "but encountered an error: %s",
2401 ref,
2402 self.name,
2403 e,
2404 )
2405 pass
2406 else:
2407 raise
2409 @transactional
2410 def emptyTrash(self, ignore_errors: bool = True) -> None:
2411 """Remove all datasets from the trash.
2413 Parameters
2414 ----------
2415 ignore_errors : `bool`
2416 If `True` return without error even if something went wrong.
2417 Problems could occur if another process is simultaneously trying
2418 to delete.
2419 """
2420 log.debug("Emptying trash in datastore %s", self.name)
2422 # Context manager will empty trash iff we finish it without raising.
2423 # It will also automatically delete the relevant rows from the
2424 # trash table and the records table.
2425 with self.bridge.emptyTrash(
2426 self._table, record_class=StoredFileInfo, record_column="path"
2427 ) as trash_data:
2428 # Removing the artifacts themselves requires that the files are
2429 # not also associated with refs that are not to be trashed.
2430 # Therefore need to do a query with the file paths themselves
2431 # and return all the refs associated with them. Can only delete
2432 # a file if the refs to be trashed are the only refs associated
2433 # with the file.
2434 # This requires multiple copies of the trashed items
2435 trashed, artifacts_to_keep = trash_data
2437 if artifacts_to_keep is None:
2438 # The bridge is not helping us so have to work it out
2439 # ourselves. This is not going to be as efficient.
2440 trashed = list(trashed)
2442 # The instance check is for mypy since up to this point it
2443 # does not know the type of info.
2444 path_map = self._refs_associated_with_artifacts(
2445 [info.path for _, info in trashed if isinstance(info, StoredFileInfo)]
2446 )
2448 for ref, info in trashed:
2449 # Mypy needs to know this is not the base class
2450 assert isinstance(info, StoredFileInfo), f"Unexpectedly got info of class {type(info)}"
2452 path_map[info.path].remove(ref.id)
2453 if not path_map[info.path]:
2454 del path_map[info.path]
2456 artifacts_to_keep = set(path_map)
2458 for ref, info in trashed:
2459 # Should not happen for this implementation but need
2460 # to keep mypy happy.
2461 assert info is not None, f"Internal logic error in emptyTrash with ref {ref}."
2463 # Mypy needs to know this is not the base class
2464 assert isinstance(info, StoredFileInfo), f"Unexpectedly got info of class {type(info)}"
2466 if info.path in artifacts_to_keep:
2467 # This is a multi-dataset artifact and we are not
2468 # removing all associated refs.
2469 continue
2471 # Only trashed refs still known to datastore will be returned.
2472 location = info.file_location(self.locationFactory)
2474 # Point of no return for this artifact
2475 log.debug("Removing artifact %s from datastore %s", location.uri, self.name)
2476 try:
2477 self._delete_artifact(location)
2478 except FileNotFoundError:
2479 # If the file itself has been deleted there is nothing
2480 # we can do about it. It is possible that trash has
2481 # been run in parallel in another process or someone
2482 # decided to delete the file. It is unlikely to come
2483 # back and so we should still continue with the removal
2484 # of the entry from the trash table. It is also possible
2485 # we removed it in a previous iteration if it was
2486 # a multi-dataset artifact. The delete artifact method
2487 # will log a debug message in this scenario.
2488 # Distinguishing file missing before trash started and
2489 # file already removed previously as part of this trash
2490 # is not worth the distinction with regards to potential
2491 # memory cost.
2492 pass
2493 except Exception as e:
2494 if ignore_errors:
2495 # Use a debug message here even though it's not
2496 # a good situation. In some cases this can be
2497 # caused by a race between user A and user B
2498 # and neither of them has permissions for the
2499 # other's files. Butler does not know about users
2500 # and trash has no idea what collections these
2501 # files were in (without guessing from a path).
2502 log.debug(
2503 "Encountered error removing artifact %s from datastore %s: %s",
2504 location.uri,
2505 self.name,
2506 e,
2507 )
2508 else:
2509 raise
2511 @transactional
2512 def transfer_from(
2513 self,
2514 source_datastore: Datastore,
2515 refs: Iterable[DatasetRef],
2516 transfer: str = "auto",
2517 artifact_existence: dict[ResourcePath, bool] | None = None,
2518 ) -> tuple[set[DatasetRef], set[DatasetRef]]:
2519 # Docstring inherited
2520 if type(self) is not type(source_datastore):
2521 raise TypeError(
2522 f"Datastore mismatch between this datastore ({type(self)}) and the "
2523 f"source datastore ({type(source_datastore)})."
2524 )
2526 # Be explicit for mypy
2527 if not isinstance(source_datastore, FileDatastore):
2528 raise TypeError(
2529 "Can only transfer to a FileDatastore from another FileDatastore, not"
2530 f" {type(source_datastore)}"
2531 )
2533 # Stop early if "direct" transfer mode is requested. That would
2534 # require that the URI inside the source datastore should be stored
2535 # directly in the target datastore, which seems unlikely to be useful
2536 # since at any moment the source datastore could delete the file.
2537 if transfer in ("direct", "split"):
2538 raise ValueError(
2539 f"Can not transfer from a source datastore using {transfer} mode since"
2540 " those files are controlled by the other datastore."
2541 )
2543 # Empty existence lookup if none given.
2544 if artifact_existence is None:
2545 artifact_existence = {}
2547 # We will go through the list multiple times so must convert
2548 # generators to lists.
2549 refs = list(refs)
2551 # In order to handle disassembled composites the code works
2552 # at the records level since it can assume that internal APIs
2553 # can be used.
2554 # - If the record already exists in the destination this is assumed
2555 # to be okay.
2556 # - If there is no record but the source and destination URIs are
2557 # identical no transfer is done but the record is added.
2558 # - If the source record refers to an absolute URI currently assume
2559 # that that URI should remain absolute and will be visible to the
2560 # destination butler. May need to have a flag to indicate whether
2561 # the dataset should be transferred. This will only happen if
2562 # the detached Butler has had a local ingest.
2564 # What we really want is all the records in the source datastore
2565 # associated with these refs. Or derived ones if they don't exist
2566 # in the source.
2567 source_records = source_datastore._get_stored_records_associated_with_refs(refs)
2569 # The source dataset_ids are the keys in these records
2570 source_ids = set(source_records)
2571 log.debug("Number of datastore records found in source: %d", len(source_ids))
2573 requested_ids = {ref.id for ref in refs}
2574 missing_ids = requested_ids - source_ids
2576 # Missing IDs can be okay if that datastore has allowed
2577 # gets based on file existence. Should we transfer what we can
2578 # or complain about it and warn?
2579 if missing_ids and not source_datastore.trustGetRequest:
2580 raise ValueError(
2581 f"Some datasets are missing from source datastore {source_datastore}: {missing_ids}"
2582 )
2584 # Need to map these missing IDs to a DatasetRef so we can guess
2585 # the details.
2586 if missing_ids:
2587 log.info(
2588 "Number of expected datasets missing from source datastore records: %d out of %d",
2589 len(missing_ids),
2590 len(requested_ids),
2591 )
2592 id_to_ref = {ref.id: ref for ref in refs if ref.id in missing_ids}
2594 # This should be chunked in case we end up having to check
2595 # the file store since we need some log output to show
2596 # progress.
2597 for missing_ids_chunk in chunk_iterable(missing_ids, chunk_size=10_000):
2598 records = {}
2599 for missing in missing_ids_chunk:
2600 # Ask the source datastore where the missing artifacts
2601 # should be. An execution butler might not know about the
2602 # artifacts even if they are there.
2603 expected = source_datastore._get_expected_dataset_locations_info(id_to_ref[missing])
2604 records[missing] = [info for _, info in expected]
2606 # Call the mexist helper method in case we have not already
2607 # checked these artifacts such that artifact_existence is
2608 # empty. This allows us to benefit from parallelism.
2609 # datastore.mexists() itself does not give us access to the
2610 # derived datastore record.
2611 log.verbose("Checking existence of %d datasets unknown to datastore", len(records))
2612 ref_exists = source_datastore._process_mexists_records(
2613 id_to_ref, records, False, artifact_existence=artifact_existence
2614 )
2616 # Now go through the records and propagate the ones that exist.
2617 location_factory = source_datastore.locationFactory
2618 for missing, record_list in records.items():
2619 # Skip completely if the ref does not exist.
2620 ref = id_to_ref[missing]
2621 if not ref_exists[ref]:
2622 log.warning("Asked to transfer dataset %s but no file artifacts exist for it.", ref)
2623 continue
2624 # Check for file artifact to decide which parts of a
2625 # disassembled composite do exist. If there is only a
2626 # single record we don't even need to look because it can't
2627 # be a composite and must exist.
2628 if len(record_list) == 1:
2629 dataset_records = record_list
2630 else:
2631 dataset_records = [
2632 record
2633 for record in record_list
2634 if artifact_existence[record.file_location(location_factory).uri]
2635 ]
2636 assert len(dataset_records) > 0, "Disassembled composite should have had some files."
2638 # Rely on source_records being a defaultdict.
2639 source_records[missing].extend(dataset_records)
2641 # See if we already have these records
2642 target_records = self._get_stored_records_associated_with_refs(refs)
2644 # The artifacts to register
2645 artifacts = []
2647 # Refs that already exist
2648 already_present = []
2650 # Refs that were rejected by this datastore.
2651 rejected = set()
2653 # Refs that were transferred successfully.
2654 accepted = set()
2656 # Record each time we have done a "direct" transfer.
2657 direct_transfers = []
2659 # Now can transfer the artifacts
2660 for ref in refs:
2661 if not self.constraints.isAcceptable(ref):
2662 # This datastore should not be accepting this dataset.
2663 rejected.add(ref)
2664 continue
2666 accepted.add(ref)
2668 if ref.id in target_records:
2669 # Already have an artifact for this.
2670 already_present.append(ref)
2671 continue
2673 # mypy needs to know these are always resolved refs
2674 for info in source_records[ref.id]:
2675 source_location = info.file_location(source_datastore.locationFactory)
2676 target_location = info.file_location(self.locationFactory)
2677 if source_location == target_location and not source_location.pathInStore.isabs():
2678 # Artifact is already in the target location.
2679 # (which is how execution butler currently runs)
2680 pass
2681 else:
2682 if target_location.pathInStore.isabs():
2683 # Just because we can see the artifact when running
2684 # the transfer doesn't mean it will be generally
2685 # accessible to a user of this butler. Need to decide
2686 # what to do about an absolute path.
2687 if transfer == "auto":
2688 # For "auto" transfers we allow the absolute URI
2689 # to be recorded in the target datastore.
2690 direct_transfers.append(source_location)
2691 else:
2692 # The user is explicitly requesting a transfer
2693 # even for an absolute URI. This requires us to
2694 # calculate the target path.
2695 template_ref = ref
2696 if info.component:
2697 template_ref = ref.makeComponentRef(info.component)
2698 target_location = self._calculate_ingested_datastore_name(
2699 source_location.uri,
2700 template_ref,
2701 )
2703 info = info.update(path=target_location.pathInStore.path)
2705 # Need to transfer it to the new location.
2706 # Assume we should always overwrite. If the artifact
2707 # is there this might indicate that a previous transfer
2708 # was interrupted but was not able to be rolled back
2709 # completely (eg pre-emption) so follow Datastore default
2710 # and overwrite.
2711 target_location.uri.transfer_from(
2712 source_location.uri, transfer=transfer, overwrite=True, transaction=self._transaction
2713 )
2715 artifacts.append((ref, info))
2717 if direct_transfers:
2718 log.info(
2719 "Transfer request for an outside-datastore artifact with absolute URI done %d time%s",
2720 len(direct_transfers),
2721 "" if len(direct_transfers) == 1 else "s",
2722 )
2724 self._register_datasets(artifacts)
2726 if already_present:
2727 n_skipped = len(already_present)
2728 log.info(
2729 "Skipped transfer of %d dataset%s already present in datastore",
2730 n_skipped,
2731 "" if n_skipped == 1 else "s",
2732 )
2734 return accepted, rejected
2736 @transactional
2737 def forget(self, refs: Iterable[DatasetRef]) -> None:
2738 # Docstring inherited.
2739 refs = list(refs)
2740 self.bridge.forget(refs)
2741 self._table.delete(["dataset_id"], *[{"dataset_id": ref.id} for ref in refs])
2743 def validateConfiguration(
2744 self, entities: Iterable[DatasetRef | DatasetType | StorageClass], logFailures: bool = False
2745 ) -> None:
2746 """Validate some of the configuration for this datastore.
2748 Parameters
2749 ----------
2750 entities : iterable of `DatasetRef`, `DatasetType`, or `StorageClass`
2751 Entities to test against this configuration. Can be differing
2752 types.
2753 logFailures : `bool`, optional
2754 If `True`, output a log message for every validation error
2755 detected.
2757 Raises
2758 ------
2759 DatastoreValidationError
2760 Raised if there is a validation problem with a configuration.
2761 All the problems are reported in a single exception.
2763 Notes
2764 -----
2765 This method checks that all the supplied entities have valid file
2766 templates and also have formatters defined.
2767 """
2768 templateFailed = None
2769 try:
2770 self.templates.validateTemplates(entities, logFailures=logFailures)
2771 except FileTemplateValidationError as e:
2772 templateFailed = str(e)
2774 formatterFailed = []
2775 for entity in entities:
2776 try:
2777 self.formatterFactory.getFormatterClass(entity)
2778 except KeyError as e:
2779 formatterFailed.append(str(e))
2780 if logFailures:
2781 log.critical("Formatter failure: %s", e)
2783 if templateFailed or formatterFailed:
2784 messages = []
2785 if templateFailed:
2786 messages.append(templateFailed)
2787 if formatterFailed:
2788 messages.append(",".join(formatterFailed))
2789 msg = ";\n".join(messages)
2790 raise DatastoreValidationError(msg)
2792 def getLookupKeys(self) -> set[LookupKey]:
2793 # Docstring is inherited from base class
2794 return (
2795 self.templates.getLookupKeys()
2796 | self.formatterFactory.getLookupKeys()
2797 | self.constraints.getLookupKeys()
2798 )
2800 def validateKey(self, lookupKey: LookupKey, entity: DatasetRef | DatasetType | StorageClass) -> None:
2801 # Docstring is inherited from base class
2802 # The key can be valid in either formatters or templates so we can
2803 # only check the template if it exists
2804 if lookupKey in self.templates:
2805 try:
2806 self.templates[lookupKey].validateTemplate(entity)
2807 except FileTemplateValidationError as e:
2808 raise DatastoreValidationError(e) from e
2810 def export(
2811 self,
2812 refs: Iterable[DatasetRef],
2813 *,
2814 directory: ResourcePathExpression | None = None,
2815 transfer: str | None = "auto",
2816 ) -> Iterable[FileDataset]:
2817 # Docstring inherited from Datastore.export.
2818 if transfer == "auto" and directory is None:
2819 transfer = None
2821 if transfer is not None and directory is None:
2822 raise TypeError(f"Cannot export using transfer mode {transfer} with no export directory given")
2824 if transfer == "move":
2825 raise TypeError("Can not export by moving files out of datastore.")
2826 elif transfer == "direct":
2827 # For an export, treat this as equivalent to None. We do not
2828 # want an import to risk using absolute URIs to datasets owned
2829 # by another datastore.
2830 log.info("Treating 'direct' transfer mode as in-place export.")
2831 transfer = None
2833 # Force the directory to be a URI object
2834 directoryUri: ResourcePath | None = None
2835 if directory is not None:
2836 directoryUri = ResourcePath(directory, forceDirectory=True)
2838 if transfer is not None and directoryUri is not None:
2839 # mypy needs the second test
2840 if not directoryUri.exists():
2841 raise FileNotFoundError(f"Export location {directory} does not exist")
2843 progress = Progress("lsst.daf.butler.datastores.FileDatastore.export", level=logging.DEBUG)
2844 for ref in progress.wrap(refs, "Exporting dataset files"):
2845 fileLocations = self._get_dataset_locations_info(ref)
2846 if not fileLocations:
2847 raise FileNotFoundError(f"Could not retrieve dataset {ref}.")
2848 # For now we can not export disassembled datasets
2849 if len(fileLocations) > 1:
2850 raise NotImplementedError(f"Can not export disassembled datasets such as {ref}")
2851 location, storedFileInfo = fileLocations[0]
2853 pathInStore = location.pathInStore.path
2854 if transfer is None:
2855 # TODO: do we also need to return the readStorageClass somehow?
2856 # We will use the path in store directly. If this is an
2857 # absolute URI, preserve it.
2858 if location.pathInStore.isabs():
2859 pathInStore = str(location.uri)
2860 elif transfer == "direct":
2861 # Use full URIs to the remote store in the export
2862 pathInStore = str(location.uri)
2863 else:
2864 # mypy needs help
2865 assert directoryUri is not None, "directoryUri must be defined to get here"
2866 storeUri = ResourcePath(location.uri)
2868 # if the datastore has an absolute URI to a resource, we
2869 # have two options:
2870 # 1. Keep the absolute URI in the exported YAML
2871 # 2. Allocate a new name in the local datastore and transfer
2872 # it.
2873 # For now go with option 2
2874 if location.pathInStore.isabs():
2875 template = self.templates.getTemplate(ref)
2876 newURI = ResourcePath(template.format(ref), forceAbsolute=False)
2877 pathInStore = str(newURI.updatedExtension(location.pathInStore.getExtension()))
2879 exportUri = directoryUri.join(pathInStore)
2880 exportUri.transfer_from(storeUri, transfer=transfer)
2882 yield FileDataset(refs=[ref], path=pathInStore, formatter=storedFileInfo.formatter)
2884 @staticmethod
2885 def computeChecksum(uri: ResourcePath, algorithm: str = "blake2b", block_size: int = 8192) -> str | None:
2886 """Compute the checksum of the supplied file.
2888 Parameters
2889 ----------
2890 uri : `lsst.resources.ResourcePath`
2891 Name of resource to calculate checksum from.
2892 algorithm : `str`, optional
2893 Name of algorithm to use. Must be one of the algorithms supported
2894 by :py:class`hashlib`.
2895 block_size : `int`
2896 Number of bytes to read from file at one time.
2898 Returns
2899 -------
2900 hexdigest : `str`
2901 Hex digest of the file.
2903 Notes
2904 -----
2905 Currently returns None if the URI is for a remote resource.
2906 """
2907 if algorithm not in hashlib.algorithms_guaranteed:
2908 raise NameError(f"The specified algorithm '{algorithm}' is not supported by hashlib")
2910 if not uri.isLocal:
2911 return None
2913 hasher = hashlib.new(algorithm)
2915 with uri.as_local() as local_uri:
2916 with open(local_uri.ospath, "rb") as f:
2917 for chunk in iter(lambda: f.read(block_size), b""):
2918 hasher.update(chunk)
2920 return hasher.hexdigest()
2922 def needs_expanded_data_ids(
2923 self,
2924 transfer: str | None,
2925 entity: DatasetRef | DatasetType | StorageClass | None = None,
2926 ) -> bool:
2927 # Docstring inherited.
2928 # This _could_ also use entity to inspect whether the filename template
2929 # involves placeholders other than the required dimensions for its
2930 # dataset type, but that's not necessary for correctness; it just
2931 # enables more optimizations (perhaps only in theory).
2932 return transfer not in ("direct", None)
2934 def import_records(self, data: Mapping[str, DatastoreRecordData]) -> None:
2935 # Docstring inherited from the base class.
2936 record_data = data.get(self.name)
2937 if not record_data:
2938 return
2940 self._bridge.insert(FakeDatasetRef(dataset_id) for dataset_id in record_data.records.keys())
2942 # TODO: Verify that there are no unexpected table names in the dict?
2943 unpacked_records = []
2944 for dataset_data in record_data.records.values():
2945 records = dataset_data.get(self._table.name)
2946 if records:
2947 for info in records:
2948 assert isinstance(info, StoredFileInfo), "Expecting StoredFileInfo records"
2949 unpacked_records.append(info.to_record())
2950 if unpacked_records:
2951 self._table.insert(*unpacked_records, transaction=self._transaction)
2953 def export_records(self, refs: Iterable[DatasetIdRef]) -> Mapping[str, DatastoreRecordData]:
2954 # Docstring inherited from the base class.
2955 exported_refs = list(self._bridge.check(refs))
2956 ids = {ref.id for ref in exported_refs}
2957 records: dict[DatasetId, dict[str, list[StoredDatastoreItemInfo]]] = {id: {} for id in ids}
2958 for row in self._table.fetch(dataset_id=ids):
2959 info: StoredDatastoreItemInfo = StoredFileInfo.from_record(row)
2960 dataset_records = records.setdefault(info.dataset_id, {})
2961 dataset_records.setdefault(self._table.name, []).append(info)
2963 record_data = DatastoreRecordData(records=records)
2964 return {self.name: record_data}
2966 def set_retrieve_dataset_type_method(self, method: Callable[[str], DatasetType | None] | None) -> None:
2967 # Docstring inherited from the base class.
2968 self._retrieve_dataset_method = method
2970 def _cast_storage_class(self, ref: DatasetRef) -> DatasetRef:
2971 """Update dataset reference to use the storage class from registry."""
2972 if self._retrieve_dataset_method is None:
2973 # We could raise an exception here but unit tests do not define
2974 # this method.
2975 return ref
2976 dataset_type = self._retrieve_dataset_method(ref.datasetType.name)
2977 if dataset_type is not None:
2978 ref = ref.overrideStorageClass(dataset_type.storageClass)
2979 return ref