Coverage for python/lsst/daf/butler/datastores/fileDatastore.py: 8%
974 statements
« prev ^ index » next coverage.py v7.2.7, created at 2023-06-23 09:30 +0000
« prev ^ index » next coverage.py v7.2.7, created at 2023-06-23 09:30 +0000
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
21from __future__ import annotations
23"""Generic file-based datastore code."""
25__all__ = ("FileDatastore",)
27import hashlib
28import logging
29from collections import defaultdict
30from collections.abc import Callable, Iterable, Mapping, Sequence
31from dataclasses import dataclass
32from typing import TYPE_CHECKING, Any, ClassVar
34from lsst.daf.butler import (
35 CompositesMap,
36 Config,
37 DatasetId,
38 DatasetRef,
39 DatasetRefURIs,
40 DatasetType,
41 DatasetTypeNotSupportedError,
42 Datastore,
43 DatastoreCacheManager,
44 DatastoreConfig,
45 DatastoreDisabledCacheManager,
46 DatastoreRecordData,
47 DatastoreValidationError,
48 FileDataset,
49 FileDescriptor,
50 FileTemplates,
51 FileTemplateValidationError,
52 Formatter,
53 FormatterFactory,
54 Location,
55 LocationFactory,
56 Progress,
57 StorageClass,
58 StoredDatastoreItemInfo,
59 StoredFileInfo,
60 ddl,
61)
62from lsst.daf.butler.core.repoRelocation import replaceRoot
63from lsst.daf.butler.core.utils import transactional
64from lsst.daf.butler.registry.interfaces import DatastoreRegistryBridge, ReadOnlyDatabaseError
65from lsst.resources import ResourcePath, ResourcePathExpression
66from lsst.utils.introspection import get_class_of, get_instance_of
67from lsst.utils.iteration import chunk_iterable
69# For VERBOSE logging usage.
70from lsst.utils.logging import VERBOSE, getLogger
71from lsst.utils.timer import time_this
72from sqlalchemy import BigInteger, String
74from ..registry.interfaces import FakeDatasetRef
75from .genericDatastore import GenericBaseDatastore
77if TYPE_CHECKING:
78 from lsst.daf.butler import AbstractDatastoreCacheManager, LookupKey
79 from lsst.daf.butler.registry.interfaces import DatasetIdRef, DatastoreRegistryBridgeManager
81log = getLogger(__name__)
84class _IngestPrepData(Datastore.IngestPrepData):
85 """Helper class for FileDatastore ingest implementation.
87 Parameters
88 ----------
89 datasets : `~collections.abc.Iterable` of `FileDataset`
90 Files to be ingested by this datastore.
91 """
93 def __init__(self, datasets: Iterable[FileDataset]):
94 super().__init__(ref for dataset in datasets for ref in dataset.refs)
95 self.datasets = datasets
98@dataclass(frozen=True)
99class DatastoreFileGetInformation:
100 """Collection of useful parameters needed to retrieve a file from
101 a Datastore.
102 """
104 location: Location
105 """The location from which to read the dataset."""
107 formatter: Formatter
108 """The `Formatter` to use to deserialize the dataset."""
110 info: StoredFileInfo
111 """Stored information about this file and its formatter."""
113 assemblerParams: Mapping[str, Any]
114 """Parameters to use for post-processing the retrieved dataset."""
116 formatterParams: Mapping[str, Any]
117 """Parameters that were understood by the associated formatter."""
119 component: str | None
120 """The component to be retrieved (can be `None`)."""
122 readStorageClass: StorageClass
123 """The `StorageClass` of the dataset being read."""
126class FileDatastore(GenericBaseDatastore):
127 """Generic Datastore for file-based implementations.
129 Should always be sub-classed since key abstract methods are missing.
131 Parameters
132 ----------
133 config : `DatastoreConfig` or `str`
134 Configuration as either a `Config` object or URI to file.
135 bridgeManager : `DatastoreRegistryBridgeManager`
136 Object that manages the interface between `Registry` and datastores.
137 butlerRoot : `str`, optional
138 New datastore root to use to override the configuration value.
140 Raises
141 ------
142 ValueError
143 If root location does not exist and ``create`` is `False` in the
144 configuration.
145 """
147 defaultConfigFile: ClassVar[str | None] = None
148 """Path to configuration defaults. Accessed within the ``config`` resource
149 or relative to a search path. Can be None if no defaults specified.
150 """
152 root: ResourcePath
153 """Root directory URI of this `Datastore`."""
155 locationFactory: LocationFactory
156 """Factory for creating locations relative to the datastore root."""
158 formatterFactory: FormatterFactory
159 """Factory for creating instances of formatters."""
161 templates: FileTemplates
162 """File templates that can be used by this `Datastore`."""
164 composites: CompositesMap
165 """Determines whether a dataset should be disassembled on put."""
167 defaultConfigFile = "datastores/fileDatastore.yaml"
168 """Path to configuration defaults. Accessed within the ``config`` resource
169 or relative to a search path. Can be None if no defaults specified.
170 """
172 _retrieve_dataset_method: Callable[[str], DatasetType | None] | None = None
173 """Callable that is used in trusted mode to retrieve registry definition
174 of a named dataset type.
175 """
177 @classmethod
178 def setConfigRoot(cls, root: str, config: Config, full: Config, overwrite: bool = True) -> None:
179 """Set any filesystem-dependent config options for this Datastore to
180 be appropriate for a new empty repository with the given root.
182 Parameters
183 ----------
184 root : `str`
185 URI to the root of the data repository.
186 config : `Config`
187 A `Config` to update. Only the subset understood by
188 this component will be updated. Will not expand
189 defaults.
190 full : `Config`
191 A complete config with all defaults expanded that can be
192 converted to a `DatastoreConfig`. Read-only and will not be
193 modified by this method.
194 Repository-specific options that should not be obtained
195 from defaults when Butler instances are constructed
196 should be copied from ``full`` to ``config``.
197 overwrite : `bool`, optional
198 If `False`, do not modify a value in ``config`` if the value
199 already exists. Default is always to overwrite with the provided
200 ``root``.
202 Notes
203 -----
204 If a keyword is explicitly defined in the supplied ``config`` it
205 will not be overridden by this method if ``overwrite`` is `False`.
206 This allows explicit values set in external configs to be retained.
207 """
208 Config.updateParameters(
209 DatastoreConfig,
210 config,
211 full,
212 toUpdate={"root": root},
213 toCopy=("cls", ("records", "table")),
214 overwrite=overwrite,
215 )
217 @classmethod
218 def makeTableSpec(cls, datasetIdColumnType: type) -> ddl.TableSpec:
219 return ddl.TableSpec(
220 fields=[
221 ddl.FieldSpec(name="dataset_id", dtype=datasetIdColumnType, primaryKey=True),
222 ddl.FieldSpec(name="path", dtype=String, length=256, nullable=False),
223 ddl.FieldSpec(name="formatter", dtype=String, length=128, nullable=False),
224 ddl.FieldSpec(name="storage_class", dtype=String, length=64, nullable=False),
225 # Use empty string to indicate no component
226 ddl.FieldSpec(name="component", dtype=String, length=32, primaryKey=True),
227 # TODO: should checksum be Base64Bytes instead?
228 ddl.FieldSpec(name="checksum", dtype=String, length=128, nullable=True),
229 ddl.FieldSpec(name="file_size", dtype=BigInteger, nullable=True),
230 ],
231 unique=frozenset(),
232 indexes=[ddl.IndexSpec("path")],
233 )
235 def __init__(
236 self,
237 config: DatastoreConfig | ResourcePathExpression,
238 bridgeManager: DatastoreRegistryBridgeManager,
239 butlerRoot: str | None = None,
240 ):
241 super().__init__(config, bridgeManager)
242 if "root" not in self.config:
243 raise ValueError("No root directory specified in configuration")
245 self._bridgeManager = bridgeManager
247 # Name ourselves either using an explicit name or a name
248 # derived from the (unexpanded) root
249 if "name" in self.config:
250 self.name = self.config["name"]
251 else:
252 # We use the unexpanded root in the name to indicate that this
253 # datastore can be moved without having to update registry.
254 self.name = "{}@{}".format(type(self).__name__, self.config["root"])
256 # Support repository relocation in config
257 # Existence of self.root is checked in subclass
258 self.root = ResourcePath(
259 replaceRoot(self.config["root"], butlerRoot), forceDirectory=True, forceAbsolute=True
260 )
262 self.locationFactory = LocationFactory(self.root)
263 self.formatterFactory = FormatterFactory()
265 # Now associate formatters with storage classes
266 self.formatterFactory.registerFormatters(self.config["formatters"], universe=bridgeManager.universe)
268 # Read the file naming templates
269 self.templates = FileTemplates(self.config["templates"], universe=bridgeManager.universe)
271 # See if composites should be disassembled
272 self.composites = CompositesMap(self.config["composites"], universe=bridgeManager.universe)
274 tableName = self.config["records", "table"]
275 try:
276 # Storage of paths and formatters, keyed by dataset_id
277 self._table = bridgeManager.opaque.register(
278 tableName, self.makeTableSpec(bridgeManager.datasetIdColumnType)
279 )
280 # Interface to Registry.
281 self._bridge = bridgeManager.register(self.name)
282 except ReadOnlyDatabaseError:
283 # If the database is read only and we just tried and failed to
284 # create a table, it means someone is trying to create a read-only
285 # butler client for an empty repo. That should be okay, as long
286 # as they then try to get any datasets before some other client
287 # creates the table. Chances are they'rejust validating
288 # configuration.
289 pass
291 # Determine whether checksums should be used - default to False
292 self.useChecksum = self.config.get("checksum", False)
294 # Determine whether we can fall back to configuration if a
295 # requested dataset is not known to registry
296 self.trustGetRequest = self.config.get("trust_get_request", False)
298 # Create a cache manager
299 self.cacheManager: AbstractDatastoreCacheManager
300 if "cached" in self.config:
301 self.cacheManager = DatastoreCacheManager(self.config["cached"], universe=bridgeManager.universe)
302 else:
303 self.cacheManager = DatastoreDisabledCacheManager("", universe=bridgeManager.universe)
305 # Check existence and create directory structure if necessary
306 if not self.root.exists():
307 if "create" not in self.config or not self.config["create"]:
308 raise ValueError(f"No valid root and not allowed to create one at: {self.root}")
309 try:
310 self.root.mkdir()
311 except Exception as e:
312 raise ValueError(
313 f"Can not create datastore root '{self.root}', check permissions. Got error: {e}"
314 ) from e
316 def __str__(self) -> str:
317 return str(self.root)
319 @property
320 def bridge(self) -> DatastoreRegistryBridge:
321 return self._bridge
323 def _artifact_exists(self, location: Location) -> bool:
324 """Check that an artifact exists in this datastore at the specified
325 location.
327 Parameters
328 ----------
329 location : `Location`
330 Expected location of the artifact associated with this datastore.
332 Returns
333 -------
334 exists : `bool`
335 True if the location can be found, false otherwise.
336 """
337 log.debug("Checking if resource exists: %s", location.uri)
338 return location.uri.exists()
340 def _delete_artifact(self, location: Location) -> None:
341 """Delete the artifact from the datastore.
343 Parameters
344 ----------
345 location : `Location`
346 Location of the artifact associated with this datastore.
347 """
348 if location.pathInStore.isabs():
349 raise RuntimeError(f"Cannot delete artifact with absolute uri {location.uri}.")
351 try:
352 location.uri.remove()
353 except FileNotFoundError:
354 log.debug("File %s did not exist and so could not be deleted.", location.uri)
355 raise
356 except Exception as e:
357 log.critical("Failed to delete file: %s (%s)", location.uri, e)
358 raise
359 log.debug("Successfully deleted file: %s", location.uri)
361 def addStoredItemInfo(self, refs: Iterable[DatasetRef], infos: Iterable[StoredFileInfo]) -> None:
362 # Docstring inherited from GenericBaseDatastore
363 records = [info.rebase(ref).to_record() for ref, info in zip(refs, infos)]
364 self._table.insert(*records, transaction=self._transaction)
366 def getStoredItemsInfo(self, ref: DatasetIdRef) -> list[StoredFileInfo]:
367 # Docstring inherited from GenericBaseDatastore
369 # Look for the dataset_id -- there might be multiple matches
370 # if we have disassembled the dataset.
371 records = self._table.fetch(dataset_id=ref.id)
372 return [StoredFileInfo.from_record(record) for record in records]
374 def _get_stored_records_associated_with_refs(
375 self, refs: Iterable[DatasetIdRef]
376 ) -> dict[DatasetId, list[StoredFileInfo]]:
377 """Retrieve all records associated with the provided refs.
379 Parameters
380 ----------
381 refs : iterable of `DatasetIdRef`
382 The refs for which records are to be retrieved.
384 Returns
385 -------
386 records : `dict` of [`DatasetId`, `list` of `StoredFileInfo`]
387 The matching records indexed by the ref ID. The number of entries
388 in the dict can be smaller than the number of requested refs.
389 """
390 records = self._table.fetch(dataset_id=[ref.id for ref in refs])
392 # Uniqueness is dataset_id + component so can have multiple records
393 # per ref.
394 records_by_ref = defaultdict(list)
395 for record in records:
396 records_by_ref[record["dataset_id"]].append(StoredFileInfo.from_record(record))
397 return records_by_ref
399 def _refs_associated_with_artifacts(self, paths: list[str | ResourcePath]) -> dict[str, set[DatasetId]]:
400 """Return paths and associated dataset refs.
402 Parameters
403 ----------
404 paths : `list` of `str` or `lsst.resources.ResourcePath`
405 All the paths to include in search.
407 Returns
408 -------
409 mapping : `dict` of [`str`, `set` [`DatasetId`]]
410 Mapping of each path to a set of associated database IDs.
411 """
412 records = self._table.fetch(path=[str(path) for path in paths])
413 result = defaultdict(set)
414 for row in records:
415 result[row["path"]].add(row["dataset_id"])
416 return result
418 def _registered_refs_per_artifact(self, pathInStore: ResourcePath) -> set[DatasetId]:
419 """Return all dataset refs associated with the supplied path.
421 Parameters
422 ----------
423 pathInStore : `lsst.resources.ResourcePath`
424 Path of interest in the data store.
426 Returns
427 -------
428 ids : `set` of `int`
429 All `DatasetRef` IDs associated with this path.
430 """
431 records = list(self._table.fetch(path=str(pathInStore)))
432 ids = {r["dataset_id"] for r in records}
433 return ids
435 def removeStoredItemInfo(self, ref: DatasetIdRef) -> None:
436 # Docstring inherited from GenericBaseDatastore
437 self._table.delete(["dataset_id"], {"dataset_id": ref.id})
439 def _get_dataset_locations_info(self, ref: DatasetIdRef) -> list[tuple[Location, StoredFileInfo]]:
440 r"""Find all the `Location`\ s of the requested dataset in the
441 `Datastore` and the associated stored file information.
443 Parameters
444 ----------
445 ref : `DatasetRef`
446 Reference to the required `Dataset`.
448 Returns
449 -------
450 results : `list` [`tuple` [`Location`, `StoredFileInfo` ]]
451 Location of the dataset within the datastore and
452 stored information about each file and its formatter.
453 """
454 # Get the file information (this will fail if no file)
455 records = self.getStoredItemsInfo(ref)
457 # Use the path to determine the location -- we need to take
458 # into account absolute URIs in the datastore record
459 return [(r.file_location(self.locationFactory), r) for r in records]
461 def _can_remove_dataset_artifact(self, ref: DatasetIdRef, location: Location) -> bool:
462 """Check that there is only one dataset associated with the
463 specified artifact.
465 Parameters
466 ----------
467 ref : `DatasetRef` or `FakeDatasetRef`
468 Dataset to be removed.
469 location : `Location`
470 The location of the artifact to be removed.
472 Returns
473 -------
474 can_remove : `Bool`
475 True if the artifact can be safely removed.
476 """
477 # Can't ever delete absolute URIs.
478 if location.pathInStore.isabs():
479 return False
481 # Get all entries associated with this path
482 allRefs = self._registered_refs_per_artifact(location.pathInStore)
483 if not allRefs:
484 raise RuntimeError(f"Datastore inconsistency error. {location.pathInStore} not in registry")
486 # Remove these refs from all the refs and if there is nothing left
487 # then we can delete
488 remainingRefs = allRefs - {ref.id}
490 if remainingRefs:
491 return False
492 return True
494 def _get_expected_dataset_locations_info(self, ref: DatasetRef) -> list[tuple[Location, StoredFileInfo]]:
495 """Predict the location and related file information of the requested
496 dataset in this datastore.
498 Parameters
499 ----------
500 ref : `DatasetRef`
501 Reference to the required `Dataset`.
503 Returns
504 -------
505 results : `list` [`tuple` [`Location`, `StoredFileInfo` ]]
506 Expected Location of the dataset within the datastore and
507 placeholder information about each file and its formatter.
509 Notes
510 -----
511 Uses the current configuration to determine how we would expect the
512 datastore files to have been written if we couldn't ask registry.
513 This is safe so long as there has been no change to datastore
514 configuration between writing the dataset and wanting to read it.
515 Will not work for files that have been ingested without using the
516 standard file template or default formatter.
517 """
519 # If we have a component ref we always need to ask the questions
520 # of the composite. If the composite is disassembled this routine
521 # should return all components. If the composite was not
522 # disassembled the composite is what is stored regardless of
523 # component request. Note that if the caller has disassembled
524 # a composite there is no way for this guess to know that
525 # without trying both the composite and component ref and seeing
526 # if there is something at the component Location even without
527 # disassembly being enabled.
528 if ref.datasetType.isComponent():
529 ref = ref.makeCompositeRef()
531 # See if the ref is a composite that should be disassembled
532 doDisassembly = self.composites.shouldBeDisassembled(ref)
534 all_info: list[tuple[Location, Formatter, StorageClass, str | None]] = []
536 if doDisassembly:
537 for component, componentStorage in ref.datasetType.storageClass.components.items():
538 compRef = ref.makeComponentRef(component)
539 location, formatter = self._determine_put_formatter_location(compRef)
540 all_info.append((location, formatter, componentStorage, component))
542 else:
543 # Always use the composite ref if no disassembly
544 location, formatter = self._determine_put_formatter_location(ref)
545 all_info.append((location, formatter, ref.datasetType.storageClass, None))
547 # Convert the list of tuples to have StoredFileInfo as second element
548 return [
549 (
550 location,
551 StoredFileInfo(
552 formatter=formatter,
553 path=location.pathInStore.path,
554 storageClass=storageClass,
555 component=component,
556 checksum=None,
557 file_size=-1,
558 dataset_id=ref.id,
559 ),
560 )
561 for location, formatter, storageClass, component in all_info
562 ]
564 def _prepare_for_get(
565 self, ref: DatasetRef, parameters: Mapping[str, Any] | None = None
566 ) -> list[DatastoreFileGetInformation]:
567 """Check parameters for ``get`` and obtain formatter and
568 location.
570 Parameters
571 ----------
572 ref : `DatasetRef`
573 Reference to the required Dataset.
574 parameters : `dict`
575 `StorageClass`-specific parameters that specify, for example,
576 a slice of the dataset to be loaded.
578 Returns
579 -------
580 getInfo : `list` [`DatastoreFileGetInformation`]
581 Parameters needed to retrieve each file.
582 """
583 log.debug("Retrieve %s from %s with parameters %s", ref, self.name, parameters)
585 # The storage class we want to use eventually
586 refStorageClass = ref.datasetType.storageClass
588 # For trusted mode need to reset storage class.
589 ref = self._cast_storage_class(ref)
591 # Get file metadata and internal metadata
592 fileLocations = self._get_dataset_locations_info(ref)
593 if not fileLocations:
594 if not self.trustGetRequest:
595 raise FileNotFoundError(f"Could not retrieve dataset {ref}.")
596 # Assume the dataset is where we think it should be
597 fileLocations = self._get_expected_dataset_locations_info(ref)
599 if len(fileLocations) > 1:
600 disassembled = True
602 # If trust is involved it is possible that there will be
603 # components listed here that do not exist in the datastore.
604 # Explicitly check for file artifact existence and filter out any
605 # that are missing.
606 if self.trustGetRequest:
607 fileLocations = [loc for loc in fileLocations if loc[0].uri.exists()]
609 # For now complain only if we have no components at all. One
610 # component is probably a problem but we can punt that to the
611 # assembler.
612 if not fileLocations:
613 raise FileNotFoundError(f"None of the component files for dataset {ref} exist.")
615 else:
616 disassembled = False
618 # Is this a component request?
619 refComponent = ref.datasetType.component()
621 fileGetInfo = []
622 for location, storedFileInfo in fileLocations:
623 # The storage class used to write the file
624 writeStorageClass = storedFileInfo.storageClass
626 # If this has been disassembled we need read to match the write
627 if disassembled:
628 readStorageClass = writeStorageClass
629 else:
630 readStorageClass = refStorageClass
632 formatter = get_instance_of(
633 storedFileInfo.formatter,
634 FileDescriptor(
635 location,
636 readStorageClass=readStorageClass,
637 storageClass=writeStorageClass,
638 parameters=parameters,
639 ),
640 ref.dataId,
641 )
643 formatterParams, notFormatterParams = formatter.segregateParameters()
645 # Of the remaining parameters, extract the ones supported by
646 # this StorageClass (for components not all will be handled)
647 assemblerParams = readStorageClass.filterParameters(notFormatterParams)
649 # The ref itself could be a component if the dataset was
650 # disassembled by butler, or we disassembled in datastore and
651 # components came from the datastore records
652 component = storedFileInfo.component if storedFileInfo.component else refComponent
654 fileGetInfo.append(
655 DatastoreFileGetInformation(
656 location,
657 formatter,
658 storedFileInfo,
659 assemblerParams,
660 formatterParams,
661 component,
662 readStorageClass,
663 )
664 )
666 return fileGetInfo
668 def _prepare_for_put(self, inMemoryDataset: Any, ref: DatasetRef) -> tuple[Location, Formatter]:
669 """Check the arguments for ``put`` and obtain formatter and
670 location.
672 Parameters
673 ----------
674 inMemoryDataset : `object`
675 The dataset to store.
676 ref : `DatasetRef`
677 Reference to the associated Dataset.
679 Returns
680 -------
681 location : `Location`
682 The location to write the dataset.
683 formatter : `Formatter`
684 The `Formatter` to use to write the dataset.
686 Raises
687 ------
688 TypeError
689 Supplied object and storage class are inconsistent.
690 DatasetTypeNotSupportedError
691 The associated `DatasetType` is not handled by this datastore.
692 """
693 self._validate_put_parameters(inMemoryDataset, ref)
694 return self._determine_put_formatter_location(ref)
696 def _determine_put_formatter_location(self, ref: DatasetRef) -> tuple[Location, Formatter]:
697 """Calculate the formatter and output location to use for put.
699 Parameters
700 ----------
701 ref : `DatasetRef`
702 Reference to the associated Dataset.
704 Returns
705 -------
706 location : `Location`
707 The location to write the dataset.
708 formatter : `Formatter`
709 The `Formatter` to use to write the dataset.
710 """
711 # Work out output file name
712 try:
713 template = self.templates.getTemplate(ref)
714 except KeyError as e:
715 raise DatasetTypeNotSupportedError(f"Unable to find template for {ref}") from e
717 # Validate the template to protect against filenames from different
718 # dataIds returning the same and causing overwrite confusion.
719 template.validateTemplate(ref)
721 location = self.locationFactory.fromPath(template.format(ref))
723 # Get the formatter based on the storage class
724 storageClass = ref.datasetType.storageClass
725 try:
726 formatter = self.formatterFactory.getFormatter(
727 ref, FileDescriptor(location, storageClass=storageClass), ref.dataId
728 )
729 except KeyError as e:
730 raise DatasetTypeNotSupportedError(
731 f"Unable to find formatter for {ref} in datastore {self.name}"
732 ) from e
734 # Now that we know the formatter, update the location
735 location = formatter.makeUpdatedLocation(location)
737 return location, formatter
739 def _overrideTransferMode(self, *datasets: FileDataset, transfer: str | None = None) -> str | None:
740 # Docstring inherited from base class
741 if transfer != "auto":
742 return transfer
744 # See if the paths are within the datastore or not
745 inside = [self._pathInStore(d.path) is not None for d in datasets]
747 if all(inside):
748 transfer = None
749 elif not any(inside):
750 # Allow ResourcePath to use its own knowledge
751 transfer = "auto"
752 else:
753 # This can happen when importing from a datastore that
754 # has had some datasets ingested using "direct" mode.
755 # Also allow ResourcePath to sort it out but warn about it.
756 # This can happen if you are importing from a datastore
757 # that had some direct transfer datasets.
758 log.warning(
759 "Some datasets are inside the datastore and some are outside. Using 'split' "
760 "transfer mode. This assumes that the files outside the datastore are "
761 "still accessible to the new butler since they will not be copied into "
762 "the target datastore."
763 )
764 transfer = "split"
766 return transfer
768 def _pathInStore(self, path: ResourcePathExpression) -> str | None:
769 """Return path relative to datastore root
771 Parameters
772 ----------
773 path : `lsst.resources.ResourcePathExpression`
774 Path to dataset. Can be absolute URI. If relative assumed to
775 be relative to the datastore. Returns path in datastore
776 or raises an exception if the path it outside.
778 Returns
779 -------
780 inStore : `str`
781 Path relative to datastore root. Returns `None` if the file is
782 outside the root.
783 """
784 # Relative path will always be relative to datastore
785 pathUri = ResourcePath(path, forceAbsolute=False)
786 return pathUri.relative_to(self.root)
788 def _standardizeIngestPath(
789 self, path: str | ResourcePath, *, transfer: str | None = None
790 ) -> str | ResourcePath:
791 """Standardize the path of a to-be-ingested file.
793 Parameters
794 ----------
795 path : `str` or `lsst.resources.ResourcePath`
796 Path of a file to be ingested. This parameter is not expected
797 to be all the types that can be used to construct a
798 `~lsst.resources.ResourcePath`.
799 transfer : `str`, optional
800 How (and whether) the dataset should be added to the datastore.
801 See `ingest` for details of transfer modes.
802 This implementation is provided only so
803 `NotImplementedError` can be raised if the mode is not supported;
804 actual transfers are deferred to `_extractIngestInfo`.
806 Returns
807 -------
808 path : `str` or `lsst.resources.ResourcePath`
809 New path in what the datastore considers standard form. If an
810 absolute URI was given that will be returned unchanged.
812 Notes
813 -----
814 Subclasses of `FileDatastore` can implement this method instead
815 of `_prepIngest`. It should not modify the data repository or given
816 file in any way.
818 Raises
819 ------
820 NotImplementedError
821 Raised if the datastore does not support the given transfer mode
822 (including the case where ingest is not supported at all).
823 FileNotFoundError
824 Raised if one of the given files does not exist.
825 """
826 if transfer not in (None, "direct", "split") + self.root.transferModes:
827 raise NotImplementedError(f"Transfer mode {transfer} not supported.")
829 # A relative URI indicates relative to datastore root
830 srcUri = ResourcePath(path, forceAbsolute=False)
831 if not srcUri.isabs():
832 srcUri = self.root.join(path)
834 if not srcUri.exists():
835 raise FileNotFoundError(
836 f"Resource at {srcUri} does not exist; note that paths to ingest "
837 f"are assumed to be relative to {self.root} unless they are absolute."
838 )
840 if transfer is None:
841 relpath = srcUri.relative_to(self.root)
842 if not relpath:
843 raise RuntimeError(
844 f"Transfer is none but source file ({srcUri}) is not within datastore ({self.root})"
845 )
847 # Return the relative path within the datastore for internal
848 # transfer
849 path = relpath
851 return path
853 def _extractIngestInfo(
854 self,
855 path: ResourcePathExpression,
856 ref: DatasetRef,
857 *,
858 formatter: Formatter | type[Formatter],
859 transfer: str | None = None,
860 record_validation_info: bool = True,
861 ) -> StoredFileInfo:
862 """Relocate (if necessary) and extract `StoredFileInfo` from a
863 to-be-ingested file.
865 Parameters
866 ----------
867 path : `lsst.resources.ResourcePathExpression`
868 URI or path of a file to be ingested.
869 ref : `DatasetRef`
870 Reference for the dataset being ingested. Guaranteed to have
871 ``dataset_id not None`.
872 formatter : `type` or `Formatter`
873 `Formatter` subclass to use for this dataset or an instance.
874 transfer : `str`, optional
875 How (and whether) the dataset should be added to the datastore.
876 See `ingest` for details of transfer modes.
877 record_validation_info : `bool`, optional
878 If `True`, the default, the datastore can record validation
879 information associated with the file. If `False` the datastore
880 will not attempt to track any information such as checksums
881 or file sizes. This can be useful if such information is tracked
882 in an external system or if the file is to be compressed in place.
883 It is up to the datastore whether this parameter is relevant.
885 Returns
886 -------
887 info : `StoredFileInfo`
888 Internal datastore record for this file. This will be inserted by
889 the caller; the `_extractIngestInfo` is only responsible for
890 creating and populating the struct.
892 Raises
893 ------
894 FileNotFoundError
895 Raised if one of the given files does not exist.
896 FileExistsError
897 Raised if transfer is not `None` but the (internal) location the
898 file would be moved to is already occupied.
899 """
900 if self._transaction is None:
901 raise RuntimeError("Ingest called without transaction enabled")
903 # Create URI of the source path, do not need to force a relative
904 # path to absolute.
905 srcUri = ResourcePath(path, forceAbsolute=False)
907 # Track whether we have read the size of the source yet
908 have_sized = False
910 tgtLocation: Location | None
911 if transfer is None or transfer == "split":
912 # A relative path is assumed to be relative to the datastore
913 # in this context
914 if not srcUri.isabs():
915 tgtLocation = self.locationFactory.fromPath(srcUri.ospath)
916 else:
917 # Work out the path in the datastore from an absolute URI
918 # This is required to be within the datastore.
919 pathInStore = srcUri.relative_to(self.root)
920 if pathInStore is None and transfer is None:
921 raise RuntimeError(
922 f"Unexpectedly learned that {srcUri} is not within datastore {self.root}"
923 )
924 if pathInStore:
925 tgtLocation = self.locationFactory.fromPath(pathInStore)
926 elif transfer == "split":
927 # Outside the datastore but treat that as a direct ingest
928 # instead.
929 tgtLocation = None
930 else:
931 raise RuntimeError(f"Unexpected transfer mode encountered: {transfer} for URI {srcUri}")
932 elif transfer == "direct":
933 # Want to store the full URI to the resource directly in
934 # datastore. This is useful for referring to permanent archive
935 # storage for raw data.
936 # Trust that people know what they are doing.
937 tgtLocation = None
938 else:
939 # Work out the name we want this ingested file to have
940 # inside the datastore
941 tgtLocation = self._calculate_ingested_datastore_name(srcUri, ref, formatter)
942 if not tgtLocation.uri.dirname().exists():
943 log.debug("Folder %s does not exist yet.", tgtLocation.uri.dirname())
944 tgtLocation.uri.dirname().mkdir()
946 # if we are transferring from a local file to a remote location
947 # it may be more efficient to get the size and checksum of the
948 # local file rather than the transferred one
949 if record_validation_info and srcUri.isLocal:
950 size = srcUri.size()
951 checksum = self.computeChecksum(srcUri) if self.useChecksum else None
952 have_sized = True
954 # Transfer the resource to the destination.
955 # Allow overwrite of an existing file. This matches the behavior
956 # of datastore.put() in that it trusts that registry would not
957 # be asking to overwrite unless registry thought that the
958 # overwrite was allowed.
959 tgtLocation.uri.transfer_from(
960 srcUri, transfer=transfer, transaction=self._transaction, overwrite=True
961 )
963 if tgtLocation is None:
964 # This means we are using direct mode
965 targetUri = srcUri
966 targetPath = str(srcUri)
967 else:
968 targetUri = tgtLocation.uri
969 targetPath = tgtLocation.pathInStore.path
971 # the file should exist in the datastore now
972 if record_validation_info:
973 if not have_sized:
974 size = targetUri.size()
975 checksum = self.computeChecksum(targetUri) if self.useChecksum else None
976 else:
977 # Not recording any file information.
978 size = -1
979 checksum = None
981 return StoredFileInfo(
982 formatter=formatter,
983 path=targetPath,
984 storageClass=ref.datasetType.storageClass,
985 component=ref.datasetType.component(),
986 file_size=size,
987 checksum=checksum,
988 dataset_id=ref.id,
989 )
991 def _prepIngest(self, *datasets: FileDataset, transfer: str | None = None) -> _IngestPrepData:
992 # Docstring inherited from Datastore._prepIngest.
993 filtered = []
994 for dataset in datasets:
995 acceptable = [ref for ref in dataset.refs if self.constraints.isAcceptable(ref)]
996 if not acceptable:
997 continue
998 else:
999 dataset.refs = acceptable
1000 if dataset.formatter is None:
1001 dataset.formatter = self.formatterFactory.getFormatterClass(dataset.refs[0])
1002 else:
1003 assert isinstance(dataset.formatter, (type, str))
1004 formatter_class = get_class_of(dataset.formatter)
1005 if not issubclass(formatter_class, Formatter):
1006 raise TypeError(f"Requested formatter {dataset.formatter} is not a Formatter class.")
1007 dataset.formatter = formatter_class
1008 dataset.path = self._standardizeIngestPath(dataset.path, transfer=transfer)
1009 filtered.append(dataset)
1010 return _IngestPrepData(filtered)
1012 @transactional
1013 def _finishIngest(
1014 self,
1015 prepData: Datastore.IngestPrepData,
1016 *,
1017 transfer: str | None = None,
1018 record_validation_info: bool = True,
1019 ) -> None:
1020 # Docstring inherited from Datastore._finishIngest.
1021 refsAndInfos = []
1022 progress = Progress("lsst.daf.butler.datastores.FileDatastore.ingest", level=logging.DEBUG)
1023 for dataset in progress.wrap(prepData.datasets, desc="Ingesting dataset files"):
1024 # Do ingest as if the first dataset ref is associated with the file
1025 info = self._extractIngestInfo(
1026 dataset.path,
1027 dataset.refs[0],
1028 formatter=dataset.formatter,
1029 transfer=transfer,
1030 record_validation_info=record_validation_info,
1031 )
1032 refsAndInfos.extend([(ref, info) for ref in dataset.refs])
1033 self._register_datasets(refsAndInfos)
1035 def _calculate_ingested_datastore_name(
1036 self,
1037 srcUri: ResourcePath,
1038 ref: DatasetRef,
1039 formatter: Formatter | type[Formatter] | None = None,
1040 ) -> Location:
1041 """Given a source URI and a DatasetRef, determine the name the
1042 dataset will have inside datastore.
1044 Parameters
1045 ----------
1046 srcUri : `lsst.resources.ResourcePath`
1047 URI to the source dataset file.
1048 ref : `DatasetRef`
1049 Ref associated with the newly-ingested dataset artifact. This
1050 is used to determine the name within the datastore.
1051 formatter : `Formatter` or Formatter class.
1052 Formatter to use for validation. Can be a class or an instance.
1053 No validation of the file extension is performed if the
1054 ``formatter`` is `None`. This can be used if the caller knows
1055 that the source URI and target URI will use the same formatter.
1057 Returns
1058 -------
1059 location : `Location`
1060 Target location for the newly-ingested dataset.
1061 """
1062 # Ingesting a file from outside the datastore.
1063 # This involves a new name.
1064 template = self.templates.getTemplate(ref)
1065 location = self.locationFactory.fromPath(template.format(ref))
1067 # Get the extension
1068 ext = srcUri.getExtension()
1070 # Update the destination to include that extension
1071 location.updateExtension(ext)
1073 # Ask the formatter to validate this extension
1074 if formatter is not None:
1075 formatter.validateExtension(location)
1077 return location
1079 def _write_in_memory_to_artifact(self, inMemoryDataset: Any, ref: DatasetRef) -> StoredFileInfo:
1080 """Write out in memory dataset to datastore.
1082 Parameters
1083 ----------
1084 inMemoryDataset : `object`
1085 Dataset to write to datastore.
1086 ref : `DatasetRef`
1087 Registry information associated with this dataset.
1089 Returns
1090 -------
1091 info : `StoredFileInfo`
1092 Information describing the artifact written to the datastore.
1093 """
1094 # May need to coerce the in memory dataset to the correct
1095 # python type, but first we need to make sure the storage class
1096 # reflects the one defined in the data repository.
1097 ref = self._cast_storage_class(ref)
1098 inMemoryDataset = ref.datasetType.storageClass.coerce_type(inMemoryDataset)
1100 location, formatter = self._prepare_for_put(inMemoryDataset, ref)
1101 uri = location.uri
1103 if not uri.dirname().exists():
1104 log.debug("Folder %s does not exist yet so creating it.", uri.dirname())
1105 uri.dirname().mkdir()
1107 if self._transaction is None:
1108 raise RuntimeError("Attempting to write artifact without transaction enabled")
1110 def _removeFileExists(uri: ResourcePath) -> None:
1111 """Remove a file and do not complain if it is not there.
1113 This is important since a formatter might fail before the file
1114 is written and we should not confuse people by writing spurious
1115 error messages to the log.
1116 """
1117 try:
1118 uri.remove()
1119 except FileNotFoundError:
1120 pass
1122 # Register a callback to try to delete the uploaded data if
1123 # something fails below
1124 self._transaction.registerUndo("artifactWrite", _removeFileExists, uri)
1126 data_written = False
1127 if not uri.isLocal:
1128 # This is a remote URI. Some datasets can be serialized directly
1129 # to bytes and sent to the remote datastore without writing a
1130 # file. If the dataset is intended to be saved to the cache
1131 # a file is always written and direct write to the remote
1132 # datastore is bypassed.
1133 if not self.cacheManager.should_be_cached(ref):
1134 try:
1135 serializedDataset = formatter.toBytes(inMemoryDataset)
1136 except NotImplementedError:
1137 # Fallback to the file writing option.
1138 pass
1139 except Exception as e:
1140 raise RuntimeError(
1141 f"Failed to serialize dataset {ref} of type {type(inMemoryDataset)} to bytes."
1142 ) from e
1143 else:
1144 log.debug("Writing bytes directly to %s", uri)
1145 uri.write(serializedDataset, overwrite=True)
1146 log.debug("Successfully wrote bytes directly to %s", uri)
1147 data_written = True
1149 if not data_written:
1150 # Did not write the bytes directly to object store so instead
1151 # write to temporary file. Always write to a temporary even if
1152 # using a local file system -- that gives us atomic writes.
1153 # If a process is killed as the file is being written we do not
1154 # want it to remain in the correct place but in corrupt state.
1155 # For local files write to the output directory not temporary dir.
1156 prefix = uri.dirname() if uri.isLocal else None
1157 with ResourcePath.temporary_uri(suffix=uri.getExtension(), prefix=prefix) as temporary_uri:
1158 # Need to configure the formatter to write to a different
1159 # location and that needs us to overwrite internals
1160 log.debug("Writing dataset to temporary location at %s", temporary_uri)
1161 with formatter._updateLocation(Location(None, temporary_uri)):
1162 try:
1163 formatter.write(inMemoryDataset)
1164 except Exception as e:
1165 raise RuntimeError(
1166 f"Failed to serialize dataset {ref} of type"
1167 f" {type(inMemoryDataset)} to "
1168 f"temporary location {temporary_uri}"
1169 ) from e
1171 # Use move for a local file since that becomes an efficient
1172 # os.rename. For remote resources we use copy to allow the
1173 # file to be cached afterwards.
1174 transfer = "move" if uri.isLocal else "copy"
1176 uri.transfer_from(temporary_uri, transfer=transfer, overwrite=True)
1178 if transfer == "copy":
1179 # Cache if required
1180 self.cacheManager.move_to_cache(temporary_uri, ref)
1182 log.debug("Successfully wrote dataset to %s via a temporary file.", uri)
1184 # URI is needed to resolve what ingest case are we dealing with
1185 return self._extractIngestInfo(uri, ref, formatter=formatter)
1187 def _read_artifact_into_memory(
1188 self,
1189 getInfo: DatastoreFileGetInformation,
1190 ref: DatasetRef,
1191 isComponent: bool = False,
1192 cache_ref: DatasetRef | None = None,
1193 ) -> Any:
1194 """Read the artifact from datastore into in memory object.
1196 Parameters
1197 ----------
1198 getInfo : `DatastoreFileGetInformation`
1199 Information about the artifact within the datastore.
1200 ref : `DatasetRef`
1201 The registry information associated with this artifact.
1202 isComponent : `bool`
1203 Flag to indicate if a component is being read from this artifact.
1204 cache_ref : `DatasetRef`, optional
1205 The DatasetRef to use when looking up the file in the cache.
1206 This ref must have the same ID as the supplied ref but can
1207 be a parent ref or component ref to indicate to the cache whether
1208 a composite file is being requested from the cache or a component
1209 file. Without this the cache will default to the supplied ref but
1210 it can get confused with read-only derived components for
1211 disassembled composites.
1213 Returns
1214 -------
1215 inMemoryDataset : `object`
1216 The artifact as a python object.
1217 """
1218 location = getInfo.location
1219 uri = location.uri
1220 log.debug("Accessing data from %s", uri)
1222 if cache_ref is None:
1223 cache_ref = ref
1224 if cache_ref.id != ref.id:
1225 raise ValueError(
1226 "The supplied cache dataset ref refers to a different dataset than expected:"
1227 f" {ref.id} != {cache_ref.id}"
1228 )
1230 # Cannot recalculate checksum but can compare size as a quick check
1231 # Do not do this if the size is negative since that indicates
1232 # we do not know.
1233 recorded_size = getInfo.info.file_size
1234 resource_size = uri.size()
1235 if recorded_size >= 0 and resource_size != recorded_size:
1236 raise RuntimeError(
1237 "Integrity failure in Datastore. "
1238 f"Size of file {uri} ({resource_size}) "
1239 f"does not match size recorded in registry of {recorded_size}"
1240 )
1242 # For the general case we have choices for how to proceed.
1243 # 1. Always use a local file (downloading the remote resource to a
1244 # temporary file if needed).
1245 # 2. Use a threshold size and read into memory and use bytes.
1246 # Use both for now with an arbitrary hand off size.
1247 # This allows small datasets to be downloaded from remote object
1248 # stores without requiring a temporary file.
1250 formatter = getInfo.formatter
1251 nbytes_max = 10_000_000 # Arbitrary number that we can tune
1252 if resource_size <= nbytes_max and formatter.can_read_bytes():
1253 with self.cacheManager.find_in_cache(cache_ref, uri.getExtension()) as cached_file:
1254 if cached_file is not None:
1255 desired_uri = cached_file
1256 msg = f" (cached version of {uri})"
1257 else:
1258 desired_uri = uri
1259 msg = ""
1260 with time_this(log, msg="Reading bytes from %s%s", args=(desired_uri, msg)):
1261 serializedDataset = desired_uri.read()
1262 log.debug(
1263 "Deserializing %s from %d bytes from location %s with formatter %s",
1264 f"component {getInfo.component}" if isComponent else "",
1265 len(serializedDataset),
1266 uri,
1267 formatter.name(),
1268 )
1269 try:
1270 result = formatter.fromBytes(
1271 serializedDataset, component=getInfo.component if isComponent else None
1272 )
1273 except Exception as e:
1274 raise ValueError(
1275 f"Failure from formatter '{formatter.name()}' for dataset {ref.id}"
1276 f" ({ref.datasetType.name} from {uri}): {e}"
1277 ) from e
1278 else:
1279 # Read from file.
1281 # Have to update the Location associated with the formatter
1282 # because formatter.read does not allow an override.
1283 # This could be improved.
1284 location_updated = False
1285 msg = ""
1287 # First check in cache for local version.
1288 # The cache will only be relevant for remote resources but
1289 # no harm in always asking. Context manager ensures that cache
1290 # file is not deleted during cache expiration.
1291 with self.cacheManager.find_in_cache(cache_ref, uri.getExtension()) as cached_file:
1292 if cached_file is not None:
1293 msg = f"(via cache read of remote file {uri})"
1294 uri = cached_file
1295 location_updated = True
1297 with uri.as_local() as local_uri:
1298 can_be_cached = False
1299 if uri != local_uri:
1300 # URI was remote and file was downloaded
1301 cache_msg = ""
1302 location_updated = True
1304 if self.cacheManager.should_be_cached(cache_ref):
1305 # In this scenario we want to ask if the downloaded
1306 # file should be cached but we should not cache
1307 # it until after we've used it (to ensure it can't
1308 # be expired whilst we are using it).
1309 can_be_cached = True
1311 # Say that it is "likely" to be cached because
1312 # if the formatter read fails we will not be
1313 # caching this file.
1314 cache_msg = " and likely cached"
1316 msg = f"(via download to local file{cache_msg})"
1318 # Calculate the (possibly) new location for the formatter
1319 # to use.
1320 newLocation = Location(*local_uri.split()) if location_updated else None
1322 log.debug(
1323 "Reading%s from location %s %s with formatter %s",
1324 f" component {getInfo.component}" if isComponent else "",
1325 uri,
1326 msg,
1327 formatter.name(),
1328 )
1329 try:
1330 with formatter._updateLocation(newLocation):
1331 with time_this(
1332 log,
1333 msg="Reading%s from location %s %s with formatter %s",
1334 args=(
1335 f" component {getInfo.component}" if isComponent else "",
1336 uri,
1337 msg,
1338 formatter.name(),
1339 ),
1340 ):
1341 result = formatter.read(component=getInfo.component if isComponent else None)
1342 except Exception as e:
1343 raise ValueError(
1344 f"Failure from formatter '{formatter.name()}' for dataset {ref.id}"
1345 f" ({ref.datasetType.name} from {uri}): {e}"
1346 ) from e
1348 # File was read successfully so can move to cache
1349 if can_be_cached:
1350 self.cacheManager.move_to_cache(local_uri, cache_ref)
1352 return self._post_process_get(
1353 result, ref.datasetType.storageClass, getInfo.assemblerParams, isComponent=isComponent
1354 )
1356 def knows(self, ref: DatasetRef) -> bool:
1357 """Check if the dataset is known to the datastore.
1359 Does not check for existence of any artifact.
1361 Parameters
1362 ----------
1363 ref : `DatasetRef`
1364 Reference to the required dataset.
1366 Returns
1367 -------
1368 exists : `bool`
1369 `True` if the dataset is known to the datastore.
1370 """
1371 fileLocations = self._get_dataset_locations_info(ref)
1372 if fileLocations:
1373 return True
1374 return False
1376 def knows_these(self, refs: Iterable[DatasetRef]) -> dict[DatasetRef, bool]:
1377 # Docstring inherited from the base class.
1379 # The records themselves. Could be missing some entries.
1380 records = self._get_stored_records_associated_with_refs(refs)
1382 return {ref: ref.id in records for ref in refs}
1384 def _process_mexists_records(
1385 self,
1386 id_to_ref: dict[DatasetId, DatasetRef],
1387 records: dict[DatasetId, list[StoredFileInfo]],
1388 all_required: bool,
1389 artifact_existence: dict[ResourcePath, bool] | None = None,
1390 ) -> dict[DatasetRef, bool]:
1391 """Helper function for mexists that checks the given records.
1393 Parameters
1394 ----------
1395 id_to_ref : `dict` of [`DatasetId`, `DatasetRef`]
1396 Mapping of the dataset ID to the dataset ref itself.
1397 records : `dict` of [`DatasetId`, `list` of `StoredFileInfo`]
1398 Records as generally returned by
1399 ``_get_stored_records_associated_with_refs``.
1400 all_required : `bool`
1401 Flag to indicate whether existence requires all artifacts
1402 associated with a dataset ID to exist or not for existence.
1403 artifact_existence : `dict` [`lsst.resources.ResourcePath`, `bool`]
1404 Optional mapping of datastore artifact to existence. Updated by
1405 this method with details of all artifacts tested. Can be `None`
1406 if the caller is not interested.
1408 Returns
1409 -------
1410 existence : `dict` of [`DatasetRef`, `bool`]
1411 Mapping from dataset to boolean indicating existence.
1412 """
1413 # The URIs to be checked and a mapping of those URIs to
1414 # the dataset ID.
1415 uris_to_check: list[ResourcePath] = []
1416 location_map: dict[ResourcePath, DatasetId] = {}
1418 location_factory = self.locationFactory
1420 uri_existence: dict[ResourcePath, bool] = {}
1421 for ref_id, infos in records.items():
1422 # Key is the dataset Id, value is list of StoredItemInfo
1423 uris = [info.file_location(location_factory).uri for info in infos]
1424 location_map.update({uri: ref_id for uri in uris})
1426 # Check the local cache directly for a dataset corresponding
1427 # to the remote URI.
1428 if self.cacheManager.file_count > 0:
1429 ref = id_to_ref[ref_id]
1430 for uri, storedFileInfo in zip(uris, infos):
1431 check_ref = ref
1432 if not ref.datasetType.isComponent() and (component := storedFileInfo.component):
1433 check_ref = ref.makeComponentRef(component)
1434 if self.cacheManager.known_to_cache(check_ref, uri.getExtension()):
1435 # Proxy for URI existence.
1436 uri_existence[uri] = True
1437 else:
1438 uris_to_check.append(uri)
1439 else:
1440 # Check all of them.
1441 uris_to_check.extend(uris)
1443 if artifact_existence is not None:
1444 # If a URI has already been checked remove it from the list
1445 # and immediately add the status to the output dict.
1446 filtered_uris_to_check = []
1447 for uri in uris_to_check:
1448 if uri in artifact_existence:
1449 uri_existence[uri] = artifact_existence[uri]
1450 else:
1451 filtered_uris_to_check.append(uri)
1452 uris_to_check = filtered_uris_to_check
1454 # Results.
1455 dataset_existence: dict[DatasetRef, bool] = {}
1457 uri_existence.update(ResourcePath.mexists(uris_to_check))
1458 for uri, exists in uri_existence.items():
1459 dataset_id = location_map[uri]
1460 ref = id_to_ref[dataset_id]
1462 # Disassembled composite needs to check all locations.
1463 # all_required indicates whether all need to exist or not.
1464 if ref in dataset_existence:
1465 if all_required:
1466 exists = dataset_existence[ref] and exists
1467 else:
1468 exists = dataset_existence[ref] or exists
1469 dataset_existence[ref] = exists
1471 if artifact_existence is not None:
1472 artifact_existence.update(uri_existence)
1474 return dataset_existence
1476 def mexists(
1477 self, refs: Iterable[DatasetRef], artifact_existence: dict[ResourcePath, bool] | None = None
1478 ) -> dict[DatasetRef, bool]:
1479 """Check the existence of multiple datasets at once.
1481 Parameters
1482 ----------
1483 refs : iterable of `DatasetRef`
1484 The datasets to be checked.
1485 artifact_existence : `dict` [`lsst.resources.ResourcePath`, `bool`]
1486 Optional mapping of datastore artifact to existence. Updated by
1487 this method with details of all artifacts tested. Can be `None`
1488 if the caller is not interested.
1490 Returns
1491 -------
1492 existence : `dict` of [`DatasetRef`, `bool`]
1493 Mapping from dataset to boolean indicating existence.
1495 Notes
1496 -----
1497 To minimize potentially costly remote existence checks, the local
1498 cache is checked as a proxy for existence. If a file for this
1499 `DatasetRef` does exist no check is done for the actual URI. This
1500 could result in possibly unexpected behavior if the dataset itself
1501 has been removed from the datastore by another process whilst it is
1502 still in the cache.
1503 """
1504 chunk_size = 10_000
1505 dataset_existence: dict[DatasetRef, bool] = {}
1506 log.debug("Checking for the existence of multiple artifacts in datastore in chunks of %d", chunk_size)
1507 n_found_total = 0
1508 n_checked = 0
1509 n_chunks = 0
1510 for chunk in chunk_iterable(refs, chunk_size=chunk_size):
1511 chunk_result = self._mexists(chunk, artifact_existence)
1513 # The log message level and content depend on how many
1514 # datasets we are processing.
1515 n_results = len(chunk_result)
1517 # Use verbose logging to ensure that messages can be seen
1518 # easily if many refs are being checked.
1519 log_threshold = VERBOSE
1520 n_checked += n_results
1522 # This sum can take some time so only do it if we know the
1523 # result is going to be used.
1524 n_found = 0
1525 if log.isEnabledFor(log_threshold):
1526 # Can treat the booleans as 0, 1 integers and sum them.
1527 n_found = sum(chunk_result.values())
1528 n_found_total += n_found
1530 # We are deliberately not trying to count the number of refs
1531 # provided in case it's in the millions. This means there is a
1532 # situation where the number of refs exactly matches the chunk
1533 # size and we will switch to the multi-chunk path even though
1534 # we only have a single chunk.
1535 if n_results < chunk_size and n_chunks == 0:
1536 # Single chunk will be processed so we can provide more detail.
1537 if n_results == 1:
1538 ref = list(chunk_result)[0]
1539 # Use debug logging to be consistent with `exists()`.
1540 log.debug(
1541 "Calling mexists() with single ref that does%s exist (%s).",
1542 "" if chunk_result[ref] else " not",
1543 ref,
1544 )
1545 else:
1546 # Single chunk but multiple files. Summarize.
1547 log.log(
1548 log_threshold,
1549 "Number of datasets found in datastore: %d out of %d datasets checked.",
1550 n_found,
1551 n_checked,
1552 )
1554 else:
1555 # Use incremental verbose logging when we have multiple chunks.
1556 log.log(
1557 log_threshold,
1558 "Number of datasets found in datastore for chunk %d: %d out of %d checked "
1559 "(running total from all chunks so far: %d found out of %d checked)",
1560 n_chunks,
1561 n_found,
1562 n_results,
1563 n_found_total,
1564 n_checked,
1565 )
1566 dataset_existence.update(chunk_result)
1567 n_chunks += 1
1569 return dataset_existence
1571 def _mexists(
1572 self, refs: Sequence[DatasetRef], artifact_existence: dict[ResourcePath, bool] | None = None
1573 ) -> dict[DatasetRef, bool]:
1574 """Check the existence of multiple datasets at once.
1576 Parameters
1577 ----------
1578 refs : iterable of `DatasetRef`
1579 The datasets to be checked.
1580 artifact_existence : `dict` [`lsst.resources.ResourcePath`, `bool`]
1581 Optional mapping of datastore artifact to existence. Updated by
1582 this method with details of all artifacts tested. Can be `None`
1583 if the caller is not interested.
1585 Returns
1586 -------
1587 existence : `dict` of [`DatasetRef`, `bool`]
1588 Mapping from dataset to boolean indicating existence.
1589 """
1590 # Make a mapping from refs with the internal storage class to the given
1591 # refs that may have a different one. We'll use the internal refs
1592 # throughout this method and convert back at the very end.
1593 internal_ref_to_input_ref = {self._cast_storage_class(ref): ref for ref in refs}
1595 # Need a mapping of dataset_id to (internal) dataset ref since some
1596 # internal APIs work with dataset_id.
1597 id_to_ref = {ref.id: ref for ref in internal_ref_to_input_ref}
1599 # Set of all IDs we are checking for.
1600 requested_ids = set(id_to_ref.keys())
1602 # The records themselves. Could be missing some entries.
1603 records = self._get_stored_records_associated_with_refs(id_to_ref.values())
1605 dataset_existence = self._process_mexists_records(
1606 id_to_ref, records, True, artifact_existence=artifact_existence
1607 )
1609 # Set of IDs that have been handled.
1610 handled_ids = {ref.id for ref in dataset_existence.keys()}
1612 missing_ids = requested_ids - handled_ids
1613 if missing_ids:
1614 dataset_existence.update(
1615 self._mexists_check_expected(
1616 [id_to_ref[missing] for missing in missing_ids], artifact_existence
1617 )
1618 )
1620 return {
1621 internal_ref_to_input_ref[internal_ref]: existence
1622 for internal_ref, existence in dataset_existence.items()
1623 }
1625 def _mexists_check_expected(
1626 self, refs: Sequence[DatasetRef], artifact_existence: dict[ResourcePath, bool] | None = None
1627 ) -> dict[DatasetRef, bool]:
1628 """Check existence of refs that are not known to datastore.
1630 Parameters
1631 ----------
1632 refs : iterable of `DatasetRef`
1633 The datasets to be checked. These are assumed not to be known
1634 to datastore.
1635 artifact_existence : `dict` [`lsst.resources.ResourcePath`, `bool`]
1636 Optional mapping of datastore artifact to existence. Updated by
1637 this method with details of all artifacts tested. Can be `None`
1638 if the caller is not interested.
1640 Returns
1641 -------
1642 existence : `dict` of [`DatasetRef`, `bool`]
1643 Mapping from dataset to boolean indicating existence.
1644 """
1645 dataset_existence: dict[DatasetRef, bool] = {}
1646 if not self.trustGetRequest:
1647 # Must assume these do not exist
1648 for ref in refs:
1649 dataset_existence[ref] = False
1650 else:
1651 log.debug(
1652 "%d datasets were not known to datastore during initial existence check.",
1653 len(refs),
1654 )
1656 # Construct data structure identical to that returned
1657 # by _get_stored_records_associated_with_refs() but using
1658 # guessed names.
1659 records = {}
1660 id_to_ref = {}
1661 for missing_ref in refs:
1662 expected = self._get_expected_dataset_locations_info(missing_ref)
1663 dataset_id = missing_ref.id
1664 records[dataset_id] = [info for _, info in expected]
1665 id_to_ref[dataset_id] = missing_ref
1667 dataset_existence.update(
1668 self._process_mexists_records(
1669 id_to_ref,
1670 records,
1671 False,
1672 artifact_existence=artifact_existence,
1673 )
1674 )
1676 return dataset_existence
1678 def exists(self, ref: DatasetRef) -> bool:
1679 """Check if the dataset exists in the datastore.
1681 Parameters
1682 ----------
1683 ref : `DatasetRef`
1684 Reference to the required dataset.
1686 Returns
1687 -------
1688 exists : `bool`
1689 `True` if the entity exists in the `Datastore`.
1691 Notes
1692 -----
1693 The local cache is checked as a proxy for existence in the remote
1694 object store. It is possible that another process on a different
1695 compute node could remove the file from the object store even
1696 though it is present in the local cache.
1697 """
1698 ref = self._cast_storage_class(ref)
1699 fileLocations = self._get_dataset_locations_info(ref)
1701 # if we are being asked to trust that registry might not be correct
1702 # we ask for the expected locations and check them explicitly
1703 if not fileLocations:
1704 if not self.trustGetRequest:
1705 return False
1707 # First check the cache. If it is not found we must check
1708 # the datastore itself. Assume that any component in the cache
1709 # means that the dataset does exist somewhere.
1710 if self.cacheManager.known_to_cache(ref):
1711 return True
1713 # When we are guessing a dataset location we can not check
1714 # for the existence of every component since we can not
1715 # know if every component was written. Instead we check
1716 # for the existence of any of the expected locations.
1717 for location, _ in self._get_expected_dataset_locations_info(ref):
1718 if self._artifact_exists(location):
1719 return True
1720 return False
1722 # All listed artifacts must exist.
1723 for location, storedFileInfo in fileLocations:
1724 # Checking in cache needs the component ref.
1725 check_ref = ref
1726 if not ref.datasetType.isComponent() and (component := storedFileInfo.component):
1727 check_ref = ref.makeComponentRef(component)
1728 if self.cacheManager.known_to_cache(check_ref, location.getExtension()):
1729 continue
1731 if not self._artifact_exists(location):
1732 return False
1734 return True
1736 def getURIs(self, ref: DatasetRef, predict: bool = False) -> DatasetRefURIs:
1737 """Return URIs associated with dataset.
1739 Parameters
1740 ----------
1741 ref : `DatasetRef`
1742 Reference to the required dataset.
1743 predict : `bool`, optional
1744 If the datastore does not know about the dataset, should it
1745 return a predicted URI or not?
1747 Returns
1748 -------
1749 uris : `DatasetRefURIs`
1750 The URI to the primary artifact associated with this dataset (if
1751 the dataset was disassembled within the datastore this may be
1752 `None`), and the URIs to any components associated with the dataset
1753 artifact. (can be empty if there are no components).
1754 """
1755 many = self.getManyURIs([ref], predict=predict, allow_missing=False)
1756 return many[ref]
1758 def getURI(self, ref: DatasetRef, predict: bool = False) -> ResourcePath:
1759 """URI to the Dataset.
1761 Parameters
1762 ----------
1763 ref : `DatasetRef`
1764 Reference to the required Dataset.
1765 predict : `bool`
1766 If `True`, allow URIs to be returned of datasets that have not
1767 been written.
1769 Returns
1770 -------
1771 uri : `str`
1772 URI pointing to the dataset within the datastore. If the
1773 dataset does not exist in the datastore, and if ``predict`` is
1774 `True`, the URI will be a prediction and will include a URI
1775 fragment "#predicted".
1776 If the datastore does not have entities that relate well
1777 to the concept of a URI the returned URI will be
1778 descriptive. The returned URI is not guaranteed to be obtainable.
1780 Raises
1781 ------
1782 FileNotFoundError
1783 Raised if a URI has been requested for a dataset that does not
1784 exist and guessing is not allowed.
1785 RuntimeError
1786 Raised if a request is made for a single URI but multiple URIs
1787 are associated with this dataset.
1789 Notes
1790 -----
1791 When a predicted URI is requested an attempt will be made to form
1792 a reasonable URI based on file templates and the expected formatter.
1793 """
1794 primary, components = self.getURIs(ref, predict)
1795 if primary is None or components:
1796 raise RuntimeError(
1797 f"Dataset ({ref}) includes distinct URIs for components. Use Datastore.getURIs() instead."
1798 )
1799 return primary
1801 def _predict_URIs(
1802 self,
1803 ref: DatasetRef,
1804 ) -> DatasetRefURIs:
1805 """Predict the URIs of a dataset ref.
1807 Parameters
1808 ----------
1809 ref : `DatasetRef`
1810 Reference to the required Dataset.
1812 Returns
1813 -------
1814 URI : DatasetRefUris
1815 Primary and component URIs. URIs will contain a URI fragment
1816 "#predicted".
1817 """
1818 uris = DatasetRefURIs()
1820 if self.composites.shouldBeDisassembled(ref):
1821 for component, _ in ref.datasetType.storageClass.components.items():
1822 comp_ref = ref.makeComponentRef(component)
1823 comp_location, _ = self._determine_put_formatter_location(comp_ref)
1825 # Add the "#predicted" URI fragment to indicate this is a
1826 # guess
1827 uris.componentURIs[component] = ResourcePath(comp_location.uri.geturl() + "#predicted")
1829 else:
1830 location, _ = self._determine_put_formatter_location(ref)
1832 # Add the "#predicted" URI fragment to indicate this is a guess
1833 uris.primaryURI = ResourcePath(location.uri.geturl() + "#predicted")
1835 return uris
1837 def getManyURIs(
1838 self,
1839 refs: Iterable[DatasetRef],
1840 predict: bool = False,
1841 allow_missing: bool = False,
1842 ) -> dict[DatasetRef, DatasetRefURIs]:
1843 # Docstring inherited
1845 uris: dict[DatasetRef, DatasetRefURIs] = {}
1847 records = self._get_stored_records_associated_with_refs(refs)
1848 records_keys = records.keys()
1850 existing_refs = tuple(ref for ref in refs if ref.id in records_keys)
1851 missing_refs = tuple(ref for ref in refs if ref.id not in records_keys)
1853 # Have to handle trustGetRequest mode by checking for the existence
1854 # of the missing refs on disk.
1855 if missing_refs:
1856 dataset_existence = self._mexists_check_expected(missing_refs, None)
1857 really_missing = set()
1858 not_missing = set()
1859 for ref, exists in dataset_existence.items():
1860 if exists:
1861 not_missing.add(ref)
1862 else:
1863 really_missing.add(ref)
1865 if not_missing:
1866 # Need to recalculate the missing/existing split.
1867 existing_refs = existing_refs + tuple(not_missing)
1868 missing_refs = tuple(really_missing)
1870 for ref in missing_refs:
1871 # if this has never been written then we have to guess
1872 if not predict:
1873 if not allow_missing:
1874 raise FileNotFoundError(f"Dataset {ref} not in this datastore.")
1875 else:
1876 uris[ref] = self._predict_URIs(ref)
1878 for ref in existing_refs:
1879 file_infos = records[ref.id]
1880 file_locations = [(i.file_location(self.locationFactory), i) for i in file_infos]
1881 uris[ref] = self._locations_to_URI(ref, file_locations)
1883 return uris
1885 def _locations_to_URI(
1886 self,
1887 ref: DatasetRef,
1888 file_locations: Sequence[tuple[Location, StoredFileInfo]],
1889 ) -> DatasetRefURIs:
1890 """Convert one or more file locations associated with a DatasetRef
1891 to a DatasetRefURIs.
1893 Parameters
1894 ----------
1895 ref : `DatasetRef`
1896 Reference to the dataset.
1897 file_locations : Sequence[Tuple[Location, StoredFileInfo]]
1898 Each item in the sequence is the location of the dataset within the
1899 datastore and stored information about the file and its formatter.
1900 If there is only one item in the sequence then it is treated as the
1901 primary URI. If there is more than one item then they are treated
1902 as component URIs. If there are no items then an error is raised
1903 unless ``self.trustGetRequest`` is `True`.
1905 Returns
1906 -------
1907 uris: DatasetRefURIs
1908 Represents the primary URI or component URIs described by the
1909 inputs.
1911 Raises
1912 ------
1913 RuntimeError
1914 If no file locations are passed in and ``self.trustGetRequest`` is
1915 `False`.
1916 FileNotFoundError
1917 If the a passed-in URI does not exist, and ``self.trustGetRequest``
1918 is `False`.
1919 RuntimeError
1920 If a passed in `StoredFileInfo`'s ``component`` is `None` (this is
1921 unexpected).
1922 """
1924 guessing = False
1925 uris = DatasetRefURIs()
1927 if not file_locations:
1928 if not self.trustGetRequest:
1929 raise RuntimeError(f"Unexpectedly got no artifacts for dataset {ref}")
1930 file_locations = self._get_expected_dataset_locations_info(ref)
1931 guessing = True
1933 if len(file_locations) == 1:
1934 # No disassembly so this is the primary URI
1935 uris.primaryURI = file_locations[0][0].uri
1936 if guessing and not uris.primaryURI.exists():
1937 raise FileNotFoundError(f"Expected URI ({uris.primaryURI}) does not exist")
1938 else:
1939 for location, file_info in file_locations:
1940 if file_info.component is None:
1941 raise RuntimeError(f"Unexpectedly got no component name for a component at {location}")
1942 if guessing and not location.uri.exists():
1943 # If we are trusting then it is entirely possible for
1944 # some components to be missing. In that case we skip
1945 # to the next component.
1946 if self.trustGetRequest:
1947 continue
1948 raise FileNotFoundError(f"Expected URI ({location.uri}) does not exist")
1949 uris.componentURIs[file_info.component] = location.uri
1951 return uris
1953 def retrieveArtifacts(
1954 self,
1955 refs: Iterable[DatasetRef],
1956 destination: ResourcePath,
1957 transfer: str = "auto",
1958 preserve_path: bool = True,
1959 overwrite: bool = False,
1960 ) -> list[ResourcePath]:
1961 """Retrieve the file artifacts associated with the supplied refs.
1963 Parameters
1964 ----------
1965 refs : iterable of `DatasetRef`
1966 The datasets for which file artifacts are to be retrieved.
1967 A single ref can result in multiple files. The refs must
1968 be resolved.
1969 destination : `lsst.resources.ResourcePath`
1970 Location to write the file artifacts.
1971 transfer : `str`, optional
1972 Method to use to transfer the artifacts. Must be one of the options
1973 supported by `lsst.resources.ResourcePath.transfer_from()`.
1974 "move" is not allowed.
1975 preserve_path : `bool`, optional
1976 If `True` the full path of the file artifact within the datastore
1977 is preserved. If `False` the final file component of the path
1978 is used.
1979 overwrite : `bool`, optional
1980 If `True` allow transfers to overwrite existing files at the
1981 destination.
1983 Returns
1984 -------
1985 targets : `list` of `lsst.resources.ResourcePath`
1986 URIs of file artifacts in destination location. Order is not
1987 preserved.
1988 """
1989 if not destination.isdir():
1990 raise ValueError(f"Destination location must refer to a directory. Given {destination}")
1992 if transfer == "move":
1993 raise ValueError("Can not move artifacts out of datastore. Use copy instead.")
1995 # Source -> Destination
1996 # This also helps filter out duplicate DatasetRef in the request
1997 # that will map to the same underlying file transfer.
1998 to_transfer: dict[ResourcePath, ResourcePath] = {}
2000 for ref in refs:
2001 locations = self._get_dataset_locations_info(ref)
2002 for location, _ in locations:
2003 source_uri = location.uri
2004 target_path: ResourcePathExpression
2005 if preserve_path:
2006 target_path = location.pathInStore
2007 if target_path.isabs():
2008 # This is an absolute path to an external file.
2009 # Use the full path.
2010 target_path = target_path.relativeToPathRoot
2011 else:
2012 target_path = source_uri.basename()
2013 target_uri = destination.join(target_path)
2014 to_transfer[source_uri] = target_uri
2016 # In theory can now parallelize the transfer
2017 log.debug("Number of artifacts to transfer to %s: %d", str(destination), len(to_transfer))
2018 for source_uri, target_uri in to_transfer.items():
2019 target_uri.transfer_from(source_uri, transfer=transfer, overwrite=overwrite)
2021 return list(to_transfer.values())
2023 def get(
2024 self,
2025 ref: DatasetRef,
2026 parameters: Mapping[str, Any] | None = None,
2027 storageClass: StorageClass | str | None = None,
2028 ) -> Any:
2029 """Load an InMemoryDataset from the store.
2031 Parameters
2032 ----------
2033 ref : `DatasetRef`
2034 Reference to the required Dataset.
2035 parameters : `dict`
2036 `StorageClass`-specific parameters that specify, for example,
2037 a slice of the dataset to be loaded.
2038 storageClass : `StorageClass` or `str`, optional
2039 The storage class to be used to override the Python type
2040 returned by this method. By default the returned type matches
2041 the dataset type definition for this dataset. Specifying a
2042 read `StorageClass` can force a different type to be returned.
2043 This type must be compatible with the original type.
2045 Returns
2046 -------
2047 inMemoryDataset : `object`
2048 Requested dataset or slice thereof as an InMemoryDataset.
2050 Raises
2051 ------
2052 FileNotFoundError
2053 Requested dataset can not be retrieved.
2054 TypeError
2055 Return value from formatter has unexpected type.
2056 ValueError
2057 Formatter failed to process the dataset.
2058 """
2059 # Supplied storage class for the component being read is either
2060 # from the ref itself or some an override if we want to force
2061 # type conversion.
2062 if storageClass is not None:
2063 ref = ref.overrideStorageClass(storageClass)
2064 refStorageClass = ref.datasetType.storageClass
2066 allGetInfo = self._prepare_for_get(ref, parameters)
2067 refComponent = ref.datasetType.component()
2069 # Create mapping from component name to related info
2070 allComponents = {i.component: i for i in allGetInfo}
2072 # By definition the dataset is disassembled if we have more
2073 # than one record for it.
2074 isDisassembled = len(allGetInfo) > 1
2076 # Look for the special case where we are disassembled but the
2077 # component is a derived component that was not written during
2078 # disassembly. For this scenario we need to check that the
2079 # component requested is listed as a derived component for the
2080 # composite storage class
2081 isDisassembledReadOnlyComponent = False
2082 if isDisassembled and refComponent:
2083 # The composite storage class should be accessible through
2084 # the component dataset type
2085 compositeStorageClass = ref.datasetType.parentStorageClass
2087 # In the unlikely scenario where the composite storage
2088 # class is not known, we can only assume that this is a
2089 # normal component. If that assumption is wrong then the
2090 # branch below that reads a persisted component will fail
2091 # so there is no need to complain here.
2092 if compositeStorageClass is not None:
2093 isDisassembledReadOnlyComponent = refComponent in compositeStorageClass.derivedComponents
2095 if isDisassembled and not refComponent:
2096 # This was a disassembled dataset spread over multiple files
2097 # and we need to put them all back together again.
2098 # Read into memory and then assemble
2100 # Check that the supplied parameters are suitable for the type read
2101 refStorageClass.validateParameters(parameters)
2103 # We want to keep track of all the parameters that were not used
2104 # by formatters. We assume that if any of the component formatters
2105 # use a parameter that we do not need to apply it again in the
2106 # assembler.
2107 usedParams = set()
2109 components: dict[str, Any] = {}
2110 for getInfo in allGetInfo:
2111 # assemblerParams are parameters not understood by the
2112 # associated formatter.
2113 usedParams.update(set(getInfo.formatterParams))
2115 component = getInfo.component
2117 if component is None:
2118 raise RuntimeError(f"Internal error in datastore assembly of {ref}")
2120 # We do not want the formatter to think it's reading
2121 # a component though because it is really reading a
2122 # standalone dataset -- always tell reader it is not a
2123 # component.
2124 components[component] = self._read_artifact_into_memory(
2125 getInfo, ref.makeComponentRef(component), isComponent=False
2126 )
2128 inMemoryDataset = ref.datasetType.storageClass.delegate().assemble(components)
2130 # Any unused parameters will have to be passed to the assembler
2131 if parameters:
2132 unusedParams = {k: v for k, v in parameters.items() if k not in usedParams}
2133 else:
2134 unusedParams = {}
2136 # Process parameters
2137 return ref.datasetType.storageClass.delegate().handleParameters(
2138 inMemoryDataset, parameters=unusedParams
2139 )
2141 elif isDisassembledReadOnlyComponent:
2142 compositeStorageClass = ref.datasetType.parentStorageClass
2143 if compositeStorageClass is None:
2144 raise RuntimeError(
2145 f"Unable to retrieve derived component '{refComponent}' since"
2146 "no composite storage class is available."
2147 )
2149 if refComponent is None:
2150 # Mainly for mypy
2151 raise RuntimeError(f"Internal error in datastore {self.name}: component can not be None here")
2153 # Assume that every derived component can be calculated by
2154 # forwarding the request to a single read/write component.
2155 # Rather than guessing which rw component is the right one by
2156 # scanning each for a derived component of the same name,
2157 # we ask the storage class delegate directly which one is best to
2158 # use.
2159 compositeDelegate = compositeStorageClass.delegate()
2160 forwardedComponent = compositeDelegate.selectResponsibleComponent(
2161 refComponent, set(allComponents)
2162 )
2164 # Select the relevant component
2165 rwInfo = allComponents[forwardedComponent]
2167 # For now assume that read parameters are validated against
2168 # the real component and not the requested component
2169 forwardedStorageClass = rwInfo.formatter.fileDescriptor.readStorageClass
2170 forwardedStorageClass.validateParameters(parameters)
2172 # The reference to use for the caching must refer to the forwarded
2173 # component and not the derived component.
2174 cache_ref = ref.makeCompositeRef().makeComponentRef(forwardedComponent)
2176 # Unfortunately the FileDescriptor inside the formatter will have
2177 # the wrong write storage class so we need to create a new one
2178 # given the immutability constraint.
2179 writeStorageClass = rwInfo.info.storageClass
2181 # We may need to put some thought into parameters for read
2182 # components but for now forward them on as is
2183 readFormatter = type(rwInfo.formatter)(
2184 FileDescriptor(
2185 rwInfo.location,
2186 readStorageClass=refStorageClass,
2187 storageClass=writeStorageClass,
2188 parameters=parameters,
2189 ),
2190 ref.dataId,
2191 )
2193 # The assembler can not receive any parameter requests for a
2194 # derived component at this time since the assembler will
2195 # see the storage class of the derived component and those
2196 # parameters will have to be handled by the formatter on the
2197 # forwarded storage class.
2198 assemblerParams: dict[str, Any] = {}
2200 # Need to created a new info that specifies the derived
2201 # component and associated storage class
2202 readInfo = DatastoreFileGetInformation(
2203 rwInfo.location,
2204 readFormatter,
2205 rwInfo.info,
2206 assemblerParams,
2207 {},
2208 refComponent,
2209 refStorageClass,
2210 )
2212 return self._read_artifact_into_memory(readInfo, ref, isComponent=True, cache_ref=cache_ref)
2214 else:
2215 # Single file request or component from that composite file
2216 for lookup in (refComponent, None):
2217 if lookup in allComponents:
2218 getInfo = allComponents[lookup]
2219 break
2220 else:
2221 raise FileNotFoundError(
2222 f"Component {refComponent} not found for ref {ref} in datastore {self.name}"
2223 )
2225 # Do not need the component itself if already disassembled
2226 if isDisassembled:
2227 isComponent = False
2228 else:
2229 isComponent = getInfo.component is not None
2231 # For a component read of a composite we want the cache to
2232 # be looking at the composite ref itself.
2233 cache_ref = ref.makeCompositeRef() if isComponent else ref
2235 # For a disassembled component we can validate parametersagainst
2236 # the component storage class directly
2237 if isDisassembled:
2238 refStorageClass.validateParameters(parameters)
2239 else:
2240 # For an assembled composite this could be a derived
2241 # component derived from a real component. The validity
2242 # of the parameters is not clear. For now validate against
2243 # the composite storage class
2244 getInfo.formatter.fileDescriptor.storageClass.validateParameters(parameters)
2246 return self._read_artifact_into_memory(getInfo, ref, isComponent=isComponent, cache_ref=cache_ref)
2248 @transactional
2249 def put(self, inMemoryDataset: Any, ref: DatasetRef) -> None:
2250 """Write a InMemoryDataset with a given `DatasetRef` to the store.
2252 Parameters
2253 ----------
2254 inMemoryDataset : `object`
2255 The dataset to store.
2256 ref : `DatasetRef`
2257 Reference to the associated Dataset.
2259 Raises
2260 ------
2261 TypeError
2262 Supplied object and storage class are inconsistent.
2263 DatasetTypeNotSupportedError
2264 The associated `DatasetType` is not handled by this datastore.
2266 Notes
2267 -----
2268 If the datastore is configured to reject certain dataset types it
2269 is possible that the put will fail and raise a
2270 `DatasetTypeNotSupportedError`. The main use case for this is to
2271 allow `ChainedDatastore` to put to multiple datastores without
2272 requiring that every datastore accepts the dataset.
2273 """
2275 doDisassembly = self.composites.shouldBeDisassembled(ref)
2276 # doDisassembly = True
2278 artifacts = []
2279 if doDisassembly:
2280 components = ref.datasetType.storageClass.delegate().disassemble(inMemoryDataset)
2281 if components is None:
2282 raise RuntimeError(
2283 f"Inconsistent configuration: dataset type {ref.datasetType.name} "
2284 f"with storage class {ref.datasetType.storageClass.name} "
2285 "is configured to be disassembled, but cannot be."
2286 )
2287 for component, componentInfo in components.items():
2288 # Don't recurse because we want to take advantage of
2289 # bulk insert -- need a new DatasetRef that refers to the
2290 # same dataset_id but has the component DatasetType
2291 # DatasetType does not refer to the types of components
2292 # So we construct one ourselves.
2293 compRef = ref.makeComponentRef(component)
2294 storedInfo = self._write_in_memory_to_artifact(componentInfo.component, compRef)
2295 artifacts.append((compRef, storedInfo))
2296 else:
2297 # Write the entire thing out
2298 storedInfo = self._write_in_memory_to_artifact(inMemoryDataset, ref)
2299 artifacts.append((ref, storedInfo))
2301 self._register_datasets(artifacts)
2303 @transactional
2304 def trash(self, ref: DatasetRef | Iterable[DatasetRef], ignore_errors: bool = True) -> None:
2305 # At this point can safely remove these datasets from the cache
2306 # to avoid confusion later on. If they are not trashed later
2307 # the cache will simply be refilled.
2308 self.cacheManager.remove_from_cache(ref)
2310 # If we are in trust mode there will be nothing to move to
2311 # the trash table and we will have to try to delete the file
2312 # immediately.
2313 if self.trustGetRequest:
2314 # Try to keep the logic below for a single file trash.
2315 if isinstance(ref, DatasetRef):
2316 refs = {ref}
2317 else:
2318 # Will recreate ref at the end of this branch.
2319 refs = set(ref)
2321 # Determine which datasets are known to datastore directly.
2322 id_to_ref = {ref.id: ref for ref in refs}
2323 existing_ids = self._get_stored_records_associated_with_refs(refs)
2324 existing_refs = {id_to_ref[ref_id] for ref_id in existing_ids}
2326 missing = refs - existing_refs
2327 if missing:
2328 # Do an explicit existence check on these refs.
2329 # We only care about the artifacts at this point and not
2330 # the dataset existence.
2331 artifact_existence: dict[ResourcePath, bool] = {}
2332 _ = self.mexists(missing, artifact_existence)
2333 uris = [uri for uri, exists in artifact_existence.items() if exists]
2335 # FUTURE UPGRADE: Implement a parallelized bulk remove.
2336 log.debug("Removing %d artifacts from datastore that are unknown to datastore", len(uris))
2337 for uri in uris:
2338 try:
2339 uri.remove()
2340 except Exception as e:
2341 if ignore_errors:
2342 log.debug("Artifact %s could not be removed: %s", uri, e)
2343 continue
2344 raise
2346 # There is no point asking the code below to remove refs we
2347 # know are missing so update it with the list of existing
2348 # records. Try to retain one vs many logic.
2349 if not existing_refs:
2350 # Nothing more to do since none of the datasets were
2351 # known to the datastore record table.
2352 return
2353 ref = list(existing_refs)
2354 if len(ref) == 1:
2355 ref = ref[0]
2357 # Get file metadata and internal metadata
2358 if not isinstance(ref, DatasetRef):
2359 log.debug("Doing multi-dataset trash in datastore %s", self.name)
2360 # Assumed to be an iterable of refs so bulk mode enabled.
2361 try:
2362 self.bridge.moveToTrash(ref, transaction=self._transaction)
2363 except Exception as e:
2364 if ignore_errors:
2365 log.warning("Unexpected issue moving multiple datasets to trash: %s", e)
2366 else:
2367 raise
2368 return
2370 log.debug("Trashing dataset %s in datastore %s", ref, self.name)
2372 fileLocations = self._get_dataset_locations_info(ref)
2374 if not fileLocations:
2375 err_msg = f"Requested dataset to trash ({ref}) is not known to datastore {self.name}"
2376 if ignore_errors:
2377 log.warning(err_msg)
2378 return
2379 else:
2380 raise FileNotFoundError(err_msg)
2382 for location, storedFileInfo in fileLocations:
2383 if not self._artifact_exists(location):
2384 err_msg = (
2385 f"Dataset is known to datastore {self.name} but "
2386 f"associated artifact ({location.uri}) is missing"
2387 )
2388 if ignore_errors:
2389 log.warning(err_msg)
2390 return
2391 else:
2392 raise FileNotFoundError(err_msg)
2394 # Mark dataset as trashed
2395 try:
2396 self.bridge.moveToTrash([ref], transaction=self._transaction)
2397 except Exception as e:
2398 if ignore_errors:
2399 log.warning(
2400 "Attempted to mark dataset (%s) to be trashed in datastore %s "
2401 "but encountered an error: %s",
2402 ref,
2403 self.name,
2404 e,
2405 )
2406 pass
2407 else:
2408 raise
2410 @transactional
2411 def emptyTrash(self, ignore_errors: bool = True) -> None:
2412 """Remove all datasets from the trash.
2414 Parameters
2415 ----------
2416 ignore_errors : `bool`
2417 If `True` return without error even if something went wrong.
2418 Problems could occur if another process is simultaneously trying
2419 to delete.
2420 """
2421 log.debug("Emptying trash in datastore %s", self.name)
2423 # Context manager will empty trash iff we finish it without raising.
2424 # It will also automatically delete the relevant rows from the
2425 # trash table and the records table.
2426 with self.bridge.emptyTrash(
2427 self._table, record_class=StoredFileInfo, record_column="path"
2428 ) as trash_data:
2429 # Removing the artifacts themselves requires that the files are
2430 # not also associated with refs that are not to be trashed.
2431 # Therefore need to do a query with the file paths themselves
2432 # and return all the refs associated with them. Can only delete
2433 # a file if the refs to be trashed are the only refs associated
2434 # with the file.
2435 # This requires multiple copies of the trashed items
2436 trashed, artifacts_to_keep = trash_data
2438 if artifacts_to_keep is None:
2439 # The bridge is not helping us so have to work it out
2440 # ourselves. This is not going to be as efficient.
2441 trashed = list(trashed)
2443 # The instance check is for mypy since up to this point it
2444 # does not know the type of info.
2445 path_map = self._refs_associated_with_artifacts(
2446 [info.path for _, info in trashed if isinstance(info, StoredFileInfo)]
2447 )
2449 for ref, info in trashed:
2450 # Mypy needs to know this is not the base class
2451 assert isinstance(info, StoredFileInfo), f"Unexpectedly got info of class {type(info)}"
2453 path_map[info.path].remove(ref.id)
2454 if not path_map[info.path]:
2455 del path_map[info.path]
2457 artifacts_to_keep = set(path_map)
2459 for ref, info in trashed:
2460 # Should not happen for this implementation but need
2461 # to keep mypy happy.
2462 assert info is not None, f"Internal logic error in emptyTrash with ref {ref}."
2464 # Mypy needs to know this is not the base class
2465 assert isinstance(info, StoredFileInfo), f"Unexpectedly got info of class {type(info)}"
2467 if info.path in artifacts_to_keep:
2468 # This is a multi-dataset artifact and we are not
2469 # removing all associated refs.
2470 continue
2472 # Only trashed refs still known to datastore will be returned.
2473 location = info.file_location(self.locationFactory)
2475 # Point of no return for this artifact
2476 log.debug("Removing artifact %s from datastore %s", location.uri, self.name)
2477 try:
2478 self._delete_artifact(location)
2479 except FileNotFoundError:
2480 # If the file itself has been deleted there is nothing
2481 # we can do about it. It is possible that trash has
2482 # been run in parallel in another process or someone
2483 # decided to delete the file. It is unlikely to come
2484 # back and so we should still continue with the removal
2485 # of the entry from the trash table. It is also possible
2486 # we removed it in a previous iteration if it was
2487 # a multi-dataset artifact. The delete artifact method
2488 # will log a debug message in this scenario.
2489 # Distinguishing file missing before trash started and
2490 # file already removed previously as part of this trash
2491 # is not worth the distinction with regards to potential
2492 # memory cost.
2493 pass
2494 except Exception as e:
2495 if ignore_errors:
2496 # Use a debug message here even though it's not
2497 # a good situation. In some cases this can be
2498 # caused by a race between user A and user B
2499 # and neither of them has permissions for the
2500 # other's files. Butler does not know about users
2501 # and trash has no idea what collections these
2502 # files were in (without guessing from a path).
2503 log.debug(
2504 "Encountered error removing artifact %s from datastore %s: %s",
2505 location.uri,
2506 self.name,
2507 e,
2508 )
2509 else:
2510 raise
2512 @transactional
2513 def transfer_from(
2514 self,
2515 source_datastore: Datastore,
2516 refs: Iterable[DatasetRef],
2517 transfer: str = "auto",
2518 artifact_existence: dict[ResourcePath, bool] | None = None,
2519 ) -> tuple[set[DatasetRef], set[DatasetRef]]:
2520 # Docstring inherited
2521 if type(self) is not type(source_datastore):
2522 raise TypeError(
2523 f"Datastore mismatch between this datastore ({type(self)}) and the "
2524 f"source datastore ({type(source_datastore)})."
2525 )
2527 # Be explicit for mypy
2528 if not isinstance(source_datastore, FileDatastore):
2529 raise TypeError(
2530 "Can only transfer to a FileDatastore from another FileDatastore, not"
2531 f" {type(source_datastore)}"
2532 )
2534 # Stop early if "direct" transfer mode is requested. That would
2535 # require that the URI inside the source datastore should be stored
2536 # directly in the target datastore, which seems unlikely to be useful
2537 # since at any moment the source datastore could delete the file.
2538 if transfer in ("direct", "split"):
2539 raise ValueError(
2540 f"Can not transfer from a source datastore using {transfer} mode since"
2541 " those files are controlled by the other datastore."
2542 )
2544 # Empty existence lookup if none given.
2545 if artifact_existence is None:
2546 artifact_existence = {}
2548 # We will go through the list multiple times so must convert
2549 # generators to lists.
2550 refs = list(refs)
2552 # In order to handle disassembled composites the code works
2553 # at the records level since it can assume that internal APIs
2554 # can be used.
2555 # - If the record already exists in the destination this is assumed
2556 # to be okay.
2557 # - If there is no record but the source and destination URIs are
2558 # identical no transfer is done but the record is added.
2559 # - If the source record refers to an absolute URI currently assume
2560 # that that URI should remain absolute and will be visible to the
2561 # destination butler. May need to have a flag to indicate whether
2562 # the dataset should be transferred. This will only happen if
2563 # the detached Butler has had a local ingest.
2565 # What we really want is all the records in the source datastore
2566 # associated with these refs. Or derived ones if they don't exist
2567 # in the source.
2568 source_records = source_datastore._get_stored_records_associated_with_refs(refs)
2570 # The source dataset_ids are the keys in these records
2571 source_ids = set(source_records)
2572 log.debug("Number of datastore records found in source: %d", len(source_ids))
2574 requested_ids = {ref.id for ref in refs}
2575 missing_ids = requested_ids - source_ids
2577 # Missing IDs can be okay if that datastore has allowed
2578 # gets based on file existence. Should we transfer what we can
2579 # or complain about it and warn?
2580 if missing_ids and not source_datastore.trustGetRequest:
2581 raise ValueError(
2582 f"Some datasets are missing from source datastore {source_datastore}: {missing_ids}"
2583 )
2585 # Need to map these missing IDs to a DatasetRef so we can guess
2586 # the details.
2587 if missing_ids:
2588 log.info(
2589 "Number of expected datasets missing from source datastore records: %d out of %d",
2590 len(missing_ids),
2591 len(requested_ids),
2592 )
2593 id_to_ref = {ref.id: ref for ref in refs if ref.id in missing_ids}
2595 # This should be chunked in case we end up having to check
2596 # the file store since we need some log output to show
2597 # progress.
2598 for missing_ids_chunk in chunk_iterable(missing_ids, chunk_size=10_000):
2599 records = {}
2600 for missing in missing_ids_chunk:
2601 # Ask the source datastore where the missing artifacts
2602 # should be. An execution butler might not know about the
2603 # artifacts even if they are there.
2604 expected = source_datastore._get_expected_dataset_locations_info(id_to_ref[missing])
2605 records[missing] = [info for _, info in expected]
2607 # Call the mexist helper method in case we have not already
2608 # checked these artifacts such that artifact_existence is
2609 # empty. This allows us to benefit from parallelism.
2610 # datastore.mexists() itself does not give us access to the
2611 # derived datastore record.
2612 log.verbose("Checking existence of %d datasets unknown to datastore", len(records))
2613 ref_exists = source_datastore._process_mexists_records(
2614 id_to_ref, records, False, artifact_existence=artifact_existence
2615 )
2617 # Now go through the records and propagate the ones that exist.
2618 location_factory = source_datastore.locationFactory
2619 for missing, record_list in records.items():
2620 # Skip completely if the ref does not exist.
2621 ref = id_to_ref[missing]
2622 if not ref_exists[ref]:
2623 log.warning("Asked to transfer dataset %s but no file artifacts exist for it.", ref)
2624 continue
2625 # Check for file artifact to decide which parts of a
2626 # disassembled composite do exist. If there is only a
2627 # single record we don't even need to look because it can't
2628 # be a composite and must exist.
2629 if len(record_list) == 1:
2630 dataset_records = record_list
2631 else:
2632 dataset_records = [
2633 record
2634 for record in record_list
2635 if artifact_existence[record.file_location(location_factory).uri]
2636 ]
2637 assert len(dataset_records) > 0, "Disassembled composite should have had some files."
2639 # Rely on source_records being a defaultdict.
2640 source_records[missing].extend(dataset_records)
2642 # See if we already have these records
2643 target_records = self._get_stored_records_associated_with_refs(refs)
2645 # The artifacts to register
2646 artifacts = []
2648 # Refs that already exist
2649 already_present = []
2651 # Refs that were rejected by this datastore.
2652 rejected = set()
2654 # Refs that were transferred successfully.
2655 accepted = set()
2657 # Record each time we have done a "direct" transfer.
2658 direct_transfers = []
2660 # Now can transfer the artifacts
2661 for ref in refs:
2662 if not self.constraints.isAcceptable(ref):
2663 # This datastore should not be accepting this dataset.
2664 rejected.add(ref)
2665 continue
2667 accepted.add(ref)
2669 if ref.id in target_records:
2670 # Already have an artifact for this.
2671 already_present.append(ref)
2672 continue
2674 # mypy needs to know these are always resolved refs
2675 for info in source_records[ref.id]:
2676 source_location = info.file_location(source_datastore.locationFactory)
2677 target_location = info.file_location(self.locationFactory)
2678 if source_location == target_location and not source_location.pathInStore.isabs():
2679 # Artifact is already in the target location.
2680 # (which is how execution butler currently runs)
2681 pass
2682 else:
2683 if target_location.pathInStore.isabs():
2684 # Just because we can see the artifact when running
2685 # the transfer doesn't mean it will be generally
2686 # accessible to a user of this butler. Need to decide
2687 # what to do about an absolute path.
2688 if transfer == "auto":
2689 # For "auto" transfers we allow the absolute URI
2690 # to be recorded in the target datastore.
2691 direct_transfers.append(source_location)
2692 else:
2693 # The user is explicitly requesting a transfer
2694 # even for an absolute URI. This requires us to
2695 # calculate the target path.
2696 template_ref = ref
2697 if info.component:
2698 template_ref = ref.makeComponentRef(info.component)
2699 target_location = self._calculate_ingested_datastore_name(
2700 source_location.uri,
2701 template_ref,
2702 )
2704 info = info.update(path=target_location.pathInStore.path)
2706 # Need to transfer it to the new location.
2707 # Assume we should always overwrite. If the artifact
2708 # is there this might indicate that a previous transfer
2709 # was interrupted but was not able to be rolled back
2710 # completely (eg pre-emption) so follow Datastore default
2711 # and overwrite.
2712 target_location.uri.transfer_from(
2713 source_location.uri, transfer=transfer, overwrite=True, transaction=self._transaction
2714 )
2716 artifacts.append((ref, info))
2718 if direct_transfers:
2719 log.info(
2720 "Transfer request for an outside-datastore artifact with absolute URI done %d time%s",
2721 len(direct_transfers),
2722 "" if len(direct_transfers) == 1 else "s",
2723 )
2725 self._register_datasets(artifacts)
2727 if already_present:
2728 n_skipped = len(already_present)
2729 log.info(
2730 "Skipped transfer of %d dataset%s already present in datastore",
2731 n_skipped,
2732 "" if n_skipped == 1 else "s",
2733 )
2735 return accepted, rejected
2737 @transactional
2738 def forget(self, refs: Iterable[DatasetRef]) -> None:
2739 # Docstring inherited.
2740 refs = list(refs)
2741 self.bridge.forget(refs)
2742 self._table.delete(["dataset_id"], *[{"dataset_id": ref.id} for ref in refs])
2744 def validateConfiguration(
2745 self, entities: Iterable[DatasetRef | DatasetType | StorageClass], logFailures: bool = False
2746 ) -> None:
2747 """Validate some of the configuration for this datastore.
2749 Parameters
2750 ----------
2751 entities : iterable of `DatasetRef`, `DatasetType`, or `StorageClass`
2752 Entities to test against this configuration. Can be differing
2753 types.
2754 logFailures : `bool`, optional
2755 If `True`, output a log message for every validation error
2756 detected.
2758 Raises
2759 ------
2760 DatastoreValidationError
2761 Raised if there is a validation problem with a configuration.
2762 All the problems are reported in a single exception.
2764 Notes
2765 -----
2766 This method checks that all the supplied entities have valid file
2767 templates and also have formatters defined.
2768 """
2770 templateFailed = None
2771 try:
2772 self.templates.validateTemplates(entities, logFailures=logFailures)
2773 except FileTemplateValidationError as e:
2774 templateFailed = str(e)
2776 formatterFailed = []
2777 for entity in entities:
2778 try:
2779 self.formatterFactory.getFormatterClass(entity)
2780 except KeyError as e:
2781 formatterFailed.append(str(e))
2782 if logFailures:
2783 log.critical("Formatter failure: %s", e)
2785 if templateFailed or formatterFailed:
2786 messages = []
2787 if templateFailed:
2788 messages.append(templateFailed)
2789 if formatterFailed:
2790 messages.append(",".join(formatterFailed))
2791 msg = ";\n".join(messages)
2792 raise DatastoreValidationError(msg)
2794 def getLookupKeys(self) -> set[LookupKey]:
2795 # Docstring is inherited from base class
2796 return (
2797 self.templates.getLookupKeys()
2798 | self.formatterFactory.getLookupKeys()
2799 | self.constraints.getLookupKeys()
2800 )
2802 def validateKey(self, lookupKey: LookupKey, entity: DatasetRef | DatasetType | StorageClass) -> None:
2803 # Docstring is inherited from base class
2804 # The key can be valid in either formatters or templates so we can
2805 # only check the template if it exists
2806 if lookupKey in self.templates:
2807 try:
2808 self.templates[lookupKey].validateTemplate(entity)
2809 except FileTemplateValidationError as e:
2810 raise DatastoreValidationError(e) from e
2812 def export(
2813 self,
2814 refs: Iterable[DatasetRef],
2815 *,
2816 directory: ResourcePathExpression | None = None,
2817 transfer: str | None = "auto",
2818 ) -> Iterable[FileDataset]:
2819 # Docstring inherited from Datastore.export.
2820 if transfer == "auto" and directory is None:
2821 transfer = None
2823 if transfer is not None and directory is None:
2824 raise TypeError(f"Cannot export using transfer mode {transfer} with no export directory given")
2826 if transfer == "move":
2827 raise TypeError("Can not export by moving files out of datastore.")
2828 elif transfer == "direct":
2829 # For an export, treat this as equivalent to None. We do not
2830 # want an import to risk using absolute URIs to datasets owned
2831 # by another datastore.
2832 log.info("Treating 'direct' transfer mode as in-place export.")
2833 transfer = None
2835 # Force the directory to be a URI object
2836 directoryUri: ResourcePath | None = None
2837 if directory is not None:
2838 directoryUri = ResourcePath(directory, forceDirectory=True)
2840 if transfer is not None and directoryUri is not None:
2841 # mypy needs the second test
2842 if not directoryUri.exists():
2843 raise FileNotFoundError(f"Export location {directory} does not exist")
2845 progress = Progress("lsst.daf.butler.datastores.FileDatastore.export", level=logging.DEBUG)
2846 for ref in progress.wrap(refs, "Exporting dataset files"):
2847 fileLocations = self._get_dataset_locations_info(ref)
2848 if not fileLocations:
2849 raise FileNotFoundError(f"Could not retrieve dataset {ref}.")
2850 # For now we can not export disassembled datasets
2851 if len(fileLocations) > 1:
2852 raise NotImplementedError(f"Can not export disassembled datasets such as {ref}")
2853 location, storedFileInfo = fileLocations[0]
2855 pathInStore = location.pathInStore.path
2856 if transfer is None:
2857 # TODO: do we also need to return the readStorageClass somehow?
2858 # We will use the path in store directly. If this is an
2859 # absolute URI, preserve it.
2860 if location.pathInStore.isabs():
2861 pathInStore = str(location.uri)
2862 elif transfer == "direct":
2863 # Use full URIs to the remote store in the export
2864 pathInStore = str(location.uri)
2865 else:
2866 # mypy needs help
2867 assert directoryUri is not None, "directoryUri must be defined to get here"
2868 storeUri = ResourcePath(location.uri)
2870 # if the datastore has an absolute URI to a resource, we
2871 # have two options:
2872 # 1. Keep the absolute URI in the exported YAML
2873 # 2. Allocate a new name in the local datastore and transfer
2874 # it.
2875 # For now go with option 2
2876 if location.pathInStore.isabs():
2877 template = self.templates.getTemplate(ref)
2878 newURI = ResourcePath(template.format(ref), forceAbsolute=False)
2879 pathInStore = str(newURI.updatedExtension(location.pathInStore.getExtension()))
2881 exportUri = directoryUri.join(pathInStore)
2882 exportUri.transfer_from(storeUri, transfer=transfer)
2884 yield FileDataset(refs=[ref], path=pathInStore, formatter=storedFileInfo.formatter)
2886 @staticmethod
2887 def computeChecksum(uri: ResourcePath, algorithm: str = "blake2b", block_size: int = 8192) -> str | None:
2888 """Compute the checksum of the supplied file.
2890 Parameters
2891 ----------
2892 uri : `lsst.resources.ResourcePath`
2893 Name of resource to calculate checksum from.
2894 algorithm : `str`, optional
2895 Name of algorithm to use. Must be one of the algorithms supported
2896 by :py:class`hashlib`.
2897 block_size : `int`
2898 Number of bytes to read from file at one time.
2900 Returns
2901 -------
2902 hexdigest : `str`
2903 Hex digest of the file.
2905 Notes
2906 -----
2907 Currently returns None if the URI is for a remote resource.
2908 """
2909 if algorithm not in hashlib.algorithms_guaranteed:
2910 raise NameError(f"The specified algorithm '{algorithm}' is not supported by hashlib")
2912 if not uri.isLocal:
2913 return None
2915 hasher = hashlib.new(algorithm)
2917 with uri.as_local() as local_uri:
2918 with open(local_uri.ospath, "rb") as f:
2919 for chunk in iter(lambda: f.read(block_size), b""):
2920 hasher.update(chunk)
2922 return hasher.hexdigest()
2924 def needs_expanded_data_ids(
2925 self,
2926 transfer: str | None,
2927 entity: DatasetRef | DatasetType | StorageClass | None = None,
2928 ) -> bool:
2929 # Docstring inherited.
2930 # This _could_ also use entity to inspect whether the filename template
2931 # involves placeholders other than the required dimensions for its
2932 # dataset type, but that's not necessary for correctness; it just
2933 # enables more optimizations (perhaps only in theory).
2934 return transfer not in ("direct", None)
2936 def import_records(self, data: Mapping[str, DatastoreRecordData]) -> None:
2937 # Docstring inherited from the base class.
2938 record_data = data.get(self.name)
2939 if not record_data:
2940 return
2942 self._bridge.insert(FakeDatasetRef(dataset_id) for dataset_id in record_data.records.keys())
2944 # TODO: Verify that there are no unexpected table names in the dict?
2945 unpacked_records = []
2946 for dataset_data in record_data.records.values():
2947 records = dataset_data.get(self._table.name)
2948 if records:
2949 for info in records:
2950 assert isinstance(info, StoredFileInfo), "Expecting StoredFileInfo records"
2951 unpacked_records.append(info.to_record())
2952 if unpacked_records:
2953 self._table.insert(*unpacked_records, transaction=self._transaction)
2955 def export_records(self, refs: Iterable[DatasetIdRef]) -> Mapping[str, DatastoreRecordData]:
2956 # Docstring inherited from the base class.
2957 exported_refs = list(self._bridge.check(refs))
2958 ids = {ref.id for ref in exported_refs}
2959 records: dict[DatasetId, dict[str, list[StoredDatastoreItemInfo]]] = {id: {} for id in ids}
2960 for row in self._table.fetch(dataset_id=ids):
2961 info: StoredDatastoreItemInfo = StoredFileInfo.from_record(row)
2962 dataset_records = records.setdefault(info.dataset_id, {})
2963 dataset_records.setdefault(self._table.name, []).append(info)
2965 record_data = DatastoreRecordData(records=records)
2966 return {self.name: record_data}
2968 def set_retrieve_dataset_type_method(self, method: Callable[[str], DatasetType | None] | None) -> None:
2969 # Docstring inherited from the base class.
2970 self._retrieve_dataset_method = method
2972 def _cast_storage_class(self, ref: DatasetRef) -> DatasetRef:
2973 """Update dataset reference to use the storage class from registry."""
2974 if self._retrieve_dataset_method is None:
2975 # We could raise an exception here but unit tests do not define
2976 # this method.
2977 return ref
2978 dataset_type = self._retrieve_dataset_method(ref.datasetType.name)
2979 if dataset_type is not None:
2980 ref = ref.overrideStorageClass(dataset_type.storageClass)
2981 return ref