Coverage for python/lsst/daf/butler/datastores/fileDatastore.py: 10%
972 statements
« prev ^ index » next coverage.py v7.2.7, created at 2023-08-05 01:26 +0000
« prev ^ index » next coverage.py v7.2.7, created at 2023-08-05 01:26 +0000
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22"""Generic file-based datastore code."""
24from __future__ import annotations
26__all__ = ("FileDatastore",)
28import contextlib
29import hashlib
30import logging
31from collections import defaultdict
32from collections.abc import Callable, Iterable, Mapping, Sequence
33from dataclasses import dataclass
34from typing import TYPE_CHECKING, Any, ClassVar
36from lsst.daf.butler import (
37 CompositesMap,
38 Config,
39 DatasetId,
40 DatasetRef,
41 DatasetRefURIs,
42 DatasetType,
43 DatasetTypeNotSupportedError,
44 Datastore,
45 DatastoreCacheManager,
46 DatastoreConfig,
47 DatastoreDisabledCacheManager,
48 DatastoreRecordData,
49 DatastoreValidationError,
50 FileDataset,
51 FileDescriptor,
52 FileTemplates,
53 FileTemplateValidationError,
54 Formatter,
55 FormatterFactory,
56 Location,
57 LocationFactory,
58 Progress,
59 StorageClass,
60 StoredDatastoreItemInfo,
61 StoredFileInfo,
62 ddl,
63)
64from lsst.daf.butler.core.repoRelocation import replaceRoot
65from lsst.daf.butler.core.utils import transactional
66from lsst.daf.butler.registry.interfaces import DatastoreRegistryBridge, ReadOnlyDatabaseError
67from lsst.resources import ResourcePath, ResourcePathExpression
68from lsst.utils.introspection import get_class_of, get_instance_of
69from lsst.utils.iteration import chunk_iterable
71# For VERBOSE logging usage.
72from lsst.utils.logging import VERBOSE, getLogger
73from lsst.utils.timer import time_this
74from sqlalchemy import BigInteger, String
76from ..registry.interfaces import FakeDatasetRef
77from .genericDatastore import GenericBaseDatastore
79if TYPE_CHECKING:
80 from lsst.daf.butler import AbstractDatastoreCacheManager, LookupKey
81 from lsst.daf.butler.registry.interfaces import DatasetIdRef, DatastoreRegistryBridgeManager
83log = getLogger(__name__)
86class _IngestPrepData(Datastore.IngestPrepData):
87 """Helper class for FileDatastore ingest implementation.
89 Parameters
90 ----------
91 datasets : `~collections.abc.Iterable` of `FileDataset`
92 Files to be ingested by this datastore.
93 """
95 def __init__(self, datasets: Iterable[FileDataset]):
96 super().__init__(ref for dataset in datasets for ref in dataset.refs)
97 self.datasets = datasets
100@dataclass(frozen=True)
101class DatastoreFileGetInformation:
102 """Collection of useful parameters needed to retrieve a file from
103 a Datastore.
104 """
106 location: Location
107 """The location from which to read the dataset."""
109 formatter: Formatter
110 """The `Formatter` to use to deserialize the dataset."""
112 info: StoredFileInfo
113 """Stored information about this file and its formatter."""
115 assemblerParams: Mapping[str, Any]
116 """Parameters to use for post-processing the retrieved dataset."""
118 formatterParams: Mapping[str, Any]
119 """Parameters that were understood by the associated formatter."""
121 component: str | None
122 """The component to be retrieved (can be `None`)."""
124 readStorageClass: StorageClass
125 """The `StorageClass` of the dataset being read."""
128class FileDatastore(GenericBaseDatastore):
129 """Generic Datastore for file-based implementations.
131 Should always be sub-classed since key abstract methods are missing.
133 Parameters
134 ----------
135 config : `DatastoreConfig` or `str`
136 Configuration as either a `Config` object or URI to file.
137 bridgeManager : `DatastoreRegistryBridgeManager`
138 Object that manages the interface between `Registry` and datastores.
139 butlerRoot : `str`, optional
140 New datastore root to use to override the configuration value.
142 Raises
143 ------
144 ValueError
145 If root location does not exist and ``create`` is `False` in the
146 configuration.
147 """
149 defaultConfigFile: ClassVar[str | None] = None
150 """Path to configuration defaults. Accessed within the ``config`` resource
151 or relative to a search path. Can be None if no defaults specified.
152 """
154 root: ResourcePath
155 """Root directory URI of this `Datastore`."""
157 locationFactory: LocationFactory
158 """Factory for creating locations relative to the datastore root."""
160 formatterFactory: FormatterFactory
161 """Factory for creating instances of formatters."""
163 templates: FileTemplates
164 """File templates that can be used by this `Datastore`."""
166 composites: CompositesMap
167 """Determines whether a dataset should be disassembled on put."""
169 defaultConfigFile = "datastores/fileDatastore.yaml"
170 """Path to configuration defaults. Accessed within the ``config`` resource
171 or relative to a search path. Can be None if no defaults specified.
172 """
174 _retrieve_dataset_method: Callable[[str], DatasetType | None] | None = None
175 """Callable that is used in trusted mode to retrieve registry definition
176 of a named dataset type.
177 """
179 @classmethod
180 def setConfigRoot(cls, root: str, config: Config, full: Config, overwrite: bool = True) -> None:
181 """Set any filesystem-dependent config options for this Datastore to
182 be appropriate for a new empty repository with the given root.
184 Parameters
185 ----------
186 root : `str`
187 URI to the root of the data repository.
188 config : `Config`
189 A `Config` to update. Only the subset understood by
190 this component will be updated. Will not expand
191 defaults.
192 full : `Config`
193 A complete config with all defaults expanded that can be
194 converted to a `DatastoreConfig`. Read-only and will not be
195 modified by this method.
196 Repository-specific options that should not be obtained
197 from defaults when Butler instances are constructed
198 should be copied from ``full`` to ``config``.
199 overwrite : `bool`, optional
200 If `False`, do not modify a value in ``config`` if the value
201 already exists. Default is always to overwrite with the provided
202 ``root``.
204 Notes
205 -----
206 If a keyword is explicitly defined in the supplied ``config`` it
207 will not be overridden by this method if ``overwrite`` is `False`.
208 This allows explicit values set in external configs to be retained.
209 """
210 Config.updateParameters(
211 DatastoreConfig,
212 config,
213 full,
214 toUpdate={"root": root},
215 toCopy=("cls", ("records", "table")),
216 overwrite=overwrite,
217 )
219 @classmethod
220 def makeTableSpec(cls, datasetIdColumnType: type) -> ddl.TableSpec:
221 return ddl.TableSpec(
222 fields=[
223 ddl.FieldSpec(name="dataset_id", dtype=datasetIdColumnType, primaryKey=True),
224 ddl.FieldSpec(name="path", dtype=String, length=256, nullable=False),
225 ddl.FieldSpec(name="formatter", dtype=String, length=128, nullable=False),
226 ddl.FieldSpec(name="storage_class", dtype=String, length=64, nullable=False),
227 # Use empty string to indicate no component
228 ddl.FieldSpec(name="component", dtype=String, length=32, primaryKey=True),
229 # TODO: should checksum be Base64Bytes instead?
230 ddl.FieldSpec(name="checksum", dtype=String, length=128, nullable=True),
231 ddl.FieldSpec(name="file_size", dtype=BigInteger, nullable=True),
232 ],
233 unique=frozenset(),
234 indexes=[ddl.IndexSpec("path")],
235 )
237 def __init__(
238 self,
239 config: DatastoreConfig | ResourcePathExpression,
240 bridgeManager: DatastoreRegistryBridgeManager,
241 butlerRoot: str | None = None,
242 ):
243 super().__init__(config, bridgeManager)
244 if "root" not in self.config:
245 raise ValueError("No root directory specified in configuration")
247 self._bridgeManager = bridgeManager
249 # Name ourselves either using an explicit name or a name
250 # derived from the (unexpanded) root
251 if "name" in self.config:
252 self.name = self.config["name"]
253 else:
254 # We use the unexpanded root in the name to indicate that this
255 # datastore can be moved without having to update registry.
256 self.name = "{}@{}".format(type(self).__name__, self.config["root"])
258 # Support repository relocation in config
259 # Existence of self.root is checked in subclass
260 self.root = ResourcePath(
261 replaceRoot(self.config["root"], butlerRoot), forceDirectory=True, forceAbsolute=True
262 )
264 self.locationFactory = LocationFactory(self.root)
265 self.formatterFactory = FormatterFactory()
267 # Now associate formatters with storage classes
268 self.formatterFactory.registerFormatters(self.config["formatters"], universe=bridgeManager.universe)
270 # Read the file naming templates
271 self.templates = FileTemplates(self.config["templates"], universe=bridgeManager.universe)
273 # See if composites should be disassembled
274 self.composites = CompositesMap(self.config["composites"], universe=bridgeManager.universe)
276 tableName = self.config["records", "table"]
277 try:
278 # Storage of paths and formatters, keyed by dataset_id
279 self._table = bridgeManager.opaque.register(
280 tableName, self.makeTableSpec(bridgeManager.datasetIdColumnType)
281 )
282 # Interface to Registry.
283 self._bridge = bridgeManager.register(self.name)
284 except ReadOnlyDatabaseError:
285 # If the database is read only and we just tried and failed to
286 # create a table, it means someone is trying to create a read-only
287 # butler client for an empty repo. That should be okay, as long
288 # as they then try to get any datasets before some other client
289 # creates the table. Chances are they'rejust validating
290 # configuration.
291 pass
293 # Determine whether checksums should be used - default to False
294 self.useChecksum = self.config.get("checksum", False)
296 # Determine whether we can fall back to configuration if a
297 # requested dataset is not known to registry
298 self.trustGetRequest = self.config.get("trust_get_request", False)
300 # Create a cache manager
301 self.cacheManager: AbstractDatastoreCacheManager
302 if "cached" in self.config:
303 self.cacheManager = DatastoreCacheManager(self.config["cached"], universe=bridgeManager.universe)
304 else:
305 self.cacheManager = DatastoreDisabledCacheManager("", universe=bridgeManager.universe)
307 # Check existence and create directory structure if necessary
308 if not self.root.exists():
309 if "create" not in self.config or not self.config["create"]:
310 raise ValueError(f"No valid root and not allowed to create one at: {self.root}")
311 try:
312 self.root.mkdir()
313 except Exception as e:
314 raise ValueError(
315 f"Can not create datastore root '{self.root}', check permissions. Got error: {e}"
316 ) from e
318 def __str__(self) -> str:
319 return str(self.root)
321 @property
322 def bridge(self) -> DatastoreRegistryBridge:
323 return self._bridge
325 @property
326 def roots(self) -> dict[str, ResourcePath | None]:
327 # Docstring inherited.
328 return {self.name: self.root}
330 def _artifact_exists(self, location: Location) -> bool:
331 """Check that an artifact exists in this datastore at the specified
332 location.
334 Parameters
335 ----------
336 location : `Location`
337 Expected location of the artifact associated with this datastore.
339 Returns
340 -------
341 exists : `bool`
342 True if the location can be found, false otherwise.
343 """
344 log.debug("Checking if resource exists: %s", location.uri)
345 return location.uri.exists()
347 def _delete_artifact(self, location: Location) -> None:
348 """Delete the artifact from the datastore.
350 Parameters
351 ----------
352 location : `Location`
353 Location of the artifact associated with this datastore.
354 """
355 if location.pathInStore.isabs():
356 raise RuntimeError(f"Cannot delete artifact with absolute uri {location.uri}.")
358 try:
359 location.uri.remove()
360 except FileNotFoundError:
361 log.debug("File %s did not exist and so could not be deleted.", location.uri)
362 raise
363 except Exception as e:
364 log.critical("Failed to delete file: %s (%s)", location.uri, e)
365 raise
366 log.debug("Successfully deleted file: %s", location.uri)
368 def addStoredItemInfo(self, refs: Iterable[DatasetRef], infos: Iterable[StoredFileInfo]) -> None:
369 # Docstring inherited from GenericBaseDatastore
370 records = [info.rebase(ref).to_record() for ref, info in zip(refs, infos, strict=True)]
371 self._table.insert(*records, transaction=self._transaction)
373 def getStoredItemsInfo(self, ref: DatasetIdRef) -> list[StoredFileInfo]:
374 # Docstring inherited from GenericBaseDatastore
376 # Look for the dataset_id -- there might be multiple matches
377 # if we have disassembled the dataset.
378 records = self._table.fetch(dataset_id=ref.id)
379 return [StoredFileInfo.from_record(record) for record in records]
381 def _get_stored_records_associated_with_refs(
382 self, refs: Iterable[DatasetIdRef]
383 ) -> dict[DatasetId, list[StoredFileInfo]]:
384 """Retrieve all records associated with the provided refs.
386 Parameters
387 ----------
388 refs : iterable of `DatasetIdRef`
389 The refs for which records are to be retrieved.
391 Returns
392 -------
393 records : `dict` of [`DatasetId`, `list` of `StoredFileInfo`]
394 The matching records indexed by the ref ID. The number of entries
395 in the dict can be smaller than the number of requested refs.
396 """
397 records = self._table.fetch(dataset_id=[ref.id for ref in refs])
399 # Uniqueness is dataset_id + component so can have multiple records
400 # per ref.
401 records_by_ref = defaultdict(list)
402 for record in records:
403 records_by_ref[record["dataset_id"]].append(StoredFileInfo.from_record(record))
404 return records_by_ref
406 def _refs_associated_with_artifacts(self, paths: list[str | ResourcePath]) -> dict[str, set[DatasetId]]:
407 """Return paths and associated dataset refs.
409 Parameters
410 ----------
411 paths : `list` of `str` or `lsst.resources.ResourcePath`
412 All the paths to include in search.
414 Returns
415 -------
416 mapping : `dict` of [`str`, `set` [`DatasetId`]]
417 Mapping of each path to a set of associated database IDs.
418 """
419 records = self._table.fetch(path=[str(path) for path in paths])
420 result = defaultdict(set)
421 for row in records:
422 result[row["path"]].add(row["dataset_id"])
423 return result
425 def _registered_refs_per_artifact(self, pathInStore: ResourcePath) -> set[DatasetId]:
426 """Return all dataset refs associated with the supplied path.
428 Parameters
429 ----------
430 pathInStore : `lsst.resources.ResourcePath`
431 Path of interest in the data store.
433 Returns
434 -------
435 ids : `set` of `int`
436 All `DatasetRef` IDs associated with this path.
437 """
438 records = list(self._table.fetch(path=str(pathInStore)))
439 ids = {r["dataset_id"] for r in records}
440 return ids
442 def removeStoredItemInfo(self, ref: DatasetIdRef) -> None:
443 # Docstring inherited from GenericBaseDatastore
444 self._table.delete(["dataset_id"], {"dataset_id": ref.id})
446 def _get_dataset_locations_info(self, ref: DatasetIdRef) -> list[tuple[Location, StoredFileInfo]]:
447 r"""Find all the `Location`\ s of the requested dataset in the
448 `Datastore` and the associated stored file information.
450 Parameters
451 ----------
452 ref : `DatasetRef`
453 Reference to the required `Dataset`.
455 Returns
456 -------
457 results : `list` [`tuple` [`Location`, `StoredFileInfo` ]]
458 Location of the dataset within the datastore and
459 stored information about each file and its formatter.
460 """
461 # Get the file information (this will fail if no file)
462 records = self.getStoredItemsInfo(ref)
464 # Use the path to determine the location -- we need to take
465 # into account absolute URIs in the datastore record
466 return [(r.file_location(self.locationFactory), r) for r in records]
468 def _can_remove_dataset_artifact(self, ref: DatasetIdRef, location: Location) -> bool:
469 """Check that there is only one dataset associated with the
470 specified artifact.
472 Parameters
473 ----------
474 ref : `DatasetRef` or `FakeDatasetRef`
475 Dataset to be removed.
476 location : `Location`
477 The location of the artifact to be removed.
479 Returns
480 -------
481 can_remove : `Bool`
482 True if the artifact can be safely removed.
483 """
484 # Can't ever delete absolute URIs.
485 if location.pathInStore.isabs():
486 return False
488 # Get all entries associated with this path
489 allRefs = self._registered_refs_per_artifact(location.pathInStore)
490 if not allRefs:
491 raise RuntimeError(f"Datastore inconsistency error. {location.pathInStore} not in registry")
493 # Remove these refs from all the refs and if there is nothing left
494 # then we can delete
495 remainingRefs = allRefs - {ref.id}
497 if remainingRefs:
498 return False
499 return True
501 def _get_expected_dataset_locations_info(self, ref: DatasetRef) -> list[tuple[Location, StoredFileInfo]]:
502 """Predict the location and related file information of the requested
503 dataset in this datastore.
505 Parameters
506 ----------
507 ref : `DatasetRef`
508 Reference to the required `Dataset`.
510 Returns
511 -------
512 results : `list` [`tuple` [`Location`, `StoredFileInfo` ]]
513 Expected Location of the dataset within the datastore and
514 placeholder information about each file and its formatter.
516 Notes
517 -----
518 Uses the current configuration to determine how we would expect the
519 datastore files to have been written if we couldn't ask registry.
520 This is safe so long as there has been no change to datastore
521 configuration between writing the dataset and wanting to read it.
522 Will not work for files that have been ingested without using the
523 standard file template or default formatter.
524 """
525 # If we have a component ref we always need to ask the questions
526 # of the composite. If the composite is disassembled this routine
527 # should return all components. If the composite was not
528 # disassembled the composite is what is stored regardless of
529 # component request. Note that if the caller has disassembled
530 # a composite there is no way for this guess to know that
531 # without trying both the composite and component ref and seeing
532 # if there is something at the component Location even without
533 # disassembly being enabled.
534 if ref.datasetType.isComponent():
535 ref = ref.makeCompositeRef()
537 # See if the ref is a composite that should be disassembled
538 doDisassembly = self.composites.shouldBeDisassembled(ref)
540 all_info: list[tuple[Location, Formatter, StorageClass, str | None]] = []
542 if doDisassembly:
543 for component, componentStorage in ref.datasetType.storageClass.components.items():
544 compRef = ref.makeComponentRef(component)
545 location, formatter = self._determine_put_formatter_location(compRef)
546 all_info.append((location, formatter, componentStorage, component))
548 else:
549 # Always use the composite ref if no disassembly
550 location, formatter = self._determine_put_formatter_location(ref)
551 all_info.append((location, formatter, ref.datasetType.storageClass, None))
553 # Convert the list of tuples to have StoredFileInfo as second element
554 return [
555 (
556 location,
557 StoredFileInfo(
558 formatter=formatter,
559 path=location.pathInStore.path,
560 storageClass=storageClass,
561 component=component,
562 checksum=None,
563 file_size=-1,
564 dataset_id=ref.id,
565 ),
566 )
567 for location, formatter, storageClass, component in all_info
568 ]
570 def _prepare_for_get(
571 self, ref: DatasetRef, parameters: Mapping[str, Any] | None = None
572 ) -> list[DatastoreFileGetInformation]:
573 """Check parameters for ``get`` and obtain formatter and
574 location.
576 Parameters
577 ----------
578 ref : `DatasetRef`
579 Reference to the required Dataset.
580 parameters : `dict`
581 `StorageClass`-specific parameters that specify, for example,
582 a slice of the dataset to be loaded.
584 Returns
585 -------
586 getInfo : `list` [`DatastoreFileGetInformation`]
587 Parameters needed to retrieve each file.
588 """
589 log.debug("Retrieve %s from %s with parameters %s", ref, self.name, parameters)
591 # The storage class we want to use eventually
592 refStorageClass = ref.datasetType.storageClass
594 # For trusted mode need to reset storage class.
595 ref = self._cast_storage_class(ref)
597 # Get file metadata and internal metadata
598 fileLocations = self._get_dataset_locations_info(ref)
599 if not fileLocations:
600 if not self.trustGetRequest:
601 raise FileNotFoundError(f"Could not retrieve dataset {ref}.")
602 # Assume the dataset is where we think it should be
603 fileLocations = self._get_expected_dataset_locations_info(ref)
605 if len(fileLocations) > 1:
606 disassembled = True
608 # If trust is involved it is possible that there will be
609 # components listed here that do not exist in the datastore.
610 # Explicitly check for file artifact existence and filter out any
611 # that are missing.
612 if self.trustGetRequest:
613 fileLocations = [loc for loc in fileLocations if loc[0].uri.exists()]
615 # For now complain only if we have no components at all. One
616 # component is probably a problem but we can punt that to the
617 # assembler.
618 if not fileLocations:
619 raise FileNotFoundError(f"None of the component files for dataset {ref} exist.")
621 else:
622 disassembled = False
624 # Is this a component request?
625 refComponent = ref.datasetType.component()
627 fileGetInfo = []
628 for location, storedFileInfo in fileLocations:
629 # The storage class used to write the file
630 writeStorageClass = storedFileInfo.storageClass
632 # If this has been disassembled we need read to match the write
633 if disassembled:
634 readStorageClass = writeStorageClass
635 else:
636 readStorageClass = refStorageClass
638 formatter = get_instance_of(
639 storedFileInfo.formatter,
640 FileDescriptor(
641 location,
642 readStorageClass=readStorageClass,
643 storageClass=writeStorageClass,
644 parameters=parameters,
645 ),
646 ref.dataId,
647 )
649 formatterParams, notFormatterParams = formatter.segregateParameters()
651 # Of the remaining parameters, extract the ones supported by
652 # this StorageClass (for components not all will be handled)
653 assemblerParams = readStorageClass.filterParameters(notFormatterParams)
655 # The ref itself could be a component if the dataset was
656 # disassembled by butler, or we disassembled in datastore and
657 # components came from the datastore records
658 component = storedFileInfo.component if storedFileInfo.component else refComponent
660 fileGetInfo.append(
661 DatastoreFileGetInformation(
662 location,
663 formatter,
664 storedFileInfo,
665 assemblerParams,
666 formatterParams,
667 component,
668 readStorageClass,
669 )
670 )
672 return fileGetInfo
674 def _prepare_for_put(self, inMemoryDataset: Any, ref: DatasetRef) -> tuple[Location, Formatter]:
675 """Check the arguments for ``put`` and obtain formatter and
676 location.
678 Parameters
679 ----------
680 inMemoryDataset : `object`
681 The dataset to store.
682 ref : `DatasetRef`
683 Reference to the associated Dataset.
685 Returns
686 -------
687 location : `Location`
688 The location to write the dataset.
689 formatter : `Formatter`
690 The `Formatter` to use to write the dataset.
692 Raises
693 ------
694 TypeError
695 Supplied object and storage class are inconsistent.
696 DatasetTypeNotSupportedError
697 The associated `DatasetType` is not handled by this datastore.
698 """
699 self._validate_put_parameters(inMemoryDataset, ref)
700 return self._determine_put_formatter_location(ref)
702 def _determine_put_formatter_location(self, ref: DatasetRef) -> tuple[Location, Formatter]:
703 """Calculate the formatter and output location to use for put.
705 Parameters
706 ----------
707 ref : `DatasetRef`
708 Reference to the associated Dataset.
710 Returns
711 -------
712 location : `Location`
713 The location to write the dataset.
714 formatter : `Formatter`
715 The `Formatter` to use to write the dataset.
716 """
717 # Work out output file name
718 try:
719 template = self.templates.getTemplate(ref)
720 except KeyError as e:
721 raise DatasetTypeNotSupportedError(f"Unable to find template for {ref}") from e
723 # Validate the template to protect against filenames from different
724 # dataIds returning the same and causing overwrite confusion.
725 template.validateTemplate(ref)
727 location = self.locationFactory.fromPath(template.format(ref))
729 # Get the formatter based on the storage class
730 storageClass = ref.datasetType.storageClass
731 try:
732 formatter = self.formatterFactory.getFormatter(
733 ref, FileDescriptor(location, storageClass=storageClass), ref.dataId
734 )
735 except KeyError as e:
736 raise DatasetTypeNotSupportedError(
737 f"Unable to find formatter for {ref} in datastore {self.name}"
738 ) from e
740 # Now that we know the formatter, update the location
741 location = formatter.makeUpdatedLocation(location)
743 return location, formatter
745 def _overrideTransferMode(self, *datasets: FileDataset, transfer: str | None = None) -> str | None:
746 # Docstring inherited from base class
747 if transfer != "auto":
748 return transfer
750 # See if the paths are within the datastore or not
751 inside = [self._pathInStore(d.path) is not None for d in datasets]
753 if all(inside):
754 transfer = None
755 elif not any(inside):
756 # Allow ResourcePath to use its own knowledge
757 transfer = "auto"
758 else:
759 # This can happen when importing from a datastore that
760 # has had some datasets ingested using "direct" mode.
761 # Also allow ResourcePath to sort it out but warn about it.
762 # This can happen if you are importing from a datastore
763 # that had some direct transfer datasets.
764 log.warning(
765 "Some datasets are inside the datastore and some are outside. Using 'split' "
766 "transfer mode. This assumes that the files outside the datastore are "
767 "still accessible to the new butler since they will not be copied into "
768 "the target datastore."
769 )
770 transfer = "split"
772 return transfer
774 def _pathInStore(self, path: ResourcePathExpression) -> str | None:
775 """Return path relative to datastore root.
777 Parameters
778 ----------
779 path : `lsst.resources.ResourcePathExpression`
780 Path to dataset. Can be absolute URI. If relative assumed to
781 be relative to the datastore. Returns path in datastore
782 or raises an exception if the path it outside.
784 Returns
785 -------
786 inStore : `str`
787 Path relative to datastore root. Returns `None` if the file is
788 outside the root.
789 """
790 # Relative path will always be relative to datastore
791 pathUri = ResourcePath(path, forceAbsolute=False)
792 return pathUri.relative_to(self.root)
794 def _standardizeIngestPath(
795 self, path: str | ResourcePath, *, transfer: str | None = None
796 ) -> str | ResourcePath:
797 """Standardize the path of a to-be-ingested file.
799 Parameters
800 ----------
801 path : `str` or `lsst.resources.ResourcePath`
802 Path of a file to be ingested. This parameter is not expected
803 to be all the types that can be used to construct a
804 `~lsst.resources.ResourcePath`.
805 transfer : `str`, optional
806 How (and whether) the dataset should be added to the datastore.
807 See `ingest` for details of transfer modes.
808 This implementation is provided only so
809 `NotImplementedError` can be raised if the mode is not supported;
810 actual transfers are deferred to `_extractIngestInfo`.
812 Returns
813 -------
814 path : `str` or `lsst.resources.ResourcePath`
815 New path in what the datastore considers standard form. If an
816 absolute URI was given that will be returned unchanged.
818 Notes
819 -----
820 Subclasses of `FileDatastore` can implement this method instead
821 of `_prepIngest`. It should not modify the data repository or given
822 file in any way.
824 Raises
825 ------
826 NotImplementedError
827 Raised if the datastore does not support the given transfer mode
828 (including the case where ingest is not supported at all).
829 FileNotFoundError
830 Raised if one of the given files does not exist.
831 """
832 if transfer not in (None, "direct", "split") + self.root.transferModes:
833 raise NotImplementedError(f"Transfer mode {transfer} not supported.")
835 # A relative URI indicates relative to datastore root
836 srcUri = ResourcePath(path, forceAbsolute=False)
837 if not srcUri.isabs():
838 srcUri = self.root.join(path)
840 if not srcUri.exists():
841 raise FileNotFoundError(
842 f"Resource at {srcUri} does not exist; note that paths to ingest "
843 f"are assumed to be relative to {self.root} unless they are absolute."
844 )
846 if transfer is None:
847 relpath = srcUri.relative_to(self.root)
848 if not relpath:
849 raise RuntimeError(
850 f"Transfer is none but source file ({srcUri}) is not within datastore ({self.root})"
851 )
853 # Return the relative path within the datastore for internal
854 # transfer
855 path = relpath
857 return path
859 def _extractIngestInfo(
860 self,
861 path: ResourcePathExpression,
862 ref: DatasetRef,
863 *,
864 formatter: Formatter | type[Formatter],
865 transfer: str | None = None,
866 record_validation_info: bool = True,
867 ) -> StoredFileInfo:
868 """Relocate (if necessary) and extract `StoredFileInfo` from a
869 to-be-ingested file.
871 Parameters
872 ----------
873 path : `lsst.resources.ResourcePathExpression`
874 URI or path of a file to be ingested.
875 ref : `DatasetRef`
876 Reference for the dataset being ingested. Guaranteed to have
877 ``dataset_id not None`.
878 formatter : `type` or `Formatter`
879 `Formatter` subclass to use for this dataset or an instance.
880 transfer : `str`, optional
881 How (and whether) the dataset should be added to the datastore.
882 See `ingest` for details of transfer modes.
883 record_validation_info : `bool`, optional
884 If `True`, the default, the datastore can record validation
885 information associated with the file. If `False` the datastore
886 will not attempt to track any information such as checksums
887 or file sizes. This can be useful if such information is tracked
888 in an external system or if the file is to be compressed in place.
889 It is up to the datastore whether this parameter is relevant.
891 Returns
892 -------
893 info : `StoredFileInfo`
894 Internal datastore record for this file. This will be inserted by
895 the caller; the `_extractIngestInfo` is only responsible for
896 creating and populating the struct.
898 Raises
899 ------
900 FileNotFoundError
901 Raised if one of the given files does not exist.
902 FileExistsError
903 Raised if transfer is not `None` but the (internal) location the
904 file would be moved to is already occupied.
905 """
906 if self._transaction is None:
907 raise RuntimeError("Ingest called without transaction enabled")
909 # Create URI of the source path, do not need to force a relative
910 # path to absolute.
911 srcUri = ResourcePath(path, forceAbsolute=False)
913 # Track whether we have read the size of the source yet
914 have_sized = False
916 tgtLocation: Location | None
917 if transfer is None or transfer == "split":
918 # A relative path is assumed to be relative to the datastore
919 # in this context
920 if not srcUri.isabs():
921 tgtLocation = self.locationFactory.fromPath(srcUri.ospath)
922 else:
923 # Work out the path in the datastore from an absolute URI
924 # This is required to be within the datastore.
925 pathInStore = srcUri.relative_to(self.root)
926 if pathInStore is None and transfer is None:
927 raise RuntimeError(
928 f"Unexpectedly learned that {srcUri} is not within datastore {self.root}"
929 )
930 if pathInStore:
931 tgtLocation = self.locationFactory.fromPath(pathInStore)
932 elif transfer == "split":
933 # Outside the datastore but treat that as a direct ingest
934 # instead.
935 tgtLocation = None
936 else:
937 raise RuntimeError(f"Unexpected transfer mode encountered: {transfer} for URI {srcUri}")
938 elif transfer == "direct":
939 # Want to store the full URI to the resource directly in
940 # datastore. This is useful for referring to permanent archive
941 # storage for raw data.
942 # Trust that people know what they are doing.
943 tgtLocation = None
944 else:
945 # Work out the name we want this ingested file to have
946 # inside the datastore
947 tgtLocation = self._calculate_ingested_datastore_name(srcUri, ref, formatter)
948 if not tgtLocation.uri.dirname().exists():
949 log.debug("Folder %s does not exist yet.", tgtLocation.uri.dirname())
950 tgtLocation.uri.dirname().mkdir()
952 # if we are transferring from a local file to a remote location
953 # it may be more efficient to get the size and checksum of the
954 # local file rather than the transferred one
955 if record_validation_info and srcUri.isLocal:
956 size = srcUri.size()
957 checksum = self.computeChecksum(srcUri) if self.useChecksum else None
958 have_sized = True
960 # Transfer the resource to the destination.
961 # Allow overwrite of an existing file. This matches the behavior
962 # of datastore.put() in that it trusts that registry would not
963 # be asking to overwrite unless registry thought that the
964 # overwrite was allowed.
965 tgtLocation.uri.transfer_from(
966 srcUri, transfer=transfer, transaction=self._transaction, overwrite=True
967 )
969 if tgtLocation is None:
970 # This means we are using direct mode
971 targetUri = srcUri
972 targetPath = str(srcUri)
973 else:
974 targetUri = tgtLocation.uri
975 targetPath = tgtLocation.pathInStore.path
977 # the file should exist in the datastore now
978 if record_validation_info:
979 if not have_sized:
980 size = targetUri.size()
981 checksum = self.computeChecksum(targetUri) if self.useChecksum else None
982 else:
983 # Not recording any file information.
984 size = -1
985 checksum = None
987 return StoredFileInfo(
988 formatter=formatter,
989 path=targetPath,
990 storageClass=ref.datasetType.storageClass,
991 component=ref.datasetType.component(),
992 file_size=size,
993 checksum=checksum,
994 dataset_id=ref.id,
995 )
997 def _prepIngest(self, *datasets: FileDataset, transfer: str | None = None) -> _IngestPrepData:
998 # Docstring inherited from Datastore._prepIngest.
999 filtered = []
1000 for dataset in datasets:
1001 acceptable = [ref for ref in dataset.refs if self.constraints.isAcceptable(ref)]
1002 if not acceptable:
1003 continue
1004 else:
1005 dataset.refs = acceptable
1006 if dataset.formatter is None:
1007 dataset.formatter = self.formatterFactory.getFormatterClass(dataset.refs[0])
1008 else:
1009 assert isinstance(dataset.formatter, type | str)
1010 formatter_class = get_class_of(dataset.formatter)
1011 if not issubclass(formatter_class, Formatter):
1012 raise TypeError(f"Requested formatter {dataset.formatter} is not a Formatter class.")
1013 dataset.formatter = formatter_class
1014 dataset.path = self._standardizeIngestPath(dataset.path, transfer=transfer)
1015 filtered.append(dataset)
1016 return _IngestPrepData(filtered)
1018 @transactional
1019 def _finishIngest(
1020 self,
1021 prepData: Datastore.IngestPrepData,
1022 *,
1023 transfer: str | None = None,
1024 record_validation_info: bool = True,
1025 ) -> None:
1026 # Docstring inherited from Datastore._finishIngest.
1027 refsAndInfos = []
1028 progress = Progress("lsst.daf.butler.datastores.FileDatastore.ingest", level=logging.DEBUG)
1029 for dataset in progress.wrap(prepData.datasets, desc="Ingesting dataset files"):
1030 # Do ingest as if the first dataset ref is associated with the file
1031 info = self._extractIngestInfo(
1032 dataset.path,
1033 dataset.refs[0],
1034 formatter=dataset.formatter,
1035 transfer=transfer,
1036 record_validation_info=record_validation_info,
1037 )
1038 refsAndInfos.extend([(ref, info) for ref in dataset.refs])
1039 self._register_datasets(refsAndInfos)
1041 def _calculate_ingested_datastore_name(
1042 self,
1043 srcUri: ResourcePath,
1044 ref: DatasetRef,
1045 formatter: Formatter | type[Formatter] | None = None,
1046 ) -> Location:
1047 """Given a source URI and a DatasetRef, determine the name the
1048 dataset will have inside datastore.
1050 Parameters
1051 ----------
1052 srcUri : `lsst.resources.ResourcePath`
1053 URI to the source dataset file.
1054 ref : `DatasetRef`
1055 Ref associated with the newly-ingested dataset artifact. This
1056 is used to determine the name within the datastore.
1057 formatter : `Formatter` or Formatter class.
1058 Formatter to use for validation. Can be a class or an instance.
1059 No validation of the file extension is performed if the
1060 ``formatter`` is `None`. This can be used if the caller knows
1061 that the source URI and target URI will use the same formatter.
1063 Returns
1064 -------
1065 location : `Location`
1066 Target location for the newly-ingested dataset.
1067 """
1068 # Ingesting a file from outside the datastore.
1069 # This involves a new name.
1070 template = self.templates.getTemplate(ref)
1071 location = self.locationFactory.fromPath(template.format(ref))
1073 # Get the extension
1074 ext = srcUri.getExtension()
1076 # Update the destination to include that extension
1077 location.updateExtension(ext)
1079 # Ask the formatter to validate this extension
1080 if formatter is not None:
1081 formatter.validateExtension(location)
1083 return location
1085 def _write_in_memory_to_artifact(self, inMemoryDataset: Any, ref: DatasetRef) -> StoredFileInfo:
1086 """Write out in memory dataset to datastore.
1088 Parameters
1089 ----------
1090 inMemoryDataset : `object`
1091 Dataset to write to datastore.
1092 ref : `DatasetRef`
1093 Registry information associated with this dataset.
1095 Returns
1096 -------
1097 info : `StoredFileInfo`
1098 Information describing the artifact written to the datastore.
1099 """
1100 # May need to coerce the in memory dataset to the correct
1101 # python type, but first we need to make sure the storage class
1102 # reflects the one defined in the data repository.
1103 ref = self._cast_storage_class(ref)
1104 inMemoryDataset = ref.datasetType.storageClass.coerce_type(inMemoryDataset)
1106 location, formatter = self._prepare_for_put(inMemoryDataset, ref)
1107 uri = location.uri
1109 if not uri.dirname().exists():
1110 log.debug("Folder %s does not exist yet so creating it.", uri.dirname())
1111 uri.dirname().mkdir()
1113 if self._transaction is None:
1114 raise RuntimeError("Attempting to write artifact without transaction enabled")
1116 def _removeFileExists(uri: ResourcePath) -> None:
1117 """Remove a file and do not complain if it is not there.
1119 This is important since a formatter might fail before the file
1120 is written and we should not confuse people by writing spurious
1121 error messages to the log.
1122 """
1123 with contextlib.suppress(FileNotFoundError):
1124 uri.remove()
1126 # Register a callback to try to delete the uploaded data if
1127 # something fails below
1128 self._transaction.registerUndo("artifactWrite", _removeFileExists, uri)
1130 data_written = False
1132 # For remote URIs some datasets can be serialized directly
1133 # to bytes and sent to the remote datastore without writing a
1134 # file. If the dataset is intended to be saved to the cache
1135 # a file is always written and direct write to the remote
1136 # datastore is bypassed.
1137 if not uri.isLocal and not self.cacheManager.should_be_cached(ref):
1138 # Remote URI that is not cached so can write directly.
1139 try:
1140 serializedDataset = formatter.toBytes(inMemoryDataset)
1141 except NotImplementedError:
1142 # Fallback to the file writing option.
1143 pass
1144 except Exception as e:
1145 raise RuntimeError(
1146 f"Failed to serialize dataset {ref} of type {type(inMemoryDataset)} to bytes."
1147 ) from e
1148 else:
1149 log.debug("Writing bytes directly to %s", uri)
1150 uri.write(serializedDataset, overwrite=True)
1151 log.debug("Successfully wrote bytes directly to %s", uri)
1152 data_written = True
1154 if not data_written:
1155 # Did not write the bytes directly to object store so instead
1156 # write to temporary file. Always write to a temporary even if
1157 # using a local file system -- that gives us atomic writes.
1158 # If a process is killed as the file is being written we do not
1159 # want it to remain in the correct place but in corrupt state.
1160 # For local files write to the output directory not temporary dir.
1161 prefix = uri.dirname() if uri.isLocal else None
1162 with ResourcePath.temporary_uri(suffix=uri.getExtension(), prefix=prefix) as temporary_uri:
1163 # Need to configure the formatter to write to a different
1164 # location and that needs us to overwrite internals
1165 log.debug("Writing dataset to temporary location at %s", temporary_uri)
1166 with formatter._updateLocation(Location(None, temporary_uri)):
1167 try:
1168 formatter.write(inMemoryDataset)
1169 except Exception as e:
1170 raise RuntimeError(
1171 f"Failed to serialize dataset {ref} of type"
1172 f" {type(inMemoryDataset)} to "
1173 f"temporary location {temporary_uri}"
1174 ) from e
1176 # Use move for a local file since that becomes an efficient
1177 # os.rename. For remote resources we use copy to allow the
1178 # file to be cached afterwards.
1179 transfer = "move" if uri.isLocal else "copy"
1181 uri.transfer_from(temporary_uri, transfer=transfer, overwrite=True)
1183 if transfer == "copy":
1184 # Cache if required
1185 self.cacheManager.move_to_cache(temporary_uri, ref)
1187 log.debug("Successfully wrote dataset to %s via a temporary file.", uri)
1189 # URI is needed to resolve what ingest case are we dealing with
1190 return self._extractIngestInfo(uri, ref, formatter=formatter)
1192 def _read_artifact_into_memory(
1193 self,
1194 getInfo: DatastoreFileGetInformation,
1195 ref: DatasetRef,
1196 isComponent: bool = False,
1197 cache_ref: DatasetRef | None = None,
1198 ) -> Any:
1199 """Read the artifact from datastore into in memory object.
1201 Parameters
1202 ----------
1203 getInfo : `DatastoreFileGetInformation`
1204 Information about the artifact within the datastore.
1205 ref : `DatasetRef`
1206 The registry information associated with this artifact.
1207 isComponent : `bool`
1208 Flag to indicate if a component is being read from this artifact.
1209 cache_ref : `DatasetRef`, optional
1210 The DatasetRef to use when looking up the file in the cache.
1211 This ref must have the same ID as the supplied ref but can
1212 be a parent ref or component ref to indicate to the cache whether
1213 a composite file is being requested from the cache or a component
1214 file. Without this the cache will default to the supplied ref but
1215 it can get confused with read-only derived components for
1216 disassembled composites.
1218 Returns
1219 -------
1220 inMemoryDataset : `object`
1221 The artifact as a python object.
1222 """
1223 location = getInfo.location
1224 uri = location.uri
1225 log.debug("Accessing data from %s", uri)
1227 if cache_ref is None:
1228 cache_ref = ref
1229 if cache_ref.id != ref.id:
1230 raise ValueError(
1231 "The supplied cache dataset ref refers to a different dataset than expected:"
1232 f" {ref.id} != {cache_ref.id}"
1233 )
1235 # Cannot recalculate checksum but can compare size as a quick check
1236 # Do not do this if the size is negative since that indicates
1237 # we do not know.
1238 recorded_size = getInfo.info.file_size
1239 resource_size = uri.size()
1240 if recorded_size >= 0 and resource_size != recorded_size:
1241 raise RuntimeError(
1242 "Integrity failure in Datastore. "
1243 f"Size of file {uri} ({resource_size}) "
1244 f"does not match size recorded in registry of {recorded_size}"
1245 )
1247 # For the general case we have choices for how to proceed.
1248 # 1. Always use a local file (downloading the remote resource to a
1249 # temporary file if needed).
1250 # 2. Use a threshold size and read into memory and use bytes.
1251 # Use both for now with an arbitrary hand off size.
1252 # This allows small datasets to be downloaded from remote object
1253 # stores without requiring a temporary file.
1255 formatter = getInfo.formatter
1256 nbytes_max = 10_000_000 # Arbitrary number that we can tune
1257 if resource_size <= nbytes_max and formatter.can_read_bytes():
1258 with self.cacheManager.find_in_cache(cache_ref, uri.getExtension()) as cached_file:
1259 if cached_file is not None:
1260 desired_uri = cached_file
1261 msg = f" (cached version of {uri})"
1262 else:
1263 desired_uri = uri
1264 msg = ""
1265 with time_this(log, msg="Reading bytes from %s%s", args=(desired_uri, msg)):
1266 serializedDataset = desired_uri.read()
1267 log.debug(
1268 "Deserializing %s from %d bytes from location %s with formatter %s",
1269 f"component {getInfo.component}" if isComponent else "",
1270 len(serializedDataset),
1271 uri,
1272 formatter.name(),
1273 )
1274 try:
1275 result = formatter.fromBytes(
1276 serializedDataset, component=getInfo.component if isComponent else None
1277 )
1278 except Exception as e:
1279 raise ValueError(
1280 f"Failure from formatter '{formatter.name()}' for dataset {ref.id}"
1281 f" ({ref.datasetType.name} from {uri}): {e}"
1282 ) from e
1283 else:
1284 # Read from file.
1286 # Have to update the Location associated with the formatter
1287 # because formatter.read does not allow an override.
1288 # This could be improved.
1289 location_updated = False
1290 msg = ""
1292 # First check in cache for local version.
1293 # The cache will only be relevant for remote resources but
1294 # no harm in always asking. Context manager ensures that cache
1295 # file is not deleted during cache expiration.
1296 with self.cacheManager.find_in_cache(cache_ref, uri.getExtension()) as cached_file:
1297 if cached_file is not None:
1298 msg = f"(via cache read of remote file {uri})"
1299 uri = cached_file
1300 location_updated = True
1302 with uri.as_local() as local_uri:
1303 can_be_cached = False
1304 if uri != local_uri:
1305 # URI was remote and file was downloaded
1306 cache_msg = ""
1307 location_updated = True
1309 if self.cacheManager.should_be_cached(cache_ref):
1310 # In this scenario we want to ask if the downloaded
1311 # file should be cached but we should not cache
1312 # it until after we've used it (to ensure it can't
1313 # be expired whilst we are using it).
1314 can_be_cached = True
1316 # Say that it is "likely" to be cached because
1317 # if the formatter read fails we will not be
1318 # caching this file.
1319 cache_msg = " and likely cached"
1321 msg = f"(via download to local file{cache_msg})"
1323 # Calculate the (possibly) new location for the formatter
1324 # to use.
1325 newLocation = Location(*local_uri.split()) if location_updated else None
1327 log.debug(
1328 "Reading%s from location %s %s with formatter %s",
1329 f" component {getInfo.component}" if isComponent else "",
1330 uri,
1331 msg,
1332 formatter.name(),
1333 )
1334 try:
1335 with (
1336 formatter._updateLocation(newLocation),
1337 time_this(
1338 log,
1339 msg="Reading%s from location %s %s with formatter %s",
1340 args=(
1341 f" component {getInfo.component}" if isComponent else "",
1342 uri,
1343 msg,
1344 formatter.name(),
1345 ),
1346 ),
1347 ):
1348 result = formatter.read(component=getInfo.component if isComponent else None)
1349 except Exception as e:
1350 raise ValueError(
1351 f"Failure from formatter '{formatter.name()}' for dataset {ref.id}"
1352 f" ({ref.datasetType.name} from {uri}): {e}"
1353 ) from e
1355 # File was read successfully so can move to cache
1356 if can_be_cached:
1357 self.cacheManager.move_to_cache(local_uri, cache_ref)
1359 return self._post_process_get(
1360 result, ref.datasetType.storageClass, getInfo.assemblerParams, isComponent=isComponent
1361 )
1363 def knows(self, ref: DatasetRef) -> bool:
1364 """Check if the dataset is known to the datastore.
1366 Does not check for existence of any artifact.
1368 Parameters
1369 ----------
1370 ref : `DatasetRef`
1371 Reference to the required dataset.
1373 Returns
1374 -------
1375 exists : `bool`
1376 `True` if the dataset is known to the datastore.
1377 """
1378 fileLocations = self._get_dataset_locations_info(ref)
1379 if fileLocations:
1380 return True
1381 return False
1383 def knows_these(self, refs: Iterable[DatasetRef]) -> dict[DatasetRef, bool]:
1384 # Docstring inherited from the base class.
1386 # The records themselves. Could be missing some entries.
1387 records = self._get_stored_records_associated_with_refs(refs)
1389 return {ref: ref.id in records for ref in refs}
1391 def _process_mexists_records(
1392 self,
1393 id_to_ref: dict[DatasetId, DatasetRef],
1394 records: dict[DatasetId, list[StoredFileInfo]],
1395 all_required: bool,
1396 artifact_existence: dict[ResourcePath, bool] | None = None,
1397 ) -> dict[DatasetRef, bool]:
1398 """Check given records for existence.
1400 Helper function for `mexists()`.
1402 Parameters
1403 ----------
1404 id_to_ref : `dict` of [`DatasetId`, `DatasetRef`]
1405 Mapping of the dataset ID to the dataset ref itself.
1406 records : `dict` of [`DatasetId`, `list` of `StoredFileInfo`]
1407 Records as generally returned by
1408 ``_get_stored_records_associated_with_refs``.
1409 all_required : `bool`
1410 Flag to indicate whether existence requires all artifacts
1411 associated with a dataset ID to exist or not for existence.
1412 artifact_existence : `dict` [`lsst.resources.ResourcePath`, `bool`]
1413 Optional mapping of datastore artifact to existence. Updated by
1414 this method with details of all artifacts tested. Can be `None`
1415 if the caller is not interested.
1417 Returns
1418 -------
1419 existence : `dict` of [`DatasetRef`, `bool`]
1420 Mapping from dataset to boolean indicating existence.
1421 """
1422 # The URIs to be checked and a mapping of those URIs to
1423 # the dataset ID.
1424 uris_to_check: list[ResourcePath] = []
1425 location_map: dict[ResourcePath, DatasetId] = {}
1427 location_factory = self.locationFactory
1429 uri_existence: dict[ResourcePath, bool] = {}
1430 for ref_id, infos in records.items():
1431 # Key is the dataset Id, value is list of StoredItemInfo
1432 uris = [info.file_location(location_factory).uri for info in infos]
1433 location_map.update({uri: ref_id for uri in uris})
1435 # Check the local cache directly for a dataset corresponding
1436 # to the remote URI.
1437 if self.cacheManager.file_count > 0:
1438 ref = id_to_ref[ref_id]
1439 for uri, storedFileInfo in zip(uris, infos, strict=True):
1440 check_ref = ref
1441 if not ref.datasetType.isComponent() and (component := storedFileInfo.component):
1442 check_ref = ref.makeComponentRef(component)
1443 if self.cacheManager.known_to_cache(check_ref, uri.getExtension()):
1444 # Proxy for URI existence.
1445 uri_existence[uri] = True
1446 else:
1447 uris_to_check.append(uri)
1448 else:
1449 # Check all of them.
1450 uris_to_check.extend(uris)
1452 if artifact_existence is not None:
1453 # If a URI has already been checked remove it from the list
1454 # and immediately add the status to the output dict.
1455 filtered_uris_to_check = []
1456 for uri in uris_to_check:
1457 if uri in artifact_existence:
1458 uri_existence[uri] = artifact_existence[uri]
1459 else:
1460 filtered_uris_to_check.append(uri)
1461 uris_to_check = filtered_uris_to_check
1463 # Results.
1464 dataset_existence: dict[DatasetRef, bool] = {}
1466 uri_existence.update(ResourcePath.mexists(uris_to_check))
1467 for uri, exists in uri_existence.items():
1468 dataset_id = location_map[uri]
1469 ref = id_to_ref[dataset_id]
1471 # Disassembled composite needs to check all locations.
1472 # all_required indicates whether all need to exist or not.
1473 if ref in dataset_existence:
1474 if all_required:
1475 exists = dataset_existence[ref] and exists
1476 else:
1477 exists = dataset_existence[ref] or exists
1478 dataset_existence[ref] = exists
1480 if artifact_existence is not None:
1481 artifact_existence.update(uri_existence)
1483 return dataset_existence
1485 def mexists(
1486 self, refs: Iterable[DatasetRef], artifact_existence: dict[ResourcePath, bool] | None = None
1487 ) -> dict[DatasetRef, bool]:
1488 """Check the existence of multiple datasets at once.
1490 Parameters
1491 ----------
1492 refs : iterable of `DatasetRef`
1493 The datasets to be checked.
1494 artifact_existence : `dict` [`lsst.resources.ResourcePath`, `bool`]
1495 Optional mapping of datastore artifact to existence. Updated by
1496 this method with details of all artifacts tested. Can be `None`
1497 if the caller is not interested.
1499 Returns
1500 -------
1501 existence : `dict` of [`DatasetRef`, `bool`]
1502 Mapping from dataset to boolean indicating existence.
1504 Notes
1505 -----
1506 To minimize potentially costly remote existence checks, the local
1507 cache is checked as a proxy for existence. If a file for this
1508 `DatasetRef` does exist no check is done for the actual URI. This
1509 could result in possibly unexpected behavior if the dataset itself
1510 has been removed from the datastore by another process whilst it is
1511 still in the cache.
1512 """
1513 chunk_size = 10_000
1514 dataset_existence: dict[DatasetRef, bool] = {}
1515 log.debug("Checking for the existence of multiple artifacts in datastore in chunks of %d", chunk_size)
1516 n_found_total = 0
1517 n_checked = 0
1518 n_chunks = 0
1519 for chunk in chunk_iterable(refs, chunk_size=chunk_size):
1520 chunk_result = self._mexists(chunk, artifact_existence)
1522 # The log message level and content depend on how many
1523 # datasets we are processing.
1524 n_results = len(chunk_result)
1526 # Use verbose logging to ensure that messages can be seen
1527 # easily if many refs are being checked.
1528 log_threshold = VERBOSE
1529 n_checked += n_results
1531 # This sum can take some time so only do it if we know the
1532 # result is going to be used.
1533 n_found = 0
1534 if log.isEnabledFor(log_threshold):
1535 # Can treat the booleans as 0, 1 integers and sum them.
1536 n_found = sum(chunk_result.values())
1537 n_found_total += n_found
1539 # We are deliberately not trying to count the number of refs
1540 # provided in case it's in the millions. This means there is a
1541 # situation where the number of refs exactly matches the chunk
1542 # size and we will switch to the multi-chunk path even though
1543 # we only have a single chunk.
1544 if n_results < chunk_size and n_chunks == 0:
1545 # Single chunk will be processed so we can provide more detail.
1546 if n_results == 1:
1547 ref = list(chunk_result)[0]
1548 # Use debug logging to be consistent with `exists()`.
1549 log.debug(
1550 "Calling mexists() with single ref that does%s exist (%s).",
1551 "" if chunk_result[ref] else " not",
1552 ref,
1553 )
1554 else:
1555 # Single chunk but multiple files. Summarize.
1556 log.log(
1557 log_threshold,
1558 "Number of datasets found in datastore: %d out of %d datasets checked.",
1559 n_found,
1560 n_checked,
1561 )
1563 else:
1564 # Use incremental verbose logging when we have multiple chunks.
1565 log.log(
1566 log_threshold,
1567 "Number of datasets found in datastore for chunk %d: %d out of %d checked "
1568 "(running total from all chunks so far: %d found out of %d checked)",
1569 n_chunks,
1570 n_found,
1571 n_results,
1572 n_found_total,
1573 n_checked,
1574 )
1575 dataset_existence.update(chunk_result)
1576 n_chunks += 1
1578 return dataset_existence
1580 def _mexists(
1581 self, refs: Sequence[DatasetRef], artifact_existence: dict[ResourcePath, bool] | None = None
1582 ) -> dict[DatasetRef, bool]:
1583 """Check the existence of multiple datasets at once.
1585 Parameters
1586 ----------
1587 refs : iterable of `DatasetRef`
1588 The datasets to be checked.
1589 artifact_existence : `dict` [`lsst.resources.ResourcePath`, `bool`]
1590 Optional mapping of datastore artifact to existence. Updated by
1591 this method with details of all artifacts tested. Can be `None`
1592 if the caller is not interested.
1594 Returns
1595 -------
1596 existence : `dict` of [`DatasetRef`, `bool`]
1597 Mapping from dataset to boolean indicating existence.
1598 """
1599 # Make a mapping from refs with the internal storage class to the given
1600 # refs that may have a different one. We'll use the internal refs
1601 # throughout this method and convert back at the very end.
1602 internal_ref_to_input_ref = {self._cast_storage_class(ref): ref for ref in refs}
1604 # Need a mapping of dataset_id to (internal) dataset ref since some
1605 # internal APIs work with dataset_id.
1606 id_to_ref = {ref.id: ref for ref in internal_ref_to_input_ref}
1608 # Set of all IDs we are checking for.
1609 requested_ids = set(id_to_ref.keys())
1611 # The records themselves. Could be missing some entries.
1612 records = self._get_stored_records_associated_with_refs(id_to_ref.values())
1614 dataset_existence = self._process_mexists_records(
1615 id_to_ref, records, True, artifact_existence=artifact_existence
1616 )
1618 # Set of IDs that have been handled.
1619 handled_ids = {ref.id for ref in dataset_existence}
1621 missing_ids = requested_ids - handled_ids
1622 if missing_ids:
1623 dataset_existence.update(
1624 self._mexists_check_expected(
1625 [id_to_ref[missing] for missing in missing_ids], artifact_existence
1626 )
1627 )
1629 return {
1630 internal_ref_to_input_ref[internal_ref]: existence
1631 for internal_ref, existence in dataset_existence.items()
1632 }
1634 def _mexists_check_expected(
1635 self, refs: Sequence[DatasetRef], artifact_existence: dict[ResourcePath, bool] | None = None
1636 ) -> dict[DatasetRef, bool]:
1637 """Check existence of refs that are not known to datastore.
1639 Parameters
1640 ----------
1641 refs : iterable of `DatasetRef`
1642 The datasets to be checked. These are assumed not to be known
1643 to datastore.
1644 artifact_existence : `dict` [`lsst.resources.ResourcePath`, `bool`]
1645 Optional mapping of datastore artifact to existence. Updated by
1646 this method with details of all artifacts tested. Can be `None`
1647 if the caller is not interested.
1649 Returns
1650 -------
1651 existence : `dict` of [`DatasetRef`, `bool`]
1652 Mapping from dataset to boolean indicating existence.
1653 """
1654 dataset_existence: dict[DatasetRef, bool] = {}
1655 if not self.trustGetRequest:
1656 # Must assume these do not exist
1657 for ref in refs:
1658 dataset_existence[ref] = False
1659 else:
1660 log.debug(
1661 "%d datasets were not known to datastore during initial existence check.",
1662 len(refs),
1663 )
1665 # Construct data structure identical to that returned
1666 # by _get_stored_records_associated_with_refs() but using
1667 # guessed names.
1668 records = {}
1669 id_to_ref = {}
1670 for missing_ref in refs:
1671 expected = self._get_expected_dataset_locations_info(missing_ref)
1672 dataset_id = missing_ref.id
1673 records[dataset_id] = [info for _, info in expected]
1674 id_to_ref[dataset_id] = missing_ref
1676 dataset_existence.update(
1677 self._process_mexists_records(
1678 id_to_ref,
1679 records,
1680 False,
1681 artifact_existence=artifact_existence,
1682 )
1683 )
1685 return dataset_existence
1687 def exists(self, ref: DatasetRef) -> bool:
1688 """Check if the dataset exists in the datastore.
1690 Parameters
1691 ----------
1692 ref : `DatasetRef`
1693 Reference to the required dataset.
1695 Returns
1696 -------
1697 exists : `bool`
1698 `True` if the entity exists in the `Datastore`.
1700 Notes
1701 -----
1702 The local cache is checked as a proxy for existence in the remote
1703 object store. It is possible that another process on a different
1704 compute node could remove the file from the object store even
1705 though it is present in the local cache.
1706 """
1707 ref = self._cast_storage_class(ref)
1708 fileLocations = self._get_dataset_locations_info(ref)
1710 # if we are being asked to trust that registry might not be correct
1711 # we ask for the expected locations and check them explicitly
1712 if not fileLocations:
1713 if not self.trustGetRequest:
1714 return False
1716 # First check the cache. If it is not found we must check
1717 # the datastore itself. Assume that any component in the cache
1718 # means that the dataset does exist somewhere.
1719 if self.cacheManager.known_to_cache(ref):
1720 return True
1722 # When we are guessing a dataset location we can not check
1723 # for the existence of every component since we can not
1724 # know if every component was written. Instead we check
1725 # for the existence of any of the expected locations.
1726 for location, _ in self._get_expected_dataset_locations_info(ref):
1727 if self._artifact_exists(location):
1728 return True
1729 return False
1731 # All listed artifacts must exist.
1732 for location, storedFileInfo in fileLocations:
1733 # Checking in cache needs the component ref.
1734 check_ref = ref
1735 if not ref.datasetType.isComponent() and (component := storedFileInfo.component):
1736 check_ref = ref.makeComponentRef(component)
1737 if self.cacheManager.known_to_cache(check_ref, location.getExtension()):
1738 continue
1740 if not self._artifact_exists(location):
1741 return False
1743 return True
1745 def getURIs(self, ref: DatasetRef, predict: bool = False) -> DatasetRefURIs:
1746 """Return URIs associated with dataset.
1748 Parameters
1749 ----------
1750 ref : `DatasetRef`
1751 Reference to the required dataset.
1752 predict : `bool`, optional
1753 If the datastore does not know about the dataset, should it
1754 return a predicted URI or not?
1756 Returns
1757 -------
1758 uris : `DatasetRefURIs`
1759 The URI to the primary artifact associated with this dataset (if
1760 the dataset was disassembled within the datastore this may be
1761 `None`), and the URIs to any components associated with the dataset
1762 artifact. (can be empty if there are no components).
1763 """
1764 many = self.getManyURIs([ref], predict=predict, allow_missing=False)
1765 return many[ref]
1767 def getURI(self, ref: DatasetRef, predict: bool = False) -> ResourcePath:
1768 """URI to the Dataset.
1770 Parameters
1771 ----------
1772 ref : `DatasetRef`
1773 Reference to the required Dataset.
1774 predict : `bool`
1775 If `True`, allow URIs to be returned of datasets that have not
1776 been written.
1778 Returns
1779 -------
1780 uri : `str`
1781 URI pointing to the dataset within the datastore. If the
1782 dataset does not exist in the datastore, and if ``predict`` is
1783 `True`, the URI will be a prediction and will include a URI
1784 fragment "#predicted".
1785 If the datastore does not have entities that relate well
1786 to the concept of a URI the returned URI will be
1787 descriptive. The returned URI is not guaranteed to be obtainable.
1789 Raises
1790 ------
1791 FileNotFoundError
1792 Raised if a URI has been requested for a dataset that does not
1793 exist and guessing is not allowed.
1794 RuntimeError
1795 Raised if a request is made for a single URI but multiple URIs
1796 are associated with this dataset.
1798 Notes
1799 -----
1800 When a predicted URI is requested an attempt will be made to form
1801 a reasonable URI based on file templates and the expected formatter.
1802 """
1803 primary, components = self.getURIs(ref, predict)
1804 if primary is None or components:
1805 raise RuntimeError(
1806 f"Dataset ({ref}) includes distinct URIs for components. Use Datastore.getURIs() instead."
1807 )
1808 return primary
1810 def _predict_URIs(
1811 self,
1812 ref: DatasetRef,
1813 ) -> DatasetRefURIs:
1814 """Predict the URIs of a dataset ref.
1816 Parameters
1817 ----------
1818 ref : `DatasetRef`
1819 Reference to the required Dataset.
1821 Returns
1822 -------
1823 URI : DatasetRefUris
1824 Primary and component URIs. URIs will contain a URI fragment
1825 "#predicted".
1826 """
1827 uris = DatasetRefURIs()
1829 if self.composites.shouldBeDisassembled(ref):
1830 for component, _ in ref.datasetType.storageClass.components.items():
1831 comp_ref = ref.makeComponentRef(component)
1832 comp_location, _ = self._determine_put_formatter_location(comp_ref)
1834 # Add the "#predicted" URI fragment to indicate this is a
1835 # guess
1836 uris.componentURIs[component] = ResourcePath(comp_location.uri.geturl() + "#predicted")
1838 else:
1839 location, _ = self._determine_put_formatter_location(ref)
1841 # Add the "#predicted" URI fragment to indicate this is a guess
1842 uris.primaryURI = ResourcePath(location.uri.geturl() + "#predicted")
1844 return uris
1846 def getManyURIs(
1847 self,
1848 refs: Iterable[DatasetRef],
1849 predict: bool = False,
1850 allow_missing: bool = False,
1851 ) -> dict[DatasetRef, DatasetRefURIs]:
1852 # Docstring inherited
1854 uris: dict[DatasetRef, DatasetRefURIs] = {}
1856 records = self._get_stored_records_associated_with_refs(refs)
1857 records_keys = records.keys()
1859 existing_refs = tuple(ref for ref in refs if ref.id in records_keys)
1860 missing_refs = tuple(ref for ref in refs if ref.id not in records_keys)
1862 # Have to handle trustGetRequest mode by checking for the existence
1863 # of the missing refs on disk.
1864 if missing_refs:
1865 dataset_existence = self._mexists_check_expected(missing_refs, None)
1866 really_missing = set()
1867 not_missing = set()
1868 for ref, exists in dataset_existence.items():
1869 if exists:
1870 not_missing.add(ref)
1871 else:
1872 really_missing.add(ref)
1874 if not_missing:
1875 # Need to recalculate the missing/existing split.
1876 existing_refs = existing_refs + tuple(not_missing)
1877 missing_refs = tuple(really_missing)
1879 for ref in missing_refs:
1880 # if this has never been written then we have to guess
1881 if not predict:
1882 if not allow_missing:
1883 raise FileNotFoundError(f"Dataset {ref} not in this datastore.")
1884 else:
1885 uris[ref] = self._predict_URIs(ref)
1887 for ref in existing_refs:
1888 file_infos = records[ref.id]
1889 file_locations = [(i.file_location(self.locationFactory), i) for i in file_infos]
1890 uris[ref] = self._locations_to_URI(ref, file_locations)
1892 return uris
1894 def _locations_to_URI(
1895 self,
1896 ref: DatasetRef,
1897 file_locations: Sequence[tuple[Location, StoredFileInfo]],
1898 ) -> DatasetRefURIs:
1899 """Convert one or more file locations associated with a DatasetRef
1900 to a DatasetRefURIs.
1902 Parameters
1903 ----------
1904 ref : `DatasetRef`
1905 Reference to the dataset.
1906 file_locations : Sequence[Tuple[Location, StoredFileInfo]]
1907 Each item in the sequence is the location of the dataset within the
1908 datastore and stored information about the file and its formatter.
1909 If there is only one item in the sequence then it is treated as the
1910 primary URI. If there is more than one item then they are treated
1911 as component URIs. If there are no items then an error is raised
1912 unless ``self.trustGetRequest`` is `True`.
1914 Returns
1915 -------
1916 uris: DatasetRefURIs
1917 Represents the primary URI or component URIs described by the
1918 inputs.
1920 Raises
1921 ------
1922 RuntimeError
1923 If no file locations are passed in and ``self.trustGetRequest`` is
1924 `False`.
1925 FileNotFoundError
1926 If the a passed-in URI does not exist, and ``self.trustGetRequest``
1927 is `False`.
1928 RuntimeError
1929 If a passed in `StoredFileInfo`'s ``component`` is `None` (this is
1930 unexpected).
1931 """
1932 guessing = False
1933 uris = DatasetRefURIs()
1935 if not file_locations:
1936 if not self.trustGetRequest:
1937 raise RuntimeError(f"Unexpectedly got no artifacts for dataset {ref}")
1938 file_locations = self._get_expected_dataset_locations_info(ref)
1939 guessing = True
1941 if len(file_locations) == 1:
1942 # No disassembly so this is the primary URI
1943 uris.primaryURI = file_locations[0][0].uri
1944 if guessing and not uris.primaryURI.exists():
1945 raise FileNotFoundError(f"Expected URI ({uris.primaryURI}) does not exist")
1946 else:
1947 for location, file_info in file_locations:
1948 if file_info.component is None:
1949 raise RuntimeError(f"Unexpectedly got no component name for a component at {location}")
1950 if guessing and not location.uri.exists():
1951 # If we are trusting then it is entirely possible for
1952 # some components to be missing. In that case we skip
1953 # to the next component.
1954 if self.trustGetRequest:
1955 continue
1956 raise FileNotFoundError(f"Expected URI ({location.uri}) does not exist")
1957 uris.componentURIs[file_info.component] = location.uri
1959 return uris
1961 def retrieveArtifacts(
1962 self,
1963 refs: Iterable[DatasetRef],
1964 destination: ResourcePath,
1965 transfer: str = "auto",
1966 preserve_path: bool = True,
1967 overwrite: bool = False,
1968 ) -> list[ResourcePath]:
1969 """Retrieve the file artifacts associated with the supplied refs.
1971 Parameters
1972 ----------
1973 refs : iterable of `DatasetRef`
1974 The datasets for which file artifacts are to be retrieved.
1975 A single ref can result in multiple files. The refs must
1976 be resolved.
1977 destination : `lsst.resources.ResourcePath`
1978 Location to write the file artifacts.
1979 transfer : `str`, optional
1980 Method to use to transfer the artifacts. Must be one of the options
1981 supported by `lsst.resources.ResourcePath.transfer_from()`.
1982 "move" is not allowed.
1983 preserve_path : `bool`, optional
1984 If `True` the full path of the file artifact within the datastore
1985 is preserved. If `False` the final file component of the path
1986 is used.
1987 overwrite : `bool`, optional
1988 If `True` allow transfers to overwrite existing files at the
1989 destination.
1991 Returns
1992 -------
1993 targets : `list` of `lsst.resources.ResourcePath`
1994 URIs of file artifacts in destination location. Order is not
1995 preserved.
1996 """
1997 if not destination.isdir():
1998 raise ValueError(f"Destination location must refer to a directory. Given {destination}")
2000 if transfer == "move":
2001 raise ValueError("Can not move artifacts out of datastore. Use copy instead.")
2003 # Source -> Destination
2004 # This also helps filter out duplicate DatasetRef in the request
2005 # that will map to the same underlying file transfer.
2006 to_transfer: dict[ResourcePath, ResourcePath] = {}
2008 for ref in refs:
2009 locations = self._get_dataset_locations_info(ref)
2010 for location, _ in locations:
2011 source_uri = location.uri
2012 target_path: ResourcePathExpression
2013 if preserve_path:
2014 target_path = location.pathInStore
2015 if target_path.isabs():
2016 # This is an absolute path to an external file.
2017 # Use the full path.
2018 target_path = target_path.relativeToPathRoot
2019 else:
2020 target_path = source_uri.basename()
2021 target_uri = destination.join(target_path)
2022 to_transfer[source_uri] = target_uri
2024 # In theory can now parallelize the transfer
2025 log.debug("Number of artifacts to transfer to %s: %d", str(destination), len(to_transfer))
2026 for source_uri, target_uri in to_transfer.items():
2027 target_uri.transfer_from(source_uri, transfer=transfer, overwrite=overwrite)
2029 return list(to_transfer.values())
2031 def get(
2032 self,
2033 ref: DatasetRef,
2034 parameters: Mapping[str, Any] | None = None,
2035 storageClass: StorageClass | str | None = None,
2036 ) -> Any:
2037 """Load an InMemoryDataset from the store.
2039 Parameters
2040 ----------
2041 ref : `DatasetRef`
2042 Reference to the required Dataset.
2043 parameters : `dict`
2044 `StorageClass`-specific parameters that specify, for example,
2045 a slice of the dataset to be loaded.
2046 storageClass : `StorageClass` or `str`, optional
2047 The storage class to be used to override the Python type
2048 returned by this method. By default the returned type matches
2049 the dataset type definition for this dataset. Specifying a
2050 read `StorageClass` can force a different type to be returned.
2051 This type must be compatible with the original type.
2053 Returns
2054 -------
2055 inMemoryDataset : `object`
2056 Requested dataset or slice thereof as an InMemoryDataset.
2058 Raises
2059 ------
2060 FileNotFoundError
2061 Requested dataset can not be retrieved.
2062 TypeError
2063 Return value from formatter has unexpected type.
2064 ValueError
2065 Formatter failed to process the dataset.
2066 """
2067 # Supplied storage class for the component being read is either
2068 # from the ref itself or some an override if we want to force
2069 # type conversion.
2070 if storageClass is not None:
2071 ref = ref.overrideStorageClass(storageClass)
2072 refStorageClass = ref.datasetType.storageClass
2074 allGetInfo = self._prepare_for_get(ref, parameters)
2075 refComponent = ref.datasetType.component()
2077 # Create mapping from component name to related info
2078 allComponents = {i.component: i for i in allGetInfo}
2080 # By definition the dataset is disassembled if we have more
2081 # than one record for it.
2082 isDisassembled = len(allGetInfo) > 1
2084 # Look for the special case where we are disassembled but the
2085 # component is a derived component that was not written during
2086 # disassembly. For this scenario we need to check that the
2087 # component requested is listed as a derived component for the
2088 # composite storage class
2089 isDisassembledReadOnlyComponent = False
2090 if isDisassembled and refComponent:
2091 # The composite storage class should be accessible through
2092 # the component dataset type
2093 compositeStorageClass = ref.datasetType.parentStorageClass
2095 # In the unlikely scenario where the composite storage
2096 # class is not known, we can only assume that this is a
2097 # normal component. If that assumption is wrong then the
2098 # branch below that reads a persisted component will fail
2099 # so there is no need to complain here.
2100 if compositeStorageClass is not None:
2101 isDisassembledReadOnlyComponent = refComponent in compositeStorageClass.derivedComponents
2103 if isDisassembled and not refComponent:
2104 # This was a disassembled dataset spread over multiple files
2105 # and we need to put them all back together again.
2106 # Read into memory and then assemble
2108 # Check that the supplied parameters are suitable for the type read
2109 refStorageClass.validateParameters(parameters)
2111 # We want to keep track of all the parameters that were not used
2112 # by formatters. We assume that if any of the component formatters
2113 # use a parameter that we do not need to apply it again in the
2114 # assembler.
2115 usedParams = set()
2117 components: dict[str, Any] = {}
2118 for getInfo in allGetInfo:
2119 # assemblerParams are parameters not understood by the
2120 # associated formatter.
2121 usedParams.update(set(getInfo.formatterParams))
2123 component = getInfo.component
2125 if component is None:
2126 raise RuntimeError(f"Internal error in datastore assembly of {ref}")
2128 # We do not want the formatter to think it's reading
2129 # a component though because it is really reading a
2130 # standalone dataset -- always tell reader it is not a
2131 # component.
2132 components[component] = self._read_artifact_into_memory(
2133 getInfo, ref.makeComponentRef(component), isComponent=False
2134 )
2136 inMemoryDataset = ref.datasetType.storageClass.delegate().assemble(components)
2138 # Any unused parameters will have to be passed to the assembler
2139 if parameters:
2140 unusedParams = {k: v for k, v in parameters.items() if k not in usedParams}
2141 else:
2142 unusedParams = {}
2144 # Process parameters
2145 return ref.datasetType.storageClass.delegate().handleParameters(
2146 inMemoryDataset, parameters=unusedParams
2147 )
2149 elif isDisassembledReadOnlyComponent:
2150 compositeStorageClass = ref.datasetType.parentStorageClass
2151 if compositeStorageClass is None:
2152 raise RuntimeError(
2153 f"Unable to retrieve derived component '{refComponent}' since"
2154 "no composite storage class is available."
2155 )
2157 if refComponent is None:
2158 # Mainly for mypy
2159 raise RuntimeError(f"Internal error in datastore {self.name}: component can not be None here")
2161 # Assume that every derived component can be calculated by
2162 # forwarding the request to a single read/write component.
2163 # Rather than guessing which rw component is the right one by
2164 # scanning each for a derived component of the same name,
2165 # we ask the storage class delegate directly which one is best to
2166 # use.
2167 compositeDelegate = compositeStorageClass.delegate()
2168 forwardedComponent = compositeDelegate.selectResponsibleComponent(
2169 refComponent, set(allComponents)
2170 )
2172 # Select the relevant component
2173 rwInfo = allComponents[forwardedComponent]
2175 # For now assume that read parameters are validated against
2176 # the real component and not the requested component
2177 forwardedStorageClass = rwInfo.formatter.fileDescriptor.readStorageClass
2178 forwardedStorageClass.validateParameters(parameters)
2180 # The reference to use for the caching must refer to the forwarded
2181 # component and not the derived component.
2182 cache_ref = ref.makeCompositeRef().makeComponentRef(forwardedComponent)
2184 # Unfortunately the FileDescriptor inside the formatter will have
2185 # the wrong write storage class so we need to create a new one
2186 # given the immutability constraint.
2187 writeStorageClass = rwInfo.info.storageClass
2189 # We may need to put some thought into parameters for read
2190 # components but for now forward them on as is
2191 readFormatter = type(rwInfo.formatter)(
2192 FileDescriptor(
2193 rwInfo.location,
2194 readStorageClass=refStorageClass,
2195 storageClass=writeStorageClass,
2196 parameters=parameters,
2197 ),
2198 ref.dataId,
2199 )
2201 # The assembler can not receive any parameter requests for a
2202 # derived component at this time since the assembler will
2203 # see the storage class of the derived component and those
2204 # parameters will have to be handled by the formatter on the
2205 # forwarded storage class.
2206 assemblerParams: dict[str, Any] = {}
2208 # Need to created a new info that specifies the derived
2209 # component and associated storage class
2210 readInfo = DatastoreFileGetInformation(
2211 rwInfo.location,
2212 readFormatter,
2213 rwInfo.info,
2214 assemblerParams,
2215 {},
2216 refComponent,
2217 refStorageClass,
2218 )
2220 return self._read_artifact_into_memory(readInfo, ref, isComponent=True, cache_ref=cache_ref)
2222 else:
2223 # Single file request or component from that composite file
2224 for lookup in (refComponent, None):
2225 if lookup in allComponents:
2226 getInfo = allComponents[lookup]
2227 break
2228 else:
2229 raise FileNotFoundError(
2230 f"Component {refComponent} not found for ref {ref} in datastore {self.name}"
2231 )
2233 # Do not need the component itself if already disassembled
2234 if isDisassembled:
2235 isComponent = False
2236 else:
2237 isComponent = getInfo.component is not None
2239 # For a component read of a composite we want the cache to
2240 # be looking at the composite ref itself.
2241 cache_ref = ref.makeCompositeRef() if isComponent else ref
2243 # For a disassembled component we can validate parametersagainst
2244 # the component storage class directly
2245 if isDisassembled:
2246 refStorageClass.validateParameters(parameters)
2247 else:
2248 # For an assembled composite this could be a derived
2249 # component derived from a real component. The validity
2250 # of the parameters is not clear. For now validate against
2251 # the composite storage class
2252 getInfo.formatter.fileDescriptor.storageClass.validateParameters(parameters)
2254 return self._read_artifact_into_memory(getInfo, ref, isComponent=isComponent, cache_ref=cache_ref)
2256 @transactional
2257 def put(self, inMemoryDataset: Any, ref: DatasetRef) -> None:
2258 """Write a InMemoryDataset with a given `DatasetRef` to the store.
2260 Parameters
2261 ----------
2262 inMemoryDataset : `object`
2263 The dataset to store.
2264 ref : `DatasetRef`
2265 Reference to the associated Dataset.
2267 Raises
2268 ------
2269 TypeError
2270 Supplied object and storage class are inconsistent.
2271 DatasetTypeNotSupportedError
2272 The associated `DatasetType` is not handled by this datastore.
2274 Notes
2275 -----
2276 If the datastore is configured to reject certain dataset types it
2277 is possible that the put will fail and raise a
2278 `DatasetTypeNotSupportedError`. The main use case for this is to
2279 allow `ChainedDatastore` to put to multiple datastores without
2280 requiring that every datastore accepts the dataset.
2281 """
2282 doDisassembly = self.composites.shouldBeDisassembled(ref)
2283 # doDisassembly = True
2285 artifacts = []
2286 if doDisassembly:
2287 components = ref.datasetType.storageClass.delegate().disassemble(inMemoryDataset)
2288 if components is None:
2289 raise RuntimeError(
2290 f"Inconsistent configuration: dataset type {ref.datasetType.name} "
2291 f"with storage class {ref.datasetType.storageClass.name} "
2292 "is configured to be disassembled, but cannot be."
2293 )
2294 for component, componentInfo in components.items():
2295 # Don't recurse because we want to take advantage of
2296 # bulk insert -- need a new DatasetRef that refers to the
2297 # same dataset_id but has the component DatasetType
2298 # DatasetType does not refer to the types of components
2299 # So we construct one ourselves.
2300 compRef = ref.makeComponentRef(component)
2301 storedInfo = self._write_in_memory_to_artifact(componentInfo.component, compRef)
2302 artifacts.append((compRef, storedInfo))
2303 else:
2304 # Write the entire thing out
2305 storedInfo = self._write_in_memory_to_artifact(inMemoryDataset, ref)
2306 artifacts.append((ref, storedInfo))
2308 self._register_datasets(artifacts)
2310 @transactional
2311 def trash(self, ref: DatasetRef | Iterable[DatasetRef], ignore_errors: bool = True) -> None:
2312 # At this point can safely remove these datasets from the cache
2313 # to avoid confusion later on. If they are not trashed later
2314 # the cache will simply be refilled.
2315 self.cacheManager.remove_from_cache(ref)
2317 # If we are in trust mode there will be nothing to move to
2318 # the trash table and we will have to try to delete the file
2319 # immediately.
2320 if self.trustGetRequest:
2321 # Try to keep the logic below for a single file trash.
2322 if isinstance(ref, DatasetRef):
2323 refs = {ref}
2324 else:
2325 # Will recreate ref at the end of this branch.
2326 refs = set(ref)
2328 # Determine which datasets are known to datastore directly.
2329 id_to_ref = {ref.id: ref for ref in refs}
2330 existing_ids = self._get_stored_records_associated_with_refs(refs)
2331 existing_refs = {id_to_ref[ref_id] for ref_id in existing_ids}
2333 missing = refs - existing_refs
2334 if missing:
2335 # Do an explicit existence check on these refs.
2336 # We only care about the artifacts at this point and not
2337 # the dataset existence.
2338 artifact_existence: dict[ResourcePath, bool] = {}
2339 _ = self.mexists(missing, artifact_existence)
2340 uris = [uri for uri, exists in artifact_existence.items() if exists]
2342 # FUTURE UPGRADE: Implement a parallelized bulk remove.
2343 log.debug("Removing %d artifacts from datastore that are unknown to datastore", len(uris))
2344 for uri in uris:
2345 try:
2346 uri.remove()
2347 except Exception as e:
2348 if ignore_errors:
2349 log.debug("Artifact %s could not be removed: %s", uri, e)
2350 continue
2351 raise
2353 # There is no point asking the code below to remove refs we
2354 # know are missing so update it with the list of existing
2355 # records. Try to retain one vs many logic.
2356 if not existing_refs:
2357 # Nothing more to do since none of the datasets were
2358 # known to the datastore record table.
2359 return
2360 ref = list(existing_refs)
2361 if len(ref) == 1:
2362 ref = ref[0]
2364 # Get file metadata and internal metadata
2365 if not isinstance(ref, DatasetRef):
2366 log.debug("Doing multi-dataset trash in datastore %s", self.name)
2367 # Assumed to be an iterable of refs so bulk mode enabled.
2368 try:
2369 self.bridge.moveToTrash(ref, transaction=self._transaction)
2370 except Exception as e:
2371 if ignore_errors:
2372 log.warning("Unexpected issue moving multiple datasets to trash: %s", e)
2373 else:
2374 raise
2375 return
2377 log.debug("Trashing dataset %s in datastore %s", ref, self.name)
2379 fileLocations = self._get_dataset_locations_info(ref)
2381 if not fileLocations:
2382 err_msg = f"Requested dataset to trash ({ref}) is not known to datastore {self.name}"
2383 if ignore_errors:
2384 log.warning(err_msg)
2385 return
2386 else:
2387 raise FileNotFoundError(err_msg)
2389 for location, _ in fileLocations:
2390 if not self._artifact_exists(location):
2391 err_msg = (
2392 f"Dataset is known to datastore {self.name} but "
2393 f"associated artifact ({location.uri}) is missing"
2394 )
2395 if ignore_errors:
2396 log.warning(err_msg)
2397 return
2398 else:
2399 raise FileNotFoundError(err_msg)
2401 # Mark dataset as trashed
2402 try:
2403 self.bridge.moveToTrash([ref], transaction=self._transaction)
2404 except Exception as e:
2405 if ignore_errors:
2406 log.warning(
2407 "Attempted to mark dataset (%s) to be trashed in datastore %s "
2408 "but encountered an error: %s",
2409 ref,
2410 self.name,
2411 e,
2412 )
2413 pass
2414 else:
2415 raise
2417 @transactional
2418 def emptyTrash(self, ignore_errors: bool = True) -> None:
2419 """Remove all datasets from the trash.
2421 Parameters
2422 ----------
2423 ignore_errors : `bool`
2424 If `True` return without error even if something went wrong.
2425 Problems could occur if another process is simultaneously trying
2426 to delete.
2427 """
2428 log.debug("Emptying trash in datastore %s", self.name)
2430 # Context manager will empty trash iff we finish it without raising.
2431 # It will also automatically delete the relevant rows from the
2432 # trash table and the records table.
2433 with self.bridge.emptyTrash(
2434 self._table, record_class=StoredFileInfo, record_column="path"
2435 ) as trash_data:
2436 # Removing the artifacts themselves requires that the files are
2437 # not also associated with refs that are not to be trashed.
2438 # Therefore need to do a query with the file paths themselves
2439 # and return all the refs associated with them. Can only delete
2440 # a file if the refs to be trashed are the only refs associated
2441 # with the file.
2442 # This requires multiple copies of the trashed items
2443 trashed, artifacts_to_keep = trash_data
2445 if artifacts_to_keep is None:
2446 # The bridge is not helping us so have to work it out
2447 # ourselves. This is not going to be as efficient.
2448 trashed = list(trashed)
2450 # The instance check is for mypy since up to this point it
2451 # does not know the type of info.
2452 path_map = self._refs_associated_with_artifacts(
2453 [info.path for _, info in trashed if isinstance(info, StoredFileInfo)]
2454 )
2456 for ref, info in trashed:
2457 # Mypy needs to know this is not the base class
2458 assert isinstance(info, StoredFileInfo), f"Unexpectedly got info of class {type(info)}"
2460 path_map[info.path].remove(ref.id)
2461 if not path_map[info.path]:
2462 del path_map[info.path]
2464 artifacts_to_keep = set(path_map)
2466 for ref, info in trashed:
2467 # Should not happen for this implementation but need
2468 # to keep mypy happy.
2469 assert info is not None, f"Internal logic error in emptyTrash with ref {ref}."
2471 # Mypy needs to know this is not the base class
2472 assert isinstance(info, StoredFileInfo), f"Unexpectedly got info of class {type(info)}"
2474 if info.path in artifacts_to_keep:
2475 # This is a multi-dataset artifact and we are not
2476 # removing all associated refs.
2477 continue
2479 # Only trashed refs still known to datastore will be returned.
2480 location = info.file_location(self.locationFactory)
2482 # Point of no return for this artifact
2483 log.debug("Removing artifact %s from datastore %s", location.uri, self.name)
2484 try:
2485 self._delete_artifact(location)
2486 except FileNotFoundError:
2487 # If the file itself has been deleted there is nothing
2488 # we can do about it. It is possible that trash has
2489 # been run in parallel in another process or someone
2490 # decided to delete the file. It is unlikely to come
2491 # back and so we should still continue with the removal
2492 # of the entry from the trash table. It is also possible
2493 # we removed it in a previous iteration if it was
2494 # a multi-dataset artifact. The delete artifact method
2495 # will log a debug message in this scenario.
2496 # Distinguishing file missing before trash started and
2497 # file already removed previously as part of this trash
2498 # is not worth the distinction with regards to potential
2499 # memory cost.
2500 pass
2501 except Exception as e:
2502 if ignore_errors:
2503 # Use a debug message here even though it's not
2504 # a good situation. In some cases this can be
2505 # caused by a race between user A and user B
2506 # and neither of them has permissions for the
2507 # other's files. Butler does not know about users
2508 # and trash has no idea what collections these
2509 # files were in (without guessing from a path).
2510 log.debug(
2511 "Encountered error removing artifact %s from datastore %s: %s",
2512 location.uri,
2513 self.name,
2514 e,
2515 )
2516 else:
2517 raise
2519 @transactional
2520 def transfer_from(
2521 self,
2522 source_datastore: Datastore,
2523 refs: Iterable[DatasetRef],
2524 transfer: str = "auto",
2525 artifact_existence: dict[ResourcePath, bool] | None = None,
2526 ) -> tuple[set[DatasetRef], set[DatasetRef]]:
2527 # Docstring inherited
2528 if type(self) is not type(source_datastore):
2529 raise TypeError(
2530 f"Datastore mismatch between this datastore ({type(self)}) and the "
2531 f"source datastore ({type(source_datastore)})."
2532 )
2534 # Be explicit for mypy
2535 if not isinstance(source_datastore, FileDatastore):
2536 raise TypeError(
2537 "Can only transfer to a FileDatastore from another FileDatastore, not"
2538 f" {type(source_datastore)}"
2539 )
2541 # Stop early if "direct" transfer mode is requested. That would
2542 # require that the URI inside the source datastore should be stored
2543 # directly in the target datastore, which seems unlikely to be useful
2544 # since at any moment the source datastore could delete the file.
2545 if transfer in ("direct", "split"):
2546 raise ValueError(
2547 f"Can not transfer from a source datastore using {transfer} mode since"
2548 " those files are controlled by the other datastore."
2549 )
2551 # Empty existence lookup if none given.
2552 if artifact_existence is None:
2553 artifact_existence = {}
2555 # We will go through the list multiple times so must convert
2556 # generators to lists.
2557 refs = list(refs)
2559 # In order to handle disassembled composites the code works
2560 # at the records level since it can assume that internal APIs
2561 # can be used.
2562 # - If the record already exists in the destination this is assumed
2563 # to be okay.
2564 # - If there is no record but the source and destination URIs are
2565 # identical no transfer is done but the record is added.
2566 # - If the source record refers to an absolute URI currently assume
2567 # that that URI should remain absolute and will be visible to the
2568 # destination butler. May need to have a flag to indicate whether
2569 # the dataset should be transferred. This will only happen if
2570 # the detached Butler has had a local ingest.
2572 # What we really want is all the records in the source datastore
2573 # associated with these refs. Or derived ones if they don't exist
2574 # in the source.
2575 source_records = source_datastore._get_stored_records_associated_with_refs(refs)
2577 # The source dataset_ids are the keys in these records
2578 source_ids = set(source_records)
2579 log.debug("Number of datastore records found in source: %d", len(source_ids))
2581 requested_ids = {ref.id for ref in refs}
2582 missing_ids = requested_ids - source_ids
2584 # Missing IDs can be okay if that datastore has allowed
2585 # gets based on file existence. Should we transfer what we can
2586 # or complain about it and warn?
2587 if missing_ids and not source_datastore.trustGetRequest:
2588 raise ValueError(
2589 f"Some datasets are missing from source datastore {source_datastore}: {missing_ids}"
2590 )
2592 # Need to map these missing IDs to a DatasetRef so we can guess
2593 # the details.
2594 if missing_ids:
2595 log.info(
2596 "Number of expected datasets missing from source datastore records: %d out of %d",
2597 len(missing_ids),
2598 len(requested_ids),
2599 )
2600 id_to_ref = {ref.id: ref for ref in refs if ref.id in missing_ids}
2602 # This should be chunked in case we end up having to check
2603 # the file store since we need some log output to show
2604 # progress.
2605 for missing_ids_chunk in chunk_iterable(missing_ids, chunk_size=10_000):
2606 records = {}
2607 for missing in missing_ids_chunk:
2608 # Ask the source datastore where the missing artifacts
2609 # should be. An execution butler might not know about the
2610 # artifacts even if they are there.
2611 expected = source_datastore._get_expected_dataset_locations_info(id_to_ref[missing])
2612 records[missing] = [info for _, info in expected]
2614 # Call the mexist helper method in case we have not already
2615 # checked these artifacts such that artifact_existence is
2616 # empty. This allows us to benefit from parallelism.
2617 # datastore.mexists() itself does not give us access to the
2618 # derived datastore record.
2619 log.verbose("Checking existence of %d datasets unknown to datastore", len(records))
2620 ref_exists = source_datastore._process_mexists_records(
2621 id_to_ref, records, False, artifact_existence=artifact_existence
2622 )
2624 # Now go through the records and propagate the ones that exist.
2625 location_factory = source_datastore.locationFactory
2626 for missing, record_list in records.items():
2627 # Skip completely if the ref does not exist.
2628 ref = id_to_ref[missing]
2629 if not ref_exists[ref]:
2630 log.warning("Asked to transfer dataset %s but no file artifacts exist for it.", ref)
2631 continue
2632 # Check for file artifact to decide which parts of a
2633 # disassembled composite do exist. If there is only a
2634 # single record we don't even need to look because it can't
2635 # be a composite and must exist.
2636 if len(record_list) == 1:
2637 dataset_records = record_list
2638 else:
2639 dataset_records = [
2640 record
2641 for record in record_list
2642 if artifact_existence[record.file_location(location_factory).uri]
2643 ]
2644 assert len(dataset_records) > 0, "Disassembled composite should have had some files."
2646 # Rely on source_records being a defaultdict.
2647 source_records[missing].extend(dataset_records)
2649 # See if we already have these records
2650 target_records = self._get_stored_records_associated_with_refs(refs)
2652 # The artifacts to register
2653 artifacts = []
2655 # Refs that already exist
2656 already_present = []
2658 # Refs that were rejected by this datastore.
2659 rejected = set()
2661 # Refs that were transferred successfully.
2662 accepted = set()
2664 # Record each time we have done a "direct" transfer.
2665 direct_transfers = []
2667 # Now can transfer the artifacts
2668 for ref in refs:
2669 if not self.constraints.isAcceptable(ref):
2670 # This datastore should not be accepting this dataset.
2671 rejected.add(ref)
2672 continue
2674 accepted.add(ref)
2676 if ref.id in target_records:
2677 # Already have an artifact for this.
2678 already_present.append(ref)
2679 continue
2681 # mypy needs to know these are always resolved refs
2682 for info in source_records[ref.id]:
2683 source_location = info.file_location(source_datastore.locationFactory)
2684 target_location = info.file_location(self.locationFactory)
2685 if source_location == target_location and not source_location.pathInStore.isabs():
2686 # Artifact is already in the target location.
2687 # (which is how execution butler currently runs)
2688 pass
2689 else:
2690 if target_location.pathInStore.isabs():
2691 # Just because we can see the artifact when running
2692 # the transfer doesn't mean it will be generally
2693 # accessible to a user of this butler. Need to decide
2694 # what to do about an absolute path.
2695 if transfer == "auto":
2696 # For "auto" transfers we allow the absolute URI
2697 # to be recorded in the target datastore.
2698 direct_transfers.append(source_location)
2699 else:
2700 # The user is explicitly requesting a transfer
2701 # even for an absolute URI. This requires us to
2702 # calculate the target path.
2703 template_ref = ref
2704 if info.component:
2705 template_ref = ref.makeComponentRef(info.component)
2706 target_location = self._calculate_ingested_datastore_name(
2707 source_location.uri,
2708 template_ref,
2709 )
2711 info = info.update(path=target_location.pathInStore.path)
2713 # Need to transfer it to the new location.
2714 # Assume we should always overwrite. If the artifact
2715 # is there this might indicate that a previous transfer
2716 # was interrupted but was not able to be rolled back
2717 # completely (eg pre-emption) so follow Datastore default
2718 # and overwrite.
2719 target_location.uri.transfer_from(
2720 source_location.uri, transfer=transfer, overwrite=True, transaction=self._transaction
2721 )
2723 artifacts.append((ref, info))
2725 if direct_transfers:
2726 log.info(
2727 "Transfer request for an outside-datastore artifact with absolute URI done %d time%s",
2728 len(direct_transfers),
2729 "" if len(direct_transfers) == 1 else "s",
2730 )
2732 self._register_datasets(artifacts)
2734 if already_present:
2735 n_skipped = len(already_present)
2736 log.info(
2737 "Skipped transfer of %d dataset%s already present in datastore",
2738 n_skipped,
2739 "" if n_skipped == 1 else "s",
2740 )
2742 return accepted, rejected
2744 @transactional
2745 def forget(self, refs: Iterable[DatasetRef]) -> None:
2746 # Docstring inherited.
2747 refs = list(refs)
2748 self.bridge.forget(refs)
2749 self._table.delete(["dataset_id"], *[{"dataset_id": ref.id} for ref in refs])
2751 def validateConfiguration(
2752 self, entities: Iterable[DatasetRef | DatasetType | StorageClass], logFailures: bool = False
2753 ) -> None:
2754 """Validate some of the configuration for this datastore.
2756 Parameters
2757 ----------
2758 entities : iterable of `DatasetRef`, `DatasetType`, or `StorageClass`
2759 Entities to test against this configuration. Can be differing
2760 types.
2761 logFailures : `bool`, optional
2762 If `True`, output a log message for every validation error
2763 detected.
2765 Raises
2766 ------
2767 DatastoreValidationError
2768 Raised if there is a validation problem with a configuration.
2769 All the problems are reported in a single exception.
2771 Notes
2772 -----
2773 This method checks that all the supplied entities have valid file
2774 templates and also have formatters defined.
2775 """
2776 templateFailed = None
2777 try:
2778 self.templates.validateTemplates(entities, logFailures=logFailures)
2779 except FileTemplateValidationError as e:
2780 templateFailed = str(e)
2782 formatterFailed = []
2783 for entity in entities:
2784 try:
2785 self.formatterFactory.getFormatterClass(entity)
2786 except KeyError as e:
2787 formatterFailed.append(str(e))
2788 if logFailures:
2789 log.critical("Formatter failure: %s", e)
2791 if templateFailed or formatterFailed:
2792 messages = []
2793 if templateFailed:
2794 messages.append(templateFailed)
2795 if formatterFailed:
2796 messages.append(",".join(formatterFailed))
2797 msg = ";\n".join(messages)
2798 raise DatastoreValidationError(msg)
2800 def getLookupKeys(self) -> set[LookupKey]:
2801 # Docstring is inherited from base class
2802 return (
2803 self.templates.getLookupKeys()
2804 | self.formatterFactory.getLookupKeys()
2805 | self.constraints.getLookupKeys()
2806 )
2808 def validateKey(self, lookupKey: LookupKey, entity: DatasetRef | DatasetType | StorageClass) -> None:
2809 # Docstring is inherited from base class
2810 # The key can be valid in either formatters or templates so we can
2811 # only check the template if it exists
2812 if lookupKey in self.templates:
2813 try:
2814 self.templates[lookupKey].validateTemplate(entity)
2815 except FileTemplateValidationError as e:
2816 raise DatastoreValidationError(e) from e
2818 def export(
2819 self,
2820 refs: Iterable[DatasetRef],
2821 *,
2822 directory: ResourcePathExpression | None = None,
2823 transfer: str | None = "auto",
2824 ) -> Iterable[FileDataset]:
2825 # Docstring inherited from Datastore.export.
2826 if transfer == "auto" and directory is None:
2827 transfer = None
2829 if transfer is not None and directory is None:
2830 raise TypeError(f"Cannot export using transfer mode {transfer} with no export directory given")
2832 if transfer == "move":
2833 raise TypeError("Can not export by moving files out of datastore.")
2834 elif transfer == "direct":
2835 # For an export, treat this as equivalent to None. We do not
2836 # want an import to risk using absolute URIs to datasets owned
2837 # by another datastore.
2838 log.info("Treating 'direct' transfer mode as in-place export.")
2839 transfer = None
2841 # Force the directory to be a URI object
2842 directoryUri: ResourcePath | None = None
2843 if directory is not None:
2844 directoryUri = ResourcePath(directory, forceDirectory=True)
2846 if transfer is not None and directoryUri is not None and not directoryUri.exists():
2847 # mypy needs the second test
2848 raise FileNotFoundError(f"Export location {directory} does not exist")
2850 progress = Progress("lsst.daf.butler.datastores.FileDatastore.export", level=logging.DEBUG)
2851 for ref in progress.wrap(refs, "Exporting dataset files"):
2852 fileLocations = self._get_dataset_locations_info(ref)
2853 if not fileLocations:
2854 raise FileNotFoundError(f"Could not retrieve dataset {ref}.")
2855 # For now we can not export disassembled datasets
2856 if len(fileLocations) > 1:
2857 raise NotImplementedError(f"Can not export disassembled datasets such as {ref}")
2858 location, storedFileInfo = fileLocations[0]
2860 pathInStore = location.pathInStore.path
2861 if transfer is None:
2862 # TODO: do we also need to return the readStorageClass somehow?
2863 # We will use the path in store directly. If this is an
2864 # absolute URI, preserve it.
2865 if location.pathInStore.isabs():
2866 pathInStore = str(location.uri)
2867 elif transfer == "direct":
2868 # Use full URIs to the remote store in the export
2869 pathInStore = str(location.uri)
2870 else:
2871 # mypy needs help
2872 assert directoryUri is not None, "directoryUri must be defined to get here"
2873 storeUri = ResourcePath(location.uri)
2875 # if the datastore has an absolute URI to a resource, we
2876 # have two options:
2877 # 1. Keep the absolute URI in the exported YAML
2878 # 2. Allocate a new name in the local datastore and transfer
2879 # it.
2880 # For now go with option 2
2881 if location.pathInStore.isabs():
2882 template = self.templates.getTemplate(ref)
2883 newURI = ResourcePath(template.format(ref), forceAbsolute=False)
2884 pathInStore = str(newURI.updatedExtension(location.pathInStore.getExtension()))
2886 exportUri = directoryUri.join(pathInStore)
2887 exportUri.transfer_from(storeUri, transfer=transfer)
2889 yield FileDataset(refs=[ref], path=pathInStore, formatter=storedFileInfo.formatter)
2891 @staticmethod
2892 def computeChecksum(uri: ResourcePath, algorithm: str = "blake2b", block_size: int = 8192) -> str | None:
2893 """Compute the checksum of the supplied file.
2895 Parameters
2896 ----------
2897 uri : `lsst.resources.ResourcePath`
2898 Name of resource to calculate checksum from.
2899 algorithm : `str`, optional
2900 Name of algorithm to use. Must be one of the algorithms supported
2901 by :py:class`hashlib`.
2902 block_size : `int`
2903 Number of bytes to read from file at one time.
2905 Returns
2906 -------
2907 hexdigest : `str`
2908 Hex digest of the file.
2910 Notes
2911 -----
2912 Currently returns None if the URI is for a remote resource.
2913 """
2914 if algorithm not in hashlib.algorithms_guaranteed:
2915 raise NameError(f"The specified algorithm '{algorithm}' is not supported by hashlib")
2917 if not uri.isLocal:
2918 return None
2920 hasher = hashlib.new(algorithm)
2922 with uri.as_local() as local_uri, open(local_uri.ospath, "rb") as f:
2923 for chunk in iter(lambda: f.read(block_size), b""):
2924 hasher.update(chunk)
2926 return hasher.hexdigest()
2928 def needs_expanded_data_ids(
2929 self,
2930 transfer: str | None,
2931 entity: DatasetRef | DatasetType | StorageClass | None = None,
2932 ) -> bool:
2933 # Docstring inherited.
2934 # This _could_ also use entity to inspect whether the filename template
2935 # involves placeholders other than the required dimensions for its
2936 # dataset type, but that's not necessary for correctness; it just
2937 # enables more optimizations (perhaps only in theory).
2938 return transfer not in ("direct", None)
2940 def import_records(self, data: Mapping[str, DatastoreRecordData]) -> None:
2941 # Docstring inherited from the base class.
2942 record_data = data.get(self.name)
2943 if not record_data:
2944 return
2946 self._bridge.insert(FakeDatasetRef(dataset_id) for dataset_id in record_data.records)
2948 # TODO: Verify that there are no unexpected table names in the dict?
2949 unpacked_records = []
2950 for dataset_data in record_data.records.values():
2951 records = dataset_data.get(self._table.name)
2952 if records:
2953 for info in records:
2954 assert isinstance(info, StoredFileInfo), "Expecting StoredFileInfo records"
2955 unpacked_records.append(info.to_record())
2956 if unpacked_records:
2957 self._table.insert(*unpacked_records, transaction=self._transaction)
2959 def export_records(self, refs: Iterable[DatasetIdRef]) -> Mapping[str, DatastoreRecordData]:
2960 # Docstring inherited from the base class.
2961 exported_refs = list(self._bridge.check(refs))
2962 ids = {ref.id for ref in exported_refs}
2963 records: dict[DatasetId, dict[str, list[StoredDatastoreItemInfo]]] = {id: {} for id in ids}
2964 for row in self._table.fetch(dataset_id=ids):
2965 info: StoredDatastoreItemInfo = StoredFileInfo.from_record(row)
2966 dataset_records = records.setdefault(info.dataset_id, {})
2967 dataset_records.setdefault(self._table.name, []).append(info)
2969 record_data = DatastoreRecordData(records=records)
2970 return {self.name: record_data}
2972 def set_retrieve_dataset_type_method(self, method: Callable[[str], DatasetType | None] | None) -> None:
2973 # Docstring inherited from the base class.
2974 self._retrieve_dataset_method = method
2976 def _cast_storage_class(self, ref: DatasetRef) -> DatasetRef:
2977 """Update dataset reference to use the storage class from registry."""
2978 if self._retrieve_dataset_method is None:
2979 # We could raise an exception here but unit tests do not define
2980 # this method.
2981 return ref
2982 dataset_type = self._retrieve_dataset_method(ref.datasetType.name)
2983 if dataset_type is not None:
2984 ref = ref.overrideStorageClass(dataset_type.storageClass)
2985 return ref