Coverage for python/lsst/daf/butler/datastores/fileDatastore.py: 10%
991 statements
« prev ^ index » next coverage.py v7.3.2, created at 2023-10-25 15:14 +0000
« prev ^ index » next coverage.py v7.3.2, created at 2023-10-25 15:14 +0000
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22"""Generic file-based datastore code."""
24from __future__ import annotations
26__all__ = ("FileDatastore",)
28import contextlib
29import hashlib
30import logging
31from collections import defaultdict
32from collections.abc import Callable, Iterable, Mapping, Sequence
33from dataclasses import dataclass
34from typing import TYPE_CHECKING, Any, ClassVar
36from lsst.daf.butler import (
37 CompositesMap,
38 Config,
39 DatasetId,
40 DatasetRef,
41 DatasetRefURIs,
42 DatasetType,
43 DatasetTypeNotSupportedError,
44 Datastore,
45 DatastoreCacheManager,
46 DatastoreConfig,
47 DatastoreDisabledCacheManager,
48 DatastoreRecordData,
49 DatastoreValidationError,
50 FileDataset,
51 FileDescriptor,
52 FileTemplates,
53 FileTemplateValidationError,
54 Formatter,
55 FormatterFactory,
56 Location,
57 LocationFactory,
58 Progress,
59 StorageClass,
60 StoredDatastoreItemInfo,
61 StoredFileInfo,
62 ddl,
63)
64from lsst.daf.butler.core.repoRelocation import replaceRoot
65from lsst.daf.butler.core.utils import transactional
66from lsst.daf.butler.registry.interfaces import DatastoreRegistryBridge, ReadOnlyDatabaseError
67from lsst.resources import ResourcePath, ResourcePathExpression
68from lsst.utils.introspection import get_class_of, get_instance_of
69from lsst.utils.iteration import chunk_iterable
71# For VERBOSE logging usage.
72from lsst.utils.logging import VERBOSE, getLogger
73from lsst.utils.timer import time_this
74from sqlalchemy import BigInteger, String
76from ..registry.interfaces import DatabaseInsertMode, FakeDatasetRef
77from .genericDatastore import GenericBaseDatastore
79if TYPE_CHECKING:
80 from lsst.daf.butler import AbstractDatastoreCacheManager, LookupKey
81 from lsst.daf.butler.registry.interfaces import DatasetIdRef, DatastoreRegistryBridgeManager
83log = getLogger(__name__)
86class _IngestPrepData(Datastore.IngestPrepData):
87 """Helper class for FileDatastore ingest implementation.
89 Parameters
90 ----------
91 datasets : `~collections.abc.Iterable` of `FileDataset`
92 Files to be ingested by this datastore.
93 """
95 def __init__(self, datasets: Iterable[FileDataset]):
96 super().__init__(ref for dataset in datasets for ref in dataset.refs)
97 self.datasets = datasets
100@dataclass(frozen=True)
101class DatastoreFileGetInformation:
102 """Collection of useful parameters needed to retrieve a file from
103 a Datastore.
104 """
106 location: Location
107 """The location from which to read the dataset."""
109 formatter: Formatter
110 """The `Formatter` to use to deserialize the dataset."""
112 info: StoredFileInfo
113 """Stored information about this file and its formatter."""
115 assemblerParams: Mapping[str, Any]
116 """Parameters to use for post-processing the retrieved dataset."""
118 formatterParams: Mapping[str, Any]
119 """Parameters that were understood by the associated formatter."""
121 component: str | None
122 """The component to be retrieved (can be `None`)."""
124 readStorageClass: StorageClass
125 """The `StorageClass` of the dataset being read."""
128class FileDatastore(GenericBaseDatastore):
129 """Generic Datastore for file-based implementations.
131 Should always be sub-classed since key abstract methods are missing.
133 Parameters
134 ----------
135 config : `DatastoreConfig` or `str`
136 Configuration as either a `Config` object or URI to file.
137 bridgeManager : `DatastoreRegistryBridgeManager`
138 Object that manages the interface between `Registry` and datastores.
139 butlerRoot : `str`, optional
140 New datastore root to use to override the configuration value.
142 Raises
143 ------
144 ValueError
145 If root location does not exist and ``create`` is `False` in the
146 configuration.
147 """
149 defaultConfigFile: ClassVar[str | None] = None
150 """Path to configuration defaults. Accessed within the ``config`` resource
151 or relative to a search path. Can be None if no defaults specified.
152 """
154 root: ResourcePath
155 """Root directory URI of this `Datastore`."""
157 locationFactory: LocationFactory
158 """Factory for creating locations relative to the datastore root."""
160 formatterFactory: FormatterFactory
161 """Factory for creating instances of formatters."""
163 templates: FileTemplates
164 """File templates that can be used by this `Datastore`."""
166 composites: CompositesMap
167 """Determines whether a dataset should be disassembled on put."""
169 defaultConfigFile = "datastores/fileDatastore.yaml"
170 """Path to configuration defaults. Accessed within the ``config`` resource
171 or relative to a search path. Can be None if no defaults specified.
172 """
174 _retrieve_dataset_method: Callable[[str], DatasetType | None] | None = None
175 """Callable that is used in trusted mode to retrieve registry definition
176 of a named dataset type.
177 """
179 @classmethod
180 def setConfigRoot(cls, root: str, config: Config, full: Config, overwrite: bool = True) -> None:
181 """Set any filesystem-dependent config options for this Datastore to
182 be appropriate for a new empty repository with the given root.
184 Parameters
185 ----------
186 root : `str`
187 URI to the root of the data repository.
188 config : `Config`
189 A `Config` to update. Only the subset understood by
190 this component will be updated. Will not expand
191 defaults.
192 full : `Config`
193 A complete config with all defaults expanded that can be
194 converted to a `DatastoreConfig`. Read-only and will not be
195 modified by this method.
196 Repository-specific options that should not be obtained
197 from defaults when Butler instances are constructed
198 should be copied from ``full`` to ``config``.
199 overwrite : `bool`, optional
200 If `False`, do not modify a value in ``config`` if the value
201 already exists. Default is always to overwrite with the provided
202 ``root``.
204 Notes
205 -----
206 If a keyword is explicitly defined in the supplied ``config`` it
207 will not be overridden by this method if ``overwrite`` is `False`.
208 This allows explicit values set in external configs to be retained.
209 """
210 Config.updateParameters(
211 DatastoreConfig,
212 config,
213 full,
214 toUpdate={"root": root},
215 toCopy=("cls", ("records", "table")),
216 overwrite=overwrite,
217 )
219 @classmethod
220 def makeTableSpec(cls, datasetIdColumnType: type) -> ddl.TableSpec:
221 return ddl.TableSpec(
222 fields=[
223 ddl.FieldSpec(name="dataset_id", dtype=datasetIdColumnType, primaryKey=True),
224 ddl.FieldSpec(name="path", dtype=String, length=256, nullable=False),
225 ddl.FieldSpec(name="formatter", dtype=String, length=128, nullable=False),
226 ddl.FieldSpec(name="storage_class", dtype=String, length=64, nullable=False),
227 # Use empty string to indicate no component
228 ddl.FieldSpec(name="component", dtype=String, length=32, primaryKey=True),
229 # TODO: should checksum be Base64Bytes instead?
230 ddl.FieldSpec(name="checksum", dtype=String, length=128, nullable=True),
231 ddl.FieldSpec(name="file_size", dtype=BigInteger, nullable=True),
232 ],
233 unique=frozenset(),
234 indexes=[ddl.IndexSpec("path")],
235 )
237 def __init__(
238 self,
239 config: DatastoreConfig | ResourcePathExpression,
240 bridgeManager: DatastoreRegistryBridgeManager,
241 butlerRoot: str | None = None,
242 ):
243 super().__init__(config, bridgeManager)
244 if "root" not in self.config:
245 raise ValueError("No root directory specified in configuration")
247 self._bridgeManager = bridgeManager
249 # Name ourselves either using an explicit name or a name
250 # derived from the (unexpanded) root
251 if "name" in self.config:
252 self.name = self.config["name"]
253 else:
254 # We use the unexpanded root in the name to indicate that this
255 # datastore can be moved without having to update registry.
256 self.name = "{}@{}".format(type(self).__name__, self.config["root"])
258 # Support repository relocation in config
259 # Existence of self.root is checked in subclass
260 self.root = ResourcePath(
261 replaceRoot(self.config["root"], butlerRoot), forceDirectory=True, forceAbsolute=True
262 )
264 self.locationFactory = LocationFactory(self.root)
265 self.formatterFactory = FormatterFactory()
267 # Now associate formatters with storage classes
268 self.formatterFactory.registerFormatters(self.config["formatters"], universe=bridgeManager.universe)
270 # Read the file naming templates
271 self.templates = FileTemplates(self.config["templates"], universe=bridgeManager.universe)
273 # See if composites should be disassembled
274 self.composites = CompositesMap(self.config["composites"], universe=bridgeManager.universe)
276 tableName = self.config["records", "table"]
277 try:
278 # Storage of paths and formatters, keyed by dataset_id
279 self._table = bridgeManager.opaque.register(
280 tableName, self.makeTableSpec(bridgeManager.datasetIdColumnType)
281 )
282 # Interface to Registry.
283 self._bridge = bridgeManager.register(self.name)
284 except ReadOnlyDatabaseError:
285 # If the database is read only and we just tried and failed to
286 # create a table, it means someone is trying to create a read-only
287 # butler client for an empty repo. That should be okay, as long
288 # as they then try to get any datasets before some other client
289 # creates the table. Chances are they'rejust validating
290 # configuration.
291 pass
293 # Determine whether checksums should be used - default to False
294 self.useChecksum = self.config.get("checksum", False)
296 # Determine whether we can fall back to configuration if a
297 # requested dataset is not known to registry
298 self.trustGetRequest = self.config.get("trust_get_request", False)
300 # Create a cache manager
301 self.cacheManager: AbstractDatastoreCacheManager
302 if "cached" in self.config:
303 self.cacheManager = DatastoreCacheManager(self.config["cached"], universe=bridgeManager.universe)
304 else:
305 self.cacheManager = DatastoreDisabledCacheManager("", universe=bridgeManager.universe)
307 # Check existence and create directory structure if necessary
308 if not self.root.exists():
309 if "create" not in self.config or not self.config["create"]:
310 raise ValueError(f"No valid root and not allowed to create one at: {self.root}")
311 try:
312 self.root.mkdir()
313 except Exception as e:
314 raise ValueError(
315 f"Can not create datastore root '{self.root}', check permissions. Got error: {e}"
316 ) from e
318 def __str__(self) -> str:
319 return str(self.root)
321 @property
322 def bridge(self) -> DatastoreRegistryBridge:
323 return self._bridge
325 @property
326 def roots(self) -> dict[str, ResourcePath | None]:
327 # Docstring inherited.
328 return {self.name: self.root}
330 def _artifact_exists(self, location: Location) -> bool:
331 """Check that an artifact exists in this datastore at the specified
332 location.
334 Parameters
335 ----------
336 location : `Location`
337 Expected location of the artifact associated with this datastore.
339 Returns
340 -------
341 exists : `bool`
342 True if the location can be found, false otherwise.
343 """
344 log.debug("Checking if resource exists: %s", location.uri)
345 return location.uri.exists()
347 def _delete_artifact(self, location: Location) -> None:
348 """Delete the artifact from the datastore.
350 Parameters
351 ----------
352 location : `Location`
353 Location of the artifact associated with this datastore.
354 """
355 if location.pathInStore.isabs():
356 raise RuntimeError(f"Cannot delete artifact with absolute uri {location.uri}.")
358 try:
359 location.uri.remove()
360 except FileNotFoundError:
361 log.debug("File %s did not exist and so could not be deleted.", location.uri)
362 raise
363 except Exception as e:
364 log.critical("Failed to delete file: %s (%s)", location.uri, e)
365 raise
366 log.debug("Successfully deleted file: %s", location.uri)
368 def addStoredItemInfo(
369 self,
370 refs: Iterable[DatasetRef],
371 infos: Iterable[StoredFileInfo],
372 insert_mode: DatabaseInsertMode = DatabaseInsertMode.INSERT,
373 ) -> None:
374 # Docstring inherited from GenericBaseDatastore
375 records = [info.rebase(ref).to_record() for ref, info in zip(refs, infos, strict=True)]
376 match insert_mode:
377 case DatabaseInsertMode.INSERT:
378 self._table.insert(*records, transaction=self._transaction)
379 case DatabaseInsertMode.ENSURE:
380 self._table.ensure(*records, transaction=self._transaction)
381 case DatabaseInsertMode.REPLACE:
382 self._table.replace(*records, transaction=self._transaction)
383 case _:
384 raise ValueError(f"Unknown insert mode of '{insert_mode}'")
386 def getStoredItemsInfo(self, ref: DatasetIdRef) -> list[StoredFileInfo]:
387 # Docstring inherited from GenericBaseDatastore
389 # Look for the dataset_id -- there might be multiple matches
390 # if we have disassembled the dataset.
391 records = self._table.fetch(dataset_id=ref.id)
392 return [StoredFileInfo.from_record(record) for record in records]
394 def _get_stored_records_associated_with_refs(
395 self, refs: Iterable[DatasetIdRef]
396 ) -> dict[DatasetId, list[StoredFileInfo]]:
397 """Retrieve all records associated with the provided refs.
399 Parameters
400 ----------
401 refs : iterable of `DatasetIdRef`
402 The refs for which records are to be retrieved.
404 Returns
405 -------
406 records : `dict` of [`DatasetId`, `list` of `StoredFileInfo`]
407 The matching records indexed by the ref ID. The number of entries
408 in the dict can be smaller than the number of requested refs.
409 """
410 records = self._table.fetch(dataset_id=[ref.id for ref in refs])
412 # Uniqueness is dataset_id + component so can have multiple records
413 # per ref.
414 records_by_ref = defaultdict(list)
415 for record in records:
416 records_by_ref[record["dataset_id"]].append(StoredFileInfo.from_record(record))
417 return records_by_ref
419 def _refs_associated_with_artifacts(self, paths: list[str | ResourcePath]) -> dict[str, set[DatasetId]]:
420 """Return paths and associated dataset refs.
422 Parameters
423 ----------
424 paths : `list` of `str` or `lsst.resources.ResourcePath`
425 All the paths to include in search.
427 Returns
428 -------
429 mapping : `dict` of [`str`, `set` [`DatasetId`]]
430 Mapping of each path to a set of associated database IDs.
431 """
432 records = self._table.fetch(path=[str(path) for path in paths])
433 result = defaultdict(set)
434 for row in records:
435 result[row["path"]].add(row["dataset_id"])
436 return result
438 def _registered_refs_per_artifact(self, pathInStore: ResourcePath) -> set[DatasetId]:
439 """Return all dataset refs associated with the supplied path.
441 Parameters
442 ----------
443 pathInStore : `lsst.resources.ResourcePath`
444 Path of interest in the data store.
446 Returns
447 -------
448 ids : `set` of `int`
449 All `DatasetRef` IDs associated with this path.
450 """
451 records = list(self._table.fetch(path=str(pathInStore)))
452 ids = {r["dataset_id"] for r in records}
453 return ids
455 def removeStoredItemInfo(self, ref: DatasetIdRef) -> None:
456 # Docstring inherited from GenericBaseDatastore
457 self._table.delete(["dataset_id"], {"dataset_id": ref.id})
459 def _get_dataset_locations_info(self, ref: DatasetIdRef) -> list[tuple[Location, StoredFileInfo]]:
460 r"""Find all the `Location`\ s of the requested dataset in the
461 `Datastore` and the associated stored file information.
463 Parameters
464 ----------
465 ref : `DatasetRef`
466 Reference to the required `Dataset`.
468 Returns
469 -------
470 results : `list` [`tuple` [`Location`, `StoredFileInfo` ]]
471 Location of the dataset within the datastore and
472 stored information about each file and its formatter.
473 """
474 # Get the file information (this will fail if no file)
475 records = self.getStoredItemsInfo(ref)
477 # Use the path to determine the location -- we need to take
478 # into account absolute URIs in the datastore record
479 return [(r.file_location(self.locationFactory), r) for r in records]
481 def _can_remove_dataset_artifact(self, ref: DatasetIdRef, location: Location) -> bool:
482 """Check that there is only one dataset associated with the
483 specified artifact.
485 Parameters
486 ----------
487 ref : `DatasetRef` or `FakeDatasetRef`
488 Dataset to be removed.
489 location : `Location`
490 The location of the artifact to be removed.
492 Returns
493 -------
494 can_remove : `Bool`
495 True if the artifact can be safely removed.
496 """
497 # Can't ever delete absolute URIs.
498 if location.pathInStore.isabs():
499 return False
501 # Get all entries associated with this path
502 allRefs = self._registered_refs_per_artifact(location.pathInStore)
503 if not allRefs:
504 raise RuntimeError(f"Datastore inconsistency error. {location.pathInStore} not in registry")
506 # Remove these refs from all the refs and if there is nothing left
507 # then we can delete
508 remainingRefs = allRefs - {ref.id}
510 if remainingRefs:
511 return False
512 return True
514 def _get_expected_dataset_locations_info(self, ref: DatasetRef) -> list[tuple[Location, StoredFileInfo]]:
515 """Predict the location and related file information of the requested
516 dataset in this datastore.
518 Parameters
519 ----------
520 ref : `DatasetRef`
521 Reference to the required `Dataset`.
523 Returns
524 -------
525 results : `list` [`tuple` [`Location`, `StoredFileInfo` ]]
526 Expected Location of the dataset within the datastore and
527 placeholder information about each file and its formatter.
529 Notes
530 -----
531 Uses the current configuration to determine how we would expect the
532 datastore files to have been written if we couldn't ask registry.
533 This is safe so long as there has been no change to datastore
534 configuration between writing the dataset and wanting to read it.
535 Will not work for files that have been ingested without using the
536 standard file template or default formatter.
537 """
538 # If we have a component ref we always need to ask the questions
539 # of the composite. If the composite is disassembled this routine
540 # should return all components. If the composite was not
541 # disassembled the composite is what is stored regardless of
542 # component request. Note that if the caller has disassembled
543 # a composite there is no way for this guess to know that
544 # without trying both the composite and component ref and seeing
545 # if there is something at the component Location even without
546 # disassembly being enabled.
547 if ref.datasetType.isComponent():
548 ref = ref.makeCompositeRef()
550 # See if the ref is a composite that should be disassembled
551 doDisassembly = self.composites.shouldBeDisassembled(ref)
553 all_info: list[tuple[Location, Formatter, StorageClass, str | None]] = []
555 if doDisassembly:
556 for component, componentStorage in ref.datasetType.storageClass.components.items():
557 compRef = ref.makeComponentRef(component)
558 location, formatter = self._determine_put_formatter_location(compRef)
559 all_info.append((location, formatter, componentStorage, component))
561 else:
562 # Always use the composite ref if no disassembly
563 location, formatter = self._determine_put_formatter_location(ref)
564 all_info.append((location, formatter, ref.datasetType.storageClass, None))
566 # Convert the list of tuples to have StoredFileInfo as second element
567 return [
568 (
569 location,
570 StoredFileInfo(
571 formatter=formatter,
572 path=location.pathInStore.path,
573 storageClass=storageClass,
574 component=component,
575 checksum=None,
576 file_size=-1,
577 dataset_id=ref.id,
578 ),
579 )
580 for location, formatter, storageClass, component in all_info
581 ]
583 def _prepare_for_get(
584 self, ref: DatasetRef, parameters: Mapping[str, Any] | None = None
585 ) -> list[DatastoreFileGetInformation]:
586 """Check parameters for ``get`` and obtain formatter and
587 location.
589 Parameters
590 ----------
591 ref : `DatasetRef`
592 Reference to the required Dataset.
593 parameters : `dict`
594 `StorageClass`-specific parameters that specify, for example,
595 a slice of the dataset to be loaded.
597 Returns
598 -------
599 getInfo : `list` [`DatastoreFileGetInformation`]
600 Parameters needed to retrieve each file.
601 """
602 log.debug("Retrieve %s from %s with parameters %s", ref, self.name, parameters)
604 # The storage class we want to use eventually
605 refStorageClass = ref.datasetType.storageClass
607 # For trusted mode need to reset storage class.
608 ref = self._cast_storage_class(ref)
610 # Get file metadata and internal metadata
611 fileLocations = self._get_dataset_locations_info(ref)
612 if not fileLocations:
613 if not self.trustGetRequest:
614 raise FileNotFoundError(f"Could not retrieve dataset {ref}.")
615 # Assume the dataset is where we think it should be
616 fileLocations = self._get_expected_dataset_locations_info(ref)
618 if len(fileLocations) > 1:
619 disassembled = True
621 # If trust is involved it is possible that there will be
622 # components listed here that do not exist in the datastore.
623 # Explicitly check for file artifact existence and filter out any
624 # that are missing.
625 if self.trustGetRequest:
626 fileLocations = [loc for loc in fileLocations if loc[0].uri.exists()]
628 # For now complain only if we have no components at all. One
629 # component is probably a problem but we can punt that to the
630 # assembler.
631 if not fileLocations:
632 raise FileNotFoundError(f"None of the component files for dataset {ref} exist.")
634 else:
635 disassembled = False
637 # Is this a component request?
638 refComponent = ref.datasetType.component()
640 fileGetInfo = []
641 for location, storedFileInfo in fileLocations:
642 # The storage class used to write the file
643 writeStorageClass = storedFileInfo.storageClass
645 # If this has been disassembled we need read to match the write
646 if disassembled:
647 readStorageClass = writeStorageClass
648 else:
649 readStorageClass = refStorageClass
651 formatter = get_instance_of(
652 storedFileInfo.formatter,
653 FileDescriptor(
654 location,
655 readStorageClass=readStorageClass,
656 storageClass=writeStorageClass,
657 parameters=parameters,
658 ),
659 ref.dataId,
660 )
662 formatterParams, notFormatterParams = formatter.segregateParameters()
664 # Of the remaining parameters, extract the ones supported by
665 # this StorageClass (for components not all will be handled)
666 assemblerParams = readStorageClass.filterParameters(notFormatterParams)
668 # The ref itself could be a component if the dataset was
669 # disassembled by butler, or we disassembled in datastore and
670 # components came from the datastore records
671 component = storedFileInfo.component if storedFileInfo.component else refComponent
673 fileGetInfo.append(
674 DatastoreFileGetInformation(
675 location,
676 formatter,
677 storedFileInfo,
678 assemblerParams,
679 formatterParams,
680 component,
681 readStorageClass,
682 )
683 )
685 return fileGetInfo
687 def _prepare_for_put(self, inMemoryDataset: Any, ref: DatasetRef) -> tuple[Location, Formatter]:
688 """Check the arguments for ``put`` and obtain formatter and
689 location.
691 Parameters
692 ----------
693 inMemoryDataset : `object`
694 The dataset to store.
695 ref : `DatasetRef`
696 Reference to the associated Dataset.
698 Returns
699 -------
700 location : `Location`
701 The location to write the dataset.
702 formatter : `Formatter`
703 The `Formatter` to use to write the dataset.
705 Raises
706 ------
707 TypeError
708 Supplied object and storage class are inconsistent.
709 DatasetTypeNotSupportedError
710 The associated `DatasetType` is not handled by this datastore.
711 """
712 self._validate_put_parameters(inMemoryDataset, ref)
713 return self._determine_put_formatter_location(ref)
715 def _determine_put_formatter_location(self, ref: DatasetRef) -> tuple[Location, Formatter]:
716 """Calculate the formatter and output location to use for put.
718 Parameters
719 ----------
720 ref : `DatasetRef`
721 Reference to the associated Dataset.
723 Returns
724 -------
725 location : `Location`
726 The location to write the dataset.
727 formatter : `Formatter`
728 The `Formatter` to use to write the dataset.
729 """
730 # Work out output file name
731 try:
732 template = self.templates.getTemplate(ref)
733 except KeyError as e:
734 raise DatasetTypeNotSupportedError(f"Unable to find template for {ref}") from e
736 # Validate the template to protect against filenames from different
737 # dataIds returning the same and causing overwrite confusion.
738 template.validateTemplate(ref)
740 location = self.locationFactory.fromPath(template.format(ref))
742 # Get the formatter based on the storage class
743 storageClass = ref.datasetType.storageClass
744 try:
745 formatter = self.formatterFactory.getFormatter(
746 ref, FileDescriptor(location, storageClass=storageClass), ref.dataId
747 )
748 except KeyError as e:
749 raise DatasetTypeNotSupportedError(
750 f"Unable to find formatter for {ref} in datastore {self.name}"
751 ) from e
753 # Now that we know the formatter, update the location
754 location = formatter.makeUpdatedLocation(location)
756 return location, formatter
758 def _overrideTransferMode(self, *datasets: FileDataset, transfer: str | None = None) -> str | None:
759 # Docstring inherited from base class
760 if transfer != "auto":
761 return transfer
763 # See if the paths are within the datastore or not
764 inside = [self._pathInStore(d.path) is not None for d in datasets]
766 if all(inside):
767 transfer = None
768 elif not any(inside):
769 # Allow ResourcePath to use its own knowledge
770 transfer = "auto"
771 else:
772 # This can happen when importing from a datastore that
773 # has had some datasets ingested using "direct" mode.
774 # Also allow ResourcePath to sort it out but warn about it.
775 # This can happen if you are importing from a datastore
776 # that had some direct transfer datasets.
777 log.warning(
778 "Some datasets are inside the datastore and some are outside. Using 'split' "
779 "transfer mode. This assumes that the files outside the datastore are "
780 "still accessible to the new butler since they will not be copied into "
781 "the target datastore."
782 )
783 transfer = "split"
785 return transfer
787 def _pathInStore(self, path: ResourcePathExpression) -> str | None:
788 """Return path relative to datastore root.
790 Parameters
791 ----------
792 path : `lsst.resources.ResourcePathExpression`
793 Path to dataset. Can be absolute URI. If relative assumed to
794 be relative to the datastore. Returns path in datastore
795 or raises an exception if the path it outside.
797 Returns
798 -------
799 inStore : `str`
800 Path relative to datastore root. Returns `None` if the file is
801 outside the root.
802 """
803 # Relative path will always be relative to datastore
804 pathUri = ResourcePath(path, forceAbsolute=False)
805 return pathUri.relative_to(self.root)
807 def _standardizeIngestPath(
808 self, path: str | ResourcePath, *, transfer: str | None = None
809 ) -> str | ResourcePath:
810 """Standardize the path of a to-be-ingested file.
812 Parameters
813 ----------
814 path : `str` or `lsst.resources.ResourcePath`
815 Path of a file to be ingested. This parameter is not expected
816 to be all the types that can be used to construct a
817 `~lsst.resources.ResourcePath`.
818 transfer : `str`, optional
819 How (and whether) the dataset should be added to the datastore.
820 See `ingest` for details of transfer modes.
821 This implementation is provided only so
822 `NotImplementedError` can be raised if the mode is not supported;
823 actual transfers are deferred to `_extractIngestInfo`.
825 Returns
826 -------
827 path : `str` or `lsst.resources.ResourcePath`
828 New path in what the datastore considers standard form. If an
829 absolute URI was given that will be returned unchanged.
831 Notes
832 -----
833 Subclasses of `FileDatastore` can implement this method instead
834 of `_prepIngest`. It should not modify the data repository or given
835 file in any way.
837 Raises
838 ------
839 NotImplementedError
840 Raised if the datastore does not support the given transfer mode
841 (including the case where ingest is not supported at all).
842 FileNotFoundError
843 Raised if one of the given files does not exist.
844 """
845 if transfer not in (None, "direct", "split") + self.root.transferModes:
846 raise NotImplementedError(f"Transfer mode {transfer} not supported.")
848 # A relative URI indicates relative to datastore root
849 srcUri = ResourcePath(path, forceAbsolute=False)
850 if not srcUri.isabs():
851 srcUri = self.root.join(path)
853 if not srcUri.exists():
854 raise FileNotFoundError(
855 f"Resource at {srcUri} does not exist; note that paths to ingest "
856 f"are assumed to be relative to {self.root} unless they are absolute."
857 )
859 if transfer is None:
860 relpath = srcUri.relative_to(self.root)
861 if not relpath:
862 raise RuntimeError(
863 f"Transfer is none but source file ({srcUri}) is not within datastore ({self.root})"
864 )
866 # Return the relative path within the datastore for internal
867 # transfer
868 path = relpath
870 return path
872 def _extractIngestInfo(
873 self,
874 path: ResourcePathExpression,
875 ref: DatasetRef,
876 *,
877 formatter: Formatter | type[Formatter],
878 transfer: str | None = None,
879 record_validation_info: bool = True,
880 ) -> StoredFileInfo:
881 """Relocate (if necessary) and extract `StoredFileInfo` from a
882 to-be-ingested file.
884 Parameters
885 ----------
886 path : `lsst.resources.ResourcePathExpression`
887 URI or path of a file to be ingested.
888 ref : `DatasetRef`
889 Reference for the dataset being ingested. Guaranteed to have
890 ``dataset_id not None`.
891 formatter : `type` or `Formatter`
892 `Formatter` subclass to use for this dataset or an instance.
893 transfer : `str`, optional
894 How (and whether) the dataset should be added to the datastore.
895 See `ingest` for details of transfer modes.
896 record_validation_info : `bool`, optional
897 If `True`, the default, the datastore can record validation
898 information associated with the file. If `False` the datastore
899 will not attempt to track any information such as checksums
900 or file sizes. This can be useful if such information is tracked
901 in an external system or if the file is to be compressed in place.
902 It is up to the datastore whether this parameter is relevant.
904 Returns
905 -------
906 info : `StoredFileInfo`
907 Internal datastore record for this file. This will be inserted by
908 the caller; the `_extractIngestInfo` is only responsible for
909 creating and populating the struct.
911 Raises
912 ------
913 FileNotFoundError
914 Raised if one of the given files does not exist.
915 FileExistsError
916 Raised if transfer is not `None` but the (internal) location the
917 file would be moved to is already occupied.
918 """
919 if self._transaction is None:
920 raise RuntimeError("Ingest called without transaction enabled")
922 # Create URI of the source path, do not need to force a relative
923 # path to absolute.
924 srcUri = ResourcePath(path, forceAbsolute=False)
926 # Track whether we have read the size of the source yet
927 have_sized = False
929 tgtLocation: Location | None
930 if transfer is None or transfer == "split":
931 # A relative path is assumed to be relative to the datastore
932 # in this context
933 if not srcUri.isabs():
934 tgtLocation = self.locationFactory.fromPath(srcUri.ospath)
935 else:
936 # Work out the path in the datastore from an absolute URI
937 # This is required to be within the datastore.
938 pathInStore = srcUri.relative_to(self.root)
939 if pathInStore is None and transfer is None:
940 raise RuntimeError(
941 f"Unexpectedly learned that {srcUri} is not within datastore {self.root}"
942 )
943 if pathInStore:
944 tgtLocation = self.locationFactory.fromPath(pathInStore)
945 elif transfer == "split":
946 # Outside the datastore but treat that as a direct ingest
947 # instead.
948 tgtLocation = None
949 else:
950 raise RuntimeError(f"Unexpected transfer mode encountered: {transfer} for URI {srcUri}")
951 elif transfer == "direct":
952 # Want to store the full URI to the resource directly in
953 # datastore. This is useful for referring to permanent archive
954 # storage for raw data.
955 # Trust that people know what they are doing.
956 tgtLocation = None
957 else:
958 # Work out the name we want this ingested file to have
959 # inside the datastore
960 tgtLocation = self._calculate_ingested_datastore_name(srcUri, ref, formatter)
961 if not tgtLocation.uri.dirname().exists():
962 log.debug("Folder %s does not exist yet.", tgtLocation.uri.dirname())
963 tgtLocation.uri.dirname().mkdir()
965 # if we are transferring from a local file to a remote location
966 # it may be more efficient to get the size and checksum of the
967 # local file rather than the transferred one
968 if record_validation_info and srcUri.isLocal:
969 size = srcUri.size()
970 checksum = self.computeChecksum(srcUri) if self.useChecksum else None
971 have_sized = True
973 # Transfer the resource to the destination.
974 # Allow overwrite of an existing file. This matches the behavior
975 # of datastore.put() in that it trusts that registry would not
976 # be asking to overwrite unless registry thought that the
977 # overwrite was allowed.
978 tgtLocation.uri.transfer_from(
979 srcUri, transfer=transfer, transaction=self._transaction, overwrite=True
980 )
982 if tgtLocation is None:
983 # This means we are using direct mode
984 targetUri = srcUri
985 targetPath = str(srcUri)
986 else:
987 targetUri = tgtLocation.uri
988 targetPath = tgtLocation.pathInStore.path
990 # the file should exist in the datastore now
991 if record_validation_info:
992 if not have_sized:
993 size = targetUri.size()
994 checksum = self.computeChecksum(targetUri) if self.useChecksum else None
995 else:
996 # Not recording any file information.
997 size = -1
998 checksum = None
1000 return StoredFileInfo(
1001 formatter=formatter,
1002 path=targetPath,
1003 storageClass=ref.datasetType.storageClass,
1004 component=ref.datasetType.component(),
1005 file_size=size,
1006 checksum=checksum,
1007 dataset_id=ref.id,
1008 )
1010 def _prepIngest(self, *datasets: FileDataset, transfer: str | None = None) -> _IngestPrepData:
1011 # Docstring inherited from Datastore._prepIngest.
1012 filtered = []
1013 for dataset in datasets:
1014 acceptable = [ref for ref in dataset.refs if self.constraints.isAcceptable(ref)]
1015 if not acceptable:
1016 continue
1017 else:
1018 dataset.refs = acceptable
1019 if dataset.formatter is None:
1020 dataset.formatter = self.formatterFactory.getFormatterClass(dataset.refs[0])
1021 else:
1022 assert isinstance(dataset.formatter, type | str)
1023 formatter_class = get_class_of(dataset.formatter)
1024 if not issubclass(formatter_class, Formatter):
1025 raise TypeError(f"Requested formatter {dataset.formatter} is not a Formatter class.")
1026 dataset.formatter = formatter_class
1027 dataset.path = self._standardizeIngestPath(dataset.path, transfer=transfer)
1028 filtered.append(dataset)
1029 return _IngestPrepData(filtered)
1031 @transactional
1032 def _finishIngest(
1033 self,
1034 prepData: Datastore.IngestPrepData,
1035 *,
1036 transfer: str | None = None,
1037 record_validation_info: bool = True,
1038 ) -> None:
1039 # Docstring inherited from Datastore._finishIngest.
1040 refsAndInfos = []
1041 progress = Progress("lsst.daf.butler.datastores.FileDatastore.ingest", level=logging.DEBUG)
1042 for dataset in progress.wrap(prepData.datasets, desc="Ingesting dataset files"):
1043 # Do ingest as if the first dataset ref is associated with the file
1044 info = self._extractIngestInfo(
1045 dataset.path,
1046 dataset.refs[0],
1047 formatter=dataset.formatter,
1048 transfer=transfer,
1049 record_validation_info=record_validation_info,
1050 )
1051 refsAndInfos.extend([(ref, info) for ref in dataset.refs])
1053 # In direct mode we can allow repeated ingests of the same thing
1054 # if we are sure that the external dataset is immutable. We use
1055 # UUIDv5 to indicate this. If there is a mix of v4 and v5 they are
1056 # separated.
1057 refs_and_infos_replace = []
1058 refs_and_infos_insert = []
1059 if transfer == "direct":
1060 for entry in refsAndInfos:
1061 if entry[0].id.version == 5:
1062 refs_and_infos_replace.append(entry)
1063 else:
1064 refs_and_infos_insert.append(entry)
1065 else:
1066 refs_and_infos_insert = refsAndInfos
1068 if refs_and_infos_insert:
1069 self._register_datasets(refs_and_infos_insert, insert_mode=DatabaseInsertMode.INSERT)
1070 if refs_and_infos_replace:
1071 self._register_datasets(refs_and_infos_replace, insert_mode=DatabaseInsertMode.REPLACE)
1073 def _calculate_ingested_datastore_name(
1074 self,
1075 srcUri: ResourcePath,
1076 ref: DatasetRef,
1077 formatter: Formatter | type[Formatter] | None = None,
1078 ) -> Location:
1079 """Given a source URI and a DatasetRef, determine the name the
1080 dataset will have inside datastore.
1082 Parameters
1083 ----------
1084 srcUri : `lsst.resources.ResourcePath`
1085 URI to the source dataset file.
1086 ref : `DatasetRef`
1087 Ref associated with the newly-ingested dataset artifact. This
1088 is used to determine the name within the datastore.
1089 formatter : `Formatter` or Formatter class.
1090 Formatter to use for validation. Can be a class or an instance.
1091 No validation of the file extension is performed if the
1092 ``formatter`` is `None`. This can be used if the caller knows
1093 that the source URI and target URI will use the same formatter.
1095 Returns
1096 -------
1097 location : `Location`
1098 Target location for the newly-ingested dataset.
1099 """
1100 # Ingesting a file from outside the datastore.
1101 # This involves a new name.
1102 template = self.templates.getTemplate(ref)
1103 location = self.locationFactory.fromPath(template.format(ref))
1105 # Get the extension
1106 ext = srcUri.getExtension()
1108 # Update the destination to include that extension
1109 location.updateExtension(ext)
1111 # Ask the formatter to validate this extension
1112 if formatter is not None:
1113 formatter.validateExtension(location)
1115 return location
1117 def _write_in_memory_to_artifact(self, inMemoryDataset: Any, ref: DatasetRef) -> StoredFileInfo:
1118 """Write out in memory dataset to datastore.
1120 Parameters
1121 ----------
1122 inMemoryDataset : `object`
1123 Dataset to write to datastore.
1124 ref : `DatasetRef`
1125 Registry information associated with this dataset.
1127 Returns
1128 -------
1129 info : `StoredFileInfo`
1130 Information describing the artifact written to the datastore.
1131 """
1132 # May need to coerce the in memory dataset to the correct
1133 # python type, but first we need to make sure the storage class
1134 # reflects the one defined in the data repository.
1135 ref = self._cast_storage_class(ref)
1136 inMemoryDataset = ref.datasetType.storageClass.coerce_type(inMemoryDataset)
1138 location, formatter = self._prepare_for_put(inMemoryDataset, ref)
1139 uri = location.uri
1141 if not uri.dirname().exists():
1142 log.debug("Folder %s does not exist yet so creating it.", uri.dirname())
1143 uri.dirname().mkdir()
1145 if self._transaction is None:
1146 raise RuntimeError("Attempting to write artifact without transaction enabled")
1148 def _removeFileExists(uri: ResourcePath) -> None:
1149 """Remove a file and do not complain if it is not there.
1151 This is important since a formatter might fail before the file
1152 is written and we should not confuse people by writing spurious
1153 error messages to the log.
1154 """
1155 with contextlib.suppress(FileNotFoundError):
1156 uri.remove()
1158 # Register a callback to try to delete the uploaded data if
1159 # something fails below
1160 self._transaction.registerUndo("artifactWrite", _removeFileExists, uri)
1162 data_written = False
1164 # For remote URIs some datasets can be serialized directly
1165 # to bytes and sent to the remote datastore without writing a
1166 # file. If the dataset is intended to be saved to the cache
1167 # a file is always written and direct write to the remote
1168 # datastore is bypassed.
1169 if not uri.isLocal and not self.cacheManager.should_be_cached(ref):
1170 # Remote URI that is not cached so can write directly.
1171 try:
1172 serializedDataset = formatter.toBytes(inMemoryDataset)
1173 except NotImplementedError:
1174 # Fallback to the file writing option.
1175 pass
1176 except Exception as e:
1177 raise RuntimeError(
1178 f"Failed to serialize dataset {ref} of type {type(inMemoryDataset)} to bytes."
1179 ) from e
1180 else:
1181 log.debug("Writing bytes directly to %s", uri)
1182 uri.write(serializedDataset, overwrite=True)
1183 log.debug("Successfully wrote bytes directly to %s", uri)
1184 data_written = True
1186 if not data_written:
1187 # Did not write the bytes directly to object store so instead
1188 # write to temporary file. Always write to a temporary even if
1189 # using a local file system -- that gives us atomic writes.
1190 # If a process is killed as the file is being written we do not
1191 # want it to remain in the correct place but in corrupt state.
1192 # For local files write to the output directory not temporary dir.
1193 prefix = uri.dirname() if uri.isLocal else None
1194 with ResourcePath.temporary_uri(suffix=uri.getExtension(), prefix=prefix) as temporary_uri:
1195 # Need to configure the formatter to write to a different
1196 # location and that needs us to overwrite internals
1197 log.debug("Writing dataset to temporary location at %s", temporary_uri)
1198 with formatter._updateLocation(Location(None, temporary_uri)):
1199 try:
1200 formatter.write(inMemoryDataset)
1201 except Exception as e:
1202 raise RuntimeError(
1203 f"Failed to serialize dataset {ref} of type"
1204 f" {type(inMemoryDataset)} to "
1205 f"temporary location {temporary_uri}"
1206 ) from e
1208 # Use move for a local file since that becomes an efficient
1209 # os.rename. For remote resources we use copy to allow the
1210 # file to be cached afterwards.
1211 transfer = "move" if uri.isLocal else "copy"
1213 uri.transfer_from(temporary_uri, transfer=transfer, overwrite=True)
1215 if transfer == "copy":
1216 # Cache if required
1217 self.cacheManager.move_to_cache(temporary_uri, ref)
1219 log.debug("Successfully wrote dataset to %s via a temporary file.", uri)
1221 # URI is needed to resolve what ingest case are we dealing with
1222 return self._extractIngestInfo(uri, ref, formatter=formatter)
1224 def _read_artifact_into_memory(
1225 self,
1226 getInfo: DatastoreFileGetInformation,
1227 ref: DatasetRef,
1228 isComponent: bool = False,
1229 cache_ref: DatasetRef | None = None,
1230 ) -> Any:
1231 """Read the artifact from datastore into in memory object.
1233 Parameters
1234 ----------
1235 getInfo : `DatastoreFileGetInformation`
1236 Information about the artifact within the datastore.
1237 ref : `DatasetRef`
1238 The registry information associated with this artifact.
1239 isComponent : `bool`
1240 Flag to indicate if a component is being read from this artifact.
1241 cache_ref : `DatasetRef`, optional
1242 The DatasetRef to use when looking up the file in the cache.
1243 This ref must have the same ID as the supplied ref but can
1244 be a parent ref or component ref to indicate to the cache whether
1245 a composite file is being requested from the cache or a component
1246 file. Without this the cache will default to the supplied ref but
1247 it can get confused with read-only derived components for
1248 disassembled composites.
1250 Returns
1251 -------
1252 inMemoryDataset : `object`
1253 The artifact as a python object.
1254 """
1255 location = getInfo.location
1256 uri = location.uri
1257 log.debug("Accessing data from %s", uri)
1259 if cache_ref is None:
1260 cache_ref = ref
1261 if cache_ref.id != ref.id:
1262 raise ValueError(
1263 "The supplied cache dataset ref refers to a different dataset than expected:"
1264 f" {ref.id} != {cache_ref.id}"
1265 )
1267 # Cannot recalculate checksum but can compare size as a quick check
1268 # Do not do this if the size is negative since that indicates
1269 # we do not know.
1270 recorded_size = getInfo.info.file_size
1271 resource_size = uri.size()
1272 if recorded_size >= 0 and resource_size != recorded_size:
1273 raise RuntimeError(
1274 "Integrity failure in Datastore. "
1275 f"Size of file {uri} ({resource_size}) "
1276 f"does not match size recorded in registry of {recorded_size}"
1277 )
1279 # For the general case we have choices for how to proceed.
1280 # 1. Always use a local file (downloading the remote resource to a
1281 # temporary file if needed).
1282 # 2. Use a threshold size and read into memory and use bytes.
1283 # Use both for now with an arbitrary hand off size.
1284 # This allows small datasets to be downloaded from remote object
1285 # stores without requiring a temporary file.
1287 formatter = getInfo.formatter
1288 nbytes_max = 10_000_000 # Arbitrary number that we can tune
1289 if resource_size <= nbytes_max and formatter.can_read_bytes():
1290 with self.cacheManager.find_in_cache(cache_ref, uri.getExtension()) as cached_file:
1291 if cached_file is not None:
1292 desired_uri = cached_file
1293 msg = f" (cached version of {uri})"
1294 else:
1295 desired_uri = uri
1296 msg = ""
1297 with time_this(log, msg="Reading bytes from %s%s", args=(desired_uri, msg)):
1298 serializedDataset = desired_uri.read()
1299 log.debug(
1300 "Deserializing %s from %d bytes from location %s with formatter %s",
1301 f"component {getInfo.component}" if isComponent else "",
1302 len(serializedDataset),
1303 uri,
1304 formatter.name(),
1305 )
1306 try:
1307 result = formatter.fromBytes(
1308 serializedDataset, component=getInfo.component if isComponent else None
1309 )
1310 except Exception as e:
1311 raise ValueError(
1312 f"Failure from formatter '{formatter.name()}' for dataset {ref.id}"
1313 f" ({ref.datasetType.name} from {uri}): {e}"
1314 ) from e
1315 else:
1316 # Read from file.
1318 # Have to update the Location associated with the formatter
1319 # because formatter.read does not allow an override.
1320 # This could be improved.
1321 location_updated = False
1322 msg = ""
1324 # First check in cache for local version.
1325 # The cache will only be relevant for remote resources but
1326 # no harm in always asking. Context manager ensures that cache
1327 # file is not deleted during cache expiration.
1328 with self.cacheManager.find_in_cache(cache_ref, uri.getExtension()) as cached_file:
1329 if cached_file is not None:
1330 msg = f"(via cache read of remote file {uri})"
1331 uri = cached_file
1332 location_updated = True
1334 with uri.as_local() as local_uri:
1335 can_be_cached = False
1336 if uri != local_uri:
1337 # URI was remote and file was downloaded
1338 cache_msg = ""
1339 location_updated = True
1341 if self.cacheManager.should_be_cached(cache_ref):
1342 # In this scenario we want to ask if the downloaded
1343 # file should be cached but we should not cache
1344 # it until after we've used it (to ensure it can't
1345 # be expired whilst we are using it).
1346 can_be_cached = True
1348 # Say that it is "likely" to be cached because
1349 # if the formatter read fails we will not be
1350 # caching this file.
1351 cache_msg = " and likely cached"
1353 msg = f"(via download to local file{cache_msg})"
1355 # Calculate the (possibly) new location for the formatter
1356 # to use.
1357 newLocation = Location(*local_uri.split()) if location_updated else None
1359 log.debug(
1360 "Reading%s from location %s %s with formatter %s",
1361 f" component {getInfo.component}" if isComponent else "",
1362 uri,
1363 msg,
1364 formatter.name(),
1365 )
1366 try:
1367 with (
1368 formatter._updateLocation(newLocation),
1369 time_this(
1370 log,
1371 msg="Reading%s from location %s %s with formatter %s",
1372 args=(
1373 f" component {getInfo.component}" if isComponent else "",
1374 uri,
1375 msg,
1376 formatter.name(),
1377 ),
1378 ),
1379 ):
1380 result = formatter.read(component=getInfo.component if isComponent else None)
1381 except Exception as e:
1382 raise ValueError(
1383 f"Failure from formatter '{formatter.name()}' for dataset {ref.id}"
1384 f" ({ref.datasetType.name} from {uri}): {e}"
1385 ) from e
1387 # File was read successfully so can move to cache
1388 if can_be_cached:
1389 self.cacheManager.move_to_cache(local_uri, cache_ref)
1391 return self._post_process_get(
1392 result, ref.datasetType.storageClass, getInfo.assemblerParams, isComponent=isComponent
1393 )
1395 def knows(self, ref: DatasetRef) -> bool:
1396 """Check if the dataset is known to the datastore.
1398 Does not check for existence of any artifact.
1400 Parameters
1401 ----------
1402 ref : `DatasetRef`
1403 Reference to the required dataset.
1405 Returns
1406 -------
1407 exists : `bool`
1408 `True` if the dataset is known to the datastore.
1409 """
1410 fileLocations = self._get_dataset_locations_info(ref)
1411 if fileLocations:
1412 return True
1413 return False
1415 def knows_these(self, refs: Iterable[DatasetRef]) -> dict[DatasetRef, bool]:
1416 # Docstring inherited from the base class.
1418 # The records themselves. Could be missing some entries.
1419 records = self._get_stored_records_associated_with_refs(refs)
1421 return {ref: ref.id in records for ref in refs}
1423 def _process_mexists_records(
1424 self,
1425 id_to_ref: dict[DatasetId, DatasetRef],
1426 records: dict[DatasetId, list[StoredFileInfo]],
1427 all_required: bool,
1428 artifact_existence: dict[ResourcePath, bool] | None = None,
1429 ) -> dict[DatasetRef, bool]:
1430 """Check given records for existence.
1432 Helper function for `mexists()`.
1434 Parameters
1435 ----------
1436 id_to_ref : `dict` of [`DatasetId`, `DatasetRef`]
1437 Mapping of the dataset ID to the dataset ref itself.
1438 records : `dict` of [`DatasetId`, `list` of `StoredFileInfo`]
1439 Records as generally returned by
1440 ``_get_stored_records_associated_with_refs``.
1441 all_required : `bool`
1442 Flag to indicate whether existence requires all artifacts
1443 associated with a dataset ID to exist or not for existence.
1444 artifact_existence : `dict` [`lsst.resources.ResourcePath`, `bool`]
1445 Optional mapping of datastore artifact to existence. Updated by
1446 this method with details of all artifacts tested. Can be `None`
1447 if the caller is not interested.
1449 Returns
1450 -------
1451 existence : `dict` of [`DatasetRef`, `bool`]
1452 Mapping from dataset to boolean indicating existence.
1453 """
1454 # The URIs to be checked and a mapping of those URIs to
1455 # the dataset ID.
1456 uris_to_check: list[ResourcePath] = []
1457 location_map: dict[ResourcePath, DatasetId] = {}
1459 location_factory = self.locationFactory
1461 uri_existence: dict[ResourcePath, bool] = {}
1462 for ref_id, infos in records.items():
1463 # Key is the dataset Id, value is list of StoredItemInfo
1464 uris = [info.file_location(location_factory).uri for info in infos]
1465 location_map.update({uri: ref_id for uri in uris})
1467 # Check the local cache directly for a dataset corresponding
1468 # to the remote URI.
1469 if self.cacheManager.file_count > 0:
1470 ref = id_to_ref[ref_id]
1471 for uri, storedFileInfo in zip(uris, infos, strict=True):
1472 check_ref = ref
1473 if not ref.datasetType.isComponent() and (component := storedFileInfo.component):
1474 check_ref = ref.makeComponentRef(component)
1475 if self.cacheManager.known_to_cache(check_ref, uri.getExtension()):
1476 # Proxy for URI existence.
1477 uri_existence[uri] = True
1478 else:
1479 uris_to_check.append(uri)
1480 else:
1481 # Check all of them.
1482 uris_to_check.extend(uris)
1484 if artifact_existence is not None:
1485 # If a URI has already been checked remove it from the list
1486 # and immediately add the status to the output dict.
1487 filtered_uris_to_check = []
1488 for uri in uris_to_check:
1489 if uri in artifact_existence:
1490 uri_existence[uri] = artifact_existence[uri]
1491 else:
1492 filtered_uris_to_check.append(uri)
1493 uris_to_check = filtered_uris_to_check
1495 # Results.
1496 dataset_existence: dict[DatasetRef, bool] = {}
1498 uri_existence.update(ResourcePath.mexists(uris_to_check))
1499 for uri, exists in uri_existence.items():
1500 dataset_id = location_map[uri]
1501 ref = id_to_ref[dataset_id]
1503 # Disassembled composite needs to check all locations.
1504 # all_required indicates whether all need to exist or not.
1505 if ref in dataset_existence:
1506 if all_required:
1507 exists = dataset_existence[ref] and exists
1508 else:
1509 exists = dataset_existence[ref] or exists
1510 dataset_existence[ref] = exists
1512 if artifact_existence is not None:
1513 artifact_existence.update(uri_existence)
1515 return dataset_existence
1517 def mexists(
1518 self, refs: Iterable[DatasetRef], artifact_existence: dict[ResourcePath, bool] | None = None
1519 ) -> dict[DatasetRef, bool]:
1520 """Check the existence of multiple datasets at once.
1522 Parameters
1523 ----------
1524 refs : iterable of `DatasetRef`
1525 The datasets to be checked.
1526 artifact_existence : `dict` [`lsst.resources.ResourcePath`, `bool`]
1527 Optional mapping of datastore artifact to existence. Updated by
1528 this method with details of all artifacts tested. Can be `None`
1529 if the caller is not interested.
1531 Returns
1532 -------
1533 existence : `dict` of [`DatasetRef`, `bool`]
1534 Mapping from dataset to boolean indicating existence.
1536 Notes
1537 -----
1538 To minimize potentially costly remote existence checks, the local
1539 cache is checked as a proxy for existence. If a file for this
1540 `DatasetRef` does exist no check is done for the actual URI. This
1541 could result in possibly unexpected behavior if the dataset itself
1542 has been removed from the datastore by another process whilst it is
1543 still in the cache.
1544 """
1545 chunk_size = 10_000
1546 dataset_existence: dict[DatasetRef, bool] = {}
1547 log.debug("Checking for the existence of multiple artifacts in datastore in chunks of %d", chunk_size)
1548 n_found_total = 0
1549 n_checked = 0
1550 n_chunks = 0
1551 for chunk in chunk_iterable(refs, chunk_size=chunk_size):
1552 chunk_result = self._mexists(chunk, artifact_existence)
1554 # The log message level and content depend on how many
1555 # datasets we are processing.
1556 n_results = len(chunk_result)
1558 # Use verbose logging to ensure that messages can be seen
1559 # easily if many refs are being checked.
1560 log_threshold = VERBOSE
1561 n_checked += n_results
1563 # This sum can take some time so only do it if we know the
1564 # result is going to be used.
1565 n_found = 0
1566 if log.isEnabledFor(log_threshold):
1567 # Can treat the booleans as 0, 1 integers and sum them.
1568 n_found = sum(chunk_result.values())
1569 n_found_total += n_found
1571 # We are deliberately not trying to count the number of refs
1572 # provided in case it's in the millions. This means there is a
1573 # situation where the number of refs exactly matches the chunk
1574 # size and we will switch to the multi-chunk path even though
1575 # we only have a single chunk.
1576 if n_results < chunk_size and n_chunks == 0:
1577 # Single chunk will be processed so we can provide more detail.
1578 if n_results == 1:
1579 ref = list(chunk_result)[0]
1580 # Use debug logging to be consistent with `exists()`.
1581 log.debug(
1582 "Calling mexists() with single ref that does%s exist (%s).",
1583 "" if chunk_result[ref] else " not",
1584 ref,
1585 )
1586 else:
1587 # Single chunk but multiple files. Summarize.
1588 log.log(
1589 log_threshold,
1590 "Number of datasets found in datastore: %d out of %d datasets checked.",
1591 n_found,
1592 n_checked,
1593 )
1595 else:
1596 # Use incremental verbose logging when we have multiple chunks.
1597 log.log(
1598 log_threshold,
1599 "Number of datasets found in datastore for chunk %d: %d out of %d checked "
1600 "(running total from all chunks so far: %d found out of %d checked)",
1601 n_chunks,
1602 n_found,
1603 n_results,
1604 n_found_total,
1605 n_checked,
1606 )
1607 dataset_existence.update(chunk_result)
1608 n_chunks += 1
1610 return dataset_existence
1612 def _mexists(
1613 self, refs: Sequence[DatasetRef], artifact_existence: dict[ResourcePath, bool] | None = None
1614 ) -> dict[DatasetRef, bool]:
1615 """Check the existence of multiple datasets at once.
1617 Parameters
1618 ----------
1619 refs : iterable of `DatasetRef`
1620 The datasets to be checked.
1621 artifact_existence : `dict` [`lsst.resources.ResourcePath`, `bool`]
1622 Optional mapping of datastore artifact to existence. Updated by
1623 this method with details of all artifacts tested. Can be `None`
1624 if the caller is not interested.
1626 Returns
1627 -------
1628 existence : `dict` of [`DatasetRef`, `bool`]
1629 Mapping from dataset to boolean indicating existence.
1630 """
1631 # Make a mapping from refs with the internal storage class to the given
1632 # refs that may have a different one. We'll use the internal refs
1633 # throughout this method and convert back at the very end.
1634 internal_ref_to_input_ref = {self._cast_storage_class(ref): ref for ref in refs}
1636 # Need a mapping of dataset_id to (internal) dataset ref since some
1637 # internal APIs work with dataset_id.
1638 id_to_ref = {ref.id: ref for ref in internal_ref_to_input_ref}
1640 # Set of all IDs we are checking for.
1641 requested_ids = set(id_to_ref.keys())
1643 # The records themselves. Could be missing some entries.
1644 records = self._get_stored_records_associated_with_refs(id_to_ref.values())
1646 dataset_existence = self._process_mexists_records(
1647 id_to_ref, records, True, artifact_existence=artifact_existence
1648 )
1650 # Set of IDs that have been handled.
1651 handled_ids = {ref.id for ref in dataset_existence}
1653 missing_ids = requested_ids - handled_ids
1654 if missing_ids:
1655 dataset_existence.update(
1656 self._mexists_check_expected(
1657 [id_to_ref[missing] for missing in missing_ids], artifact_existence
1658 )
1659 )
1661 return {
1662 internal_ref_to_input_ref[internal_ref]: existence
1663 for internal_ref, existence in dataset_existence.items()
1664 }
1666 def _mexists_check_expected(
1667 self, refs: Sequence[DatasetRef], artifact_existence: dict[ResourcePath, bool] | None = None
1668 ) -> dict[DatasetRef, bool]:
1669 """Check existence of refs that are not known to datastore.
1671 Parameters
1672 ----------
1673 refs : iterable of `DatasetRef`
1674 The datasets to be checked. These are assumed not to be known
1675 to datastore.
1676 artifact_existence : `dict` [`lsst.resources.ResourcePath`, `bool`]
1677 Optional mapping of datastore artifact to existence. Updated by
1678 this method with details of all artifacts tested. Can be `None`
1679 if the caller is not interested.
1681 Returns
1682 -------
1683 existence : `dict` of [`DatasetRef`, `bool`]
1684 Mapping from dataset to boolean indicating existence.
1685 """
1686 dataset_existence: dict[DatasetRef, bool] = {}
1687 if not self.trustGetRequest:
1688 # Must assume these do not exist
1689 for ref in refs:
1690 dataset_existence[ref] = False
1691 else:
1692 log.debug(
1693 "%d datasets were not known to datastore during initial existence check.",
1694 len(refs),
1695 )
1697 # Construct data structure identical to that returned
1698 # by _get_stored_records_associated_with_refs() but using
1699 # guessed names.
1700 records = {}
1701 id_to_ref = {}
1702 for missing_ref in refs:
1703 expected = self._get_expected_dataset_locations_info(missing_ref)
1704 dataset_id = missing_ref.id
1705 records[dataset_id] = [info for _, info in expected]
1706 id_to_ref[dataset_id] = missing_ref
1708 dataset_existence.update(
1709 self._process_mexists_records(
1710 id_to_ref,
1711 records,
1712 False,
1713 artifact_existence=artifact_existence,
1714 )
1715 )
1717 return dataset_existence
1719 def exists(self, ref: DatasetRef) -> bool:
1720 """Check if the dataset exists in the datastore.
1722 Parameters
1723 ----------
1724 ref : `DatasetRef`
1725 Reference to the required dataset.
1727 Returns
1728 -------
1729 exists : `bool`
1730 `True` if the entity exists in the `Datastore`.
1732 Notes
1733 -----
1734 The local cache is checked as a proxy for existence in the remote
1735 object store. It is possible that another process on a different
1736 compute node could remove the file from the object store even
1737 though it is present in the local cache.
1738 """
1739 ref = self._cast_storage_class(ref)
1740 fileLocations = self._get_dataset_locations_info(ref)
1742 # if we are being asked to trust that registry might not be correct
1743 # we ask for the expected locations and check them explicitly
1744 if not fileLocations:
1745 if not self.trustGetRequest:
1746 return False
1748 # First check the cache. If it is not found we must check
1749 # the datastore itself. Assume that any component in the cache
1750 # means that the dataset does exist somewhere.
1751 if self.cacheManager.known_to_cache(ref):
1752 return True
1754 # When we are guessing a dataset location we can not check
1755 # for the existence of every component since we can not
1756 # know if every component was written. Instead we check
1757 # for the existence of any of the expected locations.
1758 for location, _ in self._get_expected_dataset_locations_info(ref):
1759 if self._artifact_exists(location):
1760 return True
1761 return False
1763 # All listed artifacts must exist.
1764 for location, storedFileInfo in fileLocations:
1765 # Checking in cache needs the component ref.
1766 check_ref = ref
1767 if not ref.datasetType.isComponent() and (component := storedFileInfo.component):
1768 check_ref = ref.makeComponentRef(component)
1769 if self.cacheManager.known_to_cache(check_ref, location.getExtension()):
1770 continue
1772 if not self._artifact_exists(location):
1773 return False
1775 return True
1777 def getURIs(self, ref: DatasetRef, predict: bool = False) -> DatasetRefURIs:
1778 """Return URIs associated with dataset.
1780 Parameters
1781 ----------
1782 ref : `DatasetRef`
1783 Reference to the required dataset.
1784 predict : `bool`, optional
1785 If the datastore does not know about the dataset, should it
1786 return a predicted URI or not?
1788 Returns
1789 -------
1790 uris : `DatasetRefURIs`
1791 The URI to the primary artifact associated with this dataset (if
1792 the dataset was disassembled within the datastore this may be
1793 `None`), and the URIs to any components associated with the dataset
1794 artifact. (can be empty if there are no components).
1795 """
1796 many = self.getManyURIs([ref], predict=predict, allow_missing=False)
1797 return many[ref]
1799 def getURI(self, ref: DatasetRef, predict: bool = False) -> ResourcePath:
1800 """URI to the Dataset.
1802 Parameters
1803 ----------
1804 ref : `DatasetRef`
1805 Reference to the required Dataset.
1806 predict : `bool`
1807 If `True`, allow URIs to be returned of datasets that have not
1808 been written.
1810 Returns
1811 -------
1812 uri : `str`
1813 URI pointing to the dataset within the datastore. If the
1814 dataset does not exist in the datastore, and if ``predict`` is
1815 `True`, the URI will be a prediction and will include a URI
1816 fragment "#predicted".
1817 If the datastore does not have entities that relate well
1818 to the concept of a URI the returned URI will be
1819 descriptive. The returned URI is not guaranteed to be obtainable.
1821 Raises
1822 ------
1823 FileNotFoundError
1824 Raised if a URI has been requested for a dataset that does not
1825 exist and guessing is not allowed.
1826 RuntimeError
1827 Raised if a request is made for a single URI but multiple URIs
1828 are associated with this dataset.
1830 Notes
1831 -----
1832 When a predicted URI is requested an attempt will be made to form
1833 a reasonable URI based on file templates and the expected formatter.
1834 """
1835 primary, components = self.getURIs(ref, predict)
1836 if primary is None or components:
1837 raise RuntimeError(
1838 f"Dataset ({ref}) includes distinct URIs for components. Use Datastore.getURIs() instead."
1839 )
1840 return primary
1842 def _predict_URIs(
1843 self,
1844 ref: DatasetRef,
1845 ) -> DatasetRefURIs:
1846 """Predict the URIs of a dataset ref.
1848 Parameters
1849 ----------
1850 ref : `DatasetRef`
1851 Reference to the required Dataset.
1853 Returns
1854 -------
1855 URI : DatasetRefUris
1856 Primary and component URIs. URIs will contain a URI fragment
1857 "#predicted".
1858 """
1859 uris = DatasetRefURIs()
1861 if self.composites.shouldBeDisassembled(ref):
1862 for component, _ in ref.datasetType.storageClass.components.items():
1863 comp_ref = ref.makeComponentRef(component)
1864 comp_location, _ = self._determine_put_formatter_location(comp_ref)
1866 # Add the "#predicted" URI fragment to indicate this is a
1867 # guess
1868 uris.componentURIs[component] = ResourcePath(comp_location.uri.geturl() + "#predicted")
1870 else:
1871 location, _ = self._determine_put_formatter_location(ref)
1873 # Add the "#predicted" URI fragment to indicate this is a guess
1874 uris.primaryURI = ResourcePath(location.uri.geturl() + "#predicted")
1876 return uris
1878 def getManyURIs(
1879 self,
1880 refs: Iterable[DatasetRef],
1881 predict: bool = False,
1882 allow_missing: bool = False,
1883 ) -> dict[DatasetRef, DatasetRefURIs]:
1884 # Docstring inherited
1886 uris: dict[DatasetRef, DatasetRefURIs] = {}
1888 records = self._get_stored_records_associated_with_refs(refs)
1889 records_keys = records.keys()
1891 existing_refs = tuple(ref for ref in refs if ref.id in records_keys)
1892 missing_refs = tuple(ref for ref in refs if ref.id not in records_keys)
1894 # Have to handle trustGetRequest mode by checking for the existence
1895 # of the missing refs on disk.
1896 if missing_refs:
1897 dataset_existence = self._mexists_check_expected(missing_refs, None)
1898 really_missing = set()
1899 not_missing = set()
1900 for ref, exists in dataset_existence.items():
1901 if exists:
1902 not_missing.add(ref)
1903 else:
1904 really_missing.add(ref)
1906 if not_missing:
1907 # Need to recalculate the missing/existing split.
1908 existing_refs = existing_refs + tuple(not_missing)
1909 missing_refs = tuple(really_missing)
1911 for ref in missing_refs:
1912 # if this has never been written then we have to guess
1913 if not predict:
1914 if not allow_missing:
1915 raise FileNotFoundError(f"Dataset {ref} not in this datastore.")
1916 else:
1917 uris[ref] = self._predict_URIs(ref)
1919 for ref in existing_refs:
1920 file_infos = records[ref.id]
1921 file_locations = [(i.file_location(self.locationFactory), i) for i in file_infos]
1922 uris[ref] = self._locations_to_URI(ref, file_locations)
1924 return uris
1926 def _locations_to_URI(
1927 self,
1928 ref: DatasetRef,
1929 file_locations: Sequence[tuple[Location, StoredFileInfo]],
1930 ) -> DatasetRefURIs:
1931 """Convert one or more file locations associated with a DatasetRef
1932 to a DatasetRefURIs.
1934 Parameters
1935 ----------
1936 ref : `DatasetRef`
1937 Reference to the dataset.
1938 file_locations : Sequence[Tuple[Location, StoredFileInfo]]
1939 Each item in the sequence is the location of the dataset within the
1940 datastore and stored information about the file and its formatter.
1941 If there is only one item in the sequence then it is treated as the
1942 primary URI. If there is more than one item then they are treated
1943 as component URIs. If there are no items then an error is raised
1944 unless ``self.trustGetRequest`` is `True`.
1946 Returns
1947 -------
1948 uris: DatasetRefURIs
1949 Represents the primary URI or component URIs described by the
1950 inputs.
1952 Raises
1953 ------
1954 RuntimeError
1955 If no file locations are passed in and ``self.trustGetRequest`` is
1956 `False`.
1957 FileNotFoundError
1958 If the a passed-in URI does not exist, and ``self.trustGetRequest``
1959 is `False`.
1960 RuntimeError
1961 If a passed in `StoredFileInfo`'s ``component`` is `None` (this is
1962 unexpected).
1963 """
1964 guessing = False
1965 uris = DatasetRefURIs()
1967 if not file_locations:
1968 if not self.trustGetRequest:
1969 raise RuntimeError(f"Unexpectedly got no artifacts for dataset {ref}")
1970 file_locations = self._get_expected_dataset_locations_info(ref)
1971 guessing = True
1973 if len(file_locations) == 1:
1974 # No disassembly so this is the primary URI
1975 uris.primaryURI = file_locations[0][0].uri
1976 if guessing and not uris.primaryURI.exists():
1977 raise FileNotFoundError(f"Expected URI ({uris.primaryURI}) does not exist")
1978 else:
1979 for location, file_info in file_locations:
1980 if file_info.component is None:
1981 raise RuntimeError(f"Unexpectedly got no component name for a component at {location}")
1982 if guessing and not location.uri.exists():
1983 # If we are trusting then it is entirely possible for
1984 # some components to be missing. In that case we skip
1985 # to the next component.
1986 if self.trustGetRequest:
1987 continue
1988 raise FileNotFoundError(f"Expected URI ({location.uri}) does not exist")
1989 uris.componentURIs[file_info.component] = location.uri
1991 return uris
1993 def retrieveArtifacts(
1994 self,
1995 refs: Iterable[DatasetRef],
1996 destination: ResourcePath,
1997 transfer: str = "auto",
1998 preserve_path: bool = True,
1999 overwrite: bool = False,
2000 ) -> list[ResourcePath]:
2001 """Retrieve the file artifacts associated with the supplied refs.
2003 Parameters
2004 ----------
2005 refs : iterable of `DatasetRef`
2006 The datasets for which file artifacts are to be retrieved.
2007 A single ref can result in multiple files. The refs must
2008 be resolved.
2009 destination : `lsst.resources.ResourcePath`
2010 Location to write the file artifacts.
2011 transfer : `str`, optional
2012 Method to use to transfer the artifacts. Must be one of the options
2013 supported by `lsst.resources.ResourcePath.transfer_from()`.
2014 "move" is not allowed.
2015 preserve_path : `bool`, optional
2016 If `True` the full path of the file artifact within the datastore
2017 is preserved. If `False` the final file component of the path
2018 is used.
2019 overwrite : `bool`, optional
2020 If `True` allow transfers to overwrite existing files at the
2021 destination.
2023 Returns
2024 -------
2025 targets : `list` of `lsst.resources.ResourcePath`
2026 URIs of file artifacts in destination location. Order is not
2027 preserved.
2028 """
2029 if not destination.isdir():
2030 raise ValueError(f"Destination location must refer to a directory. Given {destination}")
2032 if transfer == "move":
2033 raise ValueError("Can not move artifacts out of datastore. Use copy instead.")
2035 # Source -> Destination
2036 # This also helps filter out duplicate DatasetRef in the request
2037 # that will map to the same underlying file transfer.
2038 to_transfer: dict[ResourcePath, ResourcePath] = {}
2040 for ref in refs:
2041 locations = self._get_dataset_locations_info(ref)
2042 for location, _ in locations:
2043 source_uri = location.uri
2044 target_path: ResourcePathExpression
2045 if preserve_path:
2046 target_path = location.pathInStore
2047 if target_path.isabs():
2048 # This is an absolute path to an external file.
2049 # Use the full path.
2050 target_path = target_path.relativeToPathRoot
2051 else:
2052 target_path = source_uri.basename()
2053 target_uri = destination.join(target_path)
2054 to_transfer[source_uri] = target_uri
2056 # In theory can now parallelize the transfer
2057 log.debug("Number of artifacts to transfer to %s: %d", str(destination), len(to_transfer))
2058 for source_uri, target_uri in to_transfer.items():
2059 target_uri.transfer_from(source_uri, transfer=transfer, overwrite=overwrite)
2061 return list(to_transfer.values())
2063 def get(
2064 self,
2065 ref: DatasetRef,
2066 parameters: Mapping[str, Any] | None = None,
2067 storageClass: StorageClass | str | None = None,
2068 ) -> Any:
2069 """Load an InMemoryDataset from the store.
2071 Parameters
2072 ----------
2073 ref : `DatasetRef`
2074 Reference to the required Dataset.
2075 parameters : `dict`
2076 `StorageClass`-specific parameters that specify, for example,
2077 a slice of the dataset to be loaded.
2078 storageClass : `StorageClass` or `str`, optional
2079 The storage class to be used to override the Python type
2080 returned by this method. By default the returned type matches
2081 the dataset type definition for this dataset. Specifying a
2082 read `StorageClass` can force a different type to be returned.
2083 This type must be compatible with the original type.
2085 Returns
2086 -------
2087 inMemoryDataset : `object`
2088 Requested dataset or slice thereof as an InMemoryDataset.
2090 Raises
2091 ------
2092 FileNotFoundError
2093 Requested dataset can not be retrieved.
2094 TypeError
2095 Return value from formatter has unexpected type.
2096 ValueError
2097 Formatter failed to process the dataset.
2098 """
2099 # Supplied storage class for the component being read is either
2100 # from the ref itself or some an override if we want to force
2101 # type conversion.
2102 if storageClass is not None:
2103 ref = ref.overrideStorageClass(storageClass)
2104 refStorageClass = ref.datasetType.storageClass
2106 allGetInfo = self._prepare_for_get(ref, parameters)
2107 refComponent = ref.datasetType.component()
2109 # Create mapping from component name to related info
2110 allComponents = {i.component: i for i in allGetInfo}
2112 # By definition the dataset is disassembled if we have more
2113 # than one record for it.
2114 isDisassembled = len(allGetInfo) > 1
2116 # Look for the special case where we are disassembled but the
2117 # component is a derived component that was not written during
2118 # disassembly. For this scenario we need to check that the
2119 # component requested is listed as a derived component for the
2120 # composite storage class
2121 isDisassembledReadOnlyComponent = False
2122 if isDisassembled and refComponent:
2123 # The composite storage class should be accessible through
2124 # the component dataset type
2125 compositeStorageClass = ref.datasetType.parentStorageClass
2127 # In the unlikely scenario where the composite storage
2128 # class is not known, we can only assume that this is a
2129 # normal component. If that assumption is wrong then the
2130 # branch below that reads a persisted component will fail
2131 # so there is no need to complain here.
2132 if compositeStorageClass is not None:
2133 isDisassembledReadOnlyComponent = refComponent in compositeStorageClass.derivedComponents
2135 if isDisassembled and not refComponent:
2136 # This was a disassembled dataset spread over multiple files
2137 # and we need to put them all back together again.
2138 # Read into memory and then assemble
2140 # Check that the supplied parameters are suitable for the type read
2141 refStorageClass.validateParameters(parameters)
2143 # We want to keep track of all the parameters that were not used
2144 # by formatters. We assume that if any of the component formatters
2145 # use a parameter that we do not need to apply it again in the
2146 # assembler.
2147 usedParams = set()
2149 components: dict[str, Any] = {}
2150 for getInfo in allGetInfo:
2151 # assemblerParams are parameters not understood by the
2152 # associated formatter.
2153 usedParams.update(set(getInfo.formatterParams))
2155 component = getInfo.component
2157 if component is None:
2158 raise RuntimeError(f"Internal error in datastore assembly of {ref}")
2160 # We do not want the formatter to think it's reading
2161 # a component though because it is really reading a
2162 # standalone dataset -- always tell reader it is not a
2163 # component.
2164 components[component] = self._read_artifact_into_memory(
2165 getInfo, ref.makeComponentRef(component), isComponent=False
2166 )
2168 inMemoryDataset = ref.datasetType.storageClass.delegate().assemble(components)
2170 # Any unused parameters will have to be passed to the assembler
2171 if parameters:
2172 unusedParams = {k: v for k, v in parameters.items() if k not in usedParams}
2173 else:
2174 unusedParams = {}
2176 # Process parameters
2177 return ref.datasetType.storageClass.delegate().handleParameters(
2178 inMemoryDataset, parameters=unusedParams
2179 )
2181 elif isDisassembledReadOnlyComponent:
2182 compositeStorageClass = ref.datasetType.parentStorageClass
2183 if compositeStorageClass is None:
2184 raise RuntimeError(
2185 f"Unable to retrieve derived component '{refComponent}' since"
2186 "no composite storage class is available."
2187 )
2189 if refComponent is None:
2190 # Mainly for mypy
2191 raise RuntimeError(f"Internal error in datastore {self.name}: component can not be None here")
2193 # Assume that every derived component can be calculated by
2194 # forwarding the request to a single read/write component.
2195 # Rather than guessing which rw component is the right one by
2196 # scanning each for a derived component of the same name,
2197 # we ask the storage class delegate directly which one is best to
2198 # use.
2199 compositeDelegate = compositeStorageClass.delegate()
2200 forwardedComponent = compositeDelegate.selectResponsibleComponent(
2201 refComponent, set(allComponents)
2202 )
2204 # Select the relevant component
2205 rwInfo = allComponents[forwardedComponent]
2207 # For now assume that read parameters are validated against
2208 # the real component and not the requested component
2209 forwardedStorageClass = rwInfo.formatter.fileDescriptor.readStorageClass
2210 forwardedStorageClass.validateParameters(parameters)
2212 # The reference to use for the caching must refer to the forwarded
2213 # component and not the derived component.
2214 cache_ref = ref.makeCompositeRef().makeComponentRef(forwardedComponent)
2216 # Unfortunately the FileDescriptor inside the formatter will have
2217 # the wrong write storage class so we need to create a new one
2218 # given the immutability constraint.
2219 writeStorageClass = rwInfo.info.storageClass
2221 # We may need to put some thought into parameters for read
2222 # components but for now forward them on as is
2223 readFormatter = type(rwInfo.formatter)(
2224 FileDescriptor(
2225 rwInfo.location,
2226 readStorageClass=refStorageClass,
2227 storageClass=writeStorageClass,
2228 parameters=parameters,
2229 ),
2230 ref.dataId,
2231 )
2233 # The assembler can not receive any parameter requests for a
2234 # derived component at this time since the assembler will
2235 # see the storage class of the derived component and those
2236 # parameters will have to be handled by the formatter on the
2237 # forwarded storage class.
2238 assemblerParams: dict[str, Any] = {}
2240 # Need to created a new info that specifies the derived
2241 # component and associated storage class
2242 readInfo = DatastoreFileGetInformation(
2243 rwInfo.location,
2244 readFormatter,
2245 rwInfo.info,
2246 assemblerParams,
2247 {},
2248 refComponent,
2249 refStorageClass,
2250 )
2252 return self._read_artifact_into_memory(readInfo, ref, isComponent=True, cache_ref=cache_ref)
2254 else:
2255 # Single file request or component from that composite file
2256 for lookup in (refComponent, None):
2257 if lookup in allComponents:
2258 getInfo = allComponents[lookup]
2259 break
2260 else:
2261 raise FileNotFoundError(
2262 f"Component {refComponent} not found for ref {ref} in datastore {self.name}"
2263 )
2265 # Do not need the component itself if already disassembled
2266 if isDisassembled:
2267 isComponent = False
2268 else:
2269 isComponent = getInfo.component is not None
2271 # For a component read of a composite we want the cache to
2272 # be looking at the composite ref itself.
2273 cache_ref = ref.makeCompositeRef() if isComponent else ref
2275 # For a disassembled component we can validate parametersagainst
2276 # the component storage class directly
2277 if isDisassembled:
2278 refStorageClass.validateParameters(parameters)
2279 else:
2280 # For an assembled composite this could be a derived
2281 # component derived from a real component. The validity
2282 # of the parameters is not clear. For now validate against
2283 # the composite storage class
2284 getInfo.formatter.fileDescriptor.storageClass.validateParameters(parameters)
2286 return self._read_artifact_into_memory(getInfo, ref, isComponent=isComponent, cache_ref=cache_ref)
2288 @transactional
2289 def put(self, inMemoryDataset: Any, ref: DatasetRef) -> None:
2290 """Write a InMemoryDataset with a given `DatasetRef` to the store.
2292 Parameters
2293 ----------
2294 inMemoryDataset : `object`
2295 The dataset to store.
2296 ref : `DatasetRef`
2297 Reference to the associated Dataset.
2299 Raises
2300 ------
2301 TypeError
2302 Supplied object and storage class are inconsistent.
2303 DatasetTypeNotSupportedError
2304 The associated `DatasetType` is not handled by this datastore.
2306 Notes
2307 -----
2308 If the datastore is configured to reject certain dataset types it
2309 is possible that the put will fail and raise a
2310 `DatasetTypeNotSupportedError`. The main use case for this is to
2311 allow `ChainedDatastore` to put to multiple datastores without
2312 requiring that every datastore accepts the dataset.
2313 """
2314 doDisassembly = self.composites.shouldBeDisassembled(ref)
2315 # doDisassembly = True
2317 artifacts = []
2318 if doDisassembly:
2319 components = ref.datasetType.storageClass.delegate().disassemble(inMemoryDataset)
2320 if components is None:
2321 raise RuntimeError(
2322 f"Inconsistent configuration: dataset type {ref.datasetType.name} "
2323 f"with storage class {ref.datasetType.storageClass.name} "
2324 "is configured to be disassembled, but cannot be."
2325 )
2326 for component, componentInfo in components.items():
2327 # Don't recurse because we want to take advantage of
2328 # bulk insert -- need a new DatasetRef that refers to the
2329 # same dataset_id but has the component DatasetType
2330 # DatasetType does not refer to the types of components
2331 # So we construct one ourselves.
2332 compRef = ref.makeComponentRef(component)
2333 storedInfo = self._write_in_memory_to_artifact(componentInfo.component, compRef)
2334 artifacts.append((compRef, storedInfo))
2335 else:
2336 # Write the entire thing out
2337 storedInfo = self._write_in_memory_to_artifact(inMemoryDataset, ref)
2338 artifacts.append((ref, storedInfo))
2340 self._register_datasets(artifacts, insert_mode=DatabaseInsertMode.INSERT)
2342 @transactional
2343 def trash(self, ref: DatasetRef | Iterable[DatasetRef], ignore_errors: bool = True) -> None:
2344 # At this point can safely remove these datasets from the cache
2345 # to avoid confusion later on. If they are not trashed later
2346 # the cache will simply be refilled.
2347 self.cacheManager.remove_from_cache(ref)
2349 # If we are in trust mode there will be nothing to move to
2350 # the trash table and we will have to try to delete the file
2351 # immediately.
2352 if self.trustGetRequest:
2353 # Try to keep the logic below for a single file trash.
2354 if isinstance(ref, DatasetRef):
2355 refs = {ref}
2356 else:
2357 # Will recreate ref at the end of this branch.
2358 refs = set(ref)
2360 # Determine which datasets are known to datastore directly.
2361 id_to_ref = {ref.id: ref for ref in refs}
2362 existing_ids = self._get_stored_records_associated_with_refs(refs)
2363 existing_refs = {id_to_ref[ref_id] for ref_id in existing_ids}
2365 missing = refs - existing_refs
2366 if missing:
2367 # Do an explicit existence check on these refs.
2368 # We only care about the artifacts at this point and not
2369 # the dataset existence.
2370 artifact_existence: dict[ResourcePath, bool] = {}
2371 _ = self.mexists(missing, artifact_existence)
2372 uris = [uri for uri, exists in artifact_existence.items() if exists]
2374 # FUTURE UPGRADE: Implement a parallelized bulk remove.
2375 log.debug("Removing %d artifacts from datastore that are unknown to datastore", len(uris))
2376 for uri in uris:
2377 try:
2378 uri.remove()
2379 except Exception as e:
2380 if ignore_errors:
2381 log.debug("Artifact %s could not be removed: %s", uri, e)
2382 continue
2383 raise
2385 # There is no point asking the code below to remove refs we
2386 # know are missing so update it with the list of existing
2387 # records. Try to retain one vs many logic.
2388 if not existing_refs:
2389 # Nothing more to do since none of the datasets were
2390 # known to the datastore record table.
2391 return
2392 ref = list(existing_refs)
2393 if len(ref) == 1:
2394 ref = ref[0]
2396 # Get file metadata and internal metadata
2397 if not isinstance(ref, DatasetRef):
2398 log.debug("Doing multi-dataset trash in datastore %s", self.name)
2399 # Assumed to be an iterable of refs so bulk mode enabled.
2400 try:
2401 self.bridge.moveToTrash(ref, transaction=self._transaction)
2402 except Exception as e:
2403 if ignore_errors:
2404 log.warning("Unexpected issue moving multiple datasets to trash: %s", e)
2405 else:
2406 raise
2407 return
2409 log.debug("Trashing dataset %s in datastore %s", ref, self.name)
2411 fileLocations = self._get_dataset_locations_info(ref)
2413 if not fileLocations:
2414 err_msg = f"Requested dataset to trash ({ref}) is not known to datastore {self.name}"
2415 if ignore_errors:
2416 log.warning(err_msg)
2417 return
2418 else:
2419 raise FileNotFoundError(err_msg)
2421 for location, _ in fileLocations:
2422 if not self._artifact_exists(location):
2423 err_msg = (
2424 f"Dataset is known to datastore {self.name} but "
2425 f"associated artifact ({location.uri}) is missing"
2426 )
2427 if ignore_errors:
2428 log.warning(err_msg)
2429 return
2430 else:
2431 raise FileNotFoundError(err_msg)
2433 # Mark dataset as trashed
2434 try:
2435 self.bridge.moveToTrash([ref], transaction=self._transaction)
2436 except Exception as e:
2437 if ignore_errors:
2438 log.warning(
2439 "Attempted to mark dataset (%s) to be trashed in datastore %s "
2440 "but encountered an error: %s",
2441 ref,
2442 self.name,
2443 e,
2444 )
2445 pass
2446 else:
2447 raise
2449 @transactional
2450 def emptyTrash(self, ignore_errors: bool = True) -> None:
2451 """Remove all datasets from the trash.
2453 Parameters
2454 ----------
2455 ignore_errors : `bool`
2456 If `True` return without error even if something went wrong.
2457 Problems could occur if another process is simultaneously trying
2458 to delete.
2459 """
2460 log.debug("Emptying trash in datastore %s", self.name)
2462 # Context manager will empty trash iff we finish it without raising.
2463 # It will also automatically delete the relevant rows from the
2464 # trash table and the records table.
2465 with self.bridge.emptyTrash(
2466 self._table, record_class=StoredFileInfo, record_column="path"
2467 ) as trash_data:
2468 # Removing the artifacts themselves requires that the files are
2469 # not also associated with refs that are not to be trashed.
2470 # Therefore need to do a query with the file paths themselves
2471 # and return all the refs associated with them. Can only delete
2472 # a file if the refs to be trashed are the only refs associated
2473 # with the file.
2474 # This requires multiple copies of the trashed items
2475 trashed, artifacts_to_keep = trash_data
2477 if artifacts_to_keep is None:
2478 # The bridge is not helping us so have to work it out
2479 # ourselves. This is not going to be as efficient.
2480 trashed = list(trashed)
2482 # The instance check is for mypy since up to this point it
2483 # does not know the type of info.
2484 path_map = self._refs_associated_with_artifacts(
2485 [info.path for _, info in trashed if isinstance(info, StoredFileInfo)]
2486 )
2488 for ref, info in trashed:
2489 # Mypy needs to know this is not the base class
2490 assert isinstance(info, StoredFileInfo), f"Unexpectedly got info of class {type(info)}"
2492 path_map[info.path].remove(ref.id)
2493 if not path_map[info.path]:
2494 del path_map[info.path]
2496 artifacts_to_keep = set(path_map)
2498 for ref, info in trashed:
2499 # Should not happen for this implementation but need
2500 # to keep mypy happy.
2501 assert info is not None, f"Internal logic error in emptyTrash with ref {ref}."
2503 # Mypy needs to know this is not the base class
2504 assert isinstance(info, StoredFileInfo), f"Unexpectedly got info of class {type(info)}"
2506 if info.path in artifacts_to_keep:
2507 # This is a multi-dataset artifact and we are not
2508 # removing all associated refs.
2509 continue
2511 # Only trashed refs still known to datastore will be returned.
2512 location = info.file_location(self.locationFactory)
2514 # Point of no return for this artifact
2515 log.debug("Removing artifact %s from datastore %s", location.uri, self.name)
2516 try:
2517 self._delete_artifact(location)
2518 except FileNotFoundError:
2519 # If the file itself has been deleted there is nothing
2520 # we can do about it. It is possible that trash has
2521 # been run in parallel in another process or someone
2522 # decided to delete the file. It is unlikely to come
2523 # back and so we should still continue with the removal
2524 # of the entry from the trash table. It is also possible
2525 # we removed it in a previous iteration if it was
2526 # a multi-dataset artifact. The delete artifact method
2527 # will log a debug message in this scenario.
2528 # Distinguishing file missing before trash started and
2529 # file already removed previously as part of this trash
2530 # is not worth the distinction with regards to potential
2531 # memory cost.
2532 pass
2533 except Exception as e:
2534 if ignore_errors:
2535 # Use a debug message here even though it's not
2536 # a good situation. In some cases this can be
2537 # caused by a race between user A and user B
2538 # and neither of them has permissions for the
2539 # other's files. Butler does not know about users
2540 # and trash has no idea what collections these
2541 # files were in (without guessing from a path).
2542 log.debug(
2543 "Encountered error removing artifact %s from datastore %s: %s",
2544 location.uri,
2545 self.name,
2546 e,
2547 )
2548 else:
2549 raise
2551 @transactional
2552 def transfer_from(
2553 self,
2554 source_datastore: Datastore,
2555 refs: Iterable[DatasetRef],
2556 transfer: str = "auto",
2557 artifact_existence: dict[ResourcePath, bool] | None = None,
2558 ) -> tuple[set[DatasetRef], set[DatasetRef]]:
2559 # Docstring inherited
2560 if type(self) is not type(source_datastore):
2561 raise TypeError(
2562 f"Datastore mismatch between this datastore ({type(self)}) and the "
2563 f"source datastore ({type(source_datastore)})."
2564 )
2566 # Be explicit for mypy
2567 if not isinstance(source_datastore, FileDatastore):
2568 raise TypeError(
2569 "Can only transfer to a FileDatastore from another FileDatastore, not"
2570 f" {type(source_datastore)}"
2571 )
2573 # Stop early if "direct" transfer mode is requested. That would
2574 # require that the URI inside the source datastore should be stored
2575 # directly in the target datastore, which seems unlikely to be useful
2576 # since at any moment the source datastore could delete the file.
2577 if transfer in ("direct", "split"):
2578 raise ValueError(
2579 f"Can not transfer from a source datastore using {transfer} mode since"
2580 " those files are controlled by the other datastore."
2581 )
2583 # Empty existence lookup if none given.
2584 if artifact_existence is None:
2585 artifact_existence = {}
2587 # We will go through the list multiple times so must convert
2588 # generators to lists.
2589 refs = list(refs)
2591 # In order to handle disassembled composites the code works
2592 # at the records level since it can assume that internal APIs
2593 # can be used.
2594 # - If the record already exists in the destination this is assumed
2595 # to be okay.
2596 # - If there is no record but the source and destination URIs are
2597 # identical no transfer is done but the record is added.
2598 # - If the source record refers to an absolute URI currently assume
2599 # that that URI should remain absolute and will be visible to the
2600 # destination butler. May need to have a flag to indicate whether
2601 # the dataset should be transferred. This will only happen if
2602 # the detached Butler has had a local ingest.
2604 # What we really want is all the records in the source datastore
2605 # associated with these refs. Or derived ones if they don't exist
2606 # in the source.
2607 source_records = source_datastore._get_stored_records_associated_with_refs(refs)
2609 # The source dataset_ids are the keys in these records
2610 source_ids = set(source_records)
2611 log.debug("Number of datastore records found in source: %d", len(source_ids))
2613 requested_ids = {ref.id for ref in refs}
2614 missing_ids = requested_ids - source_ids
2616 # Missing IDs can be okay if that datastore has allowed
2617 # gets based on file existence. Should we transfer what we can
2618 # or complain about it and warn?
2619 if missing_ids and not source_datastore.trustGetRequest:
2620 raise ValueError(
2621 f"Some datasets are missing from source datastore {source_datastore}: {missing_ids}"
2622 )
2624 # Need to map these missing IDs to a DatasetRef so we can guess
2625 # the details.
2626 if missing_ids:
2627 log.info(
2628 "Number of expected datasets missing from source datastore records: %d out of %d",
2629 len(missing_ids),
2630 len(requested_ids),
2631 )
2632 id_to_ref = {ref.id: ref for ref in refs if ref.id in missing_ids}
2634 # This should be chunked in case we end up having to check
2635 # the file store since we need some log output to show
2636 # progress.
2637 for missing_ids_chunk in chunk_iterable(missing_ids, chunk_size=10_000):
2638 records = {}
2639 for missing in missing_ids_chunk:
2640 # Ask the source datastore where the missing artifacts
2641 # should be. An execution butler might not know about the
2642 # artifacts even if they are there.
2643 expected = source_datastore._get_expected_dataset_locations_info(id_to_ref[missing])
2644 records[missing] = [info for _, info in expected]
2646 # Call the mexist helper method in case we have not already
2647 # checked these artifacts such that artifact_existence is
2648 # empty. This allows us to benefit from parallelism.
2649 # datastore.mexists() itself does not give us access to the
2650 # derived datastore record.
2651 log.verbose("Checking existence of %d datasets unknown to datastore", len(records))
2652 ref_exists = source_datastore._process_mexists_records(
2653 id_to_ref, records, False, artifact_existence=artifact_existence
2654 )
2656 # Now go through the records and propagate the ones that exist.
2657 location_factory = source_datastore.locationFactory
2658 for missing, record_list in records.items():
2659 # Skip completely if the ref does not exist.
2660 ref = id_to_ref[missing]
2661 if not ref_exists[ref]:
2662 log.warning("Asked to transfer dataset %s but no file artifacts exist for it.", ref)
2663 continue
2664 # Check for file artifact to decide which parts of a
2665 # disassembled composite do exist. If there is only a
2666 # single record we don't even need to look because it can't
2667 # be a composite and must exist.
2668 if len(record_list) == 1:
2669 dataset_records = record_list
2670 else:
2671 dataset_records = [
2672 record
2673 for record in record_list
2674 if artifact_existence[record.file_location(location_factory).uri]
2675 ]
2676 assert len(dataset_records) > 0, "Disassembled composite should have had some files."
2678 # Rely on source_records being a defaultdict.
2679 source_records[missing].extend(dataset_records)
2681 # See if we already have these records
2682 target_records = self._get_stored_records_associated_with_refs(refs)
2684 # The artifacts to register
2685 artifacts = []
2687 # Refs that already exist
2688 already_present = []
2690 # Refs that were rejected by this datastore.
2691 rejected = set()
2693 # Refs that were transferred successfully.
2694 accepted = set()
2696 # Record each time we have done a "direct" transfer.
2697 direct_transfers = []
2699 # Now can transfer the artifacts
2700 for ref in refs:
2701 if not self.constraints.isAcceptable(ref):
2702 # This datastore should not be accepting this dataset.
2703 rejected.add(ref)
2704 continue
2706 accepted.add(ref)
2708 if ref.id in target_records:
2709 # Already have an artifact for this.
2710 already_present.append(ref)
2711 continue
2713 # mypy needs to know these are always resolved refs
2714 for info in source_records[ref.id]:
2715 source_location = info.file_location(source_datastore.locationFactory)
2716 target_location = info.file_location(self.locationFactory)
2717 if source_location == target_location and not source_location.pathInStore.isabs():
2718 # Artifact is already in the target location.
2719 # (which is how execution butler currently runs)
2720 pass
2721 else:
2722 if target_location.pathInStore.isabs():
2723 # Just because we can see the artifact when running
2724 # the transfer doesn't mean it will be generally
2725 # accessible to a user of this butler. Need to decide
2726 # what to do about an absolute path.
2727 if transfer == "auto":
2728 # For "auto" transfers we allow the absolute URI
2729 # to be recorded in the target datastore.
2730 direct_transfers.append(source_location)
2731 else:
2732 # The user is explicitly requesting a transfer
2733 # even for an absolute URI. This requires us to
2734 # calculate the target path.
2735 template_ref = ref
2736 if info.component:
2737 template_ref = ref.makeComponentRef(info.component)
2738 target_location = self._calculate_ingested_datastore_name(
2739 source_location.uri,
2740 template_ref,
2741 )
2743 info = info.update(path=target_location.pathInStore.path)
2745 # Need to transfer it to the new location.
2746 # Assume we should always overwrite. If the artifact
2747 # is there this might indicate that a previous transfer
2748 # was interrupted but was not able to be rolled back
2749 # completely (eg pre-emption) so follow Datastore default
2750 # and overwrite.
2751 target_location.uri.transfer_from(
2752 source_location.uri, transfer=transfer, overwrite=True, transaction=self._transaction
2753 )
2755 artifacts.append((ref, info))
2757 if direct_transfers:
2758 log.info(
2759 "Transfer request for an outside-datastore artifact with absolute URI done %d time%s",
2760 len(direct_transfers),
2761 "" if len(direct_transfers) == 1 else "s",
2762 )
2764 # We are overwriting previous datasets that may have already
2765 # existed. We therefore should ensure that we force the
2766 # datastore records to agree. Note that this can potentially lead
2767 # to difficulties if the dataset has previously been ingested
2768 # disassembled and is somehow now assembled, or vice versa.
2769 self._register_datasets(artifacts, insert_mode=DatabaseInsertMode.REPLACE)
2771 if already_present:
2772 n_skipped = len(already_present)
2773 log.info(
2774 "Skipped transfer of %d dataset%s already present in datastore",
2775 n_skipped,
2776 "" if n_skipped == 1 else "s",
2777 )
2779 return accepted, rejected
2781 @transactional
2782 def forget(self, refs: Iterable[DatasetRef]) -> None:
2783 # Docstring inherited.
2784 refs = list(refs)
2785 self.bridge.forget(refs)
2786 self._table.delete(["dataset_id"], *[{"dataset_id": ref.id} for ref in refs])
2788 def validateConfiguration(
2789 self, entities: Iterable[DatasetRef | DatasetType | StorageClass], logFailures: bool = False
2790 ) -> None:
2791 """Validate some of the configuration for this datastore.
2793 Parameters
2794 ----------
2795 entities : iterable of `DatasetRef`, `DatasetType`, or `StorageClass`
2796 Entities to test against this configuration. Can be differing
2797 types.
2798 logFailures : `bool`, optional
2799 If `True`, output a log message for every validation error
2800 detected.
2802 Raises
2803 ------
2804 DatastoreValidationError
2805 Raised if there is a validation problem with a configuration.
2806 All the problems are reported in a single exception.
2808 Notes
2809 -----
2810 This method checks that all the supplied entities have valid file
2811 templates and also have formatters defined.
2812 """
2813 templateFailed = None
2814 try:
2815 self.templates.validateTemplates(entities, logFailures=logFailures)
2816 except FileTemplateValidationError as e:
2817 templateFailed = str(e)
2819 formatterFailed = []
2820 for entity in entities:
2821 try:
2822 self.formatterFactory.getFormatterClass(entity)
2823 except KeyError as e:
2824 formatterFailed.append(str(e))
2825 if logFailures:
2826 log.critical("Formatter failure: %s", e)
2828 if templateFailed or formatterFailed:
2829 messages = []
2830 if templateFailed:
2831 messages.append(templateFailed)
2832 if formatterFailed:
2833 messages.append(",".join(formatterFailed))
2834 msg = ";\n".join(messages)
2835 raise DatastoreValidationError(msg)
2837 def getLookupKeys(self) -> set[LookupKey]:
2838 # Docstring is inherited from base class
2839 return (
2840 self.templates.getLookupKeys()
2841 | self.formatterFactory.getLookupKeys()
2842 | self.constraints.getLookupKeys()
2843 )
2845 def validateKey(self, lookupKey: LookupKey, entity: DatasetRef | DatasetType | StorageClass) -> None:
2846 # Docstring is inherited from base class
2847 # The key can be valid in either formatters or templates so we can
2848 # only check the template if it exists
2849 if lookupKey in self.templates:
2850 try:
2851 self.templates[lookupKey].validateTemplate(entity)
2852 except FileTemplateValidationError as e:
2853 raise DatastoreValidationError(e) from e
2855 def export(
2856 self,
2857 refs: Iterable[DatasetRef],
2858 *,
2859 directory: ResourcePathExpression | None = None,
2860 transfer: str | None = "auto",
2861 ) -> Iterable[FileDataset]:
2862 # Docstring inherited from Datastore.export.
2863 if transfer == "auto" and directory is None:
2864 transfer = None
2866 if transfer is not None and directory is None:
2867 raise TypeError(f"Cannot export using transfer mode {transfer} with no export directory given")
2869 if transfer == "move":
2870 raise TypeError("Can not export by moving files out of datastore.")
2871 elif transfer == "direct":
2872 # For an export, treat this as equivalent to None. We do not
2873 # want an import to risk using absolute URIs to datasets owned
2874 # by another datastore.
2875 log.info("Treating 'direct' transfer mode as in-place export.")
2876 transfer = None
2878 # Force the directory to be a URI object
2879 directoryUri: ResourcePath | None = None
2880 if directory is not None:
2881 directoryUri = ResourcePath(directory, forceDirectory=True)
2883 if transfer is not None and directoryUri is not None and not directoryUri.exists():
2884 # mypy needs the second test
2885 raise FileNotFoundError(f"Export location {directory} does not exist")
2887 progress = Progress("lsst.daf.butler.datastores.FileDatastore.export", level=logging.DEBUG)
2888 for ref in progress.wrap(refs, "Exporting dataset files"):
2889 fileLocations = self._get_dataset_locations_info(ref)
2890 if not fileLocations:
2891 raise FileNotFoundError(f"Could not retrieve dataset {ref}.")
2892 # For now we can not export disassembled datasets
2893 if len(fileLocations) > 1:
2894 raise NotImplementedError(f"Can not export disassembled datasets such as {ref}")
2895 location, storedFileInfo = fileLocations[0]
2897 pathInStore = location.pathInStore.path
2898 if transfer is None:
2899 # TODO: do we also need to return the readStorageClass somehow?
2900 # We will use the path in store directly. If this is an
2901 # absolute URI, preserve it.
2902 if location.pathInStore.isabs():
2903 pathInStore = str(location.uri)
2904 elif transfer == "direct":
2905 # Use full URIs to the remote store in the export
2906 pathInStore = str(location.uri)
2907 else:
2908 # mypy needs help
2909 assert directoryUri is not None, "directoryUri must be defined to get here"
2910 storeUri = ResourcePath(location.uri)
2912 # if the datastore has an absolute URI to a resource, we
2913 # have two options:
2914 # 1. Keep the absolute URI in the exported YAML
2915 # 2. Allocate a new name in the local datastore and transfer
2916 # it.
2917 # For now go with option 2
2918 if location.pathInStore.isabs():
2919 template = self.templates.getTemplate(ref)
2920 newURI = ResourcePath(template.format(ref), forceAbsolute=False)
2921 pathInStore = str(newURI.updatedExtension(location.pathInStore.getExtension()))
2923 exportUri = directoryUri.join(pathInStore)
2924 exportUri.transfer_from(storeUri, transfer=transfer)
2926 yield FileDataset(refs=[ref], path=pathInStore, formatter=storedFileInfo.formatter)
2928 @staticmethod
2929 def computeChecksum(uri: ResourcePath, algorithm: str = "blake2b", block_size: int = 8192) -> str | None:
2930 """Compute the checksum of the supplied file.
2932 Parameters
2933 ----------
2934 uri : `lsst.resources.ResourcePath`
2935 Name of resource to calculate checksum from.
2936 algorithm : `str`, optional
2937 Name of algorithm to use. Must be one of the algorithms supported
2938 by :py:class`hashlib`.
2939 block_size : `int`
2940 Number of bytes to read from file at one time.
2942 Returns
2943 -------
2944 hexdigest : `str`
2945 Hex digest of the file.
2947 Notes
2948 -----
2949 Currently returns None if the URI is for a remote resource.
2950 """
2951 if algorithm not in hashlib.algorithms_guaranteed:
2952 raise NameError(f"The specified algorithm '{algorithm}' is not supported by hashlib")
2954 if not uri.isLocal:
2955 return None
2957 hasher = hashlib.new(algorithm)
2959 with uri.as_local() as local_uri, open(local_uri.ospath, "rb") as f:
2960 for chunk in iter(lambda: f.read(block_size), b""):
2961 hasher.update(chunk)
2963 return hasher.hexdigest()
2965 def needs_expanded_data_ids(
2966 self,
2967 transfer: str | None,
2968 entity: DatasetRef | DatasetType | StorageClass | None = None,
2969 ) -> bool:
2970 # Docstring inherited.
2971 # This _could_ also use entity to inspect whether the filename template
2972 # involves placeholders other than the required dimensions for its
2973 # dataset type, but that's not necessary for correctness; it just
2974 # enables more optimizations (perhaps only in theory).
2975 return transfer not in ("direct", None)
2977 def import_records(self, data: Mapping[str, DatastoreRecordData]) -> None:
2978 # Docstring inherited from the base class.
2979 record_data = data.get(self.name)
2980 if not record_data:
2981 return
2983 self._bridge.insert(FakeDatasetRef(dataset_id) for dataset_id in record_data.records)
2985 # TODO: Verify that there are no unexpected table names in the dict?
2986 unpacked_records = []
2987 for dataset_data in record_data.records.values():
2988 records = dataset_data.get(self._table.name)
2989 if records:
2990 for info in records:
2991 assert isinstance(info, StoredFileInfo), "Expecting StoredFileInfo records"
2992 unpacked_records.append(info.to_record())
2993 if unpacked_records:
2994 self._table.insert(*unpacked_records, transaction=self._transaction)
2996 def export_records(self, refs: Iterable[DatasetIdRef]) -> Mapping[str, DatastoreRecordData]:
2997 # Docstring inherited from the base class.
2998 exported_refs = list(self._bridge.check(refs))
2999 ids = {ref.id for ref in exported_refs}
3000 records: dict[DatasetId, dict[str, list[StoredDatastoreItemInfo]]] = {id: {} for id in ids}
3001 for row in self._table.fetch(dataset_id=ids):
3002 info: StoredDatastoreItemInfo = StoredFileInfo.from_record(row)
3003 dataset_records = records.setdefault(info.dataset_id, {})
3004 dataset_records.setdefault(self._table.name, []).append(info)
3006 record_data = DatastoreRecordData(records=records)
3007 return {self.name: record_data}
3009 def set_retrieve_dataset_type_method(self, method: Callable[[str], DatasetType | None] | None) -> None:
3010 # Docstring inherited from the base class.
3011 self._retrieve_dataset_method = method
3013 def _cast_storage_class(self, ref: DatasetRef) -> DatasetRef:
3014 """Update dataset reference to use the storage class from registry."""
3015 if self._retrieve_dataset_method is None:
3016 # We could raise an exception here but unit tests do not define
3017 # this method.
3018 return ref
3019 dataset_type = self._retrieve_dataset_method(ref.datasetType.name)
3020 if dataset_type is not None:
3021 ref = ref.overrideStorageClass(dataset_type.storageClass)
3022 return ref