Coverage for python/lsst/daf/butler/datastores/fileDatastore.py: 10%
991 statements
« prev ^ index » next coverage.py v7.3.1, created at 2023-10-02 08:00 +0000
« prev ^ index » next coverage.py v7.3.1, created at 2023-10-02 08:00 +0000
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This software is dual licensed under the GNU General Public License and also
10# under a 3-clause BSD license. Recipients may choose which of these licenses
11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt,
12# respectively. If you choose the GPL option then the following text applies
13# (but note that there is still no warranty even if you opt for BSD instead):
14#
15# This program is free software: you can redistribute it and/or modify
16# it under the terms of the GNU General Public License as published by
17# the Free Software Foundation, either version 3 of the License, or
18# (at your option) any later version.
19#
20# This program is distributed in the hope that it will be useful,
21# but WITHOUT ANY WARRANTY; without even the implied warranty of
22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23# GNU General Public License for more details.
24#
25# You should have received a copy of the GNU General Public License
26# along with this program. If not, see <http://www.gnu.org/licenses/>.
28"""Generic file-based datastore code."""
30from __future__ import annotations
32__all__ = ("FileDatastore",)
34import contextlib
35import hashlib
36import logging
37from collections import defaultdict
38from collections.abc import Callable, Iterable, Mapping, Sequence
39from dataclasses import dataclass
40from typing import TYPE_CHECKING, Any, ClassVar
42from lsst.daf.butler import (
43 CompositesMap,
44 Config,
45 DatasetId,
46 DatasetRef,
47 DatasetRefURIs,
48 DatasetType,
49 DatasetTypeNotSupportedError,
50 Datastore,
51 DatastoreCacheManager,
52 DatastoreConfig,
53 DatastoreDisabledCacheManager,
54 DatastoreRecordData,
55 DatastoreValidationError,
56 FileDataset,
57 FileDescriptor,
58 FileTemplates,
59 FileTemplateValidationError,
60 Formatter,
61 FormatterFactory,
62 Location,
63 LocationFactory,
64 Progress,
65 StorageClass,
66 StoredDatastoreItemInfo,
67 StoredFileInfo,
68 ddl,
69)
70from lsst.daf.butler.core.repoRelocation import replaceRoot
71from lsst.daf.butler.core.utils import transactional
72from lsst.daf.butler.registry.interfaces import DatastoreRegistryBridge, ReadOnlyDatabaseError
73from lsst.resources import ResourcePath, ResourcePathExpression
74from lsst.utils.introspection import get_class_of, get_instance_of
75from lsst.utils.iteration import chunk_iterable
77# For VERBOSE logging usage.
78from lsst.utils.logging import VERBOSE, getLogger
79from lsst.utils.timer import time_this
80from sqlalchemy import BigInteger, String
82from ..registry.interfaces import DatabaseInsertMode, FakeDatasetRef
83from .genericDatastore import GenericBaseDatastore
85if TYPE_CHECKING:
86 from lsst.daf.butler import AbstractDatastoreCacheManager, LookupKey
87 from lsst.daf.butler.registry.interfaces import DatasetIdRef, DatastoreRegistryBridgeManager
89log = getLogger(__name__)
92class _IngestPrepData(Datastore.IngestPrepData):
93 """Helper class for FileDatastore ingest implementation.
95 Parameters
96 ----------
97 datasets : `~collections.abc.Iterable` of `FileDataset`
98 Files to be ingested by this datastore.
99 """
101 def __init__(self, datasets: Iterable[FileDataset]):
102 super().__init__(ref for dataset in datasets for ref in dataset.refs)
103 self.datasets = datasets
106@dataclass(frozen=True)
107class DatastoreFileGetInformation:
108 """Collection of useful parameters needed to retrieve a file from
109 a Datastore.
110 """
112 location: Location
113 """The location from which to read the dataset."""
115 formatter: Formatter
116 """The `Formatter` to use to deserialize the dataset."""
118 info: StoredFileInfo
119 """Stored information about this file and its formatter."""
121 assemblerParams: Mapping[str, Any]
122 """Parameters to use for post-processing the retrieved dataset."""
124 formatterParams: Mapping[str, Any]
125 """Parameters that were understood by the associated formatter."""
127 component: str | None
128 """The component to be retrieved (can be `None`)."""
130 readStorageClass: StorageClass
131 """The `StorageClass` of the dataset being read."""
134class FileDatastore(GenericBaseDatastore):
135 """Generic Datastore for file-based implementations.
137 Should always be sub-classed since key abstract methods are missing.
139 Parameters
140 ----------
141 config : `DatastoreConfig` or `str`
142 Configuration as either a `Config` object or URI to file.
143 bridgeManager : `DatastoreRegistryBridgeManager`
144 Object that manages the interface between `Registry` and datastores.
145 butlerRoot : `str`, optional
146 New datastore root to use to override the configuration value.
148 Raises
149 ------
150 ValueError
151 If root location does not exist and ``create`` is `False` in the
152 configuration.
153 """
155 defaultConfigFile: ClassVar[str | None] = None
156 """Path to configuration defaults. Accessed within the ``config`` resource
157 or relative to a search path. Can be None if no defaults specified.
158 """
160 root: ResourcePath
161 """Root directory URI of this `Datastore`."""
163 locationFactory: LocationFactory
164 """Factory for creating locations relative to the datastore root."""
166 formatterFactory: FormatterFactory
167 """Factory for creating instances of formatters."""
169 templates: FileTemplates
170 """File templates that can be used by this `Datastore`."""
172 composites: CompositesMap
173 """Determines whether a dataset should be disassembled on put."""
175 defaultConfigFile = "datastores/fileDatastore.yaml"
176 """Path to configuration defaults. Accessed within the ``config`` resource
177 or relative to a search path. Can be None if no defaults specified.
178 """
180 _retrieve_dataset_method: Callable[[str], DatasetType | None] | None = None
181 """Callable that is used in trusted mode to retrieve registry definition
182 of a named dataset type.
183 """
185 @classmethod
186 def setConfigRoot(cls, root: str, config: Config, full: Config, overwrite: bool = True) -> None:
187 """Set any filesystem-dependent config options for this Datastore to
188 be appropriate for a new empty repository with the given root.
190 Parameters
191 ----------
192 root : `str`
193 URI to the root of the data repository.
194 config : `Config`
195 A `Config` to update. Only the subset understood by
196 this component will be updated. Will not expand
197 defaults.
198 full : `Config`
199 A complete config with all defaults expanded that can be
200 converted to a `DatastoreConfig`. Read-only and will not be
201 modified by this method.
202 Repository-specific options that should not be obtained
203 from defaults when Butler instances are constructed
204 should be copied from ``full`` to ``config``.
205 overwrite : `bool`, optional
206 If `False`, do not modify a value in ``config`` if the value
207 already exists. Default is always to overwrite with the provided
208 ``root``.
210 Notes
211 -----
212 If a keyword is explicitly defined in the supplied ``config`` it
213 will not be overridden by this method if ``overwrite`` is `False`.
214 This allows explicit values set in external configs to be retained.
215 """
216 Config.updateParameters(
217 DatastoreConfig,
218 config,
219 full,
220 toUpdate={"root": root},
221 toCopy=("cls", ("records", "table")),
222 overwrite=overwrite,
223 )
225 @classmethod
226 def makeTableSpec(cls, datasetIdColumnType: type) -> ddl.TableSpec:
227 return ddl.TableSpec(
228 fields=[
229 ddl.FieldSpec(name="dataset_id", dtype=datasetIdColumnType, primaryKey=True),
230 ddl.FieldSpec(name="path", dtype=String, length=256, nullable=False),
231 ddl.FieldSpec(name="formatter", dtype=String, length=128, nullable=False),
232 ddl.FieldSpec(name="storage_class", dtype=String, length=64, nullable=False),
233 # Use empty string to indicate no component
234 ddl.FieldSpec(name="component", dtype=String, length=32, primaryKey=True),
235 # TODO: should checksum be Base64Bytes instead?
236 ddl.FieldSpec(name="checksum", dtype=String, length=128, nullable=True),
237 ddl.FieldSpec(name="file_size", dtype=BigInteger, nullable=True),
238 ],
239 unique=frozenset(),
240 indexes=[ddl.IndexSpec("path")],
241 )
243 def __init__(
244 self,
245 config: DatastoreConfig | ResourcePathExpression,
246 bridgeManager: DatastoreRegistryBridgeManager,
247 butlerRoot: str | None = None,
248 ):
249 super().__init__(config, bridgeManager)
250 if "root" not in self.config:
251 raise ValueError("No root directory specified in configuration")
253 self._bridgeManager = bridgeManager
255 # Name ourselves either using an explicit name or a name
256 # derived from the (unexpanded) root
257 if "name" in self.config:
258 self.name = self.config["name"]
259 else:
260 # We use the unexpanded root in the name to indicate that this
261 # datastore can be moved without having to update registry.
262 self.name = "{}@{}".format(type(self).__name__, self.config["root"])
264 # Support repository relocation in config
265 # Existence of self.root is checked in subclass
266 self.root = ResourcePath(
267 replaceRoot(self.config["root"], butlerRoot), forceDirectory=True, forceAbsolute=True
268 )
270 self.locationFactory = LocationFactory(self.root)
271 self.formatterFactory = FormatterFactory()
273 # Now associate formatters with storage classes
274 self.formatterFactory.registerFormatters(self.config["formatters"], universe=bridgeManager.universe)
276 # Read the file naming templates
277 self.templates = FileTemplates(self.config["templates"], universe=bridgeManager.universe)
279 # See if composites should be disassembled
280 self.composites = CompositesMap(self.config["composites"], universe=bridgeManager.universe)
282 tableName = self.config["records", "table"]
283 try:
284 # Storage of paths and formatters, keyed by dataset_id
285 self._table = bridgeManager.opaque.register(
286 tableName, self.makeTableSpec(bridgeManager.datasetIdColumnType)
287 )
288 # Interface to Registry.
289 self._bridge = bridgeManager.register(self.name)
290 except ReadOnlyDatabaseError:
291 # If the database is read only and we just tried and failed to
292 # create a table, it means someone is trying to create a read-only
293 # butler client for an empty repo. That should be okay, as long
294 # as they then try to get any datasets before some other client
295 # creates the table. Chances are they'rejust validating
296 # configuration.
297 pass
299 # Determine whether checksums should be used - default to False
300 self.useChecksum = self.config.get("checksum", False)
302 # Determine whether we can fall back to configuration if a
303 # requested dataset is not known to registry
304 self.trustGetRequest = self.config.get("trust_get_request", False)
306 # Create a cache manager
307 self.cacheManager: AbstractDatastoreCacheManager
308 if "cached" in self.config:
309 self.cacheManager = DatastoreCacheManager(self.config["cached"], universe=bridgeManager.universe)
310 else:
311 self.cacheManager = DatastoreDisabledCacheManager("", universe=bridgeManager.universe)
313 # Check existence and create directory structure if necessary
314 if not self.root.exists():
315 if "create" not in self.config or not self.config["create"]:
316 raise ValueError(f"No valid root and not allowed to create one at: {self.root}")
317 try:
318 self.root.mkdir()
319 except Exception as e:
320 raise ValueError(
321 f"Can not create datastore root '{self.root}', check permissions. Got error: {e}"
322 ) from e
324 def __str__(self) -> str:
325 return str(self.root)
327 @property
328 def bridge(self) -> DatastoreRegistryBridge:
329 return self._bridge
331 @property
332 def roots(self) -> dict[str, ResourcePath | None]:
333 # Docstring inherited.
334 return {self.name: self.root}
336 def _artifact_exists(self, location: Location) -> bool:
337 """Check that an artifact exists in this datastore at the specified
338 location.
340 Parameters
341 ----------
342 location : `Location`
343 Expected location of the artifact associated with this datastore.
345 Returns
346 -------
347 exists : `bool`
348 True if the location can be found, false otherwise.
349 """
350 log.debug("Checking if resource exists: %s", location.uri)
351 return location.uri.exists()
353 def _delete_artifact(self, location: Location) -> None:
354 """Delete the artifact from the datastore.
356 Parameters
357 ----------
358 location : `Location`
359 Location of the artifact associated with this datastore.
360 """
361 if location.pathInStore.isabs():
362 raise RuntimeError(f"Cannot delete artifact with absolute uri {location.uri}.")
364 try:
365 location.uri.remove()
366 except FileNotFoundError:
367 log.debug("File %s did not exist and so could not be deleted.", location.uri)
368 raise
369 except Exception as e:
370 log.critical("Failed to delete file: %s (%s)", location.uri, e)
371 raise
372 log.debug("Successfully deleted file: %s", location.uri)
374 def addStoredItemInfo(
375 self,
376 refs: Iterable[DatasetRef],
377 infos: Iterable[StoredFileInfo],
378 insert_mode: DatabaseInsertMode = DatabaseInsertMode.INSERT,
379 ) -> None:
380 # Docstring inherited from GenericBaseDatastore
381 records = [info.rebase(ref).to_record() for ref, info in zip(refs, infos, strict=True)]
382 match insert_mode:
383 case DatabaseInsertMode.INSERT:
384 self._table.insert(*records, transaction=self._transaction)
385 case DatabaseInsertMode.ENSURE:
386 self._table.ensure(*records, transaction=self._transaction)
387 case DatabaseInsertMode.REPLACE:
388 self._table.replace(*records, transaction=self._transaction)
389 case _:
390 raise ValueError(f"Unknown insert mode of '{insert_mode}'")
392 def getStoredItemsInfo(self, ref: DatasetIdRef) -> list[StoredFileInfo]:
393 # Docstring inherited from GenericBaseDatastore
395 # Look for the dataset_id -- there might be multiple matches
396 # if we have disassembled the dataset.
397 records = self._table.fetch(dataset_id=ref.id)
398 return [StoredFileInfo.from_record(record) for record in records]
400 def _get_stored_records_associated_with_refs(
401 self, refs: Iterable[DatasetIdRef]
402 ) -> dict[DatasetId, list[StoredFileInfo]]:
403 """Retrieve all records associated with the provided refs.
405 Parameters
406 ----------
407 refs : iterable of `DatasetIdRef`
408 The refs for which records are to be retrieved.
410 Returns
411 -------
412 records : `dict` of [`DatasetId`, `list` of `StoredFileInfo`]
413 The matching records indexed by the ref ID. The number of entries
414 in the dict can be smaller than the number of requested refs.
415 """
416 records = self._table.fetch(dataset_id=[ref.id for ref in refs])
418 # Uniqueness is dataset_id + component so can have multiple records
419 # per ref.
420 records_by_ref = defaultdict(list)
421 for record in records:
422 records_by_ref[record["dataset_id"]].append(StoredFileInfo.from_record(record))
423 return records_by_ref
425 def _refs_associated_with_artifacts(self, paths: list[str | ResourcePath]) -> dict[str, set[DatasetId]]:
426 """Return paths and associated dataset refs.
428 Parameters
429 ----------
430 paths : `list` of `str` or `lsst.resources.ResourcePath`
431 All the paths to include in search.
433 Returns
434 -------
435 mapping : `dict` of [`str`, `set` [`DatasetId`]]
436 Mapping of each path to a set of associated database IDs.
437 """
438 records = self._table.fetch(path=[str(path) for path in paths])
439 result = defaultdict(set)
440 for row in records:
441 result[row["path"]].add(row["dataset_id"])
442 return result
444 def _registered_refs_per_artifact(self, pathInStore: ResourcePath) -> set[DatasetId]:
445 """Return all dataset refs associated with the supplied path.
447 Parameters
448 ----------
449 pathInStore : `lsst.resources.ResourcePath`
450 Path of interest in the data store.
452 Returns
453 -------
454 ids : `set` of `int`
455 All `DatasetRef` IDs associated with this path.
456 """
457 records = list(self._table.fetch(path=str(pathInStore)))
458 ids = {r["dataset_id"] for r in records}
459 return ids
461 def removeStoredItemInfo(self, ref: DatasetIdRef) -> None:
462 # Docstring inherited from GenericBaseDatastore
463 self._table.delete(["dataset_id"], {"dataset_id": ref.id})
465 def _get_dataset_locations_info(self, ref: DatasetIdRef) -> list[tuple[Location, StoredFileInfo]]:
466 r"""Find all the `Location`\ s of the requested dataset in the
467 `Datastore` and the associated stored file information.
469 Parameters
470 ----------
471 ref : `DatasetRef`
472 Reference to the required `Dataset`.
474 Returns
475 -------
476 results : `list` [`tuple` [`Location`, `StoredFileInfo` ]]
477 Location of the dataset within the datastore and
478 stored information about each file and its formatter.
479 """
480 # Get the file information (this will fail if no file)
481 records = self.getStoredItemsInfo(ref)
483 # Use the path to determine the location -- we need to take
484 # into account absolute URIs in the datastore record
485 return [(r.file_location(self.locationFactory), r) for r in records]
487 def _can_remove_dataset_artifact(self, ref: DatasetIdRef, location: Location) -> bool:
488 """Check that there is only one dataset associated with the
489 specified artifact.
491 Parameters
492 ----------
493 ref : `DatasetRef` or `FakeDatasetRef`
494 Dataset to be removed.
495 location : `Location`
496 The location of the artifact to be removed.
498 Returns
499 -------
500 can_remove : `Bool`
501 True if the artifact can be safely removed.
502 """
503 # Can't ever delete absolute URIs.
504 if location.pathInStore.isabs():
505 return False
507 # Get all entries associated with this path
508 allRefs = self._registered_refs_per_artifact(location.pathInStore)
509 if not allRefs:
510 raise RuntimeError(f"Datastore inconsistency error. {location.pathInStore} not in registry")
512 # Remove these refs from all the refs and if there is nothing left
513 # then we can delete
514 remainingRefs = allRefs - {ref.id}
516 if remainingRefs:
517 return False
518 return True
520 def _get_expected_dataset_locations_info(self, ref: DatasetRef) -> list[tuple[Location, StoredFileInfo]]:
521 """Predict the location and related file information of the requested
522 dataset in this datastore.
524 Parameters
525 ----------
526 ref : `DatasetRef`
527 Reference to the required `Dataset`.
529 Returns
530 -------
531 results : `list` [`tuple` [`Location`, `StoredFileInfo` ]]
532 Expected Location of the dataset within the datastore and
533 placeholder information about each file and its formatter.
535 Notes
536 -----
537 Uses the current configuration to determine how we would expect the
538 datastore files to have been written if we couldn't ask registry.
539 This is safe so long as there has been no change to datastore
540 configuration between writing the dataset and wanting to read it.
541 Will not work for files that have been ingested without using the
542 standard file template or default formatter.
543 """
544 # If we have a component ref we always need to ask the questions
545 # of the composite. If the composite is disassembled this routine
546 # should return all components. If the composite was not
547 # disassembled the composite is what is stored regardless of
548 # component request. Note that if the caller has disassembled
549 # a composite there is no way for this guess to know that
550 # without trying both the composite and component ref and seeing
551 # if there is something at the component Location even without
552 # disassembly being enabled.
553 if ref.datasetType.isComponent():
554 ref = ref.makeCompositeRef()
556 # See if the ref is a composite that should be disassembled
557 doDisassembly = self.composites.shouldBeDisassembled(ref)
559 all_info: list[tuple[Location, Formatter, StorageClass, str | None]] = []
561 if doDisassembly:
562 for component, componentStorage in ref.datasetType.storageClass.components.items():
563 compRef = ref.makeComponentRef(component)
564 location, formatter = self._determine_put_formatter_location(compRef)
565 all_info.append((location, formatter, componentStorage, component))
567 else:
568 # Always use the composite ref if no disassembly
569 location, formatter = self._determine_put_formatter_location(ref)
570 all_info.append((location, formatter, ref.datasetType.storageClass, None))
572 # Convert the list of tuples to have StoredFileInfo as second element
573 return [
574 (
575 location,
576 StoredFileInfo(
577 formatter=formatter,
578 path=location.pathInStore.path,
579 storageClass=storageClass,
580 component=component,
581 checksum=None,
582 file_size=-1,
583 dataset_id=ref.id,
584 ),
585 )
586 for location, formatter, storageClass, component in all_info
587 ]
589 def _prepare_for_get(
590 self, ref: DatasetRef, parameters: Mapping[str, Any] | None = None
591 ) -> list[DatastoreFileGetInformation]:
592 """Check parameters for ``get`` and obtain formatter and
593 location.
595 Parameters
596 ----------
597 ref : `DatasetRef`
598 Reference to the required Dataset.
599 parameters : `dict`
600 `StorageClass`-specific parameters that specify, for example,
601 a slice of the dataset to be loaded.
603 Returns
604 -------
605 getInfo : `list` [`DatastoreFileGetInformation`]
606 Parameters needed to retrieve each file.
607 """
608 log.debug("Retrieve %s from %s with parameters %s", ref, self.name, parameters)
610 # The storage class we want to use eventually
611 refStorageClass = ref.datasetType.storageClass
613 # For trusted mode need to reset storage class.
614 ref = self._cast_storage_class(ref)
616 # Get file metadata and internal metadata
617 fileLocations = self._get_dataset_locations_info(ref)
618 if not fileLocations:
619 if not self.trustGetRequest:
620 raise FileNotFoundError(f"Could not retrieve dataset {ref}.")
621 # Assume the dataset is where we think it should be
622 fileLocations = self._get_expected_dataset_locations_info(ref)
624 if len(fileLocations) > 1:
625 disassembled = True
627 # If trust is involved it is possible that there will be
628 # components listed here that do not exist in the datastore.
629 # Explicitly check for file artifact existence and filter out any
630 # that are missing.
631 if self.trustGetRequest:
632 fileLocations = [loc for loc in fileLocations if loc[0].uri.exists()]
634 # For now complain only if we have no components at all. One
635 # component is probably a problem but we can punt that to the
636 # assembler.
637 if not fileLocations:
638 raise FileNotFoundError(f"None of the component files for dataset {ref} exist.")
640 else:
641 disassembled = False
643 # Is this a component request?
644 refComponent = ref.datasetType.component()
646 fileGetInfo = []
647 for location, storedFileInfo in fileLocations:
648 # The storage class used to write the file
649 writeStorageClass = storedFileInfo.storageClass
651 # If this has been disassembled we need read to match the write
652 if disassembled:
653 readStorageClass = writeStorageClass
654 else:
655 readStorageClass = refStorageClass
657 formatter = get_instance_of(
658 storedFileInfo.formatter,
659 FileDescriptor(
660 location,
661 readStorageClass=readStorageClass,
662 storageClass=writeStorageClass,
663 parameters=parameters,
664 ),
665 ref.dataId,
666 )
668 formatterParams, notFormatterParams = formatter.segregateParameters()
670 # Of the remaining parameters, extract the ones supported by
671 # this StorageClass (for components not all will be handled)
672 assemblerParams = readStorageClass.filterParameters(notFormatterParams)
674 # The ref itself could be a component if the dataset was
675 # disassembled by butler, or we disassembled in datastore and
676 # components came from the datastore records
677 component = storedFileInfo.component if storedFileInfo.component else refComponent
679 fileGetInfo.append(
680 DatastoreFileGetInformation(
681 location,
682 formatter,
683 storedFileInfo,
684 assemblerParams,
685 formatterParams,
686 component,
687 readStorageClass,
688 )
689 )
691 return fileGetInfo
693 def _prepare_for_put(self, inMemoryDataset: Any, ref: DatasetRef) -> tuple[Location, Formatter]:
694 """Check the arguments for ``put`` and obtain formatter and
695 location.
697 Parameters
698 ----------
699 inMemoryDataset : `object`
700 The dataset to store.
701 ref : `DatasetRef`
702 Reference to the associated Dataset.
704 Returns
705 -------
706 location : `Location`
707 The location to write the dataset.
708 formatter : `Formatter`
709 The `Formatter` to use to write the dataset.
711 Raises
712 ------
713 TypeError
714 Supplied object and storage class are inconsistent.
715 DatasetTypeNotSupportedError
716 The associated `DatasetType` is not handled by this datastore.
717 """
718 self._validate_put_parameters(inMemoryDataset, ref)
719 return self._determine_put_formatter_location(ref)
721 def _determine_put_formatter_location(self, ref: DatasetRef) -> tuple[Location, Formatter]:
722 """Calculate the formatter and output location to use for put.
724 Parameters
725 ----------
726 ref : `DatasetRef`
727 Reference to the associated Dataset.
729 Returns
730 -------
731 location : `Location`
732 The location to write the dataset.
733 formatter : `Formatter`
734 The `Formatter` to use to write the dataset.
735 """
736 # Work out output file name
737 try:
738 template = self.templates.getTemplate(ref)
739 except KeyError as e:
740 raise DatasetTypeNotSupportedError(f"Unable to find template for {ref}") from e
742 # Validate the template to protect against filenames from different
743 # dataIds returning the same and causing overwrite confusion.
744 template.validateTemplate(ref)
746 location = self.locationFactory.fromPath(template.format(ref))
748 # Get the formatter based on the storage class
749 storageClass = ref.datasetType.storageClass
750 try:
751 formatter = self.formatterFactory.getFormatter(
752 ref, FileDescriptor(location, storageClass=storageClass), ref.dataId
753 )
754 except KeyError as e:
755 raise DatasetTypeNotSupportedError(
756 f"Unable to find formatter for {ref} in datastore {self.name}"
757 ) from e
759 # Now that we know the formatter, update the location
760 location = formatter.makeUpdatedLocation(location)
762 return location, formatter
764 def _overrideTransferMode(self, *datasets: FileDataset, transfer: str | None = None) -> str | None:
765 # Docstring inherited from base class
766 if transfer != "auto":
767 return transfer
769 # See if the paths are within the datastore or not
770 inside = [self._pathInStore(d.path) is not None for d in datasets]
772 if all(inside):
773 transfer = None
774 elif not any(inside):
775 # Allow ResourcePath to use its own knowledge
776 transfer = "auto"
777 else:
778 # This can happen when importing from a datastore that
779 # has had some datasets ingested using "direct" mode.
780 # Also allow ResourcePath to sort it out but warn about it.
781 # This can happen if you are importing from a datastore
782 # that had some direct transfer datasets.
783 log.warning(
784 "Some datasets are inside the datastore and some are outside. Using 'split' "
785 "transfer mode. This assumes that the files outside the datastore are "
786 "still accessible to the new butler since they will not be copied into "
787 "the target datastore."
788 )
789 transfer = "split"
791 return transfer
793 def _pathInStore(self, path: ResourcePathExpression) -> str | None:
794 """Return path relative to datastore root.
796 Parameters
797 ----------
798 path : `lsst.resources.ResourcePathExpression`
799 Path to dataset. Can be absolute URI. If relative assumed to
800 be relative to the datastore. Returns path in datastore
801 or raises an exception if the path it outside.
803 Returns
804 -------
805 inStore : `str`
806 Path relative to datastore root. Returns `None` if the file is
807 outside the root.
808 """
809 # Relative path will always be relative to datastore
810 pathUri = ResourcePath(path, forceAbsolute=False)
811 return pathUri.relative_to(self.root)
813 def _standardizeIngestPath(
814 self, path: str | ResourcePath, *, transfer: str | None = None
815 ) -> str | ResourcePath:
816 """Standardize the path of a to-be-ingested file.
818 Parameters
819 ----------
820 path : `str` or `lsst.resources.ResourcePath`
821 Path of a file to be ingested. This parameter is not expected
822 to be all the types that can be used to construct a
823 `~lsst.resources.ResourcePath`.
824 transfer : `str`, optional
825 How (and whether) the dataset should be added to the datastore.
826 See `ingest` for details of transfer modes.
827 This implementation is provided only so
828 `NotImplementedError` can be raised if the mode is not supported;
829 actual transfers are deferred to `_extractIngestInfo`.
831 Returns
832 -------
833 path : `str` or `lsst.resources.ResourcePath`
834 New path in what the datastore considers standard form. If an
835 absolute URI was given that will be returned unchanged.
837 Notes
838 -----
839 Subclasses of `FileDatastore` can implement this method instead
840 of `_prepIngest`. It should not modify the data repository or given
841 file in any way.
843 Raises
844 ------
845 NotImplementedError
846 Raised if the datastore does not support the given transfer mode
847 (including the case where ingest is not supported at all).
848 FileNotFoundError
849 Raised if one of the given files does not exist.
850 """
851 if transfer not in (None, "direct", "split") + self.root.transferModes:
852 raise NotImplementedError(f"Transfer mode {transfer} not supported.")
854 # A relative URI indicates relative to datastore root
855 srcUri = ResourcePath(path, forceAbsolute=False)
856 if not srcUri.isabs():
857 srcUri = self.root.join(path)
859 if not srcUri.exists():
860 raise FileNotFoundError(
861 f"Resource at {srcUri} does not exist; note that paths to ingest "
862 f"are assumed to be relative to {self.root} unless they are absolute."
863 )
865 if transfer is None:
866 relpath = srcUri.relative_to(self.root)
867 if not relpath:
868 raise RuntimeError(
869 f"Transfer is none but source file ({srcUri}) is not within datastore ({self.root})"
870 )
872 # Return the relative path within the datastore for internal
873 # transfer
874 path = relpath
876 return path
878 def _extractIngestInfo(
879 self,
880 path: ResourcePathExpression,
881 ref: DatasetRef,
882 *,
883 formatter: Formatter | type[Formatter],
884 transfer: str | None = None,
885 record_validation_info: bool = True,
886 ) -> StoredFileInfo:
887 """Relocate (if necessary) and extract `StoredFileInfo` from a
888 to-be-ingested file.
890 Parameters
891 ----------
892 path : `lsst.resources.ResourcePathExpression`
893 URI or path of a file to be ingested.
894 ref : `DatasetRef`
895 Reference for the dataset being ingested. Guaranteed to have
896 ``dataset_id not None`.
897 formatter : `type` or `Formatter`
898 `Formatter` subclass to use for this dataset or an instance.
899 transfer : `str`, optional
900 How (and whether) the dataset should be added to the datastore.
901 See `ingest` for details of transfer modes.
902 record_validation_info : `bool`, optional
903 If `True`, the default, the datastore can record validation
904 information associated with the file. If `False` the datastore
905 will not attempt to track any information such as checksums
906 or file sizes. This can be useful if such information is tracked
907 in an external system or if the file is to be compressed in place.
908 It is up to the datastore whether this parameter is relevant.
910 Returns
911 -------
912 info : `StoredFileInfo`
913 Internal datastore record for this file. This will be inserted by
914 the caller; the `_extractIngestInfo` is only responsible for
915 creating and populating the struct.
917 Raises
918 ------
919 FileNotFoundError
920 Raised if one of the given files does not exist.
921 FileExistsError
922 Raised if transfer is not `None` but the (internal) location the
923 file would be moved to is already occupied.
924 """
925 if self._transaction is None:
926 raise RuntimeError("Ingest called without transaction enabled")
928 # Create URI of the source path, do not need to force a relative
929 # path to absolute.
930 srcUri = ResourcePath(path, forceAbsolute=False)
932 # Track whether we have read the size of the source yet
933 have_sized = False
935 tgtLocation: Location | None
936 if transfer is None or transfer == "split":
937 # A relative path is assumed to be relative to the datastore
938 # in this context
939 if not srcUri.isabs():
940 tgtLocation = self.locationFactory.fromPath(srcUri.ospath)
941 else:
942 # Work out the path in the datastore from an absolute URI
943 # This is required to be within the datastore.
944 pathInStore = srcUri.relative_to(self.root)
945 if pathInStore is None and transfer is None:
946 raise RuntimeError(
947 f"Unexpectedly learned that {srcUri} is not within datastore {self.root}"
948 )
949 if pathInStore:
950 tgtLocation = self.locationFactory.fromPath(pathInStore)
951 elif transfer == "split":
952 # Outside the datastore but treat that as a direct ingest
953 # instead.
954 tgtLocation = None
955 else:
956 raise RuntimeError(f"Unexpected transfer mode encountered: {transfer} for URI {srcUri}")
957 elif transfer == "direct":
958 # Want to store the full URI to the resource directly in
959 # datastore. This is useful for referring to permanent archive
960 # storage for raw data.
961 # Trust that people know what they are doing.
962 tgtLocation = None
963 else:
964 # Work out the name we want this ingested file to have
965 # inside the datastore
966 tgtLocation = self._calculate_ingested_datastore_name(srcUri, ref, formatter)
967 if not tgtLocation.uri.dirname().exists():
968 log.debug("Folder %s does not exist yet.", tgtLocation.uri.dirname())
969 tgtLocation.uri.dirname().mkdir()
971 # if we are transferring from a local file to a remote location
972 # it may be more efficient to get the size and checksum of the
973 # local file rather than the transferred one
974 if record_validation_info and srcUri.isLocal:
975 size = srcUri.size()
976 checksum = self.computeChecksum(srcUri) if self.useChecksum else None
977 have_sized = True
979 # Transfer the resource to the destination.
980 # Allow overwrite of an existing file. This matches the behavior
981 # of datastore.put() in that it trusts that registry would not
982 # be asking to overwrite unless registry thought that the
983 # overwrite was allowed.
984 tgtLocation.uri.transfer_from(
985 srcUri, transfer=transfer, transaction=self._transaction, overwrite=True
986 )
988 if tgtLocation is None:
989 # This means we are using direct mode
990 targetUri = srcUri
991 targetPath = str(srcUri)
992 else:
993 targetUri = tgtLocation.uri
994 targetPath = tgtLocation.pathInStore.path
996 # the file should exist in the datastore now
997 if record_validation_info:
998 if not have_sized:
999 size = targetUri.size()
1000 checksum = self.computeChecksum(targetUri) if self.useChecksum else None
1001 else:
1002 # Not recording any file information.
1003 size = -1
1004 checksum = None
1006 return StoredFileInfo(
1007 formatter=formatter,
1008 path=targetPath,
1009 storageClass=ref.datasetType.storageClass,
1010 component=ref.datasetType.component(),
1011 file_size=size,
1012 checksum=checksum,
1013 dataset_id=ref.id,
1014 )
1016 def _prepIngest(self, *datasets: FileDataset, transfer: str | None = None) -> _IngestPrepData:
1017 # Docstring inherited from Datastore._prepIngest.
1018 filtered = []
1019 for dataset in datasets:
1020 acceptable = [ref for ref in dataset.refs if self.constraints.isAcceptable(ref)]
1021 if not acceptable:
1022 continue
1023 else:
1024 dataset.refs = acceptable
1025 if dataset.formatter is None:
1026 dataset.formatter = self.formatterFactory.getFormatterClass(dataset.refs[0])
1027 else:
1028 assert isinstance(dataset.formatter, type | str)
1029 formatter_class = get_class_of(dataset.formatter)
1030 if not issubclass(formatter_class, Formatter):
1031 raise TypeError(f"Requested formatter {dataset.formatter} is not a Formatter class.")
1032 dataset.formatter = formatter_class
1033 dataset.path = self._standardizeIngestPath(dataset.path, transfer=transfer)
1034 filtered.append(dataset)
1035 return _IngestPrepData(filtered)
1037 @transactional
1038 def _finishIngest(
1039 self,
1040 prepData: Datastore.IngestPrepData,
1041 *,
1042 transfer: str | None = None,
1043 record_validation_info: bool = True,
1044 ) -> None:
1045 # Docstring inherited from Datastore._finishIngest.
1046 refsAndInfos = []
1047 progress = Progress("lsst.daf.butler.datastores.FileDatastore.ingest", level=logging.DEBUG)
1048 for dataset in progress.wrap(prepData.datasets, desc="Ingesting dataset files"):
1049 # Do ingest as if the first dataset ref is associated with the file
1050 info = self._extractIngestInfo(
1051 dataset.path,
1052 dataset.refs[0],
1053 formatter=dataset.formatter,
1054 transfer=transfer,
1055 record_validation_info=record_validation_info,
1056 )
1057 refsAndInfos.extend([(ref, info) for ref in dataset.refs])
1059 # In direct mode we can allow repeated ingests of the same thing
1060 # if we are sure that the external dataset is immutable. We use
1061 # UUIDv5 to indicate this. If there is a mix of v4 and v5 they are
1062 # separated.
1063 refs_and_infos_replace = []
1064 refs_and_infos_insert = []
1065 if transfer == "direct":
1066 for entry in refsAndInfos:
1067 if entry[0].id.version == 5:
1068 refs_and_infos_replace.append(entry)
1069 else:
1070 refs_and_infos_insert.append(entry)
1071 else:
1072 refs_and_infos_insert = refsAndInfos
1074 if refs_and_infos_insert:
1075 self._register_datasets(refs_and_infos_insert, insert_mode=DatabaseInsertMode.INSERT)
1076 if refs_and_infos_replace:
1077 self._register_datasets(refs_and_infos_replace, insert_mode=DatabaseInsertMode.REPLACE)
1079 def _calculate_ingested_datastore_name(
1080 self,
1081 srcUri: ResourcePath,
1082 ref: DatasetRef,
1083 formatter: Formatter | type[Formatter] | None = None,
1084 ) -> Location:
1085 """Given a source URI and a DatasetRef, determine the name the
1086 dataset will have inside datastore.
1088 Parameters
1089 ----------
1090 srcUri : `lsst.resources.ResourcePath`
1091 URI to the source dataset file.
1092 ref : `DatasetRef`
1093 Ref associated with the newly-ingested dataset artifact. This
1094 is used to determine the name within the datastore.
1095 formatter : `Formatter` or Formatter class.
1096 Formatter to use for validation. Can be a class or an instance.
1097 No validation of the file extension is performed if the
1098 ``formatter`` is `None`. This can be used if the caller knows
1099 that the source URI and target URI will use the same formatter.
1101 Returns
1102 -------
1103 location : `Location`
1104 Target location for the newly-ingested dataset.
1105 """
1106 # Ingesting a file from outside the datastore.
1107 # This involves a new name.
1108 template = self.templates.getTemplate(ref)
1109 location = self.locationFactory.fromPath(template.format(ref))
1111 # Get the extension
1112 ext = srcUri.getExtension()
1114 # Update the destination to include that extension
1115 location.updateExtension(ext)
1117 # Ask the formatter to validate this extension
1118 if formatter is not None:
1119 formatter.validateExtension(location)
1121 return location
1123 def _write_in_memory_to_artifact(self, inMemoryDataset: Any, ref: DatasetRef) -> StoredFileInfo:
1124 """Write out in memory dataset to datastore.
1126 Parameters
1127 ----------
1128 inMemoryDataset : `object`
1129 Dataset to write to datastore.
1130 ref : `DatasetRef`
1131 Registry information associated with this dataset.
1133 Returns
1134 -------
1135 info : `StoredFileInfo`
1136 Information describing the artifact written to the datastore.
1137 """
1138 # May need to coerce the in memory dataset to the correct
1139 # python type, but first we need to make sure the storage class
1140 # reflects the one defined in the data repository.
1141 ref = self._cast_storage_class(ref)
1142 inMemoryDataset = ref.datasetType.storageClass.coerce_type(inMemoryDataset)
1144 location, formatter = self._prepare_for_put(inMemoryDataset, ref)
1145 uri = location.uri
1147 if not uri.dirname().exists():
1148 log.debug("Folder %s does not exist yet so creating it.", uri.dirname())
1149 uri.dirname().mkdir()
1151 if self._transaction is None:
1152 raise RuntimeError("Attempting to write artifact without transaction enabled")
1154 def _removeFileExists(uri: ResourcePath) -> None:
1155 """Remove a file and do not complain if it is not there.
1157 This is important since a formatter might fail before the file
1158 is written and we should not confuse people by writing spurious
1159 error messages to the log.
1160 """
1161 with contextlib.suppress(FileNotFoundError):
1162 uri.remove()
1164 # Register a callback to try to delete the uploaded data if
1165 # something fails below
1166 self._transaction.registerUndo("artifactWrite", _removeFileExists, uri)
1168 data_written = False
1170 # For remote URIs some datasets can be serialized directly
1171 # to bytes and sent to the remote datastore without writing a
1172 # file. If the dataset is intended to be saved to the cache
1173 # a file is always written and direct write to the remote
1174 # datastore is bypassed.
1175 if not uri.isLocal and not self.cacheManager.should_be_cached(ref):
1176 # Remote URI that is not cached so can write directly.
1177 try:
1178 serializedDataset = formatter.toBytes(inMemoryDataset)
1179 except NotImplementedError:
1180 # Fallback to the file writing option.
1181 pass
1182 except Exception as e:
1183 raise RuntimeError(
1184 f"Failed to serialize dataset {ref} of type {type(inMemoryDataset)} to bytes."
1185 ) from e
1186 else:
1187 log.debug("Writing bytes directly to %s", uri)
1188 uri.write(serializedDataset, overwrite=True)
1189 log.debug("Successfully wrote bytes directly to %s", uri)
1190 data_written = True
1192 if not data_written:
1193 # Did not write the bytes directly to object store so instead
1194 # write to temporary file. Always write to a temporary even if
1195 # using a local file system -- that gives us atomic writes.
1196 # If a process is killed as the file is being written we do not
1197 # want it to remain in the correct place but in corrupt state.
1198 # For local files write to the output directory not temporary dir.
1199 prefix = uri.dirname() if uri.isLocal else None
1200 with ResourcePath.temporary_uri(suffix=uri.getExtension(), prefix=prefix) as temporary_uri:
1201 # Need to configure the formatter to write to a different
1202 # location and that needs us to overwrite internals
1203 log.debug("Writing dataset to temporary location at %s", temporary_uri)
1204 with formatter._updateLocation(Location(None, temporary_uri)):
1205 try:
1206 formatter.write(inMemoryDataset)
1207 except Exception as e:
1208 raise RuntimeError(
1209 f"Failed to serialize dataset {ref} of type"
1210 f" {type(inMemoryDataset)} to "
1211 f"temporary location {temporary_uri}"
1212 ) from e
1214 # Use move for a local file since that becomes an efficient
1215 # os.rename. For remote resources we use copy to allow the
1216 # file to be cached afterwards.
1217 transfer = "move" if uri.isLocal else "copy"
1219 uri.transfer_from(temporary_uri, transfer=transfer, overwrite=True)
1221 if transfer == "copy":
1222 # Cache if required
1223 self.cacheManager.move_to_cache(temporary_uri, ref)
1225 log.debug("Successfully wrote dataset to %s via a temporary file.", uri)
1227 # URI is needed to resolve what ingest case are we dealing with
1228 return self._extractIngestInfo(uri, ref, formatter=formatter)
1230 def _read_artifact_into_memory(
1231 self,
1232 getInfo: DatastoreFileGetInformation,
1233 ref: DatasetRef,
1234 isComponent: bool = False,
1235 cache_ref: DatasetRef | None = None,
1236 ) -> Any:
1237 """Read the artifact from datastore into in memory object.
1239 Parameters
1240 ----------
1241 getInfo : `DatastoreFileGetInformation`
1242 Information about the artifact within the datastore.
1243 ref : `DatasetRef`
1244 The registry information associated with this artifact.
1245 isComponent : `bool`
1246 Flag to indicate if a component is being read from this artifact.
1247 cache_ref : `DatasetRef`, optional
1248 The DatasetRef to use when looking up the file in the cache.
1249 This ref must have the same ID as the supplied ref but can
1250 be a parent ref or component ref to indicate to the cache whether
1251 a composite file is being requested from the cache or a component
1252 file. Without this the cache will default to the supplied ref but
1253 it can get confused with read-only derived components for
1254 disassembled composites.
1256 Returns
1257 -------
1258 inMemoryDataset : `object`
1259 The artifact as a python object.
1260 """
1261 location = getInfo.location
1262 uri = location.uri
1263 log.debug("Accessing data from %s", uri)
1265 if cache_ref is None:
1266 cache_ref = ref
1267 if cache_ref.id != ref.id:
1268 raise ValueError(
1269 "The supplied cache dataset ref refers to a different dataset than expected:"
1270 f" {ref.id} != {cache_ref.id}"
1271 )
1273 # Cannot recalculate checksum but can compare size as a quick check
1274 # Do not do this if the size is negative since that indicates
1275 # we do not know.
1276 recorded_size = getInfo.info.file_size
1277 resource_size = uri.size()
1278 if recorded_size >= 0 and resource_size != recorded_size:
1279 raise RuntimeError(
1280 "Integrity failure in Datastore. "
1281 f"Size of file {uri} ({resource_size}) "
1282 f"does not match size recorded in registry of {recorded_size}"
1283 )
1285 # For the general case we have choices for how to proceed.
1286 # 1. Always use a local file (downloading the remote resource to a
1287 # temporary file if needed).
1288 # 2. Use a threshold size and read into memory and use bytes.
1289 # Use both for now with an arbitrary hand off size.
1290 # This allows small datasets to be downloaded from remote object
1291 # stores without requiring a temporary file.
1293 formatter = getInfo.formatter
1294 nbytes_max = 10_000_000 # Arbitrary number that we can tune
1295 if resource_size <= nbytes_max and formatter.can_read_bytes():
1296 with self.cacheManager.find_in_cache(cache_ref, uri.getExtension()) as cached_file:
1297 if cached_file is not None:
1298 desired_uri = cached_file
1299 msg = f" (cached version of {uri})"
1300 else:
1301 desired_uri = uri
1302 msg = ""
1303 with time_this(log, msg="Reading bytes from %s%s", args=(desired_uri, msg)):
1304 serializedDataset = desired_uri.read()
1305 log.debug(
1306 "Deserializing %s from %d bytes from location %s with formatter %s",
1307 f"component {getInfo.component}" if isComponent else "",
1308 len(serializedDataset),
1309 uri,
1310 formatter.name(),
1311 )
1312 try:
1313 result = formatter.fromBytes(
1314 serializedDataset, component=getInfo.component if isComponent else None
1315 )
1316 except Exception as e:
1317 raise ValueError(
1318 f"Failure from formatter '{formatter.name()}' for dataset {ref.id}"
1319 f" ({ref.datasetType.name} from {uri}): {e}"
1320 ) from e
1321 else:
1322 # Read from file.
1324 # Have to update the Location associated with the formatter
1325 # because formatter.read does not allow an override.
1326 # This could be improved.
1327 location_updated = False
1328 msg = ""
1330 # First check in cache for local version.
1331 # The cache will only be relevant for remote resources but
1332 # no harm in always asking. Context manager ensures that cache
1333 # file is not deleted during cache expiration.
1334 with self.cacheManager.find_in_cache(cache_ref, uri.getExtension()) as cached_file:
1335 if cached_file is not None:
1336 msg = f"(via cache read of remote file {uri})"
1337 uri = cached_file
1338 location_updated = True
1340 with uri.as_local() as local_uri:
1341 can_be_cached = False
1342 if uri != local_uri:
1343 # URI was remote and file was downloaded
1344 cache_msg = ""
1345 location_updated = True
1347 if self.cacheManager.should_be_cached(cache_ref):
1348 # In this scenario we want to ask if the downloaded
1349 # file should be cached but we should not cache
1350 # it until after we've used it (to ensure it can't
1351 # be expired whilst we are using it).
1352 can_be_cached = True
1354 # Say that it is "likely" to be cached because
1355 # if the formatter read fails we will not be
1356 # caching this file.
1357 cache_msg = " and likely cached"
1359 msg = f"(via download to local file{cache_msg})"
1361 # Calculate the (possibly) new location for the formatter
1362 # to use.
1363 newLocation = Location(*local_uri.split()) if location_updated else None
1365 log.debug(
1366 "Reading%s from location %s %s with formatter %s",
1367 f" component {getInfo.component}" if isComponent else "",
1368 uri,
1369 msg,
1370 formatter.name(),
1371 )
1372 try:
1373 with (
1374 formatter._updateLocation(newLocation),
1375 time_this(
1376 log,
1377 msg="Reading%s from location %s %s with formatter %s",
1378 args=(
1379 f" component {getInfo.component}" if isComponent else "",
1380 uri,
1381 msg,
1382 formatter.name(),
1383 ),
1384 ),
1385 ):
1386 result = formatter.read(component=getInfo.component if isComponent else None)
1387 except Exception as e:
1388 raise ValueError(
1389 f"Failure from formatter '{formatter.name()}' for dataset {ref.id}"
1390 f" ({ref.datasetType.name} from {uri}): {e}"
1391 ) from e
1393 # File was read successfully so can move to cache
1394 if can_be_cached:
1395 self.cacheManager.move_to_cache(local_uri, cache_ref)
1397 return self._post_process_get(
1398 result, ref.datasetType.storageClass, getInfo.assemblerParams, isComponent=isComponent
1399 )
1401 def knows(self, ref: DatasetRef) -> bool:
1402 """Check if the dataset is known to the datastore.
1404 Does not check for existence of any artifact.
1406 Parameters
1407 ----------
1408 ref : `DatasetRef`
1409 Reference to the required dataset.
1411 Returns
1412 -------
1413 exists : `bool`
1414 `True` if the dataset is known to the datastore.
1415 """
1416 fileLocations = self._get_dataset_locations_info(ref)
1417 if fileLocations:
1418 return True
1419 return False
1421 def knows_these(self, refs: Iterable[DatasetRef]) -> dict[DatasetRef, bool]:
1422 # Docstring inherited from the base class.
1424 # The records themselves. Could be missing some entries.
1425 records = self._get_stored_records_associated_with_refs(refs)
1427 return {ref: ref.id in records for ref in refs}
1429 def _process_mexists_records(
1430 self,
1431 id_to_ref: dict[DatasetId, DatasetRef],
1432 records: dict[DatasetId, list[StoredFileInfo]],
1433 all_required: bool,
1434 artifact_existence: dict[ResourcePath, bool] | None = None,
1435 ) -> dict[DatasetRef, bool]:
1436 """Check given records for existence.
1438 Helper function for `mexists()`.
1440 Parameters
1441 ----------
1442 id_to_ref : `dict` of [`DatasetId`, `DatasetRef`]
1443 Mapping of the dataset ID to the dataset ref itself.
1444 records : `dict` of [`DatasetId`, `list` of `StoredFileInfo`]
1445 Records as generally returned by
1446 ``_get_stored_records_associated_with_refs``.
1447 all_required : `bool`
1448 Flag to indicate whether existence requires all artifacts
1449 associated with a dataset ID to exist or not for existence.
1450 artifact_existence : `dict` [`lsst.resources.ResourcePath`, `bool`]
1451 Optional mapping of datastore artifact to existence. Updated by
1452 this method with details of all artifacts tested. Can be `None`
1453 if the caller is not interested.
1455 Returns
1456 -------
1457 existence : `dict` of [`DatasetRef`, `bool`]
1458 Mapping from dataset to boolean indicating existence.
1459 """
1460 # The URIs to be checked and a mapping of those URIs to
1461 # the dataset ID.
1462 uris_to_check: list[ResourcePath] = []
1463 location_map: dict[ResourcePath, DatasetId] = {}
1465 location_factory = self.locationFactory
1467 uri_existence: dict[ResourcePath, bool] = {}
1468 for ref_id, infos in records.items():
1469 # Key is the dataset Id, value is list of StoredItemInfo
1470 uris = [info.file_location(location_factory).uri for info in infos]
1471 location_map.update({uri: ref_id for uri in uris})
1473 # Check the local cache directly for a dataset corresponding
1474 # to the remote URI.
1475 if self.cacheManager.file_count > 0:
1476 ref = id_to_ref[ref_id]
1477 for uri, storedFileInfo in zip(uris, infos, strict=True):
1478 check_ref = ref
1479 if not ref.datasetType.isComponent() and (component := storedFileInfo.component):
1480 check_ref = ref.makeComponentRef(component)
1481 if self.cacheManager.known_to_cache(check_ref, uri.getExtension()):
1482 # Proxy for URI existence.
1483 uri_existence[uri] = True
1484 else:
1485 uris_to_check.append(uri)
1486 else:
1487 # Check all of them.
1488 uris_to_check.extend(uris)
1490 if artifact_existence is not None:
1491 # If a URI has already been checked remove it from the list
1492 # and immediately add the status to the output dict.
1493 filtered_uris_to_check = []
1494 for uri in uris_to_check:
1495 if uri in artifact_existence:
1496 uri_existence[uri] = artifact_existence[uri]
1497 else:
1498 filtered_uris_to_check.append(uri)
1499 uris_to_check = filtered_uris_to_check
1501 # Results.
1502 dataset_existence: dict[DatasetRef, bool] = {}
1504 uri_existence.update(ResourcePath.mexists(uris_to_check))
1505 for uri, exists in uri_existence.items():
1506 dataset_id = location_map[uri]
1507 ref = id_to_ref[dataset_id]
1509 # Disassembled composite needs to check all locations.
1510 # all_required indicates whether all need to exist or not.
1511 if ref in dataset_existence:
1512 if all_required:
1513 exists = dataset_existence[ref] and exists
1514 else:
1515 exists = dataset_existence[ref] or exists
1516 dataset_existence[ref] = exists
1518 if artifact_existence is not None:
1519 artifact_existence.update(uri_existence)
1521 return dataset_existence
1523 def mexists(
1524 self, refs: Iterable[DatasetRef], artifact_existence: dict[ResourcePath, bool] | None = None
1525 ) -> dict[DatasetRef, bool]:
1526 """Check the existence of multiple datasets at once.
1528 Parameters
1529 ----------
1530 refs : iterable of `DatasetRef`
1531 The datasets to be checked.
1532 artifact_existence : `dict` [`lsst.resources.ResourcePath`, `bool`]
1533 Optional mapping of datastore artifact to existence. Updated by
1534 this method with details of all artifacts tested. Can be `None`
1535 if the caller is not interested.
1537 Returns
1538 -------
1539 existence : `dict` of [`DatasetRef`, `bool`]
1540 Mapping from dataset to boolean indicating existence.
1542 Notes
1543 -----
1544 To minimize potentially costly remote existence checks, the local
1545 cache is checked as a proxy for existence. If a file for this
1546 `DatasetRef` does exist no check is done for the actual URI. This
1547 could result in possibly unexpected behavior if the dataset itself
1548 has been removed from the datastore by another process whilst it is
1549 still in the cache.
1550 """
1551 chunk_size = 10_000
1552 dataset_existence: dict[DatasetRef, bool] = {}
1553 log.debug("Checking for the existence of multiple artifacts in datastore in chunks of %d", chunk_size)
1554 n_found_total = 0
1555 n_checked = 0
1556 n_chunks = 0
1557 for chunk in chunk_iterable(refs, chunk_size=chunk_size):
1558 chunk_result = self._mexists(chunk, artifact_existence)
1560 # The log message level and content depend on how many
1561 # datasets we are processing.
1562 n_results = len(chunk_result)
1564 # Use verbose logging to ensure that messages can be seen
1565 # easily if many refs are being checked.
1566 log_threshold = VERBOSE
1567 n_checked += n_results
1569 # This sum can take some time so only do it if we know the
1570 # result is going to be used.
1571 n_found = 0
1572 if log.isEnabledFor(log_threshold):
1573 # Can treat the booleans as 0, 1 integers and sum them.
1574 n_found = sum(chunk_result.values())
1575 n_found_total += n_found
1577 # We are deliberately not trying to count the number of refs
1578 # provided in case it's in the millions. This means there is a
1579 # situation where the number of refs exactly matches the chunk
1580 # size and we will switch to the multi-chunk path even though
1581 # we only have a single chunk.
1582 if n_results < chunk_size and n_chunks == 0:
1583 # Single chunk will be processed so we can provide more detail.
1584 if n_results == 1:
1585 ref = list(chunk_result)[0]
1586 # Use debug logging to be consistent with `exists()`.
1587 log.debug(
1588 "Calling mexists() with single ref that does%s exist (%s).",
1589 "" if chunk_result[ref] else " not",
1590 ref,
1591 )
1592 else:
1593 # Single chunk but multiple files. Summarize.
1594 log.log(
1595 log_threshold,
1596 "Number of datasets found in datastore: %d out of %d datasets checked.",
1597 n_found,
1598 n_checked,
1599 )
1601 else:
1602 # Use incremental verbose logging when we have multiple chunks.
1603 log.log(
1604 log_threshold,
1605 "Number of datasets found in datastore for chunk %d: %d out of %d checked "
1606 "(running total from all chunks so far: %d found out of %d checked)",
1607 n_chunks,
1608 n_found,
1609 n_results,
1610 n_found_total,
1611 n_checked,
1612 )
1613 dataset_existence.update(chunk_result)
1614 n_chunks += 1
1616 return dataset_existence
1618 def _mexists(
1619 self, refs: Sequence[DatasetRef], artifact_existence: dict[ResourcePath, bool] | None = None
1620 ) -> dict[DatasetRef, bool]:
1621 """Check the existence of multiple datasets at once.
1623 Parameters
1624 ----------
1625 refs : iterable of `DatasetRef`
1626 The datasets to be checked.
1627 artifact_existence : `dict` [`lsst.resources.ResourcePath`, `bool`]
1628 Optional mapping of datastore artifact to existence. Updated by
1629 this method with details of all artifacts tested. Can be `None`
1630 if the caller is not interested.
1632 Returns
1633 -------
1634 existence : `dict` of [`DatasetRef`, `bool`]
1635 Mapping from dataset to boolean indicating existence.
1636 """
1637 # Make a mapping from refs with the internal storage class to the given
1638 # refs that may have a different one. We'll use the internal refs
1639 # throughout this method and convert back at the very end.
1640 internal_ref_to_input_ref = {self._cast_storage_class(ref): ref for ref in refs}
1642 # Need a mapping of dataset_id to (internal) dataset ref since some
1643 # internal APIs work with dataset_id.
1644 id_to_ref = {ref.id: ref for ref in internal_ref_to_input_ref}
1646 # Set of all IDs we are checking for.
1647 requested_ids = set(id_to_ref.keys())
1649 # The records themselves. Could be missing some entries.
1650 records = self._get_stored_records_associated_with_refs(id_to_ref.values())
1652 dataset_existence = self._process_mexists_records(
1653 id_to_ref, records, True, artifact_existence=artifact_existence
1654 )
1656 # Set of IDs that have been handled.
1657 handled_ids = {ref.id for ref in dataset_existence}
1659 missing_ids = requested_ids - handled_ids
1660 if missing_ids:
1661 dataset_existence.update(
1662 self._mexists_check_expected(
1663 [id_to_ref[missing] for missing in missing_ids], artifact_existence
1664 )
1665 )
1667 return {
1668 internal_ref_to_input_ref[internal_ref]: existence
1669 for internal_ref, existence in dataset_existence.items()
1670 }
1672 def _mexists_check_expected(
1673 self, refs: Sequence[DatasetRef], artifact_existence: dict[ResourcePath, bool] | None = None
1674 ) -> dict[DatasetRef, bool]:
1675 """Check existence of refs that are not known to datastore.
1677 Parameters
1678 ----------
1679 refs : iterable of `DatasetRef`
1680 The datasets to be checked. These are assumed not to be known
1681 to datastore.
1682 artifact_existence : `dict` [`lsst.resources.ResourcePath`, `bool`]
1683 Optional mapping of datastore artifact to existence. Updated by
1684 this method with details of all artifacts tested. Can be `None`
1685 if the caller is not interested.
1687 Returns
1688 -------
1689 existence : `dict` of [`DatasetRef`, `bool`]
1690 Mapping from dataset to boolean indicating existence.
1691 """
1692 dataset_existence: dict[DatasetRef, bool] = {}
1693 if not self.trustGetRequest:
1694 # Must assume these do not exist
1695 for ref in refs:
1696 dataset_existence[ref] = False
1697 else:
1698 log.debug(
1699 "%d datasets were not known to datastore during initial existence check.",
1700 len(refs),
1701 )
1703 # Construct data structure identical to that returned
1704 # by _get_stored_records_associated_with_refs() but using
1705 # guessed names.
1706 records = {}
1707 id_to_ref = {}
1708 for missing_ref in refs:
1709 expected = self._get_expected_dataset_locations_info(missing_ref)
1710 dataset_id = missing_ref.id
1711 records[dataset_id] = [info for _, info in expected]
1712 id_to_ref[dataset_id] = missing_ref
1714 dataset_existence.update(
1715 self._process_mexists_records(
1716 id_to_ref,
1717 records,
1718 False,
1719 artifact_existence=artifact_existence,
1720 )
1721 )
1723 return dataset_existence
1725 def exists(self, ref: DatasetRef) -> bool:
1726 """Check if the dataset exists in the datastore.
1728 Parameters
1729 ----------
1730 ref : `DatasetRef`
1731 Reference to the required dataset.
1733 Returns
1734 -------
1735 exists : `bool`
1736 `True` if the entity exists in the `Datastore`.
1738 Notes
1739 -----
1740 The local cache is checked as a proxy for existence in the remote
1741 object store. It is possible that another process on a different
1742 compute node could remove the file from the object store even
1743 though it is present in the local cache.
1744 """
1745 ref = self._cast_storage_class(ref)
1746 fileLocations = self._get_dataset_locations_info(ref)
1748 # if we are being asked to trust that registry might not be correct
1749 # we ask for the expected locations and check them explicitly
1750 if not fileLocations:
1751 if not self.trustGetRequest:
1752 return False
1754 # First check the cache. If it is not found we must check
1755 # the datastore itself. Assume that any component in the cache
1756 # means that the dataset does exist somewhere.
1757 if self.cacheManager.known_to_cache(ref):
1758 return True
1760 # When we are guessing a dataset location we can not check
1761 # for the existence of every component since we can not
1762 # know if every component was written. Instead we check
1763 # for the existence of any of the expected locations.
1764 for location, _ in self._get_expected_dataset_locations_info(ref):
1765 if self._artifact_exists(location):
1766 return True
1767 return False
1769 # All listed artifacts must exist.
1770 for location, storedFileInfo in fileLocations:
1771 # Checking in cache needs the component ref.
1772 check_ref = ref
1773 if not ref.datasetType.isComponent() and (component := storedFileInfo.component):
1774 check_ref = ref.makeComponentRef(component)
1775 if self.cacheManager.known_to_cache(check_ref, location.getExtension()):
1776 continue
1778 if not self._artifact_exists(location):
1779 return False
1781 return True
1783 def getURIs(self, ref: DatasetRef, predict: bool = False) -> DatasetRefURIs:
1784 """Return URIs associated with dataset.
1786 Parameters
1787 ----------
1788 ref : `DatasetRef`
1789 Reference to the required dataset.
1790 predict : `bool`, optional
1791 If the datastore does not know about the dataset, should it
1792 return a predicted URI or not?
1794 Returns
1795 -------
1796 uris : `DatasetRefURIs`
1797 The URI to the primary artifact associated with this dataset (if
1798 the dataset was disassembled within the datastore this may be
1799 `None`), and the URIs to any components associated with the dataset
1800 artifact. (can be empty if there are no components).
1801 """
1802 many = self.getManyURIs([ref], predict=predict, allow_missing=False)
1803 return many[ref]
1805 def getURI(self, ref: DatasetRef, predict: bool = False) -> ResourcePath:
1806 """URI to the Dataset.
1808 Parameters
1809 ----------
1810 ref : `DatasetRef`
1811 Reference to the required Dataset.
1812 predict : `bool`
1813 If `True`, allow URIs to be returned of datasets that have not
1814 been written.
1816 Returns
1817 -------
1818 uri : `str`
1819 URI pointing to the dataset within the datastore. If the
1820 dataset does not exist in the datastore, and if ``predict`` is
1821 `True`, the URI will be a prediction and will include a URI
1822 fragment "#predicted".
1823 If the datastore does not have entities that relate well
1824 to the concept of a URI the returned URI will be
1825 descriptive. The returned URI is not guaranteed to be obtainable.
1827 Raises
1828 ------
1829 FileNotFoundError
1830 Raised if a URI has been requested for a dataset that does not
1831 exist and guessing is not allowed.
1832 RuntimeError
1833 Raised if a request is made for a single URI but multiple URIs
1834 are associated with this dataset.
1836 Notes
1837 -----
1838 When a predicted URI is requested an attempt will be made to form
1839 a reasonable URI based on file templates and the expected formatter.
1840 """
1841 primary, components = self.getURIs(ref, predict)
1842 if primary is None or components:
1843 raise RuntimeError(
1844 f"Dataset ({ref}) includes distinct URIs for components. Use Datastore.getURIs() instead."
1845 )
1846 return primary
1848 def _predict_URIs(
1849 self,
1850 ref: DatasetRef,
1851 ) -> DatasetRefURIs:
1852 """Predict the URIs of a dataset ref.
1854 Parameters
1855 ----------
1856 ref : `DatasetRef`
1857 Reference to the required Dataset.
1859 Returns
1860 -------
1861 URI : DatasetRefUris
1862 Primary and component URIs. URIs will contain a URI fragment
1863 "#predicted".
1864 """
1865 uris = DatasetRefURIs()
1867 if self.composites.shouldBeDisassembled(ref):
1868 for component, _ in ref.datasetType.storageClass.components.items():
1869 comp_ref = ref.makeComponentRef(component)
1870 comp_location, _ = self._determine_put_formatter_location(comp_ref)
1872 # Add the "#predicted" URI fragment to indicate this is a
1873 # guess
1874 uris.componentURIs[component] = ResourcePath(comp_location.uri.geturl() + "#predicted")
1876 else:
1877 location, _ = self._determine_put_formatter_location(ref)
1879 # Add the "#predicted" URI fragment to indicate this is a guess
1880 uris.primaryURI = ResourcePath(location.uri.geturl() + "#predicted")
1882 return uris
1884 def getManyURIs(
1885 self,
1886 refs: Iterable[DatasetRef],
1887 predict: bool = False,
1888 allow_missing: bool = False,
1889 ) -> dict[DatasetRef, DatasetRefURIs]:
1890 # Docstring inherited
1892 uris: dict[DatasetRef, DatasetRefURIs] = {}
1894 records = self._get_stored_records_associated_with_refs(refs)
1895 records_keys = records.keys()
1897 existing_refs = tuple(ref for ref in refs if ref.id in records_keys)
1898 missing_refs = tuple(ref for ref in refs if ref.id not in records_keys)
1900 # Have to handle trustGetRequest mode by checking for the existence
1901 # of the missing refs on disk.
1902 if missing_refs:
1903 dataset_existence = self._mexists_check_expected(missing_refs, None)
1904 really_missing = set()
1905 not_missing = set()
1906 for ref, exists in dataset_existence.items():
1907 if exists:
1908 not_missing.add(ref)
1909 else:
1910 really_missing.add(ref)
1912 if not_missing:
1913 # Need to recalculate the missing/existing split.
1914 existing_refs = existing_refs + tuple(not_missing)
1915 missing_refs = tuple(really_missing)
1917 for ref in missing_refs:
1918 # if this has never been written then we have to guess
1919 if not predict:
1920 if not allow_missing:
1921 raise FileNotFoundError(f"Dataset {ref} not in this datastore.")
1922 else:
1923 uris[ref] = self._predict_URIs(ref)
1925 for ref in existing_refs:
1926 file_infos = records[ref.id]
1927 file_locations = [(i.file_location(self.locationFactory), i) for i in file_infos]
1928 uris[ref] = self._locations_to_URI(ref, file_locations)
1930 return uris
1932 def _locations_to_URI(
1933 self,
1934 ref: DatasetRef,
1935 file_locations: Sequence[tuple[Location, StoredFileInfo]],
1936 ) -> DatasetRefURIs:
1937 """Convert one or more file locations associated with a DatasetRef
1938 to a DatasetRefURIs.
1940 Parameters
1941 ----------
1942 ref : `DatasetRef`
1943 Reference to the dataset.
1944 file_locations : Sequence[Tuple[Location, StoredFileInfo]]
1945 Each item in the sequence is the location of the dataset within the
1946 datastore and stored information about the file and its formatter.
1947 If there is only one item in the sequence then it is treated as the
1948 primary URI. If there is more than one item then they are treated
1949 as component URIs. If there are no items then an error is raised
1950 unless ``self.trustGetRequest`` is `True`.
1952 Returns
1953 -------
1954 uris: DatasetRefURIs
1955 Represents the primary URI or component URIs described by the
1956 inputs.
1958 Raises
1959 ------
1960 RuntimeError
1961 If no file locations are passed in and ``self.trustGetRequest`` is
1962 `False`.
1963 FileNotFoundError
1964 If the a passed-in URI does not exist, and ``self.trustGetRequest``
1965 is `False`.
1966 RuntimeError
1967 If a passed in `StoredFileInfo`'s ``component`` is `None` (this is
1968 unexpected).
1969 """
1970 guessing = False
1971 uris = DatasetRefURIs()
1973 if not file_locations:
1974 if not self.trustGetRequest:
1975 raise RuntimeError(f"Unexpectedly got no artifacts for dataset {ref}")
1976 file_locations = self._get_expected_dataset_locations_info(ref)
1977 guessing = True
1979 if len(file_locations) == 1:
1980 # No disassembly so this is the primary URI
1981 uris.primaryURI = file_locations[0][0].uri
1982 if guessing and not uris.primaryURI.exists():
1983 raise FileNotFoundError(f"Expected URI ({uris.primaryURI}) does not exist")
1984 else:
1985 for location, file_info in file_locations:
1986 if file_info.component is None:
1987 raise RuntimeError(f"Unexpectedly got no component name for a component at {location}")
1988 if guessing and not location.uri.exists():
1989 # If we are trusting then it is entirely possible for
1990 # some components to be missing. In that case we skip
1991 # to the next component.
1992 if self.trustGetRequest:
1993 continue
1994 raise FileNotFoundError(f"Expected URI ({location.uri}) does not exist")
1995 uris.componentURIs[file_info.component] = location.uri
1997 return uris
1999 def retrieveArtifacts(
2000 self,
2001 refs: Iterable[DatasetRef],
2002 destination: ResourcePath,
2003 transfer: str = "auto",
2004 preserve_path: bool = True,
2005 overwrite: bool = False,
2006 ) -> list[ResourcePath]:
2007 """Retrieve the file artifacts associated with the supplied refs.
2009 Parameters
2010 ----------
2011 refs : iterable of `DatasetRef`
2012 The datasets for which file artifacts are to be retrieved.
2013 A single ref can result in multiple files. The refs must
2014 be resolved.
2015 destination : `lsst.resources.ResourcePath`
2016 Location to write the file artifacts.
2017 transfer : `str`, optional
2018 Method to use to transfer the artifacts. Must be one of the options
2019 supported by `lsst.resources.ResourcePath.transfer_from()`.
2020 "move" is not allowed.
2021 preserve_path : `bool`, optional
2022 If `True` the full path of the file artifact within the datastore
2023 is preserved. If `False` the final file component of the path
2024 is used.
2025 overwrite : `bool`, optional
2026 If `True` allow transfers to overwrite existing files at the
2027 destination.
2029 Returns
2030 -------
2031 targets : `list` of `lsst.resources.ResourcePath`
2032 URIs of file artifacts in destination location. Order is not
2033 preserved.
2034 """
2035 if not destination.isdir():
2036 raise ValueError(f"Destination location must refer to a directory. Given {destination}")
2038 if transfer == "move":
2039 raise ValueError("Can not move artifacts out of datastore. Use copy instead.")
2041 # Source -> Destination
2042 # This also helps filter out duplicate DatasetRef in the request
2043 # that will map to the same underlying file transfer.
2044 to_transfer: dict[ResourcePath, ResourcePath] = {}
2046 for ref in refs:
2047 locations = self._get_dataset_locations_info(ref)
2048 for location, _ in locations:
2049 source_uri = location.uri
2050 target_path: ResourcePathExpression
2051 if preserve_path:
2052 target_path = location.pathInStore
2053 if target_path.isabs():
2054 # This is an absolute path to an external file.
2055 # Use the full path.
2056 target_path = target_path.relativeToPathRoot
2057 else:
2058 target_path = source_uri.basename()
2059 target_uri = destination.join(target_path)
2060 to_transfer[source_uri] = target_uri
2062 # In theory can now parallelize the transfer
2063 log.debug("Number of artifacts to transfer to %s: %d", str(destination), len(to_transfer))
2064 for source_uri, target_uri in to_transfer.items():
2065 target_uri.transfer_from(source_uri, transfer=transfer, overwrite=overwrite)
2067 return list(to_transfer.values())
2069 def get(
2070 self,
2071 ref: DatasetRef,
2072 parameters: Mapping[str, Any] | None = None,
2073 storageClass: StorageClass | str | None = None,
2074 ) -> Any:
2075 """Load an InMemoryDataset from the store.
2077 Parameters
2078 ----------
2079 ref : `DatasetRef`
2080 Reference to the required Dataset.
2081 parameters : `dict`
2082 `StorageClass`-specific parameters that specify, for example,
2083 a slice of the dataset to be loaded.
2084 storageClass : `StorageClass` or `str`, optional
2085 The storage class to be used to override the Python type
2086 returned by this method. By default the returned type matches
2087 the dataset type definition for this dataset. Specifying a
2088 read `StorageClass` can force a different type to be returned.
2089 This type must be compatible with the original type.
2091 Returns
2092 -------
2093 inMemoryDataset : `object`
2094 Requested dataset or slice thereof as an InMemoryDataset.
2096 Raises
2097 ------
2098 FileNotFoundError
2099 Requested dataset can not be retrieved.
2100 TypeError
2101 Return value from formatter has unexpected type.
2102 ValueError
2103 Formatter failed to process the dataset.
2104 """
2105 # Supplied storage class for the component being read is either
2106 # from the ref itself or some an override if we want to force
2107 # type conversion.
2108 if storageClass is not None:
2109 ref = ref.overrideStorageClass(storageClass)
2110 refStorageClass = ref.datasetType.storageClass
2112 allGetInfo = self._prepare_for_get(ref, parameters)
2113 refComponent = ref.datasetType.component()
2115 # Create mapping from component name to related info
2116 allComponents = {i.component: i for i in allGetInfo}
2118 # By definition the dataset is disassembled if we have more
2119 # than one record for it.
2120 isDisassembled = len(allGetInfo) > 1
2122 # Look for the special case where we are disassembled but the
2123 # component is a derived component that was not written during
2124 # disassembly. For this scenario we need to check that the
2125 # component requested is listed as a derived component for the
2126 # composite storage class
2127 isDisassembledReadOnlyComponent = False
2128 if isDisassembled and refComponent:
2129 # The composite storage class should be accessible through
2130 # the component dataset type
2131 compositeStorageClass = ref.datasetType.parentStorageClass
2133 # In the unlikely scenario where the composite storage
2134 # class is not known, we can only assume that this is a
2135 # normal component. If that assumption is wrong then the
2136 # branch below that reads a persisted component will fail
2137 # so there is no need to complain here.
2138 if compositeStorageClass is not None:
2139 isDisassembledReadOnlyComponent = refComponent in compositeStorageClass.derivedComponents
2141 if isDisassembled and not refComponent:
2142 # This was a disassembled dataset spread over multiple files
2143 # and we need to put them all back together again.
2144 # Read into memory and then assemble
2146 # Check that the supplied parameters are suitable for the type read
2147 refStorageClass.validateParameters(parameters)
2149 # We want to keep track of all the parameters that were not used
2150 # by formatters. We assume that if any of the component formatters
2151 # use a parameter that we do not need to apply it again in the
2152 # assembler.
2153 usedParams = set()
2155 components: dict[str, Any] = {}
2156 for getInfo in allGetInfo:
2157 # assemblerParams are parameters not understood by the
2158 # associated formatter.
2159 usedParams.update(set(getInfo.formatterParams))
2161 component = getInfo.component
2163 if component is None:
2164 raise RuntimeError(f"Internal error in datastore assembly of {ref}")
2166 # We do not want the formatter to think it's reading
2167 # a component though because it is really reading a
2168 # standalone dataset -- always tell reader it is not a
2169 # component.
2170 components[component] = self._read_artifact_into_memory(
2171 getInfo, ref.makeComponentRef(component), isComponent=False
2172 )
2174 inMemoryDataset = ref.datasetType.storageClass.delegate().assemble(components)
2176 # Any unused parameters will have to be passed to the assembler
2177 if parameters:
2178 unusedParams = {k: v for k, v in parameters.items() if k not in usedParams}
2179 else:
2180 unusedParams = {}
2182 # Process parameters
2183 return ref.datasetType.storageClass.delegate().handleParameters(
2184 inMemoryDataset, parameters=unusedParams
2185 )
2187 elif isDisassembledReadOnlyComponent:
2188 compositeStorageClass = ref.datasetType.parentStorageClass
2189 if compositeStorageClass is None:
2190 raise RuntimeError(
2191 f"Unable to retrieve derived component '{refComponent}' since"
2192 "no composite storage class is available."
2193 )
2195 if refComponent is None:
2196 # Mainly for mypy
2197 raise RuntimeError(f"Internal error in datastore {self.name}: component can not be None here")
2199 # Assume that every derived component can be calculated by
2200 # forwarding the request to a single read/write component.
2201 # Rather than guessing which rw component is the right one by
2202 # scanning each for a derived component of the same name,
2203 # we ask the storage class delegate directly which one is best to
2204 # use.
2205 compositeDelegate = compositeStorageClass.delegate()
2206 forwardedComponent = compositeDelegate.selectResponsibleComponent(
2207 refComponent, set(allComponents)
2208 )
2210 # Select the relevant component
2211 rwInfo = allComponents[forwardedComponent]
2213 # For now assume that read parameters are validated against
2214 # the real component and not the requested component
2215 forwardedStorageClass = rwInfo.formatter.fileDescriptor.readStorageClass
2216 forwardedStorageClass.validateParameters(parameters)
2218 # The reference to use for the caching must refer to the forwarded
2219 # component and not the derived component.
2220 cache_ref = ref.makeCompositeRef().makeComponentRef(forwardedComponent)
2222 # Unfortunately the FileDescriptor inside the formatter will have
2223 # the wrong write storage class so we need to create a new one
2224 # given the immutability constraint.
2225 writeStorageClass = rwInfo.info.storageClass
2227 # We may need to put some thought into parameters for read
2228 # components but for now forward them on as is
2229 readFormatter = type(rwInfo.formatter)(
2230 FileDescriptor(
2231 rwInfo.location,
2232 readStorageClass=refStorageClass,
2233 storageClass=writeStorageClass,
2234 parameters=parameters,
2235 ),
2236 ref.dataId,
2237 )
2239 # The assembler can not receive any parameter requests for a
2240 # derived component at this time since the assembler will
2241 # see the storage class of the derived component and those
2242 # parameters will have to be handled by the formatter on the
2243 # forwarded storage class.
2244 assemblerParams: dict[str, Any] = {}
2246 # Need to created a new info that specifies the derived
2247 # component and associated storage class
2248 readInfo = DatastoreFileGetInformation(
2249 rwInfo.location,
2250 readFormatter,
2251 rwInfo.info,
2252 assemblerParams,
2253 {},
2254 refComponent,
2255 refStorageClass,
2256 )
2258 return self._read_artifact_into_memory(readInfo, ref, isComponent=True, cache_ref=cache_ref)
2260 else:
2261 # Single file request or component from that composite file
2262 for lookup in (refComponent, None):
2263 if lookup in allComponents:
2264 getInfo = allComponents[lookup]
2265 break
2266 else:
2267 raise FileNotFoundError(
2268 f"Component {refComponent} not found for ref {ref} in datastore {self.name}"
2269 )
2271 # Do not need the component itself if already disassembled
2272 if isDisassembled:
2273 isComponent = False
2274 else:
2275 isComponent = getInfo.component is not None
2277 # For a component read of a composite we want the cache to
2278 # be looking at the composite ref itself.
2279 cache_ref = ref.makeCompositeRef() if isComponent else ref
2281 # For a disassembled component we can validate parametersagainst
2282 # the component storage class directly
2283 if isDisassembled:
2284 refStorageClass.validateParameters(parameters)
2285 else:
2286 # For an assembled composite this could be a derived
2287 # component derived from a real component. The validity
2288 # of the parameters is not clear. For now validate against
2289 # the composite storage class
2290 getInfo.formatter.fileDescriptor.storageClass.validateParameters(parameters)
2292 return self._read_artifact_into_memory(getInfo, ref, isComponent=isComponent, cache_ref=cache_ref)
2294 @transactional
2295 def put(self, inMemoryDataset: Any, ref: DatasetRef) -> None:
2296 """Write a InMemoryDataset with a given `DatasetRef` to the store.
2298 Parameters
2299 ----------
2300 inMemoryDataset : `object`
2301 The dataset to store.
2302 ref : `DatasetRef`
2303 Reference to the associated Dataset.
2305 Raises
2306 ------
2307 TypeError
2308 Supplied object and storage class are inconsistent.
2309 DatasetTypeNotSupportedError
2310 The associated `DatasetType` is not handled by this datastore.
2312 Notes
2313 -----
2314 If the datastore is configured to reject certain dataset types it
2315 is possible that the put will fail and raise a
2316 `DatasetTypeNotSupportedError`. The main use case for this is to
2317 allow `ChainedDatastore` to put to multiple datastores without
2318 requiring that every datastore accepts the dataset.
2319 """
2320 doDisassembly = self.composites.shouldBeDisassembled(ref)
2321 # doDisassembly = True
2323 artifacts = []
2324 if doDisassembly:
2325 components = ref.datasetType.storageClass.delegate().disassemble(inMemoryDataset)
2326 if components is None:
2327 raise RuntimeError(
2328 f"Inconsistent configuration: dataset type {ref.datasetType.name} "
2329 f"with storage class {ref.datasetType.storageClass.name} "
2330 "is configured to be disassembled, but cannot be."
2331 )
2332 for component, componentInfo in components.items():
2333 # Don't recurse because we want to take advantage of
2334 # bulk insert -- need a new DatasetRef that refers to the
2335 # same dataset_id but has the component DatasetType
2336 # DatasetType does not refer to the types of components
2337 # So we construct one ourselves.
2338 compRef = ref.makeComponentRef(component)
2339 storedInfo = self._write_in_memory_to_artifact(componentInfo.component, compRef)
2340 artifacts.append((compRef, storedInfo))
2341 else:
2342 # Write the entire thing out
2343 storedInfo = self._write_in_memory_to_artifact(inMemoryDataset, ref)
2344 artifacts.append((ref, storedInfo))
2346 self._register_datasets(artifacts, insert_mode=DatabaseInsertMode.INSERT)
2348 @transactional
2349 def trash(self, ref: DatasetRef | Iterable[DatasetRef], ignore_errors: bool = True) -> None:
2350 # At this point can safely remove these datasets from the cache
2351 # to avoid confusion later on. If they are not trashed later
2352 # the cache will simply be refilled.
2353 self.cacheManager.remove_from_cache(ref)
2355 # If we are in trust mode there will be nothing to move to
2356 # the trash table and we will have to try to delete the file
2357 # immediately.
2358 if self.trustGetRequest:
2359 # Try to keep the logic below for a single file trash.
2360 if isinstance(ref, DatasetRef):
2361 refs = {ref}
2362 else:
2363 # Will recreate ref at the end of this branch.
2364 refs = set(ref)
2366 # Determine which datasets are known to datastore directly.
2367 id_to_ref = {ref.id: ref for ref in refs}
2368 existing_ids = self._get_stored_records_associated_with_refs(refs)
2369 existing_refs = {id_to_ref[ref_id] for ref_id in existing_ids}
2371 missing = refs - existing_refs
2372 if missing:
2373 # Do an explicit existence check on these refs.
2374 # We only care about the artifacts at this point and not
2375 # the dataset existence.
2376 artifact_existence: dict[ResourcePath, bool] = {}
2377 _ = self.mexists(missing, artifact_existence)
2378 uris = [uri for uri, exists in artifact_existence.items() if exists]
2380 # FUTURE UPGRADE: Implement a parallelized bulk remove.
2381 log.debug("Removing %d artifacts from datastore that are unknown to datastore", len(uris))
2382 for uri in uris:
2383 try:
2384 uri.remove()
2385 except Exception as e:
2386 if ignore_errors:
2387 log.debug("Artifact %s could not be removed: %s", uri, e)
2388 continue
2389 raise
2391 # There is no point asking the code below to remove refs we
2392 # know are missing so update it with the list of existing
2393 # records. Try to retain one vs many logic.
2394 if not existing_refs:
2395 # Nothing more to do since none of the datasets were
2396 # known to the datastore record table.
2397 return
2398 ref = list(existing_refs)
2399 if len(ref) == 1:
2400 ref = ref[0]
2402 # Get file metadata and internal metadata
2403 if not isinstance(ref, DatasetRef):
2404 log.debug("Doing multi-dataset trash in datastore %s", self.name)
2405 # Assumed to be an iterable of refs so bulk mode enabled.
2406 try:
2407 self.bridge.moveToTrash(ref, transaction=self._transaction)
2408 except Exception as e:
2409 if ignore_errors:
2410 log.warning("Unexpected issue moving multiple datasets to trash: %s", e)
2411 else:
2412 raise
2413 return
2415 log.debug("Trashing dataset %s in datastore %s", ref, self.name)
2417 fileLocations = self._get_dataset_locations_info(ref)
2419 if not fileLocations:
2420 err_msg = f"Requested dataset to trash ({ref}) is not known to datastore {self.name}"
2421 if ignore_errors:
2422 log.warning(err_msg)
2423 return
2424 else:
2425 raise FileNotFoundError(err_msg)
2427 for location, _ in fileLocations:
2428 if not self._artifact_exists(location):
2429 err_msg = (
2430 f"Dataset is known to datastore {self.name} but "
2431 f"associated artifact ({location.uri}) is missing"
2432 )
2433 if ignore_errors:
2434 log.warning(err_msg)
2435 return
2436 else:
2437 raise FileNotFoundError(err_msg)
2439 # Mark dataset as trashed
2440 try:
2441 self.bridge.moveToTrash([ref], transaction=self._transaction)
2442 except Exception as e:
2443 if ignore_errors:
2444 log.warning(
2445 "Attempted to mark dataset (%s) to be trashed in datastore %s "
2446 "but encountered an error: %s",
2447 ref,
2448 self.name,
2449 e,
2450 )
2451 pass
2452 else:
2453 raise
2455 @transactional
2456 def emptyTrash(self, ignore_errors: bool = True) -> None:
2457 """Remove all datasets from the trash.
2459 Parameters
2460 ----------
2461 ignore_errors : `bool`
2462 If `True` return without error even if something went wrong.
2463 Problems could occur if another process is simultaneously trying
2464 to delete.
2465 """
2466 log.debug("Emptying trash in datastore %s", self.name)
2468 # Context manager will empty trash iff we finish it without raising.
2469 # It will also automatically delete the relevant rows from the
2470 # trash table and the records table.
2471 with self.bridge.emptyTrash(
2472 self._table, record_class=StoredFileInfo, record_column="path"
2473 ) as trash_data:
2474 # Removing the artifacts themselves requires that the files are
2475 # not also associated with refs that are not to be trashed.
2476 # Therefore need to do a query with the file paths themselves
2477 # and return all the refs associated with them. Can only delete
2478 # a file if the refs to be trashed are the only refs associated
2479 # with the file.
2480 # This requires multiple copies of the trashed items
2481 trashed, artifacts_to_keep = trash_data
2483 if artifacts_to_keep is None:
2484 # The bridge is not helping us so have to work it out
2485 # ourselves. This is not going to be as efficient.
2486 trashed = list(trashed)
2488 # The instance check is for mypy since up to this point it
2489 # does not know the type of info.
2490 path_map = self._refs_associated_with_artifacts(
2491 [info.path for _, info in trashed if isinstance(info, StoredFileInfo)]
2492 )
2494 for ref, info in trashed:
2495 # Mypy needs to know this is not the base class
2496 assert isinstance(info, StoredFileInfo), f"Unexpectedly got info of class {type(info)}"
2498 path_map[info.path].remove(ref.id)
2499 if not path_map[info.path]:
2500 del path_map[info.path]
2502 artifacts_to_keep = set(path_map)
2504 for ref, info in trashed:
2505 # Should not happen for this implementation but need
2506 # to keep mypy happy.
2507 assert info is not None, f"Internal logic error in emptyTrash with ref {ref}."
2509 # Mypy needs to know this is not the base class
2510 assert isinstance(info, StoredFileInfo), f"Unexpectedly got info of class {type(info)}"
2512 if info.path in artifacts_to_keep:
2513 # This is a multi-dataset artifact and we are not
2514 # removing all associated refs.
2515 continue
2517 # Only trashed refs still known to datastore will be returned.
2518 location = info.file_location(self.locationFactory)
2520 # Point of no return for this artifact
2521 log.debug("Removing artifact %s from datastore %s", location.uri, self.name)
2522 try:
2523 self._delete_artifact(location)
2524 except FileNotFoundError:
2525 # If the file itself has been deleted there is nothing
2526 # we can do about it. It is possible that trash has
2527 # been run in parallel in another process or someone
2528 # decided to delete the file. It is unlikely to come
2529 # back and so we should still continue with the removal
2530 # of the entry from the trash table. It is also possible
2531 # we removed it in a previous iteration if it was
2532 # a multi-dataset artifact. The delete artifact method
2533 # will log a debug message in this scenario.
2534 # Distinguishing file missing before trash started and
2535 # file already removed previously as part of this trash
2536 # is not worth the distinction with regards to potential
2537 # memory cost.
2538 pass
2539 except Exception as e:
2540 if ignore_errors:
2541 # Use a debug message here even though it's not
2542 # a good situation. In some cases this can be
2543 # caused by a race between user A and user B
2544 # and neither of them has permissions for the
2545 # other's files. Butler does not know about users
2546 # and trash has no idea what collections these
2547 # files were in (without guessing from a path).
2548 log.debug(
2549 "Encountered error removing artifact %s from datastore %s: %s",
2550 location.uri,
2551 self.name,
2552 e,
2553 )
2554 else:
2555 raise
2557 @transactional
2558 def transfer_from(
2559 self,
2560 source_datastore: Datastore,
2561 refs: Iterable[DatasetRef],
2562 transfer: str = "auto",
2563 artifact_existence: dict[ResourcePath, bool] | None = None,
2564 ) -> tuple[set[DatasetRef], set[DatasetRef]]:
2565 # Docstring inherited
2566 if type(self) is not type(source_datastore):
2567 raise TypeError(
2568 f"Datastore mismatch between this datastore ({type(self)}) and the "
2569 f"source datastore ({type(source_datastore)})."
2570 )
2572 # Be explicit for mypy
2573 if not isinstance(source_datastore, FileDatastore):
2574 raise TypeError(
2575 "Can only transfer to a FileDatastore from another FileDatastore, not"
2576 f" {type(source_datastore)}"
2577 )
2579 # Stop early if "direct" transfer mode is requested. That would
2580 # require that the URI inside the source datastore should be stored
2581 # directly in the target datastore, which seems unlikely to be useful
2582 # since at any moment the source datastore could delete the file.
2583 if transfer in ("direct", "split"):
2584 raise ValueError(
2585 f"Can not transfer from a source datastore using {transfer} mode since"
2586 " those files are controlled by the other datastore."
2587 )
2589 # Empty existence lookup if none given.
2590 if artifact_existence is None:
2591 artifact_existence = {}
2593 # We will go through the list multiple times so must convert
2594 # generators to lists.
2595 refs = list(refs)
2597 # In order to handle disassembled composites the code works
2598 # at the records level since it can assume that internal APIs
2599 # can be used.
2600 # - If the record already exists in the destination this is assumed
2601 # to be okay.
2602 # - If there is no record but the source and destination URIs are
2603 # identical no transfer is done but the record is added.
2604 # - If the source record refers to an absolute URI currently assume
2605 # that that URI should remain absolute and will be visible to the
2606 # destination butler. May need to have a flag to indicate whether
2607 # the dataset should be transferred. This will only happen if
2608 # the detached Butler has had a local ingest.
2610 # What we really want is all the records in the source datastore
2611 # associated with these refs. Or derived ones if they don't exist
2612 # in the source.
2613 source_records = source_datastore._get_stored_records_associated_with_refs(refs)
2615 # The source dataset_ids are the keys in these records
2616 source_ids = set(source_records)
2617 log.debug("Number of datastore records found in source: %d", len(source_ids))
2619 requested_ids = {ref.id for ref in refs}
2620 missing_ids = requested_ids - source_ids
2622 # Missing IDs can be okay if that datastore has allowed
2623 # gets based on file existence. Should we transfer what we can
2624 # or complain about it and warn?
2625 if missing_ids and not source_datastore.trustGetRequest:
2626 raise ValueError(
2627 f"Some datasets are missing from source datastore {source_datastore}: {missing_ids}"
2628 )
2630 # Need to map these missing IDs to a DatasetRef so we can guess
2631 # the details.
2632 if missing_ids:
2633 log.info(
2634 "Number of expected datasets missing from source datastore records: %d out of %d",
2635 len(missing_ids),
2636 len(requested_ids),
2637 )
2638 id_to_ref = {ref.id: ref for ref in refs if ref.id in missing_ids}
2640 # This should be chunked in case we end up having to check
2641 # the file store since we need some log output to show
2642 # progress.
2643 for missing_ids_chunk in chunk_iterable(missing_ids, chunk_size=10_000):
2644 records = {}
2645 for missing in missing_ids_chunk:
2646 # Ask the source datastore where the missing artifacts
2647 # should be. An execution butler might not know about the
2648 # artifacts even if they are there.
2649 expected = source_datastore._get_expected_dataset_locations_info(id_to_ref[missing])
2650 records[missing] = [info for _, info in expected]
2652 # Call the mexist helper method in case we have not already
2653 # checked these artifacts such that artifact_existence is
2654 # empty. This allows us to benefit from parallelism.
2655 # datastore.mexists() itself does not give us access to the
2656 # derived datastore record.
2657 log.verbose("Checking existence of %d datasets unknown to datastore", len(records))
2658 ref_exists = source_datastore._process_mexists_records(
2659 id_to_ref, records, False, artifact_existence=artifact_existence
2660 )
2662 # Now go through the records and propagate the ones that exist.
2663 location_factory = source_datastore.locationFactory
2664 for missing, record_list in records.items():
2665 # Skip completely if the ref does not exist.
2666 ref = id_to_ref[missing]
2667 if not ref_exists[ref]:
2668 log.warning("Asked to transfer dataset %s but no file artifacts exist for it.", ref)
2669 continue
2670 # Check for file artifact to decide which parts of a
2671 # disassembled composite do exist. If there is only a
2672 # single record we don't even need to look because it can't
2673 # be a composite and must exist.
2674 if len(record_list) == 1:
2675 dataset_records = record_list
2676 else:
2677 dataset_records = [
2678 record
2679 for record in record_list
2680 if artifact_existence[record.file_location(location_factory).uri]
2681 ]
2682 assert len(dataset_records) > 0, "Disassembled composite should have had some files."
2684 # Rely on source_records being a defaultdict.
2685 source_records[missing].extend(dataset_records)
2687 # See if we already have these records
2688 target_records = self._get_stored_records_associated_with_refs(refs)
2690 # The artifacts to register
2691 artifacts = []
2693 # Refs that already exist
2694 already_present = []
2696 # Refs that were rejected by this datastore.
2697 rejected = set()
2699 # Refs that were transferred successfully.
2700 accepted = set()
2702 # Record each time we have done a "direct" transfer.
2703 direct_transfers = []
2705 # Now can transfer the artifacts
2706 for ref in refs:
2707 if not self.constraints.isAcceptable(ref):
2708 # This datastore should not be accepting this dataset.
2709 rejected.add(ref)
2710 continue
2712 accepted.add(ref)
2714 if ref.id in target_records:
2715 # Already have an artifact for this.
2716 already_present.append(ref)
2717 continue
2719 # mypy needs to know these are always resolved refs
2720 for info in source_records[ref.id]:
2721 source_location = info.file_location(source_datastore.locationFactory)
2722 target_location = info.file_location(self.locationFactory)
2723 if source_location == target_location and not source_location.pathInStore.isabs():
2724 # Artifact is already in the target location.
2725 # (which is how execution butler currently runs)
2726 pass
2727 else:
2728 if target_location.pathInStore.isabs():
2729 # Just because we can see the artifact when running
2730 # the transfer doesn't mean it will be generally
2731 # accessible to a user of this butler. Need to decide
2732 # what to do about an absolute path.
2733 if transfer == "auto":
2734 # For "auto" transfers we allow the absolute URI
2735 # to be recorded in the target datastore.
2736 direct_transfers.append(source_location)
2737 else:
2738 # The user is explicitly requesting a transfer
2739 # even for an absolute URI. This requires us to
2740 # calculate the target path.
2741 template_ref = ref
2742 if info.component:
2743 template_ref = ref.makeComponentRef(info.component)
2744 target_location = self._calculate_ingested_datastore_name(
2745 source_location.uri,
2746 template_ref,
2747 )
2749 info = info.update(path=target_location.pathInStore.path)
2751 # Need to transfer it to the new location.
2752 # Assume we should always overwrite. If the artifact
2753 # is there this might indicate that a previous transfer
2754 # was interrupted but was not able to be rolled back
2755 # completely (eg pre-emption) so follow Datastore default
2756 # and overwrite.
2757 target_location.uri.transfer_from(
2758 source_location.uri, transfer=transfer, overwrite=True, transaction=self._transaction
2759 )
2761 artifacts.append((ref, info))
2763 if direct_transfers:
2764 log.info(
2765 "Transfer request for an outside-datastore artifact with absolute URI done %d time%s",
2766 len(direct_transfers),
2767 "" if len(direct_transfers) == 1 else "s",
2768 )
2770 # We are overwriting previous datasets that may have already
2771 # existed. We therefore should ensure that we force the
2772 # datastore records to agree. Note that this can potentially lead
2773 # to difficulties if the dataset has previously been ingested
2774 # disassembled and is somehow now assembled, or vice versa.
2775 self._register_datasets(artifacts, insert_mode=DatabaseInsertMode.REPLACE)
2777 if already_present:
2778 n_skipped = len(already_present)
2779 log.info(
2780 "Skipped transfer of %d dataset%s already present in datastore",
2781 n_skipped,
2782 "" if n_skipped == 1 else "s",
2783 )
2785 return accepted, rejected
2787 @transactional
2788 def forget(self, refs: Iterable[DatasetRef]) -> None:
2789 # Docstring inherited.
2790 refs = list(refs)
2791 self.bridge.forget(refs)
2792 self._table.delete(["dataset_id"], *[{"dataset_id": ref.id} for ref in refs])
2794 def validateConfiguration(
2795 self, entities: Iterable[DatasetRef | DatasetType | StorageClass], logFailures: bool = False
2796 ) -> None:
2797 """Validate some of the configuration for this datastore.
2799 Parameters
2800 ----------
2801 entities : iterable of `DatasetRef`, `DatasetType`, or `StorageClass`
2802 Entities to test against this configuration. Can be differing
2803 types.
2804 logFailures : `bool`, optional
2805 If `True`, output a log message for every validation error
2806 detected.
2808 Raises
2809 ------
2810 DatastoreValidationError
2811 Raised if there is a validation problem with a configuration.
2812 All the problems are reported in a single exception.
2814 Notes
2815 -----
2816 This method checks that all the supplied entities have valid file
2817 templates and also have formatters defined.
2818 """
2819 templateFailed = None
2820 try:
2821 self.templates.validateTemplates(entities, logFailures=logFailures)
2822 except FileTemplateValidationError as e:
2823 templateFailed = str(e)
2825 formatterFailed = []
2826 for entity in entities:
2827 try:
2828 self.formatterFactory.getFormatterClass(entity)
2829 except KeyError as e:
2830 formatterFailed.append(str(e))
2831 if logFailures:
2832 log.critical("Formatter failure: %s", e)
2834 if templateFailed or formatterFailed:
2835 messages = []
2836 if templateFailed:
2837 messages.append(templateFailed)
2838 if formatterFailed:
2839 messages.append(",".join(formatterFailed))
2840 msg = ";\n".join(messages)
2841 raise DatastoreValidationError(msg)
2843 def getLookupKeys(self) -> set[LookupKey]:
2844 # Docstring is inherited from base class
2845 return (
2846 self.templates.getLookupKeys()
2847 | self.formatterFactory.getLookupKeys()
2848 | self.constraints.getLookupKeys()
2849 )
2851 def validateKey(self, lookupKey: LookupKey, entity: DatasetRef | DatasetType | StorageClass) -> None:
2852 # Docstring is inherited from base class
2853 # The key can be valid in either formatters or templates so we can
2854 # only check the template if it exists
2855 if lookupKey in self.templates:
2856 try:
2857 self.templates[lookupKey].validateTemplate(entity)
2858 except FileTemplateValidationError as e:
2859 raise DatastoreValidationError(e) from e
2861 def export(
2862 self,
2863 refs: Iterable[DatasetRef],
2864 *,
2865 directory: ResourcePathExpression | None = None,
2866 transfer: str | None = "auto",
2867 ) -> Iterable[FileDataset]:
2868 # Docstring inherited from Datastore.export.
2869 if transfer == "auto" and directory is None:
2870 transfer = None
2872 if transfer is not None and directory is None:
2873 raise TypeError(f"Cannot export using transfer mode {transfer} with no export directory given")
2875 if transfer == "move":
2876 raise TypeError("Can not export by moving files out of datastore.")
2877 elif transfer == "direct":
2878 # For an export, treat this as equivalent to None. We do not
2879 # want an import to risk using absolute URIs to datasets owned
2880 # by another datastore.
2881 log.info("Treating 'direct' transfer mode as in-place export.")
2882 transfer = None
2884 # Force the directory to be a URI object
2885 directoryUri: ResourcePath | None = None
2886 if directory is not None:
2887 directoryUri = ResourcePath(directory, forceDirectory=True)
2889 if transfer is not None and directoryUri is not None and not directoryUri.exists():
2890 # mypy needs the second test
2891 raise FileNotFoundError(f"Export location {directory} does not exist")
2893 progress = Progress("lsst.daf.butler.datastores.FileDatastore.export", level=logging.DEBUG)
2894 for ref in progress.wrap(refs, "Exporting dataset files"):
2895 fileLocations = self._get_dataset_locations_info(ref)
2896 if not fileLocations:
2897 raise FileNotFoundError(f"Could not retrieve dataset {ref}.")
2898 # For now we can not export disassembled datasets
2899 if len(fileLocations) > 1:
2900 raise NotImplementedError(f"Can not export disassembled datasets such as {ref}")
2901 location, storedFileInfo = fileLocations[0]
2903 pathInStore = location.pathInStore.path
2904 if transfer is None:
2905 # TODO: do we also need to return the readStorageClass somehow?
2906 # We will use the path in store directly. If this is an
2907 # absolute URI, preserve it.
2908 if location.pathInStore.isabs():
2909 pathInStore = str(location.uri)
2910 elif transfer == "direct":
2911 # Use full URIs to the remote store in the export
2912 pathInStore = str(location.uri)
2913 else:
2914 # mypy needs help
2915 assert directoryUri is not None, "directoryUri must be defined to get here"
2916 storeUri = ResourcePath(location.uri)
2918 # if the datastore has an absolute URI to a resource, we
2919 # have two options:
2920 # 1. Keep the absolute URI in the exported YAML
2921 # 2. Allocate a new name in the local datastore and transfer
2922 # it.
2923 # For now go with option 2
2924 if location.pathInStore.isabs():
2925 template = self.templates.getTemplate(ref)
2926 newURI = ResourcePath(template.format(ref), forceAbsolute=False)
2927 pathInStore = str(newURI.updatedExtension(location.pathInStore.getExtension()))
2929 exportUri = directoryUri.join(pathInStore)
2930 exportUri.transfer_from(storeUri, transfer=transfer)
2932 yield FileDataset(refs=[ref], path=pathInStore, formatter=storedFileInfo.formatter)
2934 @staticmethod
2935 def computeChecksum(uri: ResourcePath, algorithm: str = "blake2b", block_size: int = 8192) -> str | None:
2936 """Compute the checksum of the supplied file.
2938 Parameters
2939 ----------
2940 uri : `lsst.resources.ResourcePath`
2941 Name of resource to calculate checksum from.
2942 algorithm : `str`, optional
2943 Name of algorithm to use. Must be one of the algorithms supported
2944 by :py:class`hashlib`.
2945 block_size : `int`
2946 Number of bytes to read from file at one time.
2948 Returns
2949 -------
2950 hexdigest : `str`
2951 Hex digest of the file.
2953 Notes
2954 -----
2955 Currently returns None if the URI is for a remote resource.
2956 """
2957 if algorithm not in hashlib.algorithms_guaranteed:
2958 raise NameError(f"The specified algorithm '{algorithm}' is not supported by hashlib")
2960 if not uri.isLocal:
2961 return None
2963 hasher = hashlib.new(algorithm)
2965 with uri.as_local() as local_uri, open(local_uri.ospath, "rb") as f:
2966 for chunk in iter(lambda: f.read(block_size), b""):
2967 hasher.update(chunk)
2969 return hasher.hexdigest()
2971 def needs_expanded_data_ids(
2972 self,
2973 transfer: str | None,
2974 entity: DatasetRef | DatasetType | StorageClass | None = None,
2975 ) -> bool:
2976 # Docstring inherited.
2977 # This _could_ also use entity to inspect whether the filename template
2978 # involves placeholders other than the required dimensions for its
2979 # dataset type, but that's not necessary for correctness; it just
2980 # enables more optimizations (perhaps only in theory).
2981 return transfer not in ("direct", None)
2983 def import_records(self, data: Mapping[str, DatastoreRecordData]) -> None:
2984 # Docstring inherited from the base class.
2985 record_data = data.get(self.name)
2986 if not record_data:
2987 return
2989 self._bridge.insert(FakeDatasetRef(dataset_id) for dataset_id in record_data.records)
2991 # TODO: Verify that there are no unexpected table names in the dict?
2992 unpacked_records = []
2993 for dataset_data in record_data.records.values():
2994 records = dataset_data.get(self._table.name)
2995 if records:
2996 for info in records:
2997 assert isinstance(info, StoredFileInfo), "Expecting StoredFileInfo records"
2998 unpacked_records.append(info.to_record())
2999 if unpacked_records:
3000 self._table.insert(*unpacked_records, transaction=self._transaction)
3002 def export_records(self, refs: Iterable[DatasetIdRef]) -> Mapping[str, DatastoreRecordData]:
3003 # Docstring inherited from the base class.
3004 exported_refs = list(self._bridge.check(refs))
3005 ids = {ref.id for ref in exported_refs}
3006 records: dict[DatasetId, dict[str, list[StoredDatastoreItemInfo]]] = {id: {} for id in ids}
3007 for row in self._table.fetch(dataset_id=ids):
3008 info: StoredDatastoreItemInfo = StoredFileInfo.from_record(row)
3009 dataset_records = records.setdefault(info.dataset_id, {})
3010 dataset_records.setdefault(self._table.name, []).append(info)
3012 record_data = DatastoreRecordData(records=records)
3013 return {self.name: record_data}
3015 def set_retrieve_dataset_type_method(self, method: Callable[[str], DatasetType | None] | None) -> None:
3016 # Docstring inherited from the base class.
3017 self._retrieve_dataset_method = method
3019 def _cast_storage_class(self, ref: DatasetRef) -> DatasetRef:
3020 """Update dataset reference to use the storage class from registry."""
3021 if self._retrieve_dataset_method is None:
3022 # We could raise an exception here but unit tests do not define
3023 # this method.
3024 return ref
3025 dataset_type = self._retrieve_dataset_method(ref.datasetType.name)
3026 if dataset_type is not None:
3027 ref = ref.overrideStorageClass(dataset_type.storageClass)
3028 return ref