Coverage for python/lsst/daf/butler/datastores/fileDatastore.py: 10%
996 statements
« prev ^ index » next coverage.py v7.3.2, created at 2023-10-12 09:44 +0000
« prev ^ index » next coverage.py v7.3.2, created at 2023-10-12 09:44 +0000
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This software is dual licensed under the GNU General Public License and also
10# under a 3-clause BSD license. Recipients may choose which of these licenses
11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt,
12# respectively. If you choose the GPL option then the following text applies
13# (but note that there is still no warranty even if you opt for BSD instead):
14#
15# This program is free software: you can redistribute it and/or modify
16# it under the terms of the GNU General Public License as published by
17# the Free Software Foundation, either version 3 of the License, or
18# (at your option) any later version.
19#
20# This program is distributed in the hope that it will be useful,
21# but WITHOUT ANY WARRANTY; without even the implied warranty of
22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23# GNU General Public License for more details.
24#
25# You should have received a copy of the GNU General Public License
26# along with this program. If not, see <http://www.gnu.org/licenses/>.
28"""Generic file-based datastore code."""
30from __future__ import annotations
32__all__ = ("FileDatastore",)
34import contextlib
35import hashlib
36import logging
37from collections import defaultdict
38from collections.abc import Callable, Iterable, Mapping, Sequence
39from dataclasses import dataclass
40from typing import TYPE_CHECKING, Any, ClassVar
42from lsst.daf.butler import (
43 Config,
44 DatasetId,
45 DatasetRef,
46 DatasetType,
47 DatasetTypeNotSupportedError,
48 Datastore,
49 FileDataset,
50 FileDescriptor,
51 Formatter,
52 FormatterFactory,
53 Location,
54 LocationFactory,
55 Progress,
56 StorageClass,
57 ddl,
58)
59from lsst.daf.butler.datastore import DatasetRefURIs, DatastoreConfig, DatastoreValidationError
60from lsst.daf.butler.datastore.cache_manager import (
61 AbstractDatastoreCacheManager,
62 DatastoreCacheManager,
63 DatastoreDisabledCacheManager,
64)
65from lsst.daf.butler.datastore.composites import CompositesMap
66from lsst.daf.butler.datastore.file_templates import FileTemplates, FileTemplateValidationError
67from lsst.daf.butler.datastore.record_data import DatastoreRecordData
68from lsst.daf.butler.datastore.stored_file_info import StoredDatastoreItemInfo, StoredFileInfo
69from lsst.daf.butler.registry.interfaces import (
70 DatabaseInsertMode,
71 DatastoreRegistryBridge,
72 FakeDatasetRef,
73 ReadOnlyDatabaseError,
74)
75from lsst.daf.butler.repo_relocation import replaceRoot
76from lsst.daf.butler.utils import transactional
77from lsst.resources import ResourcePath, ResourcePathExpression
78from lsst.utils.introspection import get_class_of, get_instance_of
79from lsst.utils.iteration import chunk_iterable
81# For VERBOSE logging usage.
82from lsst.utils.logging import VERBOSE, getLogger
83from lsst.utils.timer import time_this
84from sqlalchemy import BigInteger, String
86from ..datastore.generic_base import GenericBaseDatastore
88if TYPE_CHECKING:
89 from lsst.daf.butler import LookupKey
90 from lsst.daf.butler.registry.interfaces import DatasetIdRef, DatastoreRegistryBridgeManager
92log = getLogger(__name__)
95class _IngestPrepData(Datastore.IngestPrepData):
96 """Helper class for FileDatastore ingest implementation.
98 Parameters
99 ----------
100 datasets : `~collections.abc.Iterable` of `FileDataset`
101 Files to be ingested by this datastore.
102 """
104 def __init__(self, datasets: Iterable[FileDataset]):
105 super().__init__(ref for dataset in datasets for ref in dataset.refs)
106 self.datasets = datasets
109@dataclass(frozen=True)
110class DatastoreFileGetInformation:
111 """Collection of useful parameters needed to retrieve a file from
112 a Datastore.
113 """
115 location: Location
116 """The location from which to read the dataset."""
118 formatter: Formatter
119 """The `Formatter` to use to deserialize the dataset."""
121 info: StoredFileInfo
122 """Stored information about this file and its formatter."""
124 assemblerParams: Mapping[str, Any]
125 """Parameters to use for post-processing the retrieved dataset."""
127 formatterParams: Mapping[str, Any]
128 """Parameters that were understood by the associated formatter."""
130 component: str | None
131 """The component to be retrieved (can be `None`)."""
133 readStorageClass: StorageClass
134 """The `StorageClass` of the dataset being read."""
137class FileDatastore(GenericBaseDatastore):
138 """Generic Datastore for file-based implementations.
140 Should always be sub-classed since key abstract methods are missing.
142 Parameters
143 ----------
144 config : `DatastoreConfig` or `str`
145 Configuration as either a `Config` object or URI to file.
146 bridgeManager : `DatastoreRegistryBridgeManager`
147 Object that manages the interface between `Registry` and datastores.
148 butlerRoot : `str`, optional
149 New datastore root to use to override the configuration value.
151 Raises
152 ------
153 ValueError
154 If root location does not exist and ``create`` is `False` in the
155 configuration.
156 """
158 defaultConfigFile: ClassVar[str | None] = None
159 """Path to configuration defaults. Accessed within the ``config`` resource
160 or relative to a search path. Can be None if no defaults specified.
161 """
163 root: ResourcePath
164 """Root directory URI of this `Datastore`."""
166 locationFactory: LocationFactory
167 """Factory for creating locations relative to the datastore root."""
169 formatterFactory: FormatterFactory
170 """Factory for creating instances of formatters."""
172 templates: FileTemplates
173 """File templates that can be used by this `Datastore`."""
175 composites: CompositesMap
176 """Determines whether a dataset should be disassembled on put."""
178 defaultConfigFile = "datastores/fileDatastore.yaml"
179 """Path to configuration defaults. Accessed within the ``config`` resource
180 or relative to a search path. Can be None if no defaults specified.
181 """
183 _retrieve_dataset_method: Callable[[str], DatasetType | None] | None = None
184 """Callable that is used in trusted mode to retrieve registry definition
185 of a named dataset type.
186 """
188 @classmethod
189 def setConfigRoot(cls, root: str, config: Config, full: Config, overwrite: bool = True) -> None:
190 """Set any filesystem-dependent config options for this Datastore to
191 be appropriate for a new empty repository with the given root.
193 Parameters
194 ----------
195 root : `str`
196 URI to the root of the data repository.
197 config : `Config`
198 A `Config` to update. Only the subset understood by
199 this component will be updated. Will not expand
200 defaults.
201 full : `Config`
202 A complete config with all defaults expanded that can be
203 converted to a `DatastoreConfig`. Read-only and will not be
204 modified by this method.
205 Repository-specific options that should not be obtained
206 from defaults when Butler instances are constructed
207 should be copied from ``full`` to ``config``.
208 overwrite : `bool`, optional
209 If `False`, do not modify a value in ``config`` if the value
210 already exists. Default is always to overwrite with the provided
211 ``root``.
213 Notes
214 -----
215 If a keyword is explicitly defined in the supplied ``config`` it
216 will not be overridden by this method if ``overwrite`` is `False`.
217 This allows explicit values set in external configs to be retained.
218 """
219 Config.updateParameters(
220 DatastoreConfig,
221 config,
222 full,
223 toUpdate={"root": root},
224 toCopy=("cls", ("records", "table")),
225 overwrite=overwrite,
226 )
228 @classmethod
229 def makeTableSpec(cls, datasetIdColumnType: type) -> ddl.TableSpec:
230 return ddl.TableSpec(
231 fields=[
232 ddl.FieldSpec(name="dataset_id", dtype=datasetIdColumnType, primaryKey=True),
233 ddl.FieldSpec(name="path", dtype=String, length=256, nullable=False),
234 ddl.FieldSpec(name="formatter", dtype=String, length=128, nullable=False),
235 ddl.FieldSpec(name="storage_class", dtype=String, length=64, nullable=False),
236 # Use empty string to indicate no component
237 ddl.FieldSpec(name="component", dtype=String, length=32, primaryKey=True),
238 # TODO: should checksum be Base64Bytes instead?
239 ddl.FieldSpec(name="checksum", dtype=String, length=128, nullable=True),
240 ddl.FieldSpec(name="file_size", dtype=BigInteger, nullable=True),
241 ],
242 unique=frozenset(),
243 indexes=[ddl.IndexSpec("path")],
244 )
246 def __init__(
247 self,
248 config: DatastoreConfig | ResourcePathExpression,
249 bridgeManager: DatastoreRegistryBridgeManager,
250 butlerRoot: str | None = None,
251 ):
252 super().__init__(config, bridgeManager)
253 if "root" not in self.config:
254 raise ValueError("No root directory specified in configuration")
256 self._bridgeManager = bridgeManager
258 # Name ourselves either using an explicit name or a name
259 # derived from the (unexpanded) root
260 if "name" in self.config:
261 self.name = self.config["name"]
262 else:
263 # We use the unexpanded root in the name to indicate that this
264 # datastore can be moved without having to update registry.
265 self.name = "{}@{}".format(type(self).__name__, self.config["root"])
267 # Support repository relocation in config
268 # Existence of self.root is checked in subclass
269 self.root = ResourcePath(
270 replaceRoot(self.config["root"], butlerRoot), forceDirectory=True, forceAbsolute=True
271 )
273 self.locationFactory = LocationFactory(self.root)
274 self.formatterFactory = FormatterFactory()
276 # Now associate formatters with storage classes
277 self.formatterFactory.registerFormatters(self.config["formatters"], universe=bridgeManager.universe)
279 # Read the file naming templates
280 self.templates = FileTemplates(self.config["templates"], universe=bridgeManager.universe)
282 # See if composites should be disassembled
283 self.composites = CompositesMap(self.config["composites"], universe=bridgeManager.universe)
285 tableName = self.config["records", "table"]
286 try:
287 # Storage of paths and formatters, keyed by dataset_id
288 self._table = bridgeManager.opaque.register(
289 tableName, self.makeTableSpec(bridgeManager.datasetIdColumnType)
290 )
291 # Interface to Registry.
292 self._bridge = bridgeManager.register(self.name)
293 except ReadOnlyDatabaseError:
294 # If the database is read only and we just tried and failed to
295 # create a table, it means someone is trying to create a read-only
296 # butler client for an empty repo. That should be okay, as long
297 # as they then try to get any datasets before some other client
298 # creates the table. Chances are they'rejust validating
299 # configuration.
300 pass
302 # Determine whether checksums should be used - default to False
303 self.useChecksum = self.config.get("checksum", False)
305 # Determine whether we can fall back to configuration if a
306 # requested dataset is not known to registry
307 self.trustGetRequest = self.config.get("trust_get_request", False)
309 # Create a cache manager
310 self.cacheManager: AbstractDatastoreCacheManager
311 if "cached" in self.config:
312 self.cacheManager = DatastoreCacheManager(self.config["cached"], universe=bridgeManager.universe)
313 else:
314 self.cacheManager = DatastoreDisabledCacheManager("", universe=bridgeManager.universe)
316 # Check existence and create directory structure if necessary
317 if not self.root.exists():
318 if "create" not in self.config or not self.config["create"]:
319 raise ValueError(f"No valid root and not allowed to create one at: {self.root}")
320 try:
321 self.root.mkdir()
322 except Exception as e:
323 raise ValueError(
324 f"Can not create datastore root '{self.root}', check permissions. Got error: {e}"
325 ) from e
327 def __str__(self) -> str:
328 return str(self.root)
330 @property
331 def bridge(self) -> DatastoreRegistryBridge:
332 return self._bridge
334 @property
335 def roots(self) -> dict[str, ResourcePath | None]:
336 # Docstring inherited.
337 return {self.name: self.root}
339 def _artifact_exists(self, location: Location) -> bool:
340 """Check that an artifact exists in this datastore at the specified
341 location.
343 Parameters
344 ----------
345 location : `Location`
346 Expected location of the artifact associated with this datastore.
348 Returns
349 -------
350 exists : `bool`
351 True if the location can be found, false otherwise.
352 """
353 log.debug("Checking if resource exists: %s", location.uri)
354 return location.uri.exists()
356 def _delete_artifact(self, location: Location) -> None:
357 """Delete the artifact from the datastore.
359 Parameters
360 ----------
361 location : `Location`
362 Location of the artifact associated with this datastore.
363 """
364 if location.pathInStore.isabs():
365 raise RuntimeError(f"Cannot delete artifact with absolute uri {location.uri}.")
367 try:
368 location.uri.remove()
369 except FileNotFoundError:
370 log.debug("File %s did not exist and so could not be deleted.", location.uri)
371 raise
372 except Exception as e:
373 log.critical("Failed to delete file: %s (%s)", location.uri, e)
374 raise
375 log.debug("Successfully deleted file: %s", location.uri)
377 def addStoredItemInfo(
378 self,
379 refs: Iterable[DatasetRef],
380 infos: Iterable[StoredFileInfo],
381 insert_mode: DatabaseInsertMode = DatabaseInsertMode.INSERT,
382 ) -> None:
383 # Docstring inherited from GenericBaseDatastore
384 records = [info.rebase(ref).to_record() for ref, info in zip(refs, infos, strict=True)]
385 match insert_mode:
386 case DatabaseInsertMode.INSERT:
387 self._table.insert(*records, transaction=self._transaction)
388 case DatabaseInsertMode.ENSURE:
389 self._table.ensure(*records, transaction=self._transaction)
390 case DatabaseInsertMode.REPLACE:
391 self._table.replace(*records, transaction=self._transaction)
392 case _:
393 raise ValueError(f"Unknown insert mode of '{insert_mode}'")
395 def getStoredItemsInfo(self, ref: DatasetIdRef) -> list[StoredFileInfo]:
396 # Docstring inherited from GenericBaseDatastore
398 # Look for the dataset_id -- there might be multiple matches
399 # if we have disassembled the dataset.
400 records = self._table.fetch(dataset_id=ref.id)
401 return [StoredFileInfo.from_record(record) for record in records]
403 def _get_stored_records_associated_with_refs(
404 self, refs: Iterable[DatasetIdRef]
405 ) -> dict[DatasetId, list[StoredFileInfo]]:
406 """Retrieve all records associated with the provided refs.
408 Parameters
409 ----------
410 refs : iterable of `DatasetIdRef`
411 The refs for which records are to be retrieved.
413 Returns
414 -------
415 records : `dict` of [`DatasetId`, `list` of `StoredFileInfo`]
416 The matching records indexed by the ref ID. The number of entries
417 in the dict can be smaller than the number of requested refs.
418 """
419 records = self._table.fetch(dataset_id=[ref.id for ref in refs])
421 # Uniqueness is dataset_id + component so can have multiple records
422 # per ref.
423 records_by_ref = defaultdict(list)
424 for record in records:
425 records_by_ref[record["dataset_id"]].append(StoredFileInfo.from_record(record))
426 return records_by_ref
428 def _refs_associated_with_artifacts(self, paths: list[str | ResourcePath]) -> dict[str, set[DatasetId]]:
429 """Return paths and associated dataset refs.
431 Parameters
432 ----------
433 paths : `list` of `str` or `lsst.resources.ResourcePath`
434 All the paths to include in search.
436 Returns
437 -------
438 mapping : `dict` of [`str`, `set` [`DatasetId`]]
439 Mapping of each path to a set of associated database IDs.
440 """
441 records = self._table.fetch(path=[str(path) for path in paths])
442 result = defaultdict(set)
443 for row in records:
444 result[row["path"]].add(row["dataset_id"])
445 return result
447 def _registered_refs_per_artifact(self, pathInStore: ResourcePath) -> set[DatasetId]:
448 """Return all dataset refs associated with the supplied path.
450 Parameters
451 ----------
452 pathInStore : `lsst.resources.ResourcePath`
453 Path of interest in the data store.
455 Returns
456 -------
457 ids : `set` of `int`
458 All `DatasetRef` IDs associated with this path.
459 """
460 records = list(self._table.fetch(path=str(pathInStore)))
461 ids = {r["dataset_id"] for r in records}
462 return ids
464 def removeStoredItemInfo(self, ref: DatasetIdRef) -> None:
465 # Docstring inherited from GenericBaseDatastore
466 self._table.delete(["dataset_id"], {"dataset_id": ref.id})
468 def _get_dataset_locations_info(self, ref: DatasetIdRef) -> list[tuple[Location, StoredFileInfo]]:
469 r"""Find all the `Location`\ s of the requested dataset in the
470 `Datastore` and the associated stored file information.
472 Parameters
473 ----------
474 ref : `DatasetRef`
475 Reference to the required `Dataset`.
477 Returns
478 -------
479 results : `list` [`tuple` [`Location`, `StoredFileInfo` ]]
480 Location of the dataset within the datastore and
481 stored information about each file and its formatter.
482 """
483 # Get the file information (this will fail if no file)
484 records = self.getStoredItemsInfo(ref)
486 # Use the path to determine the location -- we need to take
487 # into account absolute URIs in the datastore record
488 return [(r.file_location(self.locationFactory), r) for r in records]
490 def _can_remove_dataset_artifact(self, ref: DatasetIdRef, location: Location) -> bool:
491 """Check that there is only one dataset associated with the
492 specified artifact.
494 Parameters
495 ----------
496 ref : `DatasetRef` or `FakeDatasetRef`
497 Dataset to be removed.
498 location : `Location`
499 The location of the artifact to be removed.
501 Returns
502 -------
503 can_remove : `Bool`
504 True if the artifact can be safely removed.
505 """
506 # Can't ever delete absolute URIs.
507 if location.pathInStore.isabs():
508 return False
510 # Get all entries associated with this path
511 allRefs = self._registered_refs_per_artifact(location.pathInStore)
512 if not allRefs:
513 raise RuntimeError(f"Datastore inconsistency error. {location.pathInStore} not in registry")
515 # Remove these refs from all the refs and if there is nothing left
516 # then we can delete
517 remainingRefs = allRefs - {ref.id}
519 if remainingRefs:
520 return False
521 return True
523 def _get_expected_dataset_locations_info(self, ref: DatasetRef) -> list[tuple[Location, StoredFileInfo]]:
524 """Predict the location and related file information of the requested
525 dataset in this datastore.
527 Parameters
528 ----------
529 ref : `DatasetRef`
530 Reference to the required `Dataset`.
532 Returns
533 -------
534 results : `list` [`tuple` [`Location`, `StoredFileInfo` ]]
535 Expected Location of the dataset within the datastore and
536 placeholder information about each file and its formatter.
538 Notes
539 -----
540 Uses the current configuration to determine how we would expect the
541 datastore files to have been written if we couldn't ask registry.
542 This is safe so long as there has been no change to datastore
543 configuration between writing the dataset and wanting to read it.
544 Will not work for files that have been ingested without using the
545 standard file template or default formatter.
546 """
547 # If we have a component ref we always need to ask the questions
548 # of the composite. If the composite is disassembled this routine
549 # should return all components. If the composite was not
550 # disassembled the composite is what is stored regardless of
551 # component request. Note that if the caller has disassembled
552 # a composite there is no way for this guess to know that
553 # without trying both the composite and component ref and seeing
554 # if there is something at the component Location even without
555 # disassembly being enabled.
556 if ref.datasetType.isComponent():
557 ref = ref.makeCompositeRef()
559 # See if the ref is a composite that should be disassembled
560 doDisassembly = self.composites.shouldBeDisassembled(ref)
562 all_info: list[tuple[Location, Formatter, StorageClass, str | None]] = []
564 if doDisassembly:
565 for component, componentStorage in ref.datasetType.storageClass.components.items():
566 compRef = ref.makeComponentRef(component)
567 location, formatter = self._determine_put_formatter_location(compRef)
568 all_info.append((location, formatter, componentStorage, component))
570 else:
571 # Always use the composite ref if no disassembly
572 location, formatter = self._determine_put_formatter_location(ref)
573 all_info.append((location, formatter, ref.datasetType.storageClass, None))
575 # Convert the list of tuples to have StoredFileInfo as second element
576 return [
577 (
578 location,
579 StoredFileInfo(
580 formatter=formatter,
581 path=location.pathInStore.path,
582 storageClass=storageClass,
583 component=component,
584 checksum=None,
585 file_size=-1,
586 dataset_id=ref.id,
587 ),
588 )
589 for location, formatter, storageClass, component in all_info
590 ]
592 def _prepare_for_get(
593 self, ref: DatasetRef, parameters: Mapping[str, Any] | None = None
594 ) -> list[DatastoreFileGetInformation]:
595 """Check parameters for ``get`` and obtain formatter and
596 location.
598 Parameters
599 ----------
600 ref : `DatasetRef`
601 Reference to the required Dataset.
602 parameters : `dict`
603 `StorageClass`-specific parameters that specify, for example,
604 a slice of the dataset to be loaded.
606 Returns
607 -------
608 getInfo : `list` [`DatastoreFileGetInformation`]
609 Parameters needed to retrieve each file.
610 """
611 log.debug("Retrieve %s from %s with parameters %s", ref, self.name, parameters)
613 # The storage class we want to use eventually
614 refStorageClass = ref.datasetType.storageClass
616 # For trusted mode need to reset storage class.
617 ref = self._cast_storage_class(ref)
619 # Get file metadata and internal metadata
620 fileLocations = self._get_dataset_locations_info(ref)
621 if not fileLocations:
622 if not self.trustGetRequest:
623 raise FileNotFoundError(f"Could not retrieve dataset {ref}.")
624 # Assume the dataset is where we think it should be
625 fileLocations = self._get_expected_dataset_locations_info(ref)
627 if len(fileLocations) > 1:
628 disassembled = True
630 # If trust is involved it is possible that there will be
631 # components listed here that do not exist in the datastore.
632 # Explicitly check for file artifact existence and filter out any
633 # that are missing.
634 if self.trustGetRequest:
635 fileLocations = [loc for loc in fileLocations if loc[0].uri.exists()]
637 # For now complain only if we have no components at all. One
638 # component is probably a problem but we can punt that to the
639 # assembler.
640 if not fileLocations:
641 raise FileNotFoundError(f"None of the component files for dataset {ref} exist.")
643 else:
644 disassembled = False
646 # Is this a component request?
647 refComponent = ref.datasetType.component()
649 fileGetInfo = []
650 for location, storedFileInfo in fileLocations:
651 # The storage class used to write the file
652 writeStorageClass = storedFileInfo.storageClass
654 # If this has been disassembled we need read to match the write
655 if disassembled:
656 readStorageClass = writeStorageClass
657 else:
658 readStorageClass = refStorageClass
660 formatter = get_instance_of(
661 storedFileInfo.formatter,
662 FileDescriptor(
663 location,
664 readStorageClass=readStorageClass,
665 storageClass=writeStorageClass,
666 parameters=parameters,
667 ),
668 ref.dataId,
669 )
671 formatterParams, notFormatterParams = formatter.segregateParameters()
673 # Of the remaining parameters, extract the ones supported by
674 # this StorageClass (for components not all will be handled)
675 assemblerParams = readStorageClass.filterParameters(notFormatterParams)
677 # The ref itself could be a component if the dataset was
678 # disassembled by butler, or we disassembled in datastore and
679 # components came from the datastore records
680 component = storedFileInfo.component if storedFileInfo.component else refComponent
682 fileGetInfo.append(
683 DatastoreFileGetInformation(
684 location,
685 formatter,
686 storedFileInfo,
687 assemblerParams,
688 formatterParams,
689 component,
690 readStorageClass,
691 )
692 )
694 return fileGetInfo
696 def _prepare_for_put(self, inMemoryDataset: Any, ref: DatasetRef) -> tuple[Location, Formatter]:
697 """Check the arguments for ``put`` and obtain formatter and
698 location.
700 Parameters
701 ----------
702 inMemoryDataset : `object`
703 The dataset to store.
704 ref : `DatasetRef`
705 Reference to the associated Dataset.
707 Returns
708 -------
709 location : `Location`
710 The location to write the dataset.
711 formatter : `Formatter`
712 The `Formatter` to use to write the dataset.
714 Raises
715 ------
716 TypeError
717 Supplied object and storage class are inconsistent.
718 DatasetTypeNotSupportedError
719 The associated `DatasetType` is not handled by this datastore.
720 """
721 self._validate_put_parameters(inMemoryDataset, ref)
722 return self._determine_put_formatter_location(ref)
724 def _determine_put_formatter_location(self, ref: DatasetRef) -> tuple[Location, Formatter]:
725 """Calculate the formatter and output location to use for put.
727 Parameters
728 ----------
729 ref : `DatasetRef`
730 Reference to the associated Dataset.
732 Returns
733 -------
734 location : `Location`
735 The location to write the dataset.
736 formatter : `Formatter`
737 The `Formatter` to use to write the dataset.
738 """
739 # Work out output file name
740 try:
741 template = self.templates.getTemplate(ref)
742 except KeyError as e:
743 raise DatasetTypeNotSupportedError(f"Unable to find template for {ref}") from e
745 # Validate the template to protect against filenames from different
746 # dataIds returning the same and causing overwrite confusion.
747 template.validateTemplate(ref)
749 location = self.locationFactory.fromPath(template.format(ref))
751 # Get the formatter based on the storage class
752 storageClass = ref.datasetType.storageClass
753 try:
754 formatter = self.formatterFactory.getFormatter(
755 ref, FileDescriptor(location, storageClass=storageClass), ref.dataId
756 )
757 except KeyError as e:
758 raise DatasetTypeNotSupportedError(
759 f"Unable to find formatter for {ref} in datastore {self.name}"
760 ) from e
762 # Now that we know the formatter, update the location
763 location = formatter.makeUpdatedLocation(location)
765 return location, formatter
767 def _overrideTransferMode(self, *datasets: FileDataset, transfer: str | None = None) -> str | None:
768 # Docstring inherited from base class
769 if transfer != "auto":
770 return transfer
772 # See if the paths are within the datastore or not
773 inside = [self._pathInStore(d.path) is not None for d in datasets]
775 if all(inside):
776 transfer = None
777 elif not any(inside):
778 # Allow ResourcePath to use its own knowledge
779 transfer = "auto"
780 else:
781 # This can happen when importing from a datastore that
782 # has had some datasets ingested using "direct" mode.
783 # Also allow ResourcePath to sort it out but warn about it.
784 # This can happen if you are importing from a datastore
785 # that had some direct transfer datasets.
786 log.warning(
787 "Some datasets are inside the datastore and some are outside. Using 'split' "
788 "transfer mode. This assumes that the files outside the datastore are "
789 "still accessible to the new butler since they will not be copied into "
790 "the target datastore."
791 )
792 transfer = "split"
794 return transfer
796 def _pathInStore(self, path: ResourcePathExpression) -> str | None:
797 """Return path relative to datastore root.
799 Parameters
800 ----------
801 path : `lsst.resources.ResourcePathExpression`
802 Path to dataset. Can be absolute URI. If relative assumed to
803 be relative to the datastore. Returns path in datastore
804 or raises an exception if the path it outside.
806 Returns
807 -------
808 inStore : `str`
809 Path relative to datastore root. Returns `None` if the file is
810 outside the root.
811 """
812 # Relative path will always be relative to datastore
813 pathUri = ResourcePath(path, forceAbsolute=False)
814 return pathUri.relative_to(self.root)
816 def _standardizeIngestPath(
817 self, path: str | ResourcePath, *, transfer: str | None = None
818 ) -> str | ResourcePath:
819 """Standardize the path of a to-be-ingested file.
821 Parameters
822 ----------
823 path : `str` or `lsst.resources.ResourcePath`
824 Path of a file to be ingested. This parameter is not expected
825 to be all the types that can be used to construct a
826 `~lsst.resources.ResourcePath`.
827 transfer : `str`, optional
828 How (and whether) the dataset should be added to the datastore.
829 See `ingest` for details of transfer modes.
830 This implementation is provided only so
831 `NotImplementedError` can be raised if the mode is not supported;
832 actual transfers are deferred to `_extractIngestInfo`.
834 Returns
835 -------
836 path : `str` or `lsst.resources.ResourcePath`
837 New path in what the datastore considers standard form. If an
838 absolute URI was given that will be returned unchanged.
840 Notes
841 -----
842 Subclasses of `FileDatastore` can implement this method instead
843 of `_prepIngest`. It should not modify the data repository or given
844 file in any way.
846 Raises
847 ------
848 NotImplementedError
849 Raised if the datastore does not support the given transfer mode
850 (including the case where ingest is not supported at all).
851 FileNotFoundError
852 Raised if one of the given files does not exist.
853 """
854 if transfer not in (None, "direct", "split") + self.root.transferModes:
855 raise NotImplementedError(f"Transfer mode {transfer} not supported.")
857 # A relative URI indicates relative to datastore root
858 srcUri = ResourcePath(path, forceAbsolute=False)
859 if not srcUri.isabs():
860 srcUri = self.root.join(path)
862 if not srcUri.exists():
863 raise FileNotFoundError(
864 f"Resource at {srcUri} does not exist; note that paths to ingest "
865 f"are assumed to be relative to {self.root} unless they are absolute."
866 )
868 if transfer is None:
869 relpath = srcUri.relative_to(self.root)
870 if not relpath:
871 raise RuntimeError(
872 f"Transfer is none but source file ({srcUri}) is not within datastore ({self.root})"
873 )
875 # Return the relative path within the datastore for internal
876 # transfer
877 path = relpath
879 return path
881 def _extractIngestInfo(
882 self,
883 path: ResourcePathExpression,
884 ref: DatasetRef,
885 *,
886 formatter: Formatter | type[Formatter],
887 transfer: str | None = None,
888 record_validation_info: bool = True,
889 ) -> StoredFileInfo:
890 """Relocate (if necessary) and extract `StoredFileInfo` from a
891 to-be-ingested file.
893 Parameters
894 ----------
895 path : `lsst.resources.ResourcePathExpression`
896 URI or path of a file to be ingested.
897 ref : `DatasetRef`
898 Reference for the dataset being ingested. Guaranteed to have
899 ``dataset_id not None`.
900 formatter : `type` or `Formatter`
901 `Formatter` subclass to use for this dataset or an instance.
902 transfer : `str`, optional
903 How (and whether) the dataset should be added to the datastore.
904 See `ingest` for details of transfer modes.
905 record_validation_info : `bool`, optional
906 If `True`, the default, the datastore can record validation
907 information associated with the file. If `False` the datastore
908 will not attempt to track any information such as checksums
909 or file sizes. This can be useful if such information is tracked
910 in an external system or if the file is to be compressed in place.
911 It is up to the datastore whether this parameter is relevant.
913 Returns
914 -------
915 info : `StoredFileInfo`
916 Internal datastore record for this file. This will be inserted by
917 the caller; the `_extractIngestInfo` is only responsible for
918 creating and populating the struct.
920 Raises
921 ------
922 FileNotFoundError
923 Raised if one of the given files does not exist.
924 FileExistsError
925 Raised if transfer is not `None` but the (internal) location the
926 file would be moved to is already occupied.
927 """
928 if self._transaction is None:
929 raise RuntimeError("Ingest called without transaction enabled")
931 # Create URI of the source path, do not need to force a relative
932 # path to absolute.
933 srcUri = ResourcePath(path, forceAbsolute=False)
935 # Track whether we have read the size of the source yet
936 have_sized = False
938 tgtLocation: Location | None
939 if transfer is None or transfer == "split":
940 # A relative path is assumed to be relative to the datastore
941 # in this context
942 if not srcUri.isabs():
943 tgtLocation = self.locationFactory.fromPath(srcUri.ospath)
944 else:
945 # Work out the path in the datastore from an absolute URI
946 # This is required to be within the datastore.
947 pathInStore = srcUri.relative_to(self.root)
948 if pathInStore is None and transfer is None:
949 raise RuntimeError(
950 f"Unexpectedly learned that {srcUri} is not within datastore {self.root}"
951 )
952 if pathInStore:
953 tgtLocation = self.locationFactory.fromPath(pathInStore)
954 elif transfer == "split":
955 # Outside the datastore but treat that as a direct ingest
956 # instead.
957 tgtLocation = None
958 else:
959 raise RuntimeError(f"Unexpected transfer mode encountered: {transfer} for URI {srcUri}")
960 elif transfer == "direct":
961 # Want to store the full URI to the resource directly in
962 # datastore. This is useful for referring to permanent archive
963 # storage for raw data.
964 # Trust that people know what they are doing.
965 tgtLocation = None
966 else:
967 # Work out the name we want this ingested file to have
968 # inside the datastore
969 tgtLocation = self._calculate_ingested_datastore_name(srcUri, ref, formatter)
970 if not tgtLocation.uri.dirname().exists():
971 log.debug("Folder %s does not exist yet.", tgtLocation.uri.dirname())
972 tgtLocation.uri.dirname().mkdir()
974 # if we are transferring from a local file to a remote location
975 # it may be more efficient to get the size and checksum of the
976 # local file rather than the transferred one
977 if record_validation_info and srcUri.isLocal:
978 size = srcUri.size()
979 checksum = self.computeChecksum(srcUri) if self.useChecksum else None
980 have_sized = True
982 # Transfer the resource to the destination.
983 # Allow overwrite of an existing file. This matches the behavior
984 # of datastore.put() in that it trusts that registry would not
985 # be asking to overwrite unless registry thought that the
986 # overwrite was allowed.
987 tgtLocation.uri.transfer_from(
988 srcUri, transfer=transfer, transaction=self._transaction, overwrite=True
989 )
991 if tgtLocation is None:
992 # This means we are using direct mode
993 targetUri = srcUri
994 targetPath = str(srcUri)
995 else:
996 targetUri = tgtLocation.uri
997 targetPath = tgtLocation.pathInStore.path
999 # the file should exist in the datastore now
1000 if record_validation_info:
1001 if not have_sized:
1002 size = targetUri.size()
1003 checksum = self.computeChecksum(targetUri) if self.useChecksum else None
1004 else:
1005 # Not recording any file information.
1006 size = -1
1007 checksum = None
1009 return StoredFileInfo(
1010 formatter=formatter,
1011 path=targetPath,
1012 storageClass=ref.datasetType.storageClass,
1013 component=ref.datasetType.component(),
1014 file_size=size,
1015 checksum=checksum,
1016 dataset_id=ref.id,
1017 )
1019 def _prepIngest(self, *datasets: FileDataset, transfer: str | None = None) -> _IngestPrepData:
1020 # Docstring inherited from Datastore._prepIngest.
1021 filtered = []
1022 for dataset in datasets:
1023 acceptable = [ref for ref in dataset.refs if self.constraints.isAcceptable(ref)]
1024 if not acceptable:
1025 continue
1026 else:
1027 dataset.refs = acceptable
1028 if dataset.formatter is None:
1029 dataset.formatter = self.formatterFactory.getFormatterClass(dataset.refs[0])
1030 else:
1031 assert isinstance(dataset.formatter, type | str)
1032 formatter_class = get_class_of(dataset.formatter)
1033 if not issubclass(formatter_class, Formatter):
1034 raise TypeError(f"Requested formatter {dataset.formatter} is not a Formatter class.")
1035 dataset.formatter = formatter_class
1036 dataset.path = self._standardizeIngestPath(dataset.path, transfer=transfer)
1037 filtered.append(dataset)
1038 return _IngestPrepData(filtered)
1040 @transactional
1041 def _finishIngest(
1042 self,
1043 prepData: Datastore.IngestPrepData,
1044 *,
1045 transfer: str | None = None,
1046 record_validation_info: bool = True,
1047 ) -> None:
1048 # Docstring inherited from Datastore._finishIngest.
1049 refsAndInfos = []
1050 progress = Progress("lsst.daf.butler.datastores.FileDatastore.ingest", level=logging.DEBUG)
1051 for dataset in progress.wrap(prepData.datasets, desc="Ingesting dataset files"):
1052 # Do ingest as if the first dataset ref is associated with the file
1053 info = self._extractIngestInfo(
1054 dataset.path,
1055 dataset.refs[0],
1056 formatter=dataset.formatter,
1057 transfer=transfer,
1058 record_validation_info=record_validation_info,
1059 )
1060 refsAndInfos.extend([(ref, info) for ref in dataset.refs])
1062 # In direct mode we can allow repeated ingests of the same thing
1063 # if we are sure that the external dataset is immutable. We use
1064 # UUIDv5 to indicate this. If there is a mix of v4 and v5 they are
1065 # separated.
1066 refs_and_infos_replace = []
1067 refs_and_infos_insert = []
1068 if transfer == "direct":
1069 for entry in refsAndInfos:
1070 if entry[0].id.version == 5:
1071 refs_and_infos_replace.append(entry)
1072 else:
1073 refs_and_infos_insert.append(entry)
1074 else:
1075 refs_and_infos_insert = refsAndInfos
1077 if refs_and_infos_insert:
1078 self._register_datasets(refs_and_infos_insert, insert_mode=DatabaseInsertMode.INSERT)
1079 if refs_and_infos_replace:
1080 self._register_datasets(refs_and_infos_replace, insert_mode=DatabaseInsertMode.REPLACE)
1082 def _calculate_ingested_datastore_name(
1083 self,
1084 srcUri: ResourcePath,
1085 ref: DatasetRef,
1086 formatter: Formatter | type[Formatter] | None = None,
1087 ) -> Location:
1088 """Given a source URI and a DatasetRef, determine the name the
1089 dataset will have inside datastore.
1091 Parameters
1092 ----------
1093 srcUri : `lsst.resources.ResourcePath`
1094 URI to the source dataset file.
1095 ref : `DatasetRef`
1096 Ref associated with the newly-ingested dataset artifact. This
1097 is used to determine the name within the datastore.
1098 formatter : `Formatter` or Formatter class.
1099 Formatter to use for validation. Can be a class or an instance.
1100 No validation of the file extension is performed if the
1101 ``formatter`` is `None`. This can be used if the caller knows
1102 that the source URI and target URI will use the same formatter.
1104 Returns
1105 -------
1106 location : `Location`
1107 Target location for the newly-ingested dataset.
1108 """
1109 # Ingesting a file from outside the datastore.
1110 # This involves a new name.
1111 template = self.templates.getTemplate(ref)
1112 location = self.locationFactory.fromPath(template.format(ref))
1114 # Get the extension
1115 ext = srcUri.getExtension()
1117 # Update the destination to include that extension
1118 location.updateExtension(ext)
1120 # Ask the formatter to validate this extension
1121 if formatter is not None:
1122 formatter.validateExtension(location)
1124 return location
1126 def _write_in_memory_to_artifact(self, inMemoryDataset: Any, ref: DatasetRef) -> StoredFileInfo:
1127 """Write out in memory dataset to datastore.
1129 Parameters
1130 ----------
1131 inMemoryDataset : `object`
1132 Dataset to write to datastore.
1133 ref : `DatasetRef`
1134 Registry information associated with this dataset.
1136 Returns
1137 -------
1138 info : `StoredFileInfo`
1139 Information describing the artifact written to the datastore.
1140 """
1141 # May need to coerce the in memory dataset to the correct
1142 # python type, but first we need to make sure the storage class
1143 # reflects the one defined in the data repository.
1144 ref = self._cast_storage_class(ref)
1145 inMemoryDataset = ref.datasetType.storageClass.coerce_type(inMemoryDataset)
1147 location, formatter = self._prepare_for_put(inMemoryDataset, ref)
1148 uri = location.uri
1150 if not uri.dirname().exists():
1151 log.debug("Folder %s does not exist yet so creating it.", uri.dirname())
1152 uri.dirname().mkdir()
1154 if self._transaction is None:
1155 raise RuntimeError("Attempting to write artifact without transaction enabled")
1157 def _removeFileExists(uri: ResourcePath) -> None:
1158 """Remove a file and do not complain if it is not there.
1160 This is important since a formatter might fail before the file
1161 is written and we should not confuse people by writing spurious
1162 error messages to the log.
1163 """
1164 with contextlib.suppress(FileNotFoundError):
1165 uri.remove()
1167 # Register a callback to try to delete the uploaded data if
1168 # something fails below
1169 self._transaction.registerUndo("artifactWrite", _removeFileExists, uri)
1171 data_written = False
1173 # For remote URIs some datasets can be serialized directly
1174 # to bytes and sent to the remote datastore without writing a
1175 # file. If the dataset is intended to be saved to the cache
1176 # a file is always written and direct write to the remote
1177 # datastore is bypassed.
1178 if not uri.isLocal and not self.cacheManager.should_be_cached(ref):
1179 # Remote URI that is not cached so can write directly.
1180 try:
1181 serializedDataset = formatter.toBytes(inMemoryDataset)
1182 except NotImplementedError:
1183 # Fallback to the file writing option.
1184 pass
1185 except Exception as e:
1186 raise RuntimeError(
1187 f"Failed to serialize dataset {ref} of type {type(inMemoryDataset)} to bytes."
1188 ) from e
1189 else:
1190 log.debug("Writing bytes directly to %s", uri)
1191 uri.write(serializedDataset, overwrite=True)
1192 log.debug("Successfully wrote bytes directly to %s", uri)
1193 data_written = True
1195 if not data_written:
1196 # Did not write the bytes directly to object store so instead
1197 # write to temporary file. Always write to a temporary even if
1198 # using a local file system -- that gives us atomic writes.
1199 # If a process is killed as the file is being written we do not
1200 # want it to remain in the correct place but in corrupt state.
1201 # For local files write to the output directory not temporary dir.
1202 prefix = uri.dirname() if uri.isLocal else None
1203 with ResourcePath.temporary_uri(suffix=uri.getExtension(), prefix=prefix) as temporary_uri:
1204 # Need to configure the formatter to write to a different
1205 # location and that needs us to overwrite internals
1206 log.debug("Writing dataset to temporary location at %s", temporary_uri)
1207 with formatter._updateLocation(Location(None, temporary_uri)):
1208 try:
1209 formatter.write(inMemoryDataset)
1210 except Exception as e:
1211 raise RuntimeError(
1212 f"Failed to serialize dataset {ref} of type"
1213 f" {type(inMemoryDataset)} to "
1214 f"temporary location {temporary_uri}"
1215 ) from e
1217 # Use move for a local file since that becomes an efficient
1218 # os.rename. For remote resources we use copy to allow the
1219 # file to be cached afterwards.
1220 transfer = "move" if uri.isLocal else "copy"
1222 uri.transfer_from(temporary_uri, transfer=transfer, overwrite=True)
1224 if transfer == "copy":
1225 # Cache if required
1226 self.cacheManager.move_to_cache(temporary_uri, ref)
1228 log.debug("Successfully wrote dataset to %s via a temporary file.", uri)
1230 # URI is needed to resolve what ingest case are we dealing with
1231 return self._extractIngestInfo(uri, ref, formatter=formatter)
1233 def _read_artifact_into_memory(
1234 self,
1235 getInfo: DatastoreFileGetInformation,
1236 ref: DatasetRef,
1237 isComponent: bool = False,
1238 cache_ref: DatasetRef | None = None,
1239 ) -> Any:
1240 """Read the artifact from datastore into in memory object.
1242 Parameters
1243 ----------
1244 getInfo : `DatastoreFileGetInformation`
1245 Information about the artifact within the datastore.
1246 ref : `DatasetRef`
1247 The registry information associated with this artifact.
1248 isComponent : `bool`
1249 Flag to indicate if a component is being read from this artifact.
1250 cache_ref : `DatasetRef`, optional
1251 The DatasetRef to use when looking up the file in the cache.
1252 This ref must have the same ID as the supplied ref but can
1253 be a parent ref or component ref to indicate to the cache whether
1254 a composite file is being requested from the cache or a component
1255 file. Without this the cache will default to the supplied ref but
1256 it can get confused with read-only derived components for
1257 disassembled composites.
1259 Returns
1260 -------
1261 inMemoryDataset : `object`
1262 The artifact as a python object.
1263 """
1264 location = getInfo.location
1265 uri = location.uri
1266 log.debug("Accessing data from %s", uri)
1268 if cache_ref is None:
1269 cache_ref = ref
1270 if cache_ref.id != ref.id:
1271 raise ValueError(
1272 "The supplied cache dataset ref refers to a different dataset than expected:"
1273 f" {ref.id} != {cache_ref.id}"
1274 )
1276 # Cannot recalculate checksum but can compare size as a quick check
1277 # Do not do this if the size is negative since that indicates
1278 # we do not know.
1279 recorded_size = getInfo.info.file_size
1280 resource_size = uri.size()
1281 if recorded_size >= 0 and resource_size != recorded_size:
1282 raise RuntimeError(
1283 "Integrity failure in Datastore. "
1284 f"Size of file {uri} ({resource_size}) "
1285 f"does not match size recorded in registry of {recorded_size}"
1286 )
1288 # For the general case we have choices for how to proceed.
1289 # 1. Always use a local file (downloading the remote resource to a
1290 # temporary file if needed).
1291 # 2. Use a threshold size and read into memory and use bytes.
1292 # Use both for now with an arbitrary hand off size.
1293 # This allows small datasets to be downloaded from remote object
1294 # stores without requiring a temporary file.
1296 formatter = getInfo.formatter
1297 nbytes_max = 10_000_000 # Arbitrary number that we can tune
1298 if resource_size <= nbytes_max and formatter.can_read_bytes():
1299 with self.cacheManager.find_in_cache(cache_ref, uri.getExtension()) as cached_file:
1300 if cached_file is not None:
1301 desired_uri = cached_file
1302 msg = f" (cached version of {uri})"
1303 else:
1304 desired_uri = uri
1305 msg = ""
1306 with time_this(log, msg="Reading bytes from %s%s", args=(desired_uri, msg)):
1307 serializedDataset = desired_uri.read()
1308 log.debug(
1309 "Deserializing %s from %d bytes from location %s with formatter %s",
1310 f"component {getInfo.component}" if isComponent else "",
1311 len(serializedDataset),
1312 uri,
1313 formatter.name(),
1314 )
1315 try:
1316 result = formatter.fromBytes(
1317 serializedDataset, component=getInfo.component if isComponent else None
1318 )
1319 except Exception as e:
1320 raise ValueError(
1321 f"Failure from formatter '{formatter.name()}' for dataset {ref.id}"
1322 f" ({ref.datasetType.name} from {uri}): {e}"
1323 ) from e
1324 else:
1325 # Read from file.
1327 # Have to update the Location associated with the formatter
1328 # because formatter.read does not allow an override.
1329 # This could be improved.
1330 location_updated = False
1331 msg = ""
1333 # First check in cache for local version.
1334 # The cache will only be relevant for remote resources but
1335 # no harm in always asking. Context manager ensures that cache
1336 # file is not deleted during cache expiration.
1337 with self.cacheManager.find_in_cache(cache_ref, uri.getExtension()) as cached_file:
1338 if cached_file is not None:
1339 msg = f"(via cache read of remote file {uri})"
1340 uri = cached_file
1341 location_updated = True
1343 with uri.as_local() as local_uri:
1344 can_be_cached = False
1345 if uri != local_uri:
1346 # URI was remote and file was downloaded
1347 cache_msg = ""
1348 location_updated = True
1350 if self.cacheManager.should_be_cached(cache_ref):
1351 # In this scenario we want to ask if the downloaded
1352 # file should be cached but we should not cache
1353 # it until after we've used it (to ensure it can't
1354 # be expired whilst we are using it).
1355 can_be_cached = True
1357 # Say that it is "likely" to be cached because
1358 # if the formatter read fails we will not be
1359 # caching this file.
1360 cache_msg = " and likely cached"
1362 msg = f"(via download to local file{cache_msg})"
1364 # Calculate the (possibly) new location for the formatter
1365 # to use.
1366 newLocation = Location(*local_uri.split()) if location_updated else None
1368 log.debug(
1369 "Reading%s from location %s %s with formatter %s",
1370 f" component {getInfo.component}" if isComponent else "",
1371 uri,
1372 msg,
1373 formatter.name(),
1374 )
1375 try:
1376 with (
1377 formatter._updateLocation(newLocation),
1378 time_this(
1379 log,
1380 msg="Reading%s from location %s %s with formatter %s",
1381 args=(
1382 f" component {getInfo.component}" if isComponent else "",
1383 uri,
1384 msg,
1385 formatter.name(),
1386 ),
1387 ),
1388 ):
1389 result = formatter.read(component=getInfo.component if isComponent else None)
1390 except Exception as e:
1391 raise ValueError(
1392 f"Failure from formatter '{formatter.name()}' for dataset {ref.id}"
1393 f" ({ref.datasetType.name} from {uri}): {e}"
1394 ) from e
1396 # File was read successfully so can move to cache
1397 if can_be_cached:
1398 self.cacheManager.move_to_cache(local_uri, cache_ref)
1400 return self._post_process_get(
1401 result, ref.datasetType.storageClass, getInfo.assemblerParams, isComponent=isComponent
1402 )
1404 def knows(self, ref: DatasetRef) -> bool:
1405 """Check if the dataset is known to the datastore.
1407 Does not check for existence of any artifact.
1409 Parameters
1410 ----------
1411 ref : `DatasetRef`
1412 Reference to the required dataset.
1414 Returns
1415 -------
1416 exists : `bool`
1417 `True` if the dataset is known to the datastore.
1418 """
1419 fileLocations = self._get_dataset_locations_info(ref)
1420 if fileLocations:
1421 return True
1422 return False
1424 def knows_these(self, refs: Iterable[DatasetRef]) -> dict[DatasetRef, bool]:
1425 # Docstring inherited from the base class.
1427 # The records themselves. Could be missing some entries.
1428 records = self._get_stored_records_associated_with_refs(refs)
1430 return {ref: ref.id in records for ref in refs}
1432 def _process_mexists_records(
1433 self,
1434 id_to_ref: dict[DatasetId, DatasetRef],
1435 records: dict[DatasetId, list[StoredFileInfo]],
1436 all_required: bool,
1437 artifact_existence: dict[ResourcePath, bool] | None = None,
1438 ) -> dict[DatasetRef, bool]:
1439 """Check given records for existence.
1441 Helper function for `mexists()`.
1443 Parameters
1444 ----------
1445 id_to_ref : `dict` of [`DatasetId`, `DatasetRef`]
1446 Mapping of the dataset ID to the dataset ref itself.
1447 records : `dict` of [`DatasetId`, `list` of `StoredFileInfo`]
1448 Records as generally returned by
1449 ``_get_stored_records_associated_with_refs``.
1450 all_required : `bool`
1451 Flag to indicate whether existence requires all artifacts
1452 associated with a dataset ID to exist or not for existence.
1453 artifact_existence : `dict` [`lsst.resources.ResourcePath`, `bool`]
1454 Optional mapping of datastore artifact to existence. Updated by
1455 this method with details of all artifacts tested. Can be `None`
1456 if the caller is not interested.
1458 Returns
1459 -------
1460 existence : `dict` of [`DatasetRef`, `bool`]
1461 Mapping from dataset to boolean indicating existence.
1462 """
1463 # The URIs to be checked and a mapping of those URIs to
1464 # the dataset ID.
1465 uris_to_check: list[ResourcePath] = []
1466 location_map: dict[ResourcePath, DatasetId] = {}
1468 location_factory = self.locationFactory
1470 uri_existence: dict[ResourcePath, bool] = {}
1471 for ref_id, infos in records.items():
1472 # Key is the dataset Id, value is list of StoredItemInfo
1473 uris = [info.file_location(location_factory).uri for info in infos]
1474 location_map.update({uri: ref_id for uri in uris})
1476 # Check the local cache directly for a dataset corresponding
1477 # to the remote URI.
1478 if self.cacheManager.file_count > 0:
1479 ref = id_to_ref[ref_id]
1480 for uri, storedFileInfo in zip(uris, infos, strict=True):
1481 check_ref = ref
1482 if not ref.datasetType.isComponent() and (component := storedFileInfo.component):
1483 check_ref = ref.makeComponentRef(component)
1484 if self.cacheManager.known_to_cache(check_ref, uri.getExtension()):
1485 # Proxy for URI existence.
1486 uri_existence[uri] = True
1487 else:
1488 uris_to_check.append(uri)
1489 else:
1490 # Check all of them.
1491 uris_to_check.extend(uris)
1493 if artifact_existence is not None:
1494 # If a URI has already been checked remove it from the list
1495 # and immediately add the status to the output dict.
1496 filtered_uris_to_check = []
1497 for uri in uris_to_check:
1498 if uri in artifact_existence:
1499 uri_existence[uri] = artifact_existence[uri]
1500 else:
1501 filtered_uris_to_check.append(uri)
1502 uris_to_check = filtered_uris_to_check
1504 # Results.
1505 dataset_existence: dict[DatasetRef, bool] = {}
1507 uri_existence.update(ResourcePath.mexists(uris_to_check))
1508 for uri, exists in uri_existence.items():
1509 dataset_id = location_map[uri]
1510 ref = id_to_ref[dataset_id]
1512 # Disassembled composite needs to check all locations.
1513 # all_required indicates whether all need to exist or not.
1514 if ref in dataset_existence:
1515 if all_required:
1516 exists = dataset_existence[ref] and exists
1517 else:
1518 exists = dataset_existence[ref] or exists
1519 dataset_existence[ref] = exists
1521 if artifact_existence is not None:
1522 artifact_existence.update(uri_existence)
1524 return dataset_existence
1526 def mexists(
1527 self, refs: Iterable[DatasetRef], artifact_existence: dict[ResourcePath, bool] | None = None
1528 ) -> dict[DatasetRef, bool]:
1529 """Check the existence of multiple datasets at once.
1531 Parameters
1532 ----------
1533 refs : iterable of `DatasetRef`
1534 The datasets to be checked.
1535 artifact_existence : `dict` [`lsst.resources.ResourcePath`, `bool`]
1536 Optional mapping of datastore artifact to existence. Updated by
1537 this method with details of all artifacts tested. Can be `None`
1538 if the caller is not interested.
1540 Returns
1541 -------
1542 existence : `dict` of [`DatasetRef`, `bool`]
1543 Mapping from dataset to boolean indicating existence.
1545 Notes
1546 -----
1547 To minimize potentially costly remote existence checks, the local
1548 cache is checked as a proxy for existence. If a file for this
1549 `DatasetRef` does exist no check is done for the actual URI. This
1550 could result in possibly unexpected behavior if the dataset itself
1551 has been removed from the datastore by another process whilst it is
1552 still in the cache.
1553 """
1554 chunk_size = 10_000
1555 dataset_existence: dict[DatasetRef, bool] = {}
1556 log.debug("Checking for the existence of multiple artifacts in datastore in chunks of %d", chunk_size)
1557 n_found_total = 0
1558 n_checked = 0
1559 n_chunks = 0
1560 for chunk in chunk_iterable(refs, chunk_size=chunk_size):
1561 chunk_result = self._mexists(chunk, artifact_existence)
1563 # The log message level and content depend on how many
1564 # datasets we are processing.
1565 n_results = len(chunk_result)
1567 # Use verbose logging to ensure that messages can be seen
1568 # easily if many refs are being checked.
1569 log_threshold = VERBOSE
1570 n_checked += n_results
1572 # This sum can take some time so only do it if we know the
1573 # result is going to be used.
1574 n_found = 0
1575 if log.isEnabledFor(log_threshold):
1576 # Can treat the booleans as 0, 1 integers and sum them.
1577 n_found = sum(chunk_result.values())
1578 n_found_total += n_found
1580 # We are deliberately not trying to count the number of refs
1581 # provided in case it's in the millions. This means there is a
1582 # situation where the number of refs exactly matches the chunk
1583 # size and we will switch to the multi-chunk path even though
1584 # we only have a single chunk.
1585 if n_results < chunk_size and n_chunks == 0:
1586 # Single chunk will be processed so we can provide more detail.
1587 if n_results == 1:
1588 ref = list(chunk_result)[0]
1589 # Use debug logging to be consistent with `exists()`.
1590 log.debug(
1591 "Calling mexists() with single ref that does%s exist (%s).",
1592 "" if chunk_result[ref] else " not",
1593 ref,
1594 )
1595 else:
1596 # Single chunk but multiple files. Summarize.
1597 log.log(
1598 log_threshold,
1599 "Number of datasets found in datastore: %d out of %d datasets checked.",
1600 n_found,
1601 n_checked,
1602 )
1604 else:
1605 # Use incremental verbose logging when we have multiple chunks.
1606 log.log(
1607 log_threshold,
1608 "Number of datasets found in datastore for chunk %d: %d out of %d checked "
1609 "(running total from all chunks so far: %d found out of %d checked)",
1610 n_chunks,
1611 n_found,
1612 n_results,
1613 n_found_total,
1614 n_checked,
1615 )
1616 dataset_existence.update(chunk_result)
1617 n_chunks += 1
1619 return dataset_existence
1621 def _mexists(
1622 self, refs: Sequence[DatasetRef], artifact_existence: dict[ResourcePath, bool] | None = None
1623 ) -> dict[DatasetRef, bool]:
1624 """Check the existence of multiple datasets at once.
1626 Parameters
1627 ----------
1628 refs : iterable of `DatasetRef`
1629 The datasets to be checked.
1630 artifact_existence : `dict` [`lsst.resources.ResourcePath`, `bool`]
1631 Optional mapping of datastore artifact to existence. Updated by
1632 this method with details of all artifacts tested. Can be `None`
1633 if the caller is not interested.
1635 Returns
1636 -------
1637 existence : `dict` of [`DatasetRef`, `bool`]
1638 Mapping from dataset to boolean indicating existence.
1639 """
1640 # Make a mapping from refs with the internal storage class to the given
1641 # refs that may have a different one. We'll use the internal refs
1642 # throughout this method and convert back at the very end.
1643 internal_ref_to_input_ref = {self._cast_storage_class(ref): ref for ref in refs}
1645 # Need a mapping of dataset_id to (internal) dataset ref since some
1646 # internal APIs work with dataset_id.
1647 id_to_ref = {ref.id: ref for ref in internal_ref_to_input_ref}
1649 # Set of all IDs we are checking for.
1650 requested_ids = set(id_to_ref.keys())
1652 # The records themselves. Could be missing some entries.
1653 records = self._get_stored_records_associated_with_refs(id_to_ref.values())
1655 dataset_existence = self._process_mexists_records(
1656 id_to_ref, records, True, artifact_existence=artifact_existence
1657 )
1659 # Set of IDs that have been handled.
1660 handled_ids = {ref.id for ref in dataset_existence}
1662 missing_ids = requested_ids - handled_ids
1663 if missing_ids:
1664 dataset_existence.update(
1665 self._mexists_check_expected(
1666 [id_to_ref[missing] for missing in missing_ids], artifact_existence
1667 )
1668 )
1670 return {
1671 internal_ref_to_input_ref[internal_ref]: existence
1672 for internal_ref, existence in dataset_existence.items()
1673 }
1675 def _mexists_check_expected(
1676 self, refs: Sequence[DatasetRef], artifact_existence: dict[ResourcePath, bool] | None = None
1677 ) -> dict[DatasetRef, bool]:
1678 """Check existence of refs that are not known to datastore.
1680 Parameters
1681 ----------
1682 refs : iterable of `DatasetRef`
1683 The datasets to be checked. These are assumed not to be known
1684 to datastore.
1685 artifact_existence : `dict` [`lsst.resources.ResourcePath`, `bool`]
1686 Optional mapping of datastore artifact to existence. Updated by
1687 this method with details of all artifacts tested. Can be `None`
1688 if the caller is not interested.
1690 Returns
1691 -------
1692 existence : `dict` of [`DatasetRef`, `bool`]
1693 Mapping from dataset to boolean indicating existence.
1694 """
1695 dataset_existence: dict[DatasetRef, bool] = {}
1696 if not self.trustGetRequest:
1697 # Must assume these do not exist
1698 for ref in refs:
1699 dataset_existence[ref] = False
1700 else:
1701 log.debug(
1702 "%d datasets were not known to datastore during initial existence check.",
1703 len(refs),
1704 )
1706 # Construct data structure identical to that returned
1707 # by _get_stored_records_associated_with_refs() but using
1708 # guessed names.
1709 records = {}
1710 id_to_ref = {}
1711 for missing_ref in refs:
1712 expected = self._get_expected_dataset_locations_info(missing_ref)
1713 dataset_id = missing_ref.id
1714 records[dataset_id] = [info for _, info in expected]
1715 id_to_ref[dataset_id] = missing_ref
1717 dataset_existence.update(
1718 self._process_mexists_records(
1719 id_to_ref,
1720 records,
1721 False,
1722 artifact_existence=artifact_existence,
1723 )
1724 )
1726 return dataset_existence
1728 def exists(self, ref: DatasetRef) -> bool:
1729 """Check if the dataset exists in the datastore.
1731 Parameters
1732 ----------
1733 ref : `DatasetRef`
1734 Reference to the required dataset.
1736 Returns
1737 -------
1738 exists : `bool`
1739 `True` if the entity exists in the `Datastore`.
1741 Notes
1742 -----
1743 The local cache is checked as a proxy for existence in the remote
1744 object store. It is possible that another process on a different
1745 compute node could remove the file from the object store even
1746 though it is present in the local cache.
1747 """
1748 ref = self._cast_storage_class(ref)
1749 fileLocations = self._get_dataset_locations_info(ref)
1751 # if we are being asked to trust that registry might not be correct
1752 # we ask for the expected locations and check them explicitly
1753 if not fileLocations:
1754 if not self.trustGetRequest:
1755 return False
1757 # First check the cache. If it is not found we must check
1758 # the datastore itself. Assume that any component in the cache
1759 # means that the dataset does exist somewhere.
1760 if self.cacheManager.known_to_cache(ref):
1761 return True
1763 # When we are guessing a dataset location we can not check
1764 # for the existence of every component since we can not
1765 # know if every component was written. Instead we check
1766 # for the existence of any of the expected locations.
1767 for location, _ in self._get_expected_dataset_locations_info(ref):
1768 if self._artifact_exists(location):
1769 return True
1770 return False
1772 # All listed artifacts must exist.
1773 for location, storedFileInfo in fileLocations:
1774 # Checking in cache needs the component ref.
1775 check_ref = ref
1776 if not ref.datasetType.isComponent() and (component := storedFileInfo.component):
1777 check_ref = ref.makeComponentRef(component)
1778 if self.cacheManager.known_to_cache(check_ref, location.getExtension()):
1779 continue
1781 if not self._artifact_exists(location):
1782 return False
1784 return True
1786 def getURIs(self, ref: DatasetRef, predict: bool = False) -> DatasetRefURIs:
1787 """Return URIs associated with dataset.
1789 Parameters
1790 ----------
1791 ref : `DatasetRef`
1792 Reference to the required dataset.
1793 predict : `bool`, optional
1794 If the datastore does not know about the dataset, should it
1795 return a predicted URI or not?
1797 Returns
1798 -------
1799 uris : `DatasetRefURIs`
1800 The URI to the primary artifact associated with this dataset (if
1801 the dataset was disassembled within the datastore this may be
1802 `None`), and the URIs to any components associated with the dataset
1803 artifact. (can be empty if there are no components).
1804 """
1805 many = self.getManyURIs([ref], predict=predict, allow_missing=False)
1806 return many[ref]
1808 def getURI(self, ref: DatasetRef, predict: bool = False) -> ResourcePath:
1809 """URI to the Dataset.
1811 Parameters
1812 ----------
1813 ref : `DatasetRef`
1814 Reference to the required Dataset.
1815 predict : `bool`
1816 If `True`, allow URIs to be returned of datasets that have not
1817 been written.
1819 Returns
1820 -------
1821 uri : `str`
1822 URI pointing to the dataset within the datastore. If the
1823 dataset does not exist in the datastore, and if ``predict`` is
1824 `True`, the URI will be a prediction and will include a URI
1825 fragment "#predicted".
1826 If the datastore does not have entities that relate well
1827 to the concept of a URI the returned URI will be
1828 descriptive. The returned URI is not guaranteed to be obtainable.
1830 Raises
1831 ------
1832 FileNotFoundError
1833 Raised if a URI has been requested for a dataset that does not
1834 exist and guessing is not allowed.
1835 RuntimeError
1836 Raised if a request is made for a single URI but multiple URIs
1837 are associated with this dataset.
1839 Notes
1840 -----
1841 When a predicted URI is requested an attempt will be made to form
1842 a reasonable URI based on file templates and the expected formatter.
1843 """
1844 primary, components = self.getURIs(ref, predict)
1845 if primary is None or components:
1846 raise RuntimeError(
1847 f"Dataset ({ref}) includes distinct URIs for components. Use Datastore.getURIs() instead."
1848 )
1849 return primary
1851 def _predict_URIs(
1852 self,
1853 ref: DatasetRef,
1854 ) -> DatasetRefURIs:
1855 """Predict the URIs of a dataset ref.
1857 Parameters
1858 ----------
1859 ref : `DatasetRef`
1860 Reference to the required Dataset.
1862 Returns
1863 -------
1864 URI : DatasetRefUris
1865 Primary and component URIs. URIs will contain a URI fragment
1866 "#predicted".
1867 """
1868 uris = DatasetRefURIs()
1870 if self.composites.shouldBeDisassembled(ref):
1871 for component, _ in ref.datasetType.storageClass.components.items():
1872 comp_ref = ref.makeComponentRef(component)
1873 comp_location, _ = self._determine_put_formatter_location(comp_ref)
1875 # Add the "#predicted" URI fragment to indicate this is a
1876 # guess
1877 uris.componentURIs[component] = ResourcePath(comp_location.uri.geturl() + "#predicted")
1879 else:
1880 location, _ = self._determine_put_formatter_location(ref)
1882 # Add the "#predicted" URI fragment to indicate this is a guess
1883 uris.primaryURI = ResourcePath(location.uri.geturl() + "#predicted")
1885 return uris
1887 def getManyURIs(
1888 self,
1889 refs: Iterable[DatasetRef],
1890 predict: bool = False,
1891 allow_missing: bool = False,
1892 ) -> dict[DatasetRef, DatasetRefURIs]:
1893 # Docstring inherited
1895 uris: dict[DatasetRef, DatasetRefURIs] = {}
1897 records = self._get_stored_records_associated_with_refs(refs)
1898 records_keys = records.keys()
1900 existing_refs = tuple(ref for ref in refs if ref.id in records_keys)
1901 missing_refs = tuple(ref for ref in refs if ref.id not in records_keys)
1903 # Have to handle trustGetRequest mode by checking for the existence
1904 # of the missing refs on disk.
1905 if missing_refs:
1906 dataset_existence = self._mexists_check_expected(missing_refs, None)
1907 really_missing = set()
1908 not_missing = set()
1909 for ref, exists in dataset_existence.items():
1910 if exists:
1911 not_missing.add(ref)
1912 else:
1913 really_missing.add(ref)
1915 if not_missing:
1916 # Need to recalculate the missing/existing split.
1917 existing_refs = existing_refs + tuple(not_missing)
1918 missing_refs = tuple(really_missing)
1920 for ref in missing_refs:
1921 # if this has never been written then we have to guess
1922 if not predict:
1923 if not allow_missing:
1924 raise FileNotFoundError(f"Dataset {ref} not in this datastore.")
1925 else:
1926 uris[ref] = self._predict_URIs(ref)
1928 for ref in existing_refs:
1929 file_infos = records[ref.id]
1930 file_locations = [(i.file_location(self.locationFactory), i) for i in file_infos]
1931 uris[ref] = self._locations_to_URI(ref, file_locations)
1933 return uris
1935 def _locations_to_URI(
1936 self,
1937 ref: DatasetRef,
1938 file_locations: Sequence[tuple[Location, StoredFileInfo]],
1939 ) -> DatasetRefURIs:
1940 """Convert one or more file locations associated with a DatasetRef
1941 to a DatasetRefURIs.
1943 Parameters
1944 ----------
1945 ref : `DatasetRef`
1946 Reference to the dataset.
1947 file_locations : Sequence[Tuple[Location, StoredFileInfo]]
1948 Each item in the sequence is the location of the dataset within the
1949 datastore and stored information about the file and its formatter.
1950 If there is only one item in the sequence then it is treated as the
1951 primary URI. If there is more than one item then they are treated
1952 as component URIs. If there are no items then an error is raised
1953 unless ``self.trustGetRequest`` is `True`.
1955 Returns
1956 -------
1957 uris: DatasetRefURIs
1958 Represents the primary URI or component URIs described by the
1959 inputs.
1961 Raises
1962 ------
1963 RuntimeError
1964 If no file locations are passed in and ``self.trustGetRequest`` is
1965 `False`.
1966 FileNotFoundError
1967 If the a passed-in URI does not exist, and ``self.trustGetRequest``
1968 is `False`.
1969 RuntimeError
1970 If a passed in `StoredFileInfo`'s ``component`` is `None` (this is
1971 unexpected).
1972 """
1973 guessing = False
1974 uris = DatasetRefURIs()
1976 if not file_locations:
1977 if not self.trustGetRequest:
1978 raise RuntimeError(f"Unexpectedly got no artifacts for dataset {ref}")
1979 file_locations = self._get_expected_dataset_locations_info(ref)
1980 guessing = True
1982 if len(file_locations) == 1:
1983 # No disassembly so this is the primary URI
1984 uris.primaryURI = file_locations[0][0].uri
1985 if guessing and not uris.primaryURI.exists():
1986 raise FileNotFoundError(f"Expected URI ({uris.primaryURI}) does not exist")
1987 else:
1988 for location, file_info in file_locations:
1989 if file_info.component is None:
1990 raise RuntimeError(f"Unexpectedly got no component name for a component at {location}")
1991 if guessing and not location.uri.exists():
1992 # If we are trusting then it is entirely possible for
1993 # some components to be missing. In that case we skip
1994 # to the next component.
1995 if self.trustGetRequest:
1996 continue
1997 raise FileNotFoundError(f"Expected URI ({location.uri}) does not exist")
1998 uris.componentURIs[file_info.component] = location.uri
2000 return uris
2002 def retrieveArtifacts(
2003 self,
2004 refs: Iterable[DatasetRef],
2005 destination: ResourcePath,
2006 transfer: str = "auto",
2007 preserve_path: bool = True,
2008 overwrite: bool = False,
2009 ) -> list[ResourcePath]:
2010 """Retrieve the file artifacts associated with the supplied refs.
2012 Parameters
2013 ----------
2014 refs : iterable of `DatasetRef`
2015 The datasets for which file artifacts are to be retrieved.
2016 A single ref can result in multiple files. The refs must
2017 be resolved.
2018 destination : `lsst.resources.ResourcePath`
2019 Location to write the file artifacts.
2020 transfer : `str`, optional
2021 Method to use to transfer the artifacts. Must be one of the options
2022 supported by `lsst.resources.ResourcePath.transfer_from()`.
2023 "move" is not allowed.
2024 preserve_path : `bool`, optional
2025 If `True` the full path of the file artifact within the datastore
2026 is preserved. If `False` the final file component of the path
2027 is used.
2028 overwrite : `bool`, optional
2029 If `True` allow transfers to overwrite existing files at the
2030 destination.
2032 Returns
2033 -------
2034 targets : `list` of `lsst.resources.ResourcePath`
2035 URIs of file artifacts in destination location. Order is not
2036 preserved.
2037 """
2038 if not destination.isdir():
2039 raise ValueError(f"Destination location must refer to a directory. Given {destination}")
2041 if transfer == "move":
2042 raise ValueError("Can not move artifacts out of datastore. Use copy instead.")
2044 # Source -> Destination
2045 # This also helps filter out duplicate DatasetRef in the request
2046 # that will map to the same underlying file transfer.
2047 to_transfer: dict[ResourcePath, ResourcePath] = {}
2049 for ref in refs:
2050 locations = self._get_dataset_locations_info(ref)
2051 for location, _ in locations:
2052 source_uri = location.uri
2053 target_path: ResourcePathExpression
2054 if preserve_path:
2055 target_path = location.pathInStore
2056 if target_path.isabs():
2057 # This is an absolute path to an external file.
2058 # Use the full path.
2059 target_path = target_path.relativeToPathRoot
2060 else:
2061 target_path = source_uri.basename()
2062 target_uri = destination.join(target_path)
2063 to_transfer[source_uri] = target_uri
2065 # In theory can now parallelize the transfer
2066 log.debug("Number of artifacts to transfer to %s: %d", str(destination), len(to_transfer))
2067 for source_uri, target_uri in to_transfer.items():
2068 target_uri.transfer_from(source_uri, transfer=transfer, overwrite=overwrite)
2070 return list(to_transfer.values())
2072 def get(
2073 self,
2074 ref: DatasetRef,
2075 parameters: Mapping[str, Any] | None = None,
2076 storageClass: StorageClass | str | None = None,
2077 ) -> Any:
2078 """Load an InMemoryDataset from the store.
2080 Parameters
2081 ----------
2082 ref : `DatasetRef`
2083 Reference to the required Dataset.
2084 parameters : `dict`
2085 `StorageClass`-specific parameters that specify, for example,
2086 a slice of the dataset to be loaded.
2087 storageClass : `StorageClass` or `str`, optional
2088 The storage class to be used to override the Python type
2089 returned by this method. By default the returned type matches
2090 the dataset type definition for this dataset. Specifying a
2091 read `StorageClass` can force a different type to be returned.
2092 This type must be compatible with the original type.
2094 Returns
2095 -------
2096 inMemoryDataset : `object`
2097 Requested dataset or slice thereof as an InMemoryDataset.
2099 Raises
2100 ------
2101 FileNotFoundError
2102 Requested dataset can not be retrieved.
2103 TypeError
2104 Return value from formatter has unexpected type.
2105 ValueError
2106 Formatter failed to process the dataset.
2107 """
2108 # Supplied storage class for the component being read is either
2109 # from the ref itself or some an override if we want to force
2110 # type conversion.
2111 if storageClass is not None:
2112 ref = ref.overrideStorageClass(storageClass)
2113 refStorageClass = ref.datasetType.storageClass
2115 allGetInfo = self._prepare_for_get(ref, parameters)
2116 refComponent = ref.datasetType.component()
2118 # Create mapping from component name to related info
2119 allComponents = {i.component: i for i in allGetInfo}
2121 # By definition the dataset is disassembled if we have more
2122 # than one record for it.
2123 isDisassembled = len(allGetInfo) > 1
2125 # Look for the special case where we are disassembled but the
2126 # component is a derived component that was not written during
2127 # disassembly. For this scenario we need to check that the
2128 # component requested is listed as a derived component for the
2129 # composite storage class
2130 isDisassembledReadOnlyComponent = False
2131 if isDisassembled and refComponent:
2132 # The composite storage class should be accessible through
2133 # the component dataset type
2134 compositeStorageClass = ref.datasetType.parentStorageClass
2136 # In the unlikely scenario where the composite storage
2137 # class is not known, we can only assume that this is a
2138 # normal component. If that assumption is wrong then the
2139 # branch below that reads a persisted component will fail
2140 # so there is no need to complain here.
2141 if compositeStorageClass is not None:
2142 isDisassembledReadOnlyComponent = refComponent in compositeStorageClass.derivedComponents
2144 if isDisassembled and not refComponent:
2145 # This was a disassembled dataset spread over multiple files
2146 # and we need to put them all back together again.
2147 # Read into memory and then assemble
2149 # Check that the supplied parameters are suitable for the type read
2150 refStorageClass.validateParameters(parameters)
2152 # We want to keep track of all the parameters that were not used
2153 # by formatters. We assume that if any of the component formatters
2154 # use a parameter that we do not need to apply it again in the
2155 # assembler.
2156 usedParams = set()
2158 components: dict[str, Any] = {}
2159 for getInfo in allGetInfo:
2160 # assemblerParams are parameters not understood by the
2161 # associated formatter.
2162 usedParams.update(set(getInfo.formatterParams))
2164 component = getInfo.component
2166 if component is None:
2167 raise RuntimeError(f"Internal error in datastore assembly of {ref}")
2169 # We do not want the formatter to think it's reading
2170 # a component though because it is really reading a
2171 # standalone dataset -- always tell reader it is not a
2172 # component.
2173 components[component] = self._read_artifact_into_memory(
2174 getInfo, ref.makeComponentRef(component), isComponent=False
2175 )
2177 inMemoryDataset = ref.datasetType.storageClass.delegate().assemble(components)
2179 # Any unused parameters will have to be passed to the assembler
2180 if parameters:
2181 unusedParams = {k: v for k, v in parameters.items() if k not in usedParams}
2182 else:
2183 unusedParams = {}
2185 # Process parameters
2186 return ref.datasetType.storageClass.delegate().handleParameters(
2187 inMemoryDataset, parameters=unusedParams
2188 )
2190 elif isDisassembledReadOnlyComponent:
2191 compositeStorageClass = ref.datasetType.parentStorageClass
2192 if compositeStorageClass is None:
2193 raise RuntimeError(
2194 f"Unable to retrieve derived component '{refComponent}' since"
2195 "no composite storage class is available."
2196 )
2198 if refComponent is None:
2199 # Mainly for mypy
2200 raise RuntimeError(f"Internal error in datastore {self.name}: component can not be None here")
2202 # Assume that every derived component can be calculated by
2203 # forwarding the request to a single read/write component.
2204 # Rather than guessing which rw component is the right one by
2205 # scanning each for a derived component of the same name,
2206 # we ask the storage class delegate directly which one is best to
2207 # use.
2208 compositeDelegate = compositeStorageClass.delegate()
2209 forwardedComponent = compositeDelegate.selectResponsibleComponent(
2210 refComponent, set(allComponents)
2211 )
2213 # Select the relevant component
2214 rwInfo = allComponents[forwardedComponent]
2216 # For now assume that read parameters are validated against
2217 # the real component and not the requested component
2218 forwardedStorageClass = rwInfo.formatter.fileDescriptor.readStorageClass
2219 forwardedStorageClass.validateParameters(parameters)
2221 # The reference to use for the caching must refer to the forwarded
2222 # component and not the derived component.
2223 cache_ref = ref.makeCompositeRef().makeComponentRef(forwardedComponent)
2225 # Unfortunately the FileDescriptor inside the formatter will have
2226 # the wrong write storage class so we need to create a new one
2227 # given the immutability constraint.
2228 writeStorageClass = rwInfo.info.storageClass
2230 # We may need to put some thought into parameters for read
2231 # components but for now forward them on as is
2232 readFormatter = type(rwInfo.formatter)(
2233 FileDescriptor(
2234 rwInfo.location,
2235 readStorageClass=refStorageClass,
2236 storageClass=writeStorageClass,
2237 parameters=parameters,
2238 ),
2239 ref.dataId,
2240 )
2242 # The assembler can not receive any parameter requests for a
2243 # derived component at this time since the assembler will
2244 # see the storage class of the derived component and those
2245 # parameters will have to be handled by the formatter on the
2246 # forwarded storage class.
2247 assemblerParams: dict[str, Any] = {}
2249 # Need to created a new info that specifies the derived
2250 # component and associated storage class
2251 readInfo = DatastoreFileGetInformation(
2252 rwInfo.location,
2253 readFormatter,
2254 rwInfo.info,
2255 assemblerParams,
2256 {},
2257 refComponent,
2258 refStorageClass,
2259 )
2261 return self._read_artifact_into_memory(readInfo, ref, isComponent=True, cache_ref=cache_ref)
2263 else:
2264 # Single file request or component from that composite file
2265 for lookup in (refComponent, None):
2266 if lookup in allComponents:
2267 getInfo = allComponents[lookup]
2268 break
2269 else:
2270 raise FileNotFoundError(
2271 f"Component {refComponent} not found for ref {ref} in datastore {self.name}"
2272 )
2274 # Do not need the component itself if already disassembled
2275 if isDisassembled:
2276 isComponent = False
2277 else:
2278 isComponent = getInfo.component is not None
2280 # For a component read of a composite we want the cache to
2281 # be looking at the composite ref itself.
2282 cache_ref = ref.makeCompositeRef() if isComponent else ref
2284 # For a disassembled component we can validate parametersagainst
2285 # the component storage class directly
2286 if isDisassembled:
2287 refStorageClass.validateParameters(parameters)
2288 else:
2289 # For an assembled composite this could be a derived
2290 # component derived from a real component. The validity
2291 # of the parameters is not clear. For now validate against
2292 # the composite storage class
2293 getInfo.formatter.fileDescriptor.storageClass.validateParameters(parameters)
2295 return self._read_artifact_into_memory(getInfo, ref, isComponent=isComponent, cache_ref=cache_ref)
2297 @transactional
2298 def put(self, inMemoryDataset: Any, ref: DatasetRef) -> None:
2299 """Write a InMemoryDataset with a given `DatasetRef` to the store.
2301 Parameters
2302 ----------
2303 inMemoryDataset : `object`
2304 The dataset to store.
2305 ref : `DatasetRef`
2306 Reference to the associated Dataset.
2308 Raises
2309 ------
2310 TypeError
2311 Supplied object and storage class are inconsistent.
2312 DatasetTypeNotSupportedError
2313 The associated `DatasetType` is not handled by this datastore.
2315 Notes
2316 -----
2317 If the datastore is configured to reject certain dataset types it
2318 is possible that the put will fail and raise a
2319 `DatasetTypeNotSupportedError`. The main use case for this is to
2320 allow `ChainedDatastore` to put to multiple datastores without
2321 requiring that every datastore accepts the dataset.
2322 """
2323 doDisassembly = self.composites.shouldBeDisassembled(ref)
2324 # doDisassembly = True
2326 artifacts = []
2327 if doDisassembly:
2328 components = ref.datasetType.storageClass.delegate().disassemble(inMemoryDataset)
2329 if components is None:
2330 raise RuntimeError(
2331 f"Inconsistent configuration: dataset type {ref.datasetType.name} "
2332 f"with storage class {ref.datasetType.storageClass.name} "
2333 "is configured to be disassembled, but cannot be."
2334 )
2335 for component, componentInfo in components.items():
2336 # Don't recurse because we want to take advantage of
2337 # bulk insert -- need a new DatasetRef that refers to the
2338 # same dataset_id but has the component DatasetType
2339 # DatasetType does not refer to the types of components
2340 # So we construct one ourselves.
2341 compRef = ref.makeComponentRef(component)
2342 storedInfo = self._write_in_memory_to_artifact(componentInfo.component, compRef)
2343 artifacts.append((compRef, storedInfo))
2344 else:
2345 # Write the entire thing out
2346 storedInfo = self._write_in_memory_to_artifact(inMemoryDataset, ref)
2347 artifacts.append((ref, storedInfo))
2349 self._register_datasets(artifacts, insert_mode=DatabaseInsertMode.INSERT)
2351 @transactional
2352 def trash(self, ref: DatasetRef | Iterable[DatasetRef], ignore_errors: bool = True) -> None:
2353 # At this point can safely remove these datasets from the cache
2354 # to avoid confusion later on. If they are not trashed later
2355 # the cache will simply be refilled.
2356 self.cacheManager.remove_from_cache(ref)
2358 # If we are in trust mode there will be nothing to move to
2359 # the trash table and we will have to try to delete the file
2360 # immediately.
2361 if self.trustGetRequest:
2362 # Try to keep the logic below for a single file trash.
2363 if isinstance(ref, DatasetRef):
2364 refs = {ref}
2365 else:
2366 # Will recreate ref at the end of this branch.
2367 refs = set(ref)
2369 # Determine which datasets are known to datastore directly.
2370 id_to_ref = {ref.id: ref for ref in refs}
2371 existing_ids = self._get_stored_records_associated_with_refs(refs)
2372 existing_refs = {id_to_ref[ref_id] for ref_id in existing_ids}
2374 missing = refs - existing_refs
2375 if missing:
2376 # Do an explicit existence check on these refs.
2377 # We only care about the artifacts at this point and not
2378 # the dataset existence.
2379 artifact_existence: dict[ResourcePath, bool] = {}
2380 _ = self.mexists(missing, artifact_existence)
2381 uris = [uri for uri, exists in artifact_existence.items() if exists]
2383 # FUTURE UPGRADE: Implement a parallelized bulk remove.
2384 log.debug("Removing %d artifacts from datastore that are unknown to datastore", len(uris))
2385 for uri in uris:
2386 try:
2387 uri.remove()
2388 except Exception as e:
2389 if ignore_errors:
2390 log.debug("Artifact %s could not be removed: %s", uri, e)
2391 continue
2392 raise
2394 # There is no point asking the code below to remove refs we
2395 # know are missing so update it with the list of existing
2396 # records. Try to retain one vs many logic.
2397 if not existing_refs:
2398 # Nothing more to do since none of the datasets were
2399 # known to the datastore record table.
2400 return
2401 ref = list(existing_refs)
2402 if len(ref) == 1:
2403 ref = ref[0]
2405 # Get file metadata and internal metadata
2406 if not isinstance(ref, DatasetRef):
2407 log.debug("Doing multi-dataset trash in datastore %s", self.name)
2408 # Assumed to be an iterable of refs so bulk mode enabled.
2409 try:
2410 self.bridge.moveToTrash(ref, transaction=self._transaction)
2411 except Exception as e:
2412 if ignore_errors:
2413 log.warning("Unexpected issue moving multiple datasets to trash: %s", e)
2414 else:
2415 raise
2416 return
2418 log.debug("Trashing dataset %s in datastore %s", ref, self.name)
2420 fileLocations = self._get_dataset_locations_info(ref)
2422 if not fileLocations:
2423 err_msg = f"Requested dataset to trash ({ref}) is not known to datastore {self.name}"
2424 if ignore_errors:
2425 log.warning(err_msg)
2426 return
2427 else:
2428 raise FileNotFoundError(err_msg)
2430 for location, _ in fileLocations:
2431 if not self._artifact_exists(location):
2432 err_msg = (
2433 f"Dataset is known to datastore {self.name} but "
2434 f"associated artifact ({location.uri}) is missing"
2435 )
2436 if ignore_errors:
2437 log.warning(err_msg)
2438 return
2439 else:
2440 raise FileNotFoundError(err_msg)
2442 # Mark dataset as trashed
2443 try:
2444 self.bridge.moveToTrash([ref], transaction=self._transaction)
2445 except Exception as e:
2446 if ignore_errors:
2447 log.warning(
2448 "Attempted to mark dataset (%s) to be trashed in datastore %s "
2449 "but encountered an error: %s",
2450 ref,
2451 self.name,
2452 e,
2453 )
2454 pass
2455 else:
2456 raise
2458 @transactional
2459 def emptyTrash(self, ignore_errors: bool = True) -> None:
2460 """Remove all datasets from the trash.
2462 Parameters
2463 ----------
2464 ignore_errors : `bool`
2465 If `True` return without error even if something went wrong.
2466 Problems could occur if another process is simultaneously trying
2467 to delete.
2468 """
2469 log.debug("Emptying trash in datastore %s", self.name)
2471 # Context manager will empty trash iff we finish it without raising.
2472 # It will also automatically delete the relevant rows from the
2473 # trash table and the records table.
2474 with self.bridge.emptyTrash(
2475 self._table, record_class=StoredFileInfo, record_column="path"
2476 ) as trash_data:
2477 # Removing the artifacts themselves requires that the files are
2478 # not also associated with refs that are not to be trashed.
2479 # Therefore need to do a query with the file paths themselves
2480 # and return all the refs associated with them. Can only delete
2481 # a file if the refs to be trashed are the only refs associated
2482 # with the file.
2483 # This requires multiple copies of the trashed items
2484 trashed, artifacts_to_keep = trash_data
2486 if artifacts_to_keep is None:
2487 # The bridge is not helping us so have to work it out
2488 # ourselves. This is not going to be as efficient.
2489 trashed = list(trashed)
2491 # The instance check is for mypy since up to this point it
2492 # does not know the type of info.
2493 path_map = self._refs_associated_with_artifacts(
2494 [info.path for _, info in trashed if isinstance(info, StoredFileInfo)]
2495 )
2497 for ref, info in trashed:
2498 # Mypy needs to know this is not the base class
2499 assert isinstance(info, StoredFileInfo), f"Unexpectedly got info of class {type(info)}"
2501 path_map[info.path].remove(ref.id)
2502 if not path_map[info.path]:
2503 del path_map[info.path]
2505 artifacts_to_keep = set(path_map)
2507 for ref, info in trashed:
2508 # Should not happen for this implementation but need
2509 # to keep mypy happy.
2510 assert info is not None, f"Internal logic error in emptyTrash with ref {ref}."
2512 # Mypy needs to know this is not the base class
2513 assert isinstance(info, StoredFileInfo), f"Unexpectedly got info of class {type(info)}"
2515 if info.path in artifacts_to_keep:
2516 # This is a multi-dataset artifact and we are not
2517 # removing all associated refs.
2518 continue
2520 # Only trashed refs still known to datastore will be returned.
2521 location = info.file_location(self.locationFactory)
2523 # Point of no return for this artifact
2524 log.debug("Removing artifact %s from datastore %s", location.uri, self.name)
2525 try:
2526 self._delete_artifact(location)
2527 except FileNotFoundError:
2528 # If the file itself has been deleted there is nothing
2529 # we can do about it. It is possible that trash has
2530 # been run in parallel in another process or someone
2531 # decided to delete the file. It is unlikely to come
2532 # back and so we should still continue with the removal
2533 # of the entry from the trash table. It is also possible
2534 # we removed it in a previous iteration if it was
2535 # a multi-dataset artifact. The delete artifact method
2536 # will log a debug message in this scenario.
2537 # Distinguishing file missing before trash started and
2538 # file already removed previously as part of this trash
2539 # is not worth the distinction with regards to potential
2540 # memory cost.
2541 pass
2542 except Exception as e:
2543 if ignore_errors:
2544 # Use a debug message here even though it's not
2545 # a good situation. In some cases this can be
2546 # caused by a race between user A and user B
2547 # and neither of them has permissions for the
2548 # other's files. Butler does not know about users
2549 # and trash has no idea what collections these
2550 # files were in (without guessing from a path).
2551 log.debug(
2552 "Encountered error removing artifact %s from datastore %s: %s",
2553 location.uri,
2554 self.name,
2555 e,
2556 )
2557 else:
2558 raise
2560 @transactional
2561 def transfer_from(
2562 self,
2563 source_datastore: Datastore,
2564 refs: Iterable[DatasetRef],
2565 transfer: str = "auto",
2566 artifact_existence: dict[ResourcePath, bool] | None = None,
2567 ) -> tuple[set[DatasetRef], set[DatasetRef]]:
2568 # Docstring inherited
2569 if type(self) is not type(source_datastore):
2570 raise TypeError(
2571 f"Datastore mismatch between this datastore ({type(self)}) and the "
2572 f"source datastore ({type(source_datastore)})."
2573 )
2575 # Be explicit for mypy
2576 if not isinstance(source_datastore, FileDatastore):
2577 raise TypeError(
2578 "Can only transfer to a FileDatastore from another FileDatastore, not"
2579 f" {type(source_datastore)}"
2580 )
2582 # Stop early if "direct" transfer mode is requested. That would
2583 # require that the URI inside the source datastore should be stored
2584 # directly in the target datastore, which seems unlikely to be useful
2585 # since at any moment the source datastore could delete the file.
2586 if transfer in ("direct", "split"):
2587 raise ValueError(
2588 f"Can not transfer from a source datastore using {transfer} mode since"
2589 " those files are controlled by the other datastore."
2590 )
2592 # Empty existence lookup if none given.
2593 if artifact_existence is None:
2594 artifact_existence = {}
2596 # We will go through the list multiple times so must convert
2597 # generators to lists.
2598 refs = list(refs)
2600 # In order to handle disassembled composites the code works
2601 # at the records level since it can assume that internal APIs
2602 # can be used.
2603 # - If the record already exists in the destination this is assumed
2604 # to be okay.
2605 # - If there is no record but the source and destination URIs are
2606 # identical no transfer is done but the record is added.
2607 # - If the source record refers to an absolute URI currently assume
2608 # that that URI should remain absolute and will be visible to the
2609 # destination butler. May need to have a flag to indicate whether
2610 # the dataset should be transferred. This will only happen if
2611 # the detached Butler has had a local ingest.
2613 # What we really want is all the records in the source datastore
2614 # associated with these refs. Or derived ones if they don't exist
2615 # in the source.
2616 source_records = source_datastore._get_stored_records_associated_with_refs(refs)
2618 # The source dataset_ids are the keys in these records
2619 source_ids = set(source_records)
2620 log.debug("Number of datastore records found in source: %d", len(source_ids))
2622 requested_ids = {ref.id for ref in refs}
2623 missing_ids = requested_ids - source_ids
2625 # Missing IDs can be okay if that datastore has allowed
2626 # gets based on file existence. Should we transfer what we can
2627 # or complain about it and warn?
2628 if missing_ids and not source_datastore.trustGetRequest:
2629 raise ValueError(
2630 f"Some datasets are missing from source datastore {source_datastore}: {missing_ids}"
2631 )
2633 # Need to map these missing IDs to a DatasetRef so we can guess
2634 # the details.
2635 if missing_ids:
2636 log.info(
2637 "Number of expected datasets missing from source datastore records: %d out of %d",
2638 len(missing_ids),
2639 len(requested_ids),
2640 )
2641 id_to_ref = {ref.id: ref for ref in refs if ref.id in missing_ids}
2643 # This should be chunked in case we end up having to check
2644 # the file store since we need some log output to show
2645 # progress.
2646 for missing_ids_chunk in chunk_iterable(missing_ids, chunk_size=10_000):
2647 records = {}
2648 for missing in missing_ids_chunk:
2649 # Ask the source datastore where the missing artifacts
2650 # should be. An execution butler might not know about the
2651 # artifacts even if they are there.
2652 expected = source_datastore._get_expected_dataset_locations_info(id_to_ref[missing])
2653 records[missing] = [info for _, info in expected]
2655 # Call the mexist helper method in case we have not already
2656 # checked these artifacts such that artifact_existence is
2657 # empty. This allows us to benefit from parallelism.
2658 # datastore.mexists() itself does not give us access to the
2659 # derived datastore record.
2660 log.verbose("Checking existence of %d datasets unknown to datastore", len(records))
2661 ref_exists = source_datastore._process_mexists_records(
2662 id_to_ref, records, False, artifact_existence=artifact_existence
2663 )
2665 # Now go through the records and propagate the ones that exist.
2666 location_factory = source_datastore.locationFactory
2667 for missing, record_list in records.items():
2668 # Skip completely if the ref does not exist.
2669 ref = id_to_ref[missing]
2670 if not ref_exists[ref]:
2671 log.warning("Asked to transfer dataset %s but no file artifacts exist for it.", ref)
2672 continue
2673 # Check for file artifact to decide which parts of a
2674 # disassembled composite do exist. If there is only a
2675 # single record we don't even need to look because it can't
2676 # be a composite and must exist.
2677 if len(record_list) == 1:
2678 dataset_records = record_list
2679 else:
2680 dataset_records = [
2681 record
2682 for record in record_list
2683 if artifact_existence[record.file_location(location_factory).uri]
2684 ]
2685 assert len(dataset_records) > 0, "Disassembled composite should have had some files."
2687 # Rely on source_records being a defaultdict.
2688 source_records[missing].extend(dataset_records)
2690 # See if we already have these records
2691 target_records = self._get_stored_records_associated_with_refs(refs)
2693 # The artifacts to register
2694 artifacts = []
2696 # Refs that already exist
2697 already_present = []
2699 # Refs that were rejected by this datastore.
2700 rejected = set()
2702 # Refs that were transferred successfully.
2703 accepted = set()
2705 # Record each time we have done a "direct" transfer.
2706 direct_transfers = []
2708 # Now can transfer the artifacts
2709 for ref in refs:
2710 if not self.constraints.isAcceptable(ref):
2711 # This datastore should not be accepting this dataset.
2712 rejected.add(ref)
2713 continue
2715 accepted.add(ref)
2717 if ref.id in target_records:
2718 # Already have an artifact for this.
2719 already_present.append(ref)
2720 continue
2722 # mypy needs to know these are always resolved refs
2723 for info in source_records[ref.id]:
2724 source_location = info.file_location(source_datastore.locationFactory)
2725 target_location = info.file_location(self.locationFactory)
2726 if source_location == target_location and not source_location.pathInStore.isabs():
2727 # Artifact is already in the target location.
2728 # (which is how execution butler currently runs)
2729 pass
2730 else:
2731 if target_location.pathInStore.isabs():
2732 # Just because we can see the artifact when running
2733 # the transfer doesn't mean it will be generally
2734 # accessible to a user of this butler. Need to decide
2735 # what to do about an absolute path.
2736 if transfer == "auto":
2737 # For "auto" transfers we allow the absolute URI
2738 # to be recorded in the target datastore.
2739 direct_transfers.append(source_location)
2740 else:
2741 # The user is explicitly requesting a transfer
2742 # even for an absolute URI. This requires us to
2743 # calculate the target path.
2744 template_ref = ref
2745 if info.component:
2746 template_ref = ref.makeComponentRef(info.component)
2747 target_location = self._calculate_ingested_datastore_name(
2748 source_location.uri,
2749 template_ref,
2750 )
2752 info = info.update(path=target_location.pathInStore.path)
2754 # Need to transfer it to the new location.
2755 # Assume we should always overwrite. If the artifact
2756 # is there this might indicate that a previous transfer
2757 # was interrupted but was not able to be rolled back
2758 # completely (eg pre-emption) so follow Datastore default
2759 # and overwrite.
2760 target_location.uri.transfer_from(
2761 source_location.uri, transfer=transfer, overwrite=True, transaction=self._transaction
2762 )
2764 artifacts.append((ref, info))
2766 if direct_transfers:
2767 log.info(
2768 "Transfer request for an outside-datastore artifact with absolute URI done %d time%s",
2769 len(direct_transfers),
2770 "" if len(direct_transfers) == 1 else "s",
2771 )
2773 # We are overwriting previous datasets that may have already
2774 # existed. We therefore should ensure that we force the
2775 # datastore records to agree. Note that this can potentially lead
2776 # to difficulties if the dataset has previously been ingested
2777 # disassembled and is somehow now assembled, or vice versa.
2778 self._register_datasets(artifacts, insert_mode=DatabaseInsertMode.REPLACE)
2780 if already_present:
2781 n_skipped = len(already_present)
2782 log.info(
2783 "Skipped transfer of %d dataset%s already present in datastore",
2784 n_skipped,
2785 "" if n_skipped == 1 else "s",
2786 )
2788 return accepted, rejected
2790 @transactional
2791 def forget(self, refs: Iterable[DatasetRef]) -> None:
2792 # Docstring inherited.
2793 refs = list(refs)
2794 self.bridge.forget(refs)
2795 self._table.delete(["dataset_id"], *[{"dataset_id": ref.id} for ref in refs])
2797 def validateConfiguration(
2798 self, entities: Iterable[DatasetRef | DatasetType | StorageClass], logFailures: bool = False
2799 ) -> None:
2800 """Validate some of the configuration for this datastore.
2802 Parameters
2803 ----------
2804 entities : iterable of `DatasetRef`, `DatasetType`, or `StorageClass`
2805 Entities to test against this configuration. Can be differing
2806 types.
2807 logFailures : `bool`, optional
2808 If `True`, output a log message for every validation error
2809 detected.
2811 Raises
2812 ------
2813 DatastoreValidationError
2814 Raised if there is a validation problem with a configuration.
2815 All the problems are reported in a single exception.
2817 Notes
2818 -----
2819 This method checks that all the supplied entities have valid file
2820 templates and also have formatters defined.
2821 """
2822 templateFailed = None
2823 try:
2824 self.templates.validateTemplates(entities, logFailures=logFailures)
2825 except FileTemplateValidationError as e:
2826 templateFailed = str(e)
2828 formatterFailed = []
2829 for entity in entities:
2830 try:
2831 self.formatterFactory.getFormatterClass(entity)
2832 except KeyError as e:
2833 formatterFailed.append(str(e))
2834 if logFailures:
2835 log.critical("Formatter failure: %s", e)
2837 if templateFailed or formatterFailed:
2838 messages = []
2839 if templateFailed:
2840 messages.append(templateFailed)
2841 if formatterFailed:
2842 messages.append(",".join(formatterFailed))
2843 msg = ";\n".join(messages)
2844 raise DatastoreValidationError(msg)
2846 def getLookupKeys(self) -> set[LookupKey]:
2847 # Docstring is inherited from base class
2848 return (
2849 self.templates.getLookupKeys()
2850 | self.formatterFactory.getLookupKeys()
2851 | self.constraints.getLookupKeys()
2852 )
2854 def validateKey(self, lookupKey: LookupKey, entity: DatasetRef | DatasetType | StorageClass) -> None:
2855 # Docstring is inherited from base class
2856 # The key can be valid in either formatters or templates so we can
2857 # only check the template if it exists
2858 if lookupKey in self.templates:
2859 try:
2860 self.templates[lookupKey].validateTemplate(entity)
2861 except FileTemplateValidationError as e:
2862 raise DatastoreValidationError(e) from e
2864 def export(
2865 self,
2866 refs: Iterable[DatasetRef],
2867 *,
2868 directory: ResourcePathExpression | None = None,
2869 transfer: str | None = "auto",
2870 ) -> Iterable[FileDataset]:
2871 # Docstring inherited from Datastore.export.
2872 if transfer == "auto" and directory is None:
2873 transfer = None
2875 if transfer is not None and directory is None:
2876 raise TypeError(f"Cannot export using transfer mode {transfer} with no export directory given")
2878 if transfer == "move":
2879 raise TypeError("Can not export by moving files out of datastore.")
2880 elif transfer == "direct":
2881 # For an export, treat this as equivalent to None. We do not
2882 # want an import to risk using absolute URIs to datasets owned
2883 # by another datastore.
2884 log.info("Treating 'direct' transfer mode as in-place export.")
2885 transfer = None
2887 # Force the directory to be a URI object
2888 directoryUri: ResourcePath | None = None
2889 if directory is not None:
2890 directoryUri = ResourcePath(directory, forceDirectory=True)
2892 if transfer is not None and directoryUri is not None and not directoryUri.exists():
2893 # mypy needs the second test
2894 raise FileNotFoundError(f"Export location {directory} does not exist")
2896 progress = Progress("lsst.daf.butler.datastores.FileDatastore.export", level=logging.DEBUG)
2897 for ref in progress.wrap(refs, "Exporting dataset files"):
2898 fileLocations = self._get_dataset_locations_info(ref)
2899 if not fileLocations:
2900 raise FileNotFoundError(f"Could not retrieve dataset {ref}.")
2901 # For now we can not export disassembled datasets
2902 if len(fileLocations) > 1:
2903 raise NotImplementedError(f"Can not export disassembled datasets such as {ref}")
2904 location, storedFileInfo = fileLocations[0]
2906 pathInStore = location.pathInStore.path
2907 if transfer is None:
2908 # TODO: do we also need to return the readStorageClass somehow?
2909 # We will use the path in store directly. If this is an
2910 # absolute URI, preserve it.
2911 if location.pathInStore.isabs():
2912 pathInStore = str(location.uri)
2913 elif transfer == "direct":
2914 # Use full URIs to the remote store in the export
2915 pathInStore = str(location.uri)
2916 else:
2917 # mypy needs help
2918 assert directoryUri is not None, "directoryUri must be defined to get here"
2919 storeUri = ResourcePath(location.uri)
2921 # if the datastore has an absolute URI to a resource, we
2922 # have two options:
2923 # 1. Keep the absolute URI in the exported YAML
2924 # 2. Allocate a new name in the local datastore and transfer
2925 # it.
2926 # For now go with option 2
2927 if location.pathInStore.isabs():
2928 template = self.templates.getTemplate(ref)
2929 newURI = ResourcePath(template.format(ref), forceAbsolute=False)
2930 pathInStore = str(newURI.updatedExtension(location.pathInStore.getExtension()))
2932 exportUri = directoryUri.join(pathInStore)
2933 exportUri.transfer_from(storeUri, transfer=transfer)
2935 yield FileDataset(refs=[ref], path=pathInStore, formatter=storedFileInfo.formatter)
2937 @staticmethod
2938 def computeChecksum(uri: ResourcePath, algorithm: str = "blake2b", block_size: int = 8192) -> str | None:
2939 """Compute the checksum of the supplied file.
2941 Parameters
2942 ----------
2943 uri : `lsst.resources.ResourcePath`
2944 Name of resource to calculate checksum from.
2945 algorithm : `str`, optional
2946 Name of algorithm to use. Must be one of the algorithms supported
2947 by :py:class`hashlib`.
2948 block_size : `int`
2949 Number of bytes to read from file at one time.
2951 Returns
2952 -------
2953 hexdigest : `str`
2954 Hex digest of the file.
2956 Notes
2957 -----
2958 Currently returns None if the URI is for a remote resource.
2959 """
2960 if algorithm not in hashlib.algorithms_guaranteed:
2961 raise NameError(f"The specified algorithm '{algorithm}' is not supported by hashlib")
2963 if not uri.isLocal:
2964 return None
2966 hasher = hashlib.new(algorithm)
2968 with uri.as_local() as local_uri, open(local_uri.ospath, "rb") as f:
2969 for chunk in iter(lambda: f.read(block_size), b""):
2970 hasher.update(chunk)
2972 return hasher.hexdigest()
2974 def needs_expanded_data_ids(
2975 self,
2976 transfer: str | None,
2977 entity: DatasetRef | DatasetType | StorageClass | None = None,
2978 ) -> bool:
2979 # Docstring inherited.
2980 # This _could_ also use entity to inspect whether the filename template
2981 # involves placeholders other than the required dimensions for its
2982 # dataset type, but that's not necessary for correctness; it just
2983 # enables more optimizations (perhaps only in theory).
2984 return transfer not in ("direct", None)
2986 def import_records(self, data: Mapping[str, DatastoreRecordData]) -> None:
2987 # Docstring inherited from the base class.
2988 record_data = data.get(self.name)
2989 if not record_data:
2990 return
2992 self._bridge.insert(FakeDatasetRef(dataset_id) for dataset_id in record_data.records)
2994 # TODO: Verify that there are no unexpected table names in the dict?
2995 unpacked_records = []
2996 for dataset_data in record_data.records.values():
2997 records = dataset_data.get(self._table.name)
2998 if records:
2999 for info in records:
3000 assert isinstance(info, StoredFileInfo), "Expecting StoredFileInfo records"
3001 unpacked_records.append(info.to_record())
3002 if unpacked_records:
3003 self._table.insert(*unpacked_records, transaction=self._transaction)
3005 def export_records(self, refs: Iterable[DatasetIdRef]) -> Mapping[str, DatastoreRecordData]:
3006 # Docstring inherited from the base class.
3007 exported_refs = list(self._bridge.check(refs))
3008 ids = {ref.id for ref in exported_refs}
3009 records: dict[DatasetId, dict[str, list[StoredDatastoreItemInfo]]] = {id: {} for id in ids}
3010 for row in self._table.fetch(dataset_id=ids):
3011 info: StoredDatastoreItemInfo = StoredFileInfo.from_record(row)
3012 dataset_records = records.setdefault(info.dataset_id, {})
3013 dataset_records.setdefault(self._table.name, []).append(info)
3015 record_data = DatastoreRecordData(records=records)
3016 return {self.name: record_data}
3018 def set_retrieve_dataset_type_method(self, method: Callable[[str], DatasetType | None] | None) -> None:
3019 # Docstring inherited from the base class.
3020 self._retrieve_dataset_method = method
3022 def _cast_storage_class(self, ref: DatasetRef) -> DatasetRef:
3023 """Update dataset reference to use the storage class from registry."""
3024 if self._retrieve_dataset_method is None:
3025 # We could raise an exception here but unit tests do not define
3026 # this method.
3027 return ref
3028 dataset_type = self._retrieve_dataset_method(ref.datasetType.name)
3029 if dataset_type is not None:
3030 ref = ref.overrideStorageClass(dataset_type.storageClass)
3031 return ref