Coverage for python/lsst/daf/butler/datastores/fileDatastore.py: 10%
929 statements
« prev ^ index » next coverage.py v7.4.4, created at 2024-04-10 10:14 +0000
« prev ^ index » next coverage.py v7.4.4, created at 2024-04-10 10:14 +0000
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This software is dual licensed under the GNU General Public License and also
10# under a 3-clause BSD license. Recipients may choose which of these licenses
11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt,
12# respectively. If you choose the GPL option then the following text applies
13# (but note that there is still no warranty even if you opt for BSD instead):
14#
15# This program is free software: you can redistribute it and/or modify
16# it under the terms of the GNU General Public License as published by
17# the Free Software Foundation, either version 3 of the License, or
18# (at your option) any later version.
19#
20# This program is distributed in the hope that it will be useful,
21# but WITHOUT ANY WARRANTY; without even the implied warranty of
22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23# GNU General Public License for more details.
24#
25# You should have received a copy of the GNU General Public License
26# along with this program. If not, see <http://www.gnu.org/licenses/>.
28"""Generic file-based datastore code."""
30from __future__ import annotations
32__all__ = ("FileDatastore",)
34import contextlib
35import hashlib
36import logging
37from collections import defaultdict
38from collections.abc import Callable, Collection, Iterable, Mapping, Sequence
39from typing import TYPE_CHECKING, Any, ClassVar, cast
41from lsst.daf.butler import (
42 Config,
43 DatasetDatastoreRecords,
44 DatasetId,
45 DatasetRef,
46 DatasetType,
47 DatasetTypeNotSupportedError,
48 FileDataset,
49 FileDescriptor,
50 Formatter,
51 FormatterFactory,
52 Location,
53 LocationFactory,
54 Progress,
55 StorageClass,
56 ddl,
57)
58from lsst.daf.butler.datastore import (
59 DatasetRefURIs,
60 Datastore,
61 DatastoreConfig,
62 DatastoreOpaqueTable,
63 DatastoreValidationError,
64)
65from lsst.daf.butler.datastore.cache_manager import (
66 AbstractDatastoreCacheManager,
67 DatastoreCacheManager,
68 DatastoreDisabledCacheManager,
69)
70from lsst.daf.butler.datastore.composites import CompositesMap
71from lsst.daf.butler.datastore.file_templates import FileTemplates, FileTemplateValidationError
72from lsst.daf.butler.datastore.generic_base import GenericBaseDatastore
73from lsst.daf.butler.datastore.record_data import DatastoreRecordData
74from lsst.daf.butler.datastore.stored_file_info import StoredDatastoreItemInfo, StoredFileInfo
75from lsst.daf.butler.datastores.file_datastore.get import (
76 DatasetLocationInformation,
77 DatastoreFileGetInformation,
78 generate_datastore_get_information,
79 get_dataset_as_python_object_from_get_info,
80)
81from lsst.daf.butler.datastores.file_datastore.retrieve_artifacts import (
82 determine_destination_for_retrieved_artifact,
83)
84from lsst.daf.butler.datastores.fileDatastoreClient import (
85 FileDatastoreGetPayload,
86 FileDatastoreGetPayloadFileInfo,
87)
88from lsst.daf.butler.registry.interfaces import (
89 DatabaseInsertMode,
90 DatastoreRegistryBridge,
91 FakeDatasetRef,
92 ReadOnlyDatabaseError,
93)
94from lsst.daf.butler.repo_relocation import replaceRoot
95from lsst.daf.butler.utils import transactional
96from lsst.resources import ResourcePath, ResourcePathExpression
97from lsst.utils.introspection import get_class_of
98from lsst.utils.iteration import chunk_iterable
100# For VERBOSE logging usage.
101from lsst.utils.logging import VERBOSE, getLogger
102from sqlalchemy import BigInteger, String
104if TYPE_CHECKING:
105 from lsst.daf.butler import LookupKey
106 from lsst.daf.butler.registry.interfaces import DatasetIdRef, DatastoreRegistryBridgeManager
108log = getLogger(__name__)
111class _IngestPrepData(Datastore.IngestPrepData):
112 """Helper class for FileDatastore ingest implementation.
114 Parameters
115 ----------
116 datasets : `~collections.abc.Iterable` of `FileDataset`
117 Files to be ingested by this datastore.
118 """
120 def __init__(self, datasets: Iterable[FileDataset]):
121 super().__init__(ref for dataset in datasets for ref in dataset.refs)
122 self.datasets = datasets
125class FileDatastore(GenericBaseDatastore[StoredFileInfo]):
126 """Generic Datastore for file-based implementations.
128 Should always be sub-classed since key abstract methods are missing.
130 Parameters
131 ----------
132 config : `DatastoreConfig` or `str`
133 Configuration as either a `Config` object or URI to file.
134 bridgeManager : `DatastoreRegistryBridgeManager`
135 Object that manages the interface between `Registry` and datastores.
136 root : `ResourcePath`
137 Root directory URI of this `Datastore`.
138 formatterFactory : `FormatterFactory`
139 Factory for creating instances of formatters.
140 templates : `FileTemplates`
141 File templates that can be used by this `Datastore`.
142 composites : `CompositesMap`
143 Determines whether a dataset should be disassembled on put.
144 trustGetRequest : `bool`
145 Determine whether we can fall back to configuration if a requested
146 dataset is not known to registry.
148 Raises
149 ------
150 ValueError
151 If root location does not exist and ``create`` is `False` in the
152 configuration.
153 """
155 defaultConfigFile: ClassVar[str | None] = None
156 """Path to configuration defaults. Accessed within the ``config`` resource
157 or relative to a search path. Can be None if no defaults specified.
158 """
160 root: ResourcePath
161 """Root directory URI of this `Datastore`."""
163 locationFactory: LocationFactory
164 """Factory for creating locations relative to the datastore root."""
166 formatterFactory: FormatterFactory
167 """Factory for creating instances of formatters."""
169 templates: FileTemplates
170 """File templates that can be used by this `Datastore`."""
172 composites: CompositesMap
173 """Determines whether a dataset should be disassembled on put."""
175 defaultConfigFile = "datastores/fileDatastore.yaml"
176 """Path to configuration defaults. Accessed within the ``config`` resource
177 or relative to a search path. Can be None if no defaults specified.
178 """
180 _retrieve_dataset_method: Callable[[str], DatasetType | None] | None = None
181 """Callable that is used in trusted mode to retrieve registry definition
182 of a named dataset type.
183 """
185 @classmethod
186 def setConfigRoot(cls, root: str, config: Config, full: Config, overwrite: bool = True) -> None:
187 """Set any filesystem-dependent config options for this Datastore to
188 be appropriate for a new empty repository with the given root.
190 Parameters
191 ----------
192 root : `str`
193 URI to the root of the data repository.
194 config : `Config`
195 A `Config` to update. Only the subset understood by
196 this component will be updated. Will not expand
197 defaults.
198 full : `Config`
199 A complete config with all defaults expanded that can be
200 converted to a `DatastoreConfig`. Read-only and will not be
201 modified by this method.
202 Repository-specific options that should not be obtained
203 from defaults when Butler instances are constructed
204 should be copied from ``full`` to ``config``.
205 overwrite : `bool`, optional
206 If `False`, do not modify a value in ``config`` if the value
207 already exists. Default is always to overwrite with the provided
208 ``root``.
210 Notes
211 -----
212 If a keyword is explicitly defined in the supplied ``config`` it
213 will not be overridden by this method if ``overwrite`` is `False`.
214 This allows explicit values set in external configs to be retained.
215 """
216 Config.updateParameters(
217 DatastoreConfig,
218 config,
219 full,
220 toUpdate={"root": root},
221 toCopy=("cls", ("records", "table")),
222 overwrite=overwrite,
223 )
225 @classmethod
226 def makeTableSpec(cls, datasetIdColumnType: type) -> ddl.TableSpec:
227 return ddl.TableSpec(
228 fields=[
229 ddl.FieldSpec(name="dataset_id", dtype=datasetIdColumnType, primaryKey=True),
230 ddl.FieldSpec(name="path", dtype=String, length=256, nullable=False),
231 ddl.FieldSpec(name="formatter", dtype=String, length=128, nullable=False),
232 ddl.FieldSpec(name="storage_class", dtype=String, length=64, nullable=False),
233 # Use empty string to indicate no component
234 ddl.FieldSpec(name="component", dtype=String, length=32, primaryKey=True),
235 # TODO: should checksum be Base64Bytes instead?
236 ddl.FieldSpec(name="checksum", dtype=String, length=128, nullable=True),
237 ddl.FieldSpec(name="file_size", dtype=BigInteger, nullable=True),
238 ],
239 unique=frozenset(),
240 indexes=[ddl.IndexSpec("path")],
241 )
243 def __init__(
244 self,
245 config: DatastoreConfig,
246 bridgeManager: DatastoreRegistryBridgeManager,
247 root: ResourcePath,
248 formatterFactory: FormatterFactory,
249 templates: FileTemplates,
250 composites: CompositesMap,
251 trustGetRequest: bool,
252 ):
253 super().__init__(config, bridgeManager)
254 self.root = ResourcePath(root)
255 self.formatterFactory = formatterFactory
256 self.templates = templates
257 self.composites = composites
258 self.trustGetRequest = trustGetRequest
260 # Name ourselves either using an explicit name or a name
261 # derived from the (unexpanded) root
262 if "name" in self.config:
263 self.name = self.config["name"]
264 else:
265 # We use the unexpanded root in the name to indicate that this
266 # datastore can be moved without having to update registry.
267 self.name = "{}@{}".format(type(self).__name__, self.config["root"])
269 self.locationFactory = LocationFactory(self.root)
271 self._opaque_table_name = self.config["records", "table"]
272 try:
273 # Storage of paths and formatters, keyed by dataset_id
274 self._table = bridgeManager.opaque.register(
275 self._opaque_table_name, self.makeTableSpec(bridgeManager.datasetIdColumnType)
276 )
277 # Interface to Registry.
278 self._bridge = bridgeManager.register(self.name)
279 except ReadOnlyDatabaseError:
280 # If the database is read only and we just tried and failed to
281 # create a table, it means someone is trying to create a read-only
282 # butler client for an empty repo. That should be okay, as long
283 # as they then try to get any datasets before some other client
284 # creates the table. Chances are they're just validating
285 # configuration.
286 pass
288 # Determine whether checksums should be used - default to False
289 self.useChecksum = self.config.get("checksum", False)
291 # Create a cache manager
292 self.cacheManager: AbstractDatastoreCacheManager
293 if "cached" in self.config:
294 self.cacheManager = DatastoreCacheManager(self.config["cached"], universe=bridgeManager.universe)
295 else:
296 self.cacheManager = DatastoreDisabledCacheManager("", universe=bridgeManager.universe)
298 @classmethod
299 def _create_from_config(
300 cls,
301 config: DatastoreConfig,
302 bridgeManager: DatastoreRegistryBridgeManager,
303 butlerRoot: ResourcePathExpression | None,
304 ) -> FileDatastore:
305 if "root" not in config:
306 raise ValueError("No root directory specified in configuration")
308 # Support repository relocation in config
309 # Existence of self.root is checked in subclass
310 root = ResourcePath(replaceRoot(config["root"], butlerRoot), forceDirectory=True, forceAbsolute=True)
312 # Now associate formatters with storage classes
313 formatterFactory = FormatterFactory()
314 formatterFactory.registerFormatters(config["formatters"], universe=bridgeManager.universe)
316 # Read the file naming templates
317 templates = FileTemplates(config["templates"], universe=bridgeManager.universe)
319 # See if composites should be disassembled
320 composites = CompositesMap(config["composites"], universe=bridgeManager.universe)
322 # Determine whether we can fall back to configuration if a
323 # requested dataset is not known to registry
324 trustGetRequest = config.get("trust_get_request", False)
326 self = FileDatastore(
327 config, bridgeManager, root, formatterFactory, templates, composites, trustGetRequest
328 )
330 # Check existence and create directory structure if necessary
331 if not self.root.exists():
332 if "create" not in self.config or not self.config["create"]:
333 raise ValueError(f"No valid root and not allowed to create one at: {self.root}")
334 try:
335 self.root.mkdir()
336 except Exception as e:
337 raise ValueError(
338 f"Can not create datastore root '{self.root}', check permissions. Got error: {e}"
339 ) from e
341 return self
343 def clone(self, bridgeManager: DatastoreRegistryBridgeManager) -> Datastore:
344 return FileDatastore(
345 self.config,
346 bridgeManager,
347 self.root,
348 self.formatterFactory,
349 self.templates,
350 self.composites,
351 self.trustGetRequest,
352 )
354 def __str__(self) -> str:
355 return str(self.root)
357 @property
358 def bridge(self) -> DatastoreRegistryBridge:
359 return self._bridge
361 @property
362 def roots(self) -> dict[str, ResourcePath | None]:
363 # Docstring inherited.
364 return {self.name: self.root}
366 def _set_trust_mode(self, mode: bool) -> None:
367 self.trustGetRequest = mode
369 def _artifact_exists(self, location: Location) -> bool:
370 """Check that an artifact exists in this datastore at the specified
371 location.
373 Parameters
374 ----------
375 location : `Location`
376 Expected location of the artifact associated with this datastore.
378 Returns
379 -------
380 exists : `bool`
381 True if the location can be found, false otherwise.
382 """
383 log.debug("Checking if resource exists: %s", location.uri)
384 return location.uri.exists()
386 def _delete_artifact(self, location: Location) -> None:
387 """Delete the artifact from the datastore.
389 Parameters
390 ----------
391 location : `Location`
392 Location of the artifact associated with this datastore.
393 """
394 if location.pathInStore.isabs():
395 raise RuntimeError(f"Cannot delete artifact with absolute uri {location.uri}.")
397 try:
398 location.uri.remove()
399 except FileNotFoundError:
400 log.debug("File %s did not exist and so could not be deleted.", location.uri)
401 raise
402 except Exception as e:
403 log.critical("Failed to delete file: %s (%s)", location.uri, e)
404 raise
405 log.debug("Successfully deleted file: %s", location.uri)
407 def addStoredItemInfo(
408 self,
409 refs: Iterable[DatasetRef],
410 infos: Iterable[StoredFileInfo],
411 insert_mode: DatabaseInsertMode = DatabaseInsertMode.INSERT,
412 ) -> None:
413 """Record internal storage information associated with one or more
414 datasets.
416 Parameters
417 ----------
418 refs : sequence of `DatasetRef`
419 The datasets that have been stored.
420 infos : sequence of `StoredDatastoreItemInfo`
421 Metadata associated with the stored datasets.
422 insert_mode : `~lsst.daf.butler.registry.interfaces.DatabaseInsertMode`
423 Mode to use to insert the new records into the table. The
424 options are ``INSERT`` (error if pre-existing), ``REPLACE``
425 (replace content with new values), and ``ENSURE`` (skip if the row
426 already exists).
427 """
428 records = [
429 info.rebase(ref).to_record(dataset_id=ref.id) for ref, info in zip(refs, infos, strict=True)
430 ]
431 match insert_mode:
432 case DatabaseInsertMode.INSERT:
433 self._table.insert(*records, transaction=self._transaction)
434 case DatabaseInsertMode.ENSURE:
435 self._table.ensure(*records, transaction=self._transaction)
436 case DatabaseInsertMode.REPLACE:
437 self._table.replace(*records, transaction=self._transaction)
438 case _:
439 raise ValueError(f"Unknown insert mode of '{insert_mode}'")
441 def getStoredItemsInfo(
442 self, ref: DatasetIdRef, ignore_datastore_records: bool = False
443 ) -> list[StoredFileInfo]:
444 """Retrieve information associated with files stored in this
445 `Datastore` associated with this dataset ref.
447 Parameters
448 ----------
449 ref : `DatasetRef`
450 The dataset that is to be queried.
451 ignore_datastore_records : `bool`
452 If `True` then do not use datastore records stored in refs.
454 Returns
455 -------
456 items : `~collections.abc.Iterable` [`StoredDatastoreItemInfo`]
457 Stored information about the files and associated formatters
458 associated with this dataset. Only one file will be returned
459 if the dataset has not been disassembled. Can return an empty
460 list if no matching datasets can be found.
461 """
462 # Try to get them from the ref first.
463 if ref._datastore_records is not None and not ignore_datastore_records:
464 ref_records = ref._datastore_records.get(self._table.name, [])
465 # Need to make sure they have correct type.
466 for record in ref_records:
467 if not isinstance(record, StoredFileInfo):
468 raise TypeError(f"Datastore record has unexpected type {record.__class__.__name__}")
469 return cast(list[StoredFileInfo], ref_records)
471 # Look for the dataset_id -- there might be multiple matches
472 # if we have disassembled the dataset.
473 records = self._table.fetch(dataset_id=ref.id)
474 return [StoredFileInfo.from_record(record) for record in records]
476 def _register_datasets(
477 self,
478 refsAndInfos: Iterable[tuple[DatasetRef, StoredFileInfo]],
479 insert_mode: DatabaseInsertMode = DatabaseInsertMode.INSERT,
480 ) -> None:
481 """Update registry to indicate that one or more datasets have been
482 stored.
484 Parameters
485 ----------
486 refsAndInfos : sequence `tuple` [`DatasetRef`,
487 `StoredDatastoreItemInfo`]
488 Datasets to register and the internal datastore metadata associated
489 with them.
490 insert_mode : `str`, optional
491 Indicate whether the new records should be new ("insert", default),
492 or allowed to exists ("ensure") or be replaced if already present
493 ("replace").
494 """
495 expandedRefs: list[DatasetRef] = []
496 expandedItemInfos: list[StoredFileInfo] = []
498 for ref, itemInfo in refsAndInfos:
499 expandedRefs.append(ref)
500 expandedItemInfos.append(itemInfo)
502 # Dataset location only cares about registry ID so if we have
503 # disassembled in datastore we have to deduplicate. Since they
504 # will have different datasetTypes we can't use a set
505 registryRefs = {r.id: r for r in expandedRefs}
506 if insert_mode == DatabaseInsertMode.INSERT:
507 self.bridge.insert(registryRefs.values())
508 else:
509 # There are only two columns and all that matters is the
510 # dataset ID.
511 self.bridge.ensure(registryRefs.values())
512 self.addStoredItemInfo(expandedRefs, expandedItemInfos, insert_mode=insert_mode)
514 def _get_stored_records_associated_with_refs(
515 self, refs: Iterable[DatasetIdRef], ignore_datastore_records: bool = False
516 ) -> dict[DatasetId, list[StoredFileInfo]]:
517 """Retrieve all records associated with the provided refs.
519 Parameters
520 ----------
521 refs : iterable of `DatasetIdRef`
522 The refs for which records are to be retrieved.
523 ignore_datastore_records : `bool`
524 If `True` then do not use datastore records stored in refs.
526 Returns
527 -------
528 records : `dict` of [`DatasetId`, `list` of `StoredFileInfo`]
529 The matching records indexed by the ref ID. The number of entries
530 in the dict can be smaller than the number of requested refs.
531 """
532 # Check datastore records in refs first.
533 records_by_ref: defaultdict[DatasetId, list[StoredFileInfo]] = defaultdict(list)
534 refs_with_no_records = []
535 for ref in refs:
536 if ignore_datastore_records or ref._datastore_records is None:
537 refs_with_no_records.append(ref)
538 else:
539 if (ref_records := ref._datastore_records.get(self._table.name)) is not None:
540 # Need to make sure they have correct type.
541 for ref_record in ref_records:
542 if not isinstance(ref_record, StoredFileInfo):
543 raise TypeError(
544 f"Datastore record has unexpected type {ref_record.__class__.__name__}"
545 )
546 records_by_ref[ref.id].append(ref_record)
548 # If there were any refs without datastore records, check opaque table.
549 records = self._table.fetch(dataset_id=[ref.id for ref in refs_with_no_records])
551 # Uniqueness is dataset_id + component so can have multiple records
552 # per ref.
553 for record in records:
554 records_by_ref[record["dataset_id"]].append(StoredFileInfo.from_record(record))
555 return records_by_ref
557 def _refs_associated_with_artifacts(self, paths: list[str | ResourcePath]) -> dict[str, set[DatasetId]]:
558 """Return paths and associated dataset refs.
560 Parameters
561 ----------
562 paths : `list` of `str` or `lsst.resources.ResourcePath`
563 All the paths to include in search.
565 Returns
566 -------
567 mapping : `dict` of [`str`, `set` [`DatasetId`]]
568 Mapping of each path to a set of associated database IDs.
569 """
570 records = self._table.fetch(path=[str(path) for path in paths])
571 result = defaultdict(set)
572 for row in records:
573 result[row["path"]].add(row["dataset_id"])
574 return result
576 def _registered_refs_per_artifact(self, pathInStore: ResourcePath) -> set[DatasetId]:
577 """Return all dataset refs associated with the supplied path.
579 Parameters
580 ----------
581 pathInStore : `lsst.resources.ResourcePath`
582 Path of interest in the data store.
584 Returns
585 -------
586 ids : `set` of `int`
587 All `DatasetRef` IDs associated with this path.
588 """
589 records = list(self._table.fetch(path=str(pathInStore)))
590 ids = {r["dataset_id"] for r in records}
591 return ids
593 def removeStoredItemInfo(self, ref: DatasetIdRef) -> None:
594 """Remove information about the file associated with this dataset.
596 Parameters
597 ----------
598 ref : `DatasetRef`
599 The dataset that has been removed.
600 """
601 # Note that this method is actually not used by this implementation,
602 # we depend on bridge to delete opaque records. But there are some
603 # tests that check that this method works, so we keep it for now.
604 self._table.delete(["dataset_id"], {"dataset_id": ref.id})
606 def _get_dataset_locations_info(
607 self, ref: DatasetIdRef, ignore_datastore_records: bool = False
608 ) -> list[DatasetLocationInformation]:
609 r"""Find all the `Location`\ s of the requested dataset in the
610 `Datastore` and the associated stored file information.
612 Parameters
613 ----------
614 ref : `DatasetRef`
615 Reference to the required `Dataset`.
616 ignore_datastore_records : `bool`
617 If `True` then do not use datastore records stored in refs.
619 Returns
620 -------
621 results : `list` [`tuple` [`Location`, `StoredFileInfo` ]]
622 Location of the dataset within the datastore and
623 stored information about each file and its formatter.
624 """
625 # Get the file information (this will fail if no file)
626 records = self.getStoredItemsInfo(ref, ignore_datastore_records)
628 # Use the path to determine the location -- we need to take
629 # into account absolute URIs in the datastore record
630 return [(r.file_location(self.locationFactory), r) for r in records]
632 def _can_remove_dataset_artifact(self, ref: DatasetIdRef, location: Location) -> bool:
633 """Check that there is only one dataset associated with the
634 specified artifact.
636 Parameters
637 ----------
638 ref : `DatasetRef` or `FakeDatasetRef`
639 Dataset to be removed.
640 location : `Location`
641 The location of the artifact to be removed.
643 Returns
644 -------
645 can_remove : `Bool`
646 True if the artifact can be safely removed.
647 """
648 # Can't ever delete absolute URIs.
649 if location.pathInStore.isabs():
650 return False
652 # Get all entries associated with this path
653 allRefs = self._registered_refs_per_artifact(location.pathInStore)
654 if not allRefs:
655 raise RuntimeError(f"Datastore inconsistency error. {location.pathInStore} not in registry")
657 # Remove these refs from all the refs and if there is nothing left
658 # then we can delete
659 remainingRefs = allRefs - {ref.id}
661 if remainingRefs:
662 return False
663 return True
665 def _get_expected_dataset_locations_info(self, ref: DatasetRef) -> list[tuple[Location, StoredFileInfo]]:
666 """Predict the location and related file information of the requested
667 dataset in this datastore.
669 Parameters
670 ----------
671 ref : `DatasetRef`
672 Reference to the required `Dataset`.
674 Returns
675 -------
676 results : `list` [`tuple` [`Location`, `StoredFileInfo` ]]
677 Expected Location of the dataset within the datastore and
678 placeholder information about each file and its formatter.
680 Notes
681 -----
682 Uses the current configuration to determine how we would expect the
683 datastore files to have been written if we couldn't ask registry.
684 This is safe so long as there has been no change to datastore
685 configuration between writing the dataset and wanting to read it.
686 Will not work for files that have been ingested without using the
687 standard file template or default formatter.
688 """
689 # If we have a component ref we always need to ask the questions
690 # of the composite. If the composite is disassembled this routine
691 # should return all components. If the composite was not
692 # disassembled the composite is what is stored regardless of
693 # component request. Note that if the caller has disassembled
694 # a composite there is no way for this guess to know that
695 # without trying both the composite and component ref and seeing
696 # if there is something at the component Location even without
697 # disassembly being enabled.
698 if ref.datasetType.isComponent():
699 ref = ref.makeCompositeRef()
701 # See if the ref is a composite that should be disassembled
702 doDisassembly = self.composites.shouldBeDisassembled(ref)
704 all_info: list[tuple[Location, Formatter, StorageClass, str | None]] = []
706 if doDisassembly:
707 for component, componentStorage in ref.datasetType.storageClass.components.items():
708 compRef = ref.makeComponentRef(component)
709 location, formatter = self._determine_put_formatter_location(compRef)
710 all_info.append((location, formatter, componentStorage, component))
712 else:
713 # Always use the composite ref if no disassembly
714 location, formatter = self._determine_put_formatter_location(ref)
715 all_info.append((location, formatter, ref.datasetType.storageClass, None))
717 # Convert the list of tuples to have StoredFileInfo as second element
718 return [
719 (
720 location,
721 StoredFileInfo(
722 formatter=formatter,
723 path=location.pathInStore.path,
724 storageClass=storageClass,
725 component=component,
726 checksum=None,
727 file_size=-1,
728 ),
729 )
730 for location, formatter, storageClass, component in all_info
731 ]
733 def _prepare_for_direct_get(
734 self, ref: DatasetRef, parameters: Mapping[str, Any] | None = None
735 ) -> list[DatastoreFileGetInformation]:
736 """Check parameters for ``get`` and obtain formatter and
737 location.
739 Parameters
740 ----------
741 ref : `DatasetRef`
742 Reference to the required Dataset.
743 parameters : `dict`
744 `StorageClass`-specific parameters that specify, for example,
745 a slice of the dataset to be loaded.
747 Returns
748 -------
749 getInfo : `list` [`DatastoreFileGetInformation`]
750 Parameters needed to retrieve each file.
751 """
752 log.debug("Retrieve %s from %s with parameters %s", ref, self.name, parameters)
754 # The storage class we want to use eventually
755 refStorageClass = ref.datasetType.storageClass
757 # For trusted mode need to reset storage class.
758 ref = self._cast_storage_class(ref)
760 # Get file metadata and internal metadata
761 fileLocations = self._get_dataset_locations_info(ref)
762 if not fileLocations:
763 if not self.trustGetRequest:
764 raise FileNotFoundError(f"Could not retrieve dataset {ref}.")
765 # Assume the dataset is where we think it should be
766 fileLocations = self._get_expected_dataset_locations_info(ref)
768 if len(fileLocations) > 1:
769 # If trust is involved it is possible that there will be
770 # components listed here that do not exist in the datastore.
771 # Explicitly check for file artifact existence and filter out any
772 # that are missing.
773 if self.trustGetRequest:
774 fileLocations = [loc for loc in fileLocations if loc[0].uri.exists()]
776 # For now complain only if we have no components at all. One
777 # component is probably a problem but we can punt that to the
778 # assembler.
779 if not fileLocations:
780 raise FileNotFoundError(f"None of the component files for dataset {ref} exist.")
782 return generate_datastore_get_information(
783 fileLocations,
784 readStorageClass=refStorageClass,
785 ref=ref,
786 parameters=parameters,
787 )
789 def _prepare_for_put(self, inMemoryDataset: Any, ref: DatasetRef) -> tuple[Location, Formatter]:
790 """Check the arguments for ``put`` and obtain formatter and
791 location.
793 Parameters
794 ----------
795 inMemoryDataset : `object`
796 The dataset to store.
797 ref : `DatasetRef`
798 Reference to the associated Dataset.
800 Returns
801 -------
802 location : `Location`
803 The location to write the dataset.
804 formatter : `Formatter`
805 The `Formatter` to use to write the dataset.
807 Raises
808 ------
809 TypeError
810 Supplied object and storage class are inconsistent.
811 DatasetTypeNotSupportedError
812 The associated `DatasetType` is not handled by this datastore.
813 """
814 self._validate_put_parameters(inMemoryDataset, ref)
815 return self._determine_put_formatter_location(ref)
817 def _determine_put_formatter_location(self, ref: DatasetRef) -> tuple[Location, Formatter]:
818 """Calculate the formatter and output location to use for put.
820 Parameters
821 ----------
822 ref : `DatasetRef`
823 Reference to the associated Dataset.
825 Returns
826 -------
827 location : `Location`
828 The location to write the dataset.
829 formatter : `Formatter`
830 The `Formatter` to use to write the dataset.
831 """
832 # Work out output file name
833 try:
834 template = self.templates.getTemplate(ref)
835 except KeyError as e:
836 raise DatasetTypeNotSupportedError(f"Unable to find template for {ref}") from e
838 # Validate the template to protect against filenames from different
839 # dataIds returning the same and causing overwrite confusion.
840 template.validateTemplate(ref)
842 location = self.locationFactory.fromPath(template.format(ref), trusted_path=True)
844 # Get the formatter based on the storage class
845 storageClass = ref.datasetType.storageClass
846 try:
847 formatter = self.formatterFactory.getFormatter(
848 ref, FileDescriptor(location, storageClass=storageClass), ref.dataId
849 )
850 except KeyError as e:
851 raise DatasetTypeNotSupportedError(
852 f"Unable to find formatter for {ref} in datastore {self.name}"
853 ) from e
855 # Now that we know the formatter, update the location
856 location = formatter.makeUpdatedLocation(location)
858 return location, formatter
860 def _overrideTransferMode(self, *datasets: FileDataset, transfer: str | None = None) -> str | None:
861 # Docstring inherited from base class
862 if transfer != "auto":
863 return transfer
865 # See if the paths are within the datastore or not
866 inside = [self._pathInStore(d.path) is not None for d in datasets]
868 if all(inside):
869 transfer = None
870 elif not any(inside):
871 # Allow ResourcePath to use its own knowledge
872 transfer = "auto"
873 else:
874 # This can happen when importing from a datastore that
875 # has had some datasets ingested using "direct" mode.
876 # Also allow ResourcePath to sort it out but warn about it.
877 # This can happen if you are importing from a datastore
878 # that had some direct transfer datasets.
879 log.warning(
880 "Some datasets are inside the datastore and some are outside. Using 'split' "
881 "transfer mode. This assumes that the files outside the datastore are "
882 "still accessible to the new butler since they will not be copied into "
883 "the target datastore."
884 )
885 transfer = "split"
887 return transfer
889 def _pathInStore(self, path: ResourcePathExpression) -> str | None:
890 """Return path relative to datastore root.
892 Parameters
893 ----------
894 path : `lsst.resources.ResourcePathExpression`
895 Path to dataset. Can be absolute URI. If relative assumed to
896 be relative to the datastore. Returns path in datastore
897 or raises an exception if the path it outside.
899 Returns
900 -------
901 inStore : `str`
902 Path relative to datastore root. Returns `None` if the file is
903 outside the root.
904 """
905 # Relative path will always be relative to datastore
906 pathUri = ResourcePath(path, forceAbsolute=False, forceDirectory=False)
907 return pathUri.relative_to(self.root)
909 def _standardizeIngestPath(
910 self, path: str | ResourcePath, *, transfer: str | None = None
911 ) -> str | ResourcePath:
912 """Standardize the path of a to-be-ingested file.
914 Parameters
915 ----------
916 path : `str` or `lsst.resources.ResourcePath`
917 Path of a file to be ingested. This parameter is not expected
918 to be all the types that can be used to construct a
919 `~lsst.resources.ResourcePath`.
920 transfer : `str`, optional
921 How (and whether) the dataset should be added to the datastore.
922 See `ingest` for details of transfer modes.
923 This implementation is provided only so
924 `NotImplementedError` can be raised if the mode is not supported;
925 actual transfers are deferred to `_extractIngestInfo`.
927 Returns
928 -------
929 path : `str` or `lsst.resources.ResourcePath`
930 New path in what the datastore considers standard form. If an
931 absolute URI was given that will be returned unchanged.
933 Notes
934 -----
935 Subclasses of `FileDatastore` can implement this method instead
936 of `_prepIngest`. It should not modify the data repository or given
937 file in any way.
939 Raises
940 ------
941 NotImplementedError
942 Raised if the datastore does not support the given transfer mode
943 (including the case where ingest is not supported at all).
944 FileNotFoundError
945 Raised if one of the given files does not exist.
946 """
947 if transfer not in (None, "direct", "split") + self.root.transferModes:
948 raise NotImplementedError(f"Transfer mode {transfer} not supported.")
950 # A relative URI indicates relative to datastore root
951 srcUri = ResourcePath(path, forceAbsolute=False, forceDirectory=False)
952 if not srcUri.isabs():
953 srcUri = self.root.join(path)
955 if not srcUri.exists():
956 raise FileNotFoundError(
957 f"Resource at {srcUri} does not exist; note that paths to ingest "
958 f"are assumed to be relative to {self.root} unless they are absolute."
959 )
961 if transfer is None:
962 relpath = srcUri.relative_to(self.root)
963 if not relpath:
964 raise RuntimeError(
965 f"Transfer is none but source file ({srcUri}) is not within datastore ({self.root})"
966 )
968 # Return the relative path within the datastore for internal
969 # transfer
970 path = relpath
972 return path
974 def _extractIngestInfo(
975 self,
976 path: ResourcePathExpression,
977 ref: DatasetRef,
978 *,
979 formatter: Formatter | type[Formatter],
980 transfer: str | None = None,
981 record_validation_info: bool = True,
982 ) -> StoredFileInfo:
983 """Relocate (if necessary) and extract `StoredFileInfo` from a
984 to-be-ingested file.
986 Parameters
987 ----------
988 path : `lsst.resources.ResourcePathExpression`
989 URI or path of a file to be ingested.
990 ref : `DatasetRef`
991 Reference for the dataset being ingested. Guaranteed to have
992 ``dataset_id not None`.
993 formatter : `type` or `Formatter`
994 `Formatter` subclass to use for this dataset or an instance.
995 transfer : `str`, optional
996 How (and whether) the dataset should be added to the datastore.
997 See `ingest` for details of transfer modes.
998 record_validation_info : `bool`, optional
999 If `True`, the default, the datastore can record validation
1000 information associated with the file. If `False` the datastore
1001 will not attempt to track any information such as checksums
1002 or file sizes. This can be useful if such information is tracked
1003 in an external system or if the file is to be compressed in place.
1004 It is up to the datastore whether this parameter is relevant.
1006 Returns
1007 -------
1008 info : `StoredFileInfo`
1009 Internal datastore record for this file. This will be inserted by
1010 the caller; the `_extractIngestInfo` is only responsible for
1011 creating and populating the struct.
1013 Raises
1014 ------
1015 FileNotFoundError
1016 Raised if one of the given files does not exist.
1017 FileExistsError
1018 Raised if transfer is not `None` but the (internal) location the
1019 file would be moved to is already occupied.
1020 """
1021 if self._transaction is None:
1022 raise RuntimeError("Ingest called without transaction enabled")
1024 # Create URI of the source path, do not need to force a relative
1025 # path to absolute.
1026 srcUri = ResourcePath(path, forceAbsolute=False, forceDirectory=False)
1028 # Track whether we have read the size of the source yet
1029 have_sized = False
1031 tgtLocation: Location | None
1032 if transfer is None or transfer == "split":
1033 # A relative path is assumed to be relative to the datastore
1034 # in this context
1035 if not srcUri.isabs():
1036 tgtLocation = self.locationFactory.fromPath(srcUri.ospath, trusted_path=False)
1037 else:
1038 # Work out the path in the datastore from an absolute URI
1039 # This is required to be within the datastore.
1040 pathInStore = srcUri.relative_to(self.root)
1041 if pathInStore is None and transfer is None:
1042 raise RuntimeError(
1043 f"Unexpectedly learned that {srcUri} is not within datastore {self.root}"
1044 )
1045 if pathInStore:
1046 tgtLocation = self.locationFactory.fromPath(pathInStore, trusted_path=True)
1047 elif transfer == "split":
1048 # Outside the datastore but treat that as a direct ingest
1049 # instead.
1050 tgtLocation = None
1051 else:
1052 raise RuntimeError(f"Unexpected transfer mode encountered: {transfer} for URI {srcUri}")
1053 elif transfer == "direct":
1054 # Want to store the full URI to the resource directly in
1055 # datastore. This is useful for referring to permanent archive
1056 # storage for raw data.
1057 # Trust that people know what they are doing.
1058 tgtLocation = None
1059 else:
1060 # Work out the name we want this ingested file to have
1061 # inside the datastore
1062 tgtLocation = self._calculate_ingested_datastore_name(srcUri, ref, formatter)
1063 if not tgtLocation.uri.dirname().exists():
1064 log.debug("Folder %s does not exist yet.", tgtLocation.uri.dirname())
1065 tgtLocation.uri.dirname().mkdir()
1067 # if we are transferring from a local file to a remote location
1068 # it may be more efficient to get the size and checksum of the
1069 # local file rather than the transferred one
1070 if record_validation_info and srcUri.isLocal:
1071 size = srcUri.size()
1072 checksum = self.computeChecksum(srcUri) if self.useChecksum else None
1073 have_sized = True
1075 # Transfer the resource to the destination.
1076 # Allow overwrite of an existing file. This matches the behavior
1077 # of datastore.put() in that it trusts that registry would not
1078 # be asking to overwrite unless registry thought that the
1079 # overwrite was allowed.
1080 tgtLocation.uri.transfer_from(
1081 srcUri, transfer=transfer, transaction=self._transaction, overwrite=True
1082 )
1084 if tgtLocation is None:
1085 # This means we are using direct mode
1086 targetUri = srcUri
1087 targetPath = str(srcUri)
1088 else:
1089 targetUri = tgtLocation.uri
1090 targetPath = tgtLocation.pathInStore.path
1092 # the file should exist in the datastore now
1093 if record_validation_info:
1094 if not have_sized:
1095 size = targetUri.size()
1096 checksum = self.computeChecksum(targetUri) if self.useChecksum else None
1097 else:
1098 # Not recording any file information.
1099 size = -1
1100 checksum = None
1102 return StoredFileInfo(
1103 formatter=formatter,
1104 path=targetPath,
1105 storageClass=ref.datasetType.storageClass,
1106 component=ref.datasetType.component(),
1107 file_size=size,
1108 checksum=checksum,
1109 )
1111 def _prepIngest(self, *datasets: FileDataset, transfer: str | None = None) -> _IngestPrepData:
1112 # Docstring inherited from Datastore._prepIngest.
1113 filtered = []
1114 for dataset in datasets:
1115 acceptable = [ref for ref in dataset.refs if self.constraints.isAcceptable(ref)]
1116 if not acceptable:
1117 continue
1118 else:
1119 dataset.refs = acceptable
1120 if dataset.formatter is None:
1121 dataset.formatter = self.formatterFactory.getFormatterClass(dataset.refs[0])
1122 else:
1123 assert isinstance(dataset.formatter, type | str)
1124 formatter_class = get_class_of(dataset.formatter)
1125 if not issubclass(formatter_class, Formatter):
1126 raise TypeError(f"Requested formatter {dataset.formatter} is not a Formatter class.")
1127 dataset.formatter = formatter_class
1128 dataset.path = self._standardizeIngestPath(dataset.path, transfer=transfer)
1129 filtered.append(dataset)
1130 return _IngestPrepData(filtered)
1132 @transactional
1133 def _finishIngest(
1134 self,
1135 prepData: Datastore.IngestPrepData,
1136 *,
1137 transfer: str | None = None,
1138 record_validation_info: bool = True,
1139 ) -> None:
1140 # Docstring inherited from Datastore._finishIngest.
1141 refsAndInfos = []
1142 progress = Progress("lsst.daf.butler.datastores.FileDatastore.ingest", level=logging.DEBUG)
1143 for dataset in progress.wrap(prepData.datasets, desc="Ingesting dataset files"):
1144 # Do ingest as if the first dataset ref is associated with the file
1145 info = self._extractIngestInfo(
1146 dataset.path,
1147 dataset.refs[0],
1148 formatter=dataset.formatter,
1149 transfer=transfer,
1150 record_validation_info=record_validation_info,
1151 )
1152 refsAndInfos.extend([(ref, info) for ref in dataset.refs])
1154 # In direct mode we can allow repeated ingests of the same thing
1155 # if we are sure that the external dataset is immutable. We use
1156 # UUIDv5 to indicate this. If there is a mix of v4 and v5 they are
1157 # separated.
1158 refs_and_infos_replace = []
1159 refs_and_infos_insert = []
1160 if transfer == "direct":
1161 for entry in refsAndInfos:
1162 if entry[0].id.version == 5:
1163 refs_and_infos_replace.append(entry)
1164 else:
1165 refs_and_infos_insert.append(entry)
1166 else:
1167 refs_and_infos_insert = refsAndInfos
1169 if refs_and_infos_insert:
1170 self._register_datasets(refs_and_infos_insert, insert_mode=DatabaseInsertMode.INSERT)
1171 if refs_and_infos_replace:
1172 self._register_datasets(refs_and_infos_replace, insert_mode=DatabaseInsertMode.REPLACE)
1174 def _calculate_ingested_datastore_name(
1175 self,
1176 srcUri: ResourcePath,
1177 ref: DatasetRef,
1178 formatter: Formatter | type[Formatter] | None = None,
1179 ) -> Location:
1180 """Given a source URI and a DatasetRef, determine the name the
1181 dataset will have inside datastore.
1183 Parameters
1184 ----------
1185 srcUri : `lsst.resources.ResourcePath`
1186 URI to the source dataset file.
1187 ref : `DatasetRef`
1188 Ref associated with the newly-ingested dataset artifact. This
1189 is used to determine the name within the datastore.
1190 formatter : `Formatter` or Formatter class.
1191 Formatter to use for validation. Can be a class or an instance.
1192 No validation of the file extension is performed if the
1193 ``formatter`` is `None`. This can be used if the caller knows
1194 that the source URI and target URI will use the same formatter.
1196 Returns
1197 -------
1198 location : `Location`
1199 Target location for the newly-ingested dataset.
1200 """
1201 # Ingesting a file from outside the datastore.
1202 # This involves a new name.
1203 template = self.templates.getTemplate(ref)
1204 location = self.locationFactory.fromPath(template.format(ref), trusted_path=True)
1206 # Get the extension
1207 ext = srcUri.getExtension()
1209 # Update the destination to include that extension
1210 location.updateExtension(ext)
1212 # Ask the formatter to validate this extension
1213 if formatter is not None:
1214 formatter.validateExtension(location)
1216 return location
1218 def _write_in_memory_to_artifact(self, inMemoryDataset: Any, ref: DatasetRef) -> StoredFileInfo:
1219 """Write out in memory dataset to datastore.
1221 Parameters
1222 ----------
1223 inMemoryDataset : `object`
1224 Dataset to write to datastore.
1225 ref : `DatasetRef`
1226 Registry information associated with this dataset.
1228 Returns
1229 -------
1230 info : `StoredFileInfo`
1231 Information describing the artifact written to the datastore.
1232 """
1233 # May need to coerce the in memory dataset to the correct
1234 # python type, but first we need to make sure the storage class
1235 # reflects the one defined in the data repository.
1236 ref = self._cast_storage_class(ref)
1237 inMemoryDataset = ref.datasetType.storageClass.coerce_type(inMemoryDataset)
1239 location, formatter = self._prepare_for_put(inMemoryDataset, ref)
1240 uri = location.uri
1242 if not uri.dirname().exists():
1243 log.debug("Folder %s does not exist yet so creating it.", uri.dirname())
1244 uri.dirname().mkdir()
1246 if self._transaction is None:
1247 raise RuntimeError("Attempting to write artifact without transaction enabled")
1249 def _removeFileExists(uri: ResourcePath) -> None:
1250 """Remove a file and do not complain if it is not there.
1252 This is important since a formatter might fail before the file
1253 is written and we should not confuse people by writing spurious
1254 error messages to the log.
1255 """
1256 with contextlib.suppress(FileNotFoundError):
1257 uri.remove()
1259 # Register a callback to try to delete the uploaded data if
1260 # something fails below
1261 self._transaction.registerUndo("artifactWrite", _removeFileExists, uri)
1263 data_written = False
1265 # For remote URIs some datasets can be serialized directly
1266 # to bytes and sent to the remote datastore without writing a
1267 # file. If the dataset is intended to be saved to the cache
1268 # a file is always written and direct write to the remote
1269 # datastore is bypassed.
1270 if not uri.isLocal and not self.cacheManager.should_be_cached(ref):
1271 # Remote URI that is not cached so can write directly.
1272 try:
1273 serializedDataset = formatter.toBytes(inMemoryDataset)
1274 except NotImplementedError:
1275 # Fallback to the file writing option.
1276 pass
1277 except Exception as e:
1278 raise RuntimeError(
1279 f"Failed to serialize dataset {ref} of type {type(inMemoryDataset)} to bytes."
1280 ) from e
1281 else:
1282 log.debug("Writing bytes directly to %s", uri)
1283 uri.write(serializedDataset, overwrite=True)
1284 log.debug("Successfully wrote bytes directly to %s", uri)
1285 data_written = True
1287 if not data_written:
1288 # Did not write the bytes directly to object store so instead
1289 # write to temporary file. Always write to a temporary even if
1290 # using a local file system -- that gives us atomic writes.
1291 # If a process is killed as the file is being written we do not
1292 # want it to remain in the correct place but in corrupt state.
1293 # For local files write to the output directory not temporary dir.
1294 prefix = uri.dirname() if uri.isLocal else None
1295 with ResourcePath.temporary_uri(suffix=uri.getExtension(), prefix=prefix) as temporary_uri:
1296 # Need to configure the formatter to write to a different
1297 # location and that needs us to overwrite internals
1298 log.debug("Writing dataset to temporary location at %s", temporary_uri)
1299 with formatter._updateLocation(Location(None, temporary_uri)):
1300 try:
1301 formatter.write(inMemoryDataset)
1302 except Exception as e:
1303 raise RuntimeError(
1304 f"Failed to serialize dataset {ref} of type"
1305 f" {type(inMemoryDataset)} to "
1306 f"temporary location {temporary_uri}"
1307 ) from e
1309 # Use move for a local file since that becomes an efficient
1310 # os.rename. For remote resources we use copy to allow the
1311 # file to be cached afterwards.
1312 transfer = "move" if uri.isLocal else "copy"
1314 uri.transfer_from(temporary_uri, transfer=transfer, overwrite=True)
1316 if transfer == "copy":
1317 # Cache if required
1318 self.cacheManager.move_to_cache(temporary_uri, ref)
1320 log.debug("Successfully wrote dataset to %s via a temporary file.", uri)
1322 # URI is needed to resolve what ingest case are we dealing with
1323 return self._extractIngestInfo(uri, ref, formatter=formatter)
1325 def knows(self, ref: DatasetRef) -> bool:
1326 """Check if the dataset is known to the datastore.
1328 Does not check for existence of any artifact.
1330 Parameters
1331 ----------
1332 ref : `DatasetRef`
1333 Reference to the required dataset.
1335 Returns
1336 -------
1337 exists : `bool`
1338 `True` if the dataset is known to the datastore.
1339 """
1340 fileLocations = self._get_dataset_locations_info(ref)
1341 if fileLocations:
1342 return True
1343 return False
1345 def knows_these(self, refs: Iterable[DatasetRef]) -> dict[DatasetRef, bool]:
1346 # Docstring inherited from the base class.
1348 # The records themselves. Could be missing some entries.
1349 records = self._get_stored_records_associated_with_refs(refs, ignore_datastore_records=True)
1351 return {ref: ref.id in records for ref in refs}
1353 def _process_mexists_records(
1354 self,
1355 id_to_ref: dict[DatasetId, DatasetRef],
1356 records: dict[DatasetId, list[StoredFileInfo]],
1357 all_required: bool,
1358 artifact_existence: dict[ResourcePath, bool] | None = None,
1359 ) -> dict[DatasetRef, bool]:
1360 """Check given records for existence.
1362 Helper function for `mexists()`.
1364 Parameters
1365 ----------
1366 id_to_ref : `dict` of [`DatasetId`, `DatasetRef`]
1367 Mapping of the dataset ID to the dataset ref itself.
1368 records : `dict` of [`DatasetId`, `list` of `StoredFileInfo`]
1369 Records as generally returned by
1370 ``_get_stored_records_associated_with_refs``.
1371 all_required : `bool`
1372 Flag to indicate whether existence requires all artifacts
1373 associated with a dataset ID to exist or not for existence.
1374 artifact_existence : `dict` [`lsst.resources.ResourcePath`, `bool`]
1375 Optional mapping of datastore artifact to existence. Updated by
1376 this method with details of all artifacts tested. Can be `None`
1377 if the caller is not interested.
1379 Returns
1380 -------
1381 existence : `dict` of [`DatasetRef`, `bool`]
1382 Mapping from dataset to boolean indicating existence.
1383 """
1384 # The URIs to be checked and a mapping of those URIs to
1385 # the dataset ID.
1386 uris_to_check: list[ResourcePath] = []
1387 location_map: dict[ResourcePath, DatasetId] = {}
1389 location_factory = self.locationFactory
1391 uri_existence: dict[ResourcePath, bool] = {}
1392 for ref_id, infos in records.items():
1393 # Key is the dataset Id, value is list of StoredItemInfo
1394 uris = [info.file_location(location_factory).uri for info in infos]
1395 location_map.update({uri: ref_id for uri in uris})
1397 # Check the local cache directly for a dataset corresponding
1398 # to the remote URI.
1399 if self.cacheManager.file_count > 0:
1400 ref = id_to_ref[ref_id]
1401 for uri, storedFileInfo in zip(uris, infos, strict=True):
1402 check_ref = ref
1403 if not ref.datasetType.isComponent() and (component := storedFileInfo.component):
1404 check_ref = ref.makeComponentRef(component)
1405 if self.cacheManager.known_to_cache(check_ref, uri.getExtension()):
1406 # Proxy for URI existence.
1407 uri_existence[uri] = True
1408 else:
1409 uris_to_check.append(uri)
1410 else:
1411 # Check all of them.
1412 uris_to_check.extend(uris)
1414 if artifact_existence is not None:
1415 # If a URI has already been checked remove it from the list
1416 # and immediately add the status to the output dict.
1417 filtered_uris_to_check = []
1418 for uri in uris_to_check:
1419 if uri in artifact_existence:
1420 uri_existence[uri] = artifact_existence[uri]
1421 else:
1422 filtered_uris_to_check.append(uri)
1423 uris_to_check = filtered_uris_to_check
1425 # Results.
1426 dataset_existence: dict[DatasetRef, bool] = {}
1428 uri_existence.update(ResourcePath.mexists(uris_to_check))
1429 for uri, exists in uri_existence.items():
1430 dataset_id = location_map[uri]
1431 ref = id_to_ref[dataset_id]
1433 # Disassembled composite needs to check all locations.
1434 # all_required indicates whether all need to exist or not.
1435 if ref in dataset_existence:
1436 if all_required:
1437 exists = dataset_existence[ref] and exists
1438 else:
1439 exists = dataset_existence[ref] or exists
1440 dataset_existence[ref] = exists
1442 if artifact_existence is not None:
1443 artifact_existence.update(uri_existence)
1445 return dataset_existence
1447 def mexists(
1448 self, refs: Iterable[DatasetRef], artifact_existence: dict[ResourcePath, bool] | None = None
1449 ) -> dict[DatasetRef, bool]:
1450 """Check the existence of multiple datasets at once.
1452 Parameters
1453 ----------
1454 refs : iterable of `DatasetRef`
1455 The datasets to be checked.
1456 artifact_existence : `dict` [`lsst.resources.ResourcePath`, `bool`]
1457 Optional mapping of datastore artifact to existence. Updated by
1458 this method with details of all artifacts tested. Can be `None`
1459 if the caller is not interested.
1461 Returns
1462 -------
1463 existence : `dict` of [`DatasetRef`, `bool`]
1464 Mapping from dataset to boolean indicating existence.
1466 Notes
1467 -----
1468 To minimize potentially costly remote existence checks, the local
1469 cache is checked as a proxy for existence. If a file for this
1470 `DatasetRef` does exist no check is done for the actual URI. This
1471 could result in possibly unexpected behavior if the dataset itself
1472 has been removed from the datastore by another process whilst it is
1473 still in the cache.
1474 """
1475 chunk_size = 10_000
1476 dataset_existence: dict[DatasetRef, bool] = {}
1477 log.debug("Checking for the existence of multiple artifacts in datastore in chunks of %d", chunk_size)
1478 n_found_total = 0
1479 n_checked = 0
1480 n_chunks = 0
1481 for chunk in chunk_iterable(refs, chunk_size=chunk_size):
1482 chunk_result = self._mexists(chunk, artifact_existence)
1484 # The log message level and content depend on how many
1485 # datasets we are processing.
1486 n_results = len(chunk_result)
1488 # Use verbose logging to ensure that messages can be seen
1489 # easily if many refs are being checked.
1490 log_threshold = VERBOSE
1491 n_checked += n_results
1493 # This sum can take some time so only do it if we know the
1494 # result is going to be used.
1495 n_found = 0
1496 if log.isEnabledFor(log_threshold):
1497 # Can treat the booleans as 0, 1 integers and sum them.
1498 n_found = sum(chunk_result.values())
1499 n_found_total += n_found
1501 # We are deliberately not trying to count the number of refs
1502 # provided in case it's in the millions. This means there is a
1503 # situation where the number of refs exactly matches the chunk
1504 # size and we will switch to the multi-chunk path even though
1505 # we only have a single chunk.
1506 if n_results < chunk_size and n_chunks == 0:
1507 # Single chunk will be processed so we can provide more detail.
1508 if n_results == 1:
1509 ref = list(chunk_result)[0]
1510 # Use debug logging to be consistent with `exists()`.
1511 log.debug(
1512 "Calling mexists() with single ref that does%s exist (%s).",
1513 "" if chunk_result[ref] else " not",
1514 ref,
1515 )
1516 else:
1517 # Single chunk but multiple files. Summarize.
1518 log.log(
1519 log_threshold,
1520 "Number of datasets found in datastore: %d out of %d datasets checked.",
1521 n_found,
1522 n_checked,
1523 )
1525 else:
1526 # Use incremental verbose logging when we have multiple chunks.
1527 log.log(
1528 log_threshold,
1529 "Number of datasets found in datastore for chunk %d: %d out of %d checked "
1530 "(running total from all chunks so far: %d found out of %d checked)",
1531 n_chunks,
1532 n_found,
1533 n_results,
1534 n_found_total,
1535 n_checked,
1536 )
1537 dataset_existence.update(chunk_result)
1538 n_chunks += 1
1540 return dataset_existence
1542 def _mexists(
1543 self, refs: Sequence[DatasetRef], artifact_existence: dict[ResourcePath, bool] | None = None
1544 ) -> dict[DatasetRef, bool]:
1545 """Check the existence of multiple datasets at once.
1547 Parameters
1548 ----------
1549 refs : iterable of `DatasetRef`
1550 The datasets to be checked.
1551 artifact_existence : `dict` [`lsst.resources.ResourcePath`, `bool`]
1552 Optional mapping of datastore artifact to existence. Updated by
1553 this method with details of all artifacts tested. Can be `None`
1554 if the caller is not interested.
1556 Returns
1557 -------
1558 existence : `dict` of [`DatasetRef`, `bool`]
1559 Mapping from dataset to boolean indicating existence.
1560 """
1561 # Make a mapping from refs with the internal storage class to the given
1562 # refs that may have a different one. We'll use the internal refs
1563 # throughout this method and convert back at the very end.
1564 internal_ref_to_input_ref = {self._cast_storage_class(ref): ref for ref in refs}
1566 # Need a mapping of dataset_id to (internal) dataset ref since some
1567 # internal APIs work with dataset_id.
1568 id_to_ref = {ref.id: ref for ref in internal_ref_to_input_ref}
1570 # Set of all IDs we are checking for.
1571 requested_ids = set(id_to_ref.keys())
1573 # The records themselves. Could be missing some entries.
1574 records = self._get_stored_records_associated_with_refs(
1575 id_to_ref.values(), ignore_datastore_records=True
1576 )
1578 dataset_existence = self._process_mexists_records(
1579 id_to_ref, records, True, artifact_existence=artifact_existence
1580 )
1582 # Set of IDs that have been handled.
1583 handled_ids = {ref.id for ref in dataset_existence}
1585 missing_ids = requested_ids - handled_ids
1586 if missing_ids:
1587 dataset_existence.update(
1588 self._mexists_check_expected(
1589 [id_to_ref[missing] for missing in missing_ids], artifact_existence
1590 )
1591 )
1593 return {
1594 internal_ref_to_input_ref[internal_ref]: existence
1595 for internal_ref, existence in dataset_existence.items()
1596 }
1598 def _mexists_check_expected(
1599 self, refs: Sequence[DatasetRef], artifact_existence: dict[ResourcePath, bool] | None = None
1600 ) -> dict[DatasetRef, bool]:
1601 """Check existence of refs that are not known to datastore.
1603 Parameters
1604 ----------
1605 refs : iterable of `DatasetRef`
1606 The datasets to be checked. These are assumed not to be known
1607 to datastore.
1608 artifact_existence : `dict` [`lsst.resources.ResourcePath`, `bool`]
1609 Optional mapping of datastore artifact to existence. Updated by
1610 this method with details of all artifacts tested. Can be `None`
1611 if the caller is not interested.
1613 Returns
1614 -------
1615 existence : `dict` of [`DatasetRef`, `bool`]
1616 Mapping from dataset to boolean indicating existence.
1617 """
1618 dataset_existence: dict[DatasetRef, bool] = {}
1619 if not self.trustGetRequest:
1620 # Must assume these do not exist
1621 for ref in refs:
1622 dataset_existence[ref] = False
1623 else:
1624 log.debug(
1625 "%d datasets were not known to datastore during initial existence check.",
1626 len(refs),
1627 )
1629 # Construct data structure identical to that returned
1630 # by _get_stored_records_associated_with_refs() but using
1631 # guessed names.
1632 records = {}
1633 id_to_ref = {}
1634 for missing_ref in refs:
1635 expected = self._get_expected_dataset_locations_info(missing_ref)
1636 dataset_id = missing_ref.id
1637 records[dataset_id] = [info for _, info in expected]
1638 id_to_ref[dataset_id] = missing_ref
1640 dataset_existence.update(
1641 self._process_mexists_records(
1642 id_to_ref,
1643 records,
1644 False,
1645 artifact_existence=artifact_existence,
1646 )
1647 )
1649 return dataset_existence
1651 def exists(self, ref: DatasetRef) -> bool:
1652 """Check if the dataset exists in the datastore.
1654 Parameters
1655 ----------
1656 ref : `DatasetRef`
1657 Reference to the required dataset.
1659 Returns
1660 -------
1661 exists : `bool`
1662 `True` if the entity exists in the `Datastore`.
1664 Notes
1665 -----
1666 The local cache is checked as a proxy for existence in the remote
1667 object store. It is possible that another process on a different
1668 compute node could remove the file from the object store even
1669 though it is present in the local cache.
1670 """
1671 ref = self._cast_storage_class(ref)
1672 # We cannot trust datastore records from ref, as many unit tests delete
1673 # datasets and check their existence.
1674 fileLocations = self._get_dataset_locations_info(ref, ignore_datastore_records=True)
1676 # if we are being asked to trust that registry might not be correct
1677 # we ask for the expected locations and check them explicitly
1678 if not fileLocations:
1679 if not self.trustGetRequest:
1680 return False
1682 # First check the cache. If it is not found we must check
1683 # the datastore itself. Assume that any component in the cache
1684 # means that the dataset does exist somewhere.
1685 if self.cacheManager.known_to_cache(ref):
1686 return True
1688 # When we are guessing a dataset location we can not check
1689 # for the existence of every component since we can not
1690 # know if every component was written. Instead we check
1691 # for the existence of any of the expected locations.
1692 for location, _ in self._get_expected_dataset_locations_info(ref):
1693 if self._artifact_exists(location):
1694 return True
1695 return False
1697 # All listed artifacts must exist.
1698 for location, storedFileInfo in fileLocations:
1699 # Checking in cache needs the component ref.
1700 check_ref = ref
1701 if not ref.datasetType.isComponent() and (component := storedFileInfo.component):
1702 check_ref = ref.makeComponentRef(component)
1703 if self.cacheManager.known_to_cache(check_ref, location.getExtension()):
1704 continue
1706 if not self._artifact_exists(location):
1707 return False
1709 return True
1711 def getURIs(self, ref: DatasetRef, predict: bool = False) -> DatasetRefURIs:
1712 """Return URIs associated with dataset.
1714 Parameters
1715 ----------
1716 ref : `DatasetRef`
1717 Reference to the required dataset.
1718 predict : `bool`, optional
1719 If the datastore does not know about the dataset, controls whether
1720 it should return a predicted URI or not.
1722 Returns
1723 -------
1724 uris : `DatasetRefURIs`
1725 The URI to the primary artifact associated with this dataset (if
1726 the dataset was disassembled within the datastore this may be
1727 `None`), and the URIs to any components associated with the dataset
1728 artifact. (can be empty if there are no components).
1729 """
1730 many = self.getManyURIs([ref], predict=predict, allow_missing=False)
1731 return many[ref]
1733 def getURI(self, ref: DatasetRef, predict: bool = False) -> ResourcePath:
1734 """URI to the Dataset.
1736 Parameters
1737 ----------
1738 ref : `DatasetRef`
1739 Reference to the required Dataset.
1740 predict : `bool`
1741 If `True`, allow URIs to be returned of datasets that have not
1742 been written.
1744 Returns
1745 -------
1746 uri : `str`
1747 URI pointing to the dataset within the datastore. If the
1748 dataset does not exist in the datastore, and if ``predict`` is
1749 `True`, the URI will be a prediction and will include a URI
1750 fragment "#predicted".
1751 If the datastore does not have entities that relate well
1752 to the concept of a URI the returned URI will be
1753 descriptive. The returned URI is not guaranteed to be obtainable.
1755 Raises
1756 ------
1757 FileNotFoundError
1758 Raised if a URI has been requested for a dataset that does not
1759 exist and guessing is not allowed.
1760 RuntimeError
1761 Raised if a request is made for a single URI but multiple URIs
1762 are associated with this dataset.
1764 Notes
1765 -----
1766 When a predicted URI is requested an attempt will be made to form
1767 a reasonable URI based on file templates and the expected formatter.
1768 """
1769 primary, components = self.getURIs(ref, predict)
1770 if primary is None or components:
1771 raise RuntimeError(
1772 f"Dataset ({ref}) includes distinct URIs for components. Use Datastore.getURIs() instead."
1773 )
1774 return primary
1776 def _predict_URIs(
1777 self,
1778 ref: DatasetRef,
1779 ) -> DatasetRefURIs:
1780 """Predict the URIs of a dataset ref.
1782 Parameters
1783 ----------
1784 ref : `DatasetRef`
1785 Reference to the required Dataset.
1787 Returns
1788 -------
1789 URI : DatasetRefUris
1790 Primary and component URIs. URIs will contain a URI fragment
1791 "#predicted".
1792 """
1793 uris = DatasetRefURIs()
1795 if self.composites.shouldBeDisassembled(ref):
1796 for component, _ in ref.datasetType.storageClass.components.items():
1797 comp_ref = ref.makeComponentRef(component)
1798 comp_location, _ = self._determine_put_formatter_location(comp_ref)
1800 # Add the "#predicted" URI fragment to indicate this is a
1801 # guess
1802 uris.componentURIs[component] = ResourcePath(
1803 comp_location.uri.geturl() + "#predicted", forceDirectory=comp_location.uri.dirLike
1804 )
1806 else:
1807 location, _ = self._determine_put_formatter_location(ref)
1809 # Add the "#predicted" URI fragment to indicate this is a guess
1810 uris.primaryURI = ResourcePath(
1811 location.uri.geturl() + "#predicted", forceDirectory=location.uri.dirLike
1812 )
1814 return uris
1816 def getManyURIs(
1817 self,
1818 refs: Iterable[DatasetRef],
1819 predict: bool = False,
1820 allow_missing: bool = False,
1821 ) -> dict[DatasetRef, DatasetRefURIs]:
1822 # Docstring inherited
1824 uris: dict[DatasetRef, DatasetRefURIs] = {}
1826 records = self._get_stored_records_associated_with_refs(refs)
1827 records_keys = records.keys()
1829 existing_refs = tuple(ref for ref in refs if ref.id in records_keys)
1830 missing_refs = tuple(ref for ref in refs if ref.id not in records_keys)
1832 # Have to handle trustGetRequest mode by checking for the existence
1833 # of the missing refs on disk.
1834 if missing_refs:
1835 dataset_existence = self._mexists_check_expected(missing_refs, None)
1836 really_missing = set()
1837 not_missing = set()
1838 for ref, exists in dataset_existence.items():
1839 if exists:
1840 not_missing.add(ref)
1841 else:
1842 really_missing.add(ref)
1844 if not_missing:
1845 # Need to recalculate the missing/existing split.
1846 existing_refs = existing_refs + tuple(not_missing)
1847 missing_refs = tuple(really_missing)
1849 for ref in missing_refs:
1850 # if this has never been written then we have to guess
1851 if not predict:
1852 if not allow_missing:
1853 raise FileNotFoundError(f"Dataset {ref} not in this datastore.")
1854 else:
1855 uris[ref] = self._predict_URIs(ref)
1857 for ref in existing_refs:
1858 file_infos = records[ref.id]
1859 file_locations = [(i.file_location(self.locationFactory), i) for i in file_infos]
1860 uris[ref] = self._locations_to_URI(ref, file_locations)
1862 return uris
1864 def _locations_to_URI(
1865 self,
1866 ref: DatasetRef,
1867 file_locations: Sequence[tuple[Location, StoredFileInfo]],
1868 ) -> DatasetRefURIs:
1869 """Convert one or more file locations associated with a DatasetRef
1870 to a DatasetRefURIs.
1872 Parameters
1873 ----------
1874 ref : `DatasetRef`
1875 Reference to the dataset.
1876 file_locations : Sequence[Tuple[Location, StoredFileInfo]]
1877 Each item in the sequence is the location of the dataset within the
1878 datastore and stored information about the file and its formatter.
1879 If there is only one item in the sequence then it is treated as the
1880 primary URI. If there is more than one item then they are treated
1881 as component URIs. If there are no items then an error is raised
1882 unless ``self.trustGetRequest`` is `True`.
1884 Returns
1885 -------
1886 uris: DatasetRefURIs
1887 Represents the primary URI or component URIs described by the
1888 inputs.
1890 Raises
1891 ------
1892 RuntimeError
1893 If no file locations are passed in and ``self.trustGetRequest`` is
1894 `False`.
1895 FileNotFoundError
1896 If the a passed-in URI does not exist, and ``self.trustGetRequest``
1897 is `False`.
1898 RuntimeError
1899 If a passed in `StoredFileInfo`'s ``component`` is `None` (this is
1900 unexpected).
1901 """
1902 guessing = False
1903 uris = DatasetRefURIs()
1905 if not file_locations:
1906 if not self.trustGetRequest:
1907 raise RuntimeError(f"Unexpectedly got no artifacts for dataset {ref}")
1908 file_locations = self._get_expected_dataset_locations_info(ref)
1909 guessing = True
1911 if len(file_locations) == 1:
1912 # No disassembly so this is the primary URI
1913 uris.primaryURI = file_locations[0][0].uri
1914 if guessing and not uris.primaryURI.exists():
1915 raise FileNotFoundError(f"Expected URI ({uris.primaryURI}) does not exist")
1916 else:
1917 for location, file_info in file_locations:
1918 if file_info.component is None:
1919 raise RuntimeError(f"Unexpectedly got no component name for a component at {location}")
1920 if guessing and not location.uri.exists():
1921 # If we are trusting then it is entirely possible for
1922 # some components to be missing. In that case we skip
1923 # to the next component.
1924 if self.trustGetRequest:
1925 continue
1926 raise FileNotFoundError(f"Expected URI ({location.uri}) does not exist")
1927 uris.componentURIs[file_info.component] = location.uri
1929 return uris
1931 def retrieveArtifacts(
1932 self,
1933 refs: Iterable[DatasetRef],
1934 destination: ResourcePath,
1935 transfer: str = "auto",
1936 preserve_path: bool = True,
1937 overwrite: bool = False,
1938 ) -> list[ResourcePath]:
1939 """Retrieve the file artifacts associated with the supplied refs.
1941 Parameters
1942 ----------
1943 refs : iterable of `DatasetRef`
1944 The datasets for which file artifacts are to be retrieved.
1945 A single ref can result in multiple files. The refs must
1946 be resolved.
1947 destination : `lsst.resources.ResourcePath`
1948 Location to write the file artifacts.
1949 transfer : `str`, optional
1950 Method to use to transfer the artifacts. Must be one of the options
1951 supported by `lsst.resources.ResourcePath.transfer_from()`.
1952 "move" is not allowed.
1953 preserve_path : `bool`, optional
1954 If `True` the full path of the file artifact within the datastore
1955 is preserved. If `False` the final file component of the path
1956 is used.
1957 overwrite : `bool`, optional
1958 If `True` allow transfers to overwrite existing files at the
1959 destination.
1961 Returns
1962 -------
1963 targets : `list` of `lsst.resources.ResourcePath`
1964 URIs of file artifacts in destination location. Order is not
1965 preserved.
1966 """
1967 if not destination.isdir():
1968 raise ValueError(f"Destination location must refer to a directory. Given {destination}")
1970 if transfer == "move":
1971 raise ValueError("Can not move artifacts out of datastore. Use copy instead.")
1973 # Source -> Destination
1974 # This also helps filter out duplicate DatasetRef in the request
1975 # that will map to the same underlying file transfer.
1976 to_transfer: dict[ResourcePath, ResourcePath] = {}
1978 for ref in refs:
1979 locations = self._get_dataset_locations_info(ref)
1980 for location, _ in locations:
1981 source_uri = location.uri
1982 target_uri = determine_destination_for_retrieved_artifact(
1983 destination, location.pathInStore, preserve_path
1984 )
1985 to_transfer[source_uri] = target_uri
1987 # In theory can now parallelize the transfer
1988 log.debug("Number of artifacts to transfer to %s: %d", str(destination), len(to_transfer))
1989 for source_uri, target_uri in to_transfer.items():
1990 target_uri.transfer_from(source_uri, transfer=transfer, overwrite=overwrite)
1992 return list(to_transfer.values())
1994 def get(
1995 self,
1996 ref: DatasetRef,
1997 parameters: Mapping[str, Any] | None = None,
1998 storageClass: StorageClass | str | None = None,
1999 ) -> Any:
2000 """Load an InMemoryDataset from the store.
2002 Parameters
2003 ----------
2004 ref : `DatasetRef`
2005 Reference to the required Dataset.
2006 parameters : `dict`
2007 `StorageClass`-specific parameters that specify, for example,
2008 a slice of the dataset to be loaded.
2009 storageClass : `StorageClass` or `str`, optional
2010 The storage class to be used to override the Python type
2011 returned by this method. By default the returned type matches
2012 the dataset type definition for this dataset. Specifying a
2013 read `StorageClass` can force a different type to be returned.
2014 This type must be compatible with the original type.
2016 Returns
2017 -------
2018 inMemoryDataset : `object`
2019 Requested dataset or slice thereof as an InMemoryDataset.
2021 Raises
2022 ------
2023 FileNotFoundError
2024 Requested dataset can not be retrieved.
2025 TypeError
2026 Return value from formatter has unexpected type.
2027 ValueError
2028 Formatter failed to process the dataset.
2029 """
2030 # Supplied storage class for the component being read is either
2031 # from the ref itself or some an override if we want to force
2032 # type conversion.
2033 if storageClass is not None:
2034 ref = ref.overrideStorageClass(storageClass)
2036 allGetInfo = self._prepare_for_direct_get(ref, parameters)
2037 return get_dataset_as_python_object_from_get_info(
2038 allGetInfo, ref=ref, parameters=parameters, cache_manager=self.cacheManager
2039 )
2041 def prepare_get_for_external_client(self, ref: DatasetRef) -> FileDatastoreGetPayload | None:
2042 # Docstring inherited
2044 # 1 hour. Chosen somewhat arbitrarily -- this is long enough that the
2045 # client should have time to download a large file with retries if
2046 # needed, but short enough that it will become obvious quickly that
2047 # these URLs expire.
2048 # From a strictly technical standpoint there is no reason this
2049 # shouldn't be a day or more, but there seems to be a political issue
2050 # where people think there is a risk of end users posting presigned
2051 # URLs for people without access rights to download.
2052 url_expiration_time_seconds = 1 * 60 * 60
2054 locations = self._get_dataset_locations_info(ref)
2055 if len(locations) == 0:
2056 return None
2058 return FileDatastoreGetPayload(
2059 datastore_type="file",
2060 file_info=[_to_file_info_payload(info, url_expiration_time_seconds) for info in locations],
2061 )
2063 @transactional
2064 def put(self, inMemoryDataset: Any, ref: DatasetRef) -> None:
2065 """Write a InMemoryDataset with a given `DatasetRef` to the store.
2067 Parameters
2068 ----------
2069 inMemoryDataset : `object`
2070 The dataset to store.
2071 ref : `DatasetRef`
2072 Reference to the associated Dataset.
2074 Raises
2075 ------
2076 TypeError
2077 Supplied object and storage class are inconsistent.
2078 DatasetTypeNotSupportedError
2079 The associated `DatasetType` is not handled by this datastore.
2081 Notes
2082 -----
2083 If the datastore is configured to reject certain dataset types it
2084 is possible that the put will fail and raise a
2085 `DatasetTypeNotSupportedError`. The main use case for this is to
2086 allow `ChainedDatastore` to put to multiple datastores without
2087 requiring that every datastore accepts the dataset.
2088 """
2089 doDisassembly = self.composites.shouldBeDisassembled(ref)
2090 # doDisassembly = True
2092 artifacts = []
2093 if doDisassembly:
2094 components = ref.datasetType.storageClass.delegate().disassemble(inMemoryDataset)
2095 if components is None:
2096 raise RuntimeError(
2097 f"Inconsistent configuration: dataset type {ref.datasetType.name} "
2098 f"with storage class {ref.datasetType.storageClass.name} "
2099 "is configured to be disassembled, but cannot be."
2100 )
2101 for component, componentInfo in components.items():
2102 # Don't recurse because we want to take advantage of
2103 # bulk insert -- need a new DatasetRef that refers to the
2104 # same dataset_id but has the component DatasetType
2105 # DatasetType does not refer to the types of components
2106 # So we construct one ourselves.
2107 compRef = ref.makeComponentRef(component)
2108 storedInfo = self._write_in_memory_to_artifact(componentInfo.component, compRef)
2109 artifacts.append((compRef, storedInfo))
2110 else:
2111 # Write the entire thing out
2112 storedInfo = self._write_in_memory_to_artifact(inMemoryDataset, ref)
2113 artifacts.append((ref, storedInfo))
2115 self._register_datasets(artifacts, insert_mode=DatabaseInsertMode.INSERT)
2117 @transactional
2118 def put_new(self, in_memory_dataset: Any, ref: DatasetRef) -> Mapping[str, DatasetRef]:
2119 doDisassembly = self.composites.shouldBeDisassembled(ref)
2120 # doDisassembly = True
2122 artifacts = []
2123 if doDisassembly:
2124 components = ref.datasetType.storageClass.delegate().disassemble(in_memory_dataset)
2125 if components is None:
2126 raise RuntimeError(
2127 f"Inconsistent configuration: dataset type {ref.datasetType.name} "
2128 f"with storage class {ref.datasetType.storageClass.name} "
2129 "is configured to be disassembled, but cannot be."
2130 )
2131 for component, componentInfo in components.items():
2132 # Don't recurse because we want to take advantage of
2133 # bulk insert -- need a new DatasetRef that refers to the
2134 # same dataset_id but has the component DatasetType
2135 # DatasetType does not refer to the types of components
2136 # So we construct one ourselves.
2137 compRef = ref.makeComponentRef(component)
2138 storedInfo = self._write_in_memory_to_artifact(componentInfo.component, compRef)
2139 artifacts.append((compRef, storedInfo))
2140 else:
2141 # Write the entire thing out
2142 storedInfo = self._write_in_memory_to_artifact(in_memory_dataset, ref)
2143 artifacts.append((ref, storedInfo))
2145 ref_records: DatasetDatastoreRecords = {self._opaque_table_name: [info for _, info in artifacts]}
2146 ref = ref.replace(datastore_records=ref_records)
2147 return {self.name: ref}
2149 @transactional
2150 def trash(self, ref: DatasetRef | Iterable[DatasetRef], ignore_errors: bool = True) -> None:
2151 # At this point can safely remove these datasets from the cache
2152 # to avoid confusion later on. If they are not trashed later
2153 # the cache will simply be refilled.
2154 self.cacheManager.remove_from_cache(ref)
2156 # If we are in trust mode there will be nothing to move to
2157 # the trash table and we will have to try to delete the file
2158 # immediately.
2159 if self.trustGetRequest:
2160 # Try to keep the logic below for a single file trash.
2161 if isinstance(ref, DatasetRef):
2162 refs = {ref}
2163 else:
2164 # Will recreate ref at the end of this branch.
2165 refs = set(ref)
2167 # Determine which datasets are known to datastore directly.
2168 id_to_ref = {ref.id: ref for ref in refs}
2169 existing_ids = self._get_stored_records_associated_with_refs(refs, ignore_datastore_records=True)
2170 existing_refs = {id_to_ref[ref_id] for ref_id in existing_ids}
2172 missing = refs - existing_refs
2173 if missing:
2174 # Do an explicit existence check on these refs.
2175 # We only care about the artifacts at this point and not
2176 # the dataset existence.
2177 artifact_existence: dict[ResourcePath, bool] = {}
2178 _ = self.mexists(missing, artifact_existence)
2179 uris = [uri for uri, exists in artifact_existence.items() if exists]
2181 # FUTURE UPGRADE: Implement a parallelized bulk remove.
2182 log.debug("Removing %d artifacts from datastore that are unknown to datastore", len(uris))
2183 for uri in uris:
2184 try:
2185 uri.remove()
2186 except Exception as e:
2187 if ignore_errors:
2188 log.debug("Artifact %s could not be removed: %s", uri, e)
2189 continue
2190 raise
2192 # There is no point asking the code below to remove refs we
2193 # know are missing so update it with the list of existing
2194 # records. Try to retain one vs many logic.
2195 if not existing_refs:
2196 # Nothing more to do since none of the datasets were
2197 # known to the datastore record table.
2198 return
2199 ref = list(existing_refs)
2200 if len(ref) == 1:
2201 ref = ref[0]
2203 # Get file metadata and internal metadata
2204 if not isinstance(ref, DatasetRef):
2205 log.debug("Doing multi-dataset trash in datastore %s", self.name)
2206 # Assumed to be an iterable of refs so bulk mode enabled.
2207 try:
2208 self.bridge.moveToTrash(ref, transaction=self._transaction)
2209 except Exception as e:
2210 if ignore_errors:
2211 log.warning("Unexpected issue moving multiple datasets to trash: %s", e)
2212 else:
2213 raise
2214 return
2216 log.debug("Trashing dataset %s in datastore %s", ref, self.name)
2218 fileLocations = self._get_dataset_locations_info(ref)
2220 if not fileLocations:
2221 err_msg = f"Requested dataset to trash ({ref}) is not known to datastore {self.name}"
2222 if ignore_errors:
2223 log.warning(err_msg)
2224 return
2225 else:
2226 raise FileNotFoundError(err_msg)
2228 for location, _ in fileLocations:
2229 if not self._artifact_exists(location):
2230 err_msg = (
2231 f"Dataset is known to datastore {self.name} but "
2232 f"associated artifact ({location.uri}) is missing"
2233 )
2234 if ignore_errors:
2235 log.warning(err_msg)
2236 return
2237 else:
2238 raise FileNotFoundError(err_msg)
2240 # Mark dataset as trashed
2241 try:
2242 self.bridge.moveToTrash([ref], transaction=self._transaction)
2243 except Exception as e:
2244 if ignore_errors:
2245 log.warning(
2246 "Attempted to mark dataset (%s) to be trashed in datastore %s "
2247 "but encountered an error: %s",
2248 ref,
2249 self.name,
2250 e,
2251 )
2252 pass
2253 else:
2254 raise
2256 @transactional
2257 def emptyTrash(self, ignore_errors: bool = True) -> None:
2258 """Remove all datasets from the trash.
2260 Parameters
2261 ----------
2262 ignore_errors : `bool`
2263 If `True` return without error even if something went wrong.
2264 Problems could occur if another process is simultaneously trying
2265 to delete.
2266 """
2267 log.debug("Emptying trash in datastore %s", self.name)
2269 # Context manager will empty trash iff we finish it without raising.
2270 # It will also automatically delete the relevant rows from the
2271 # trash table and the records table.
2272 with self.bridge.emptyTrash(
2273 self._table, record_class=StoredFileInfo, record_column="path"
2274 ) as trash_data:
2275 # Removing the artifacts themselves requires that the files are
2276 # not also associated with refs that are not to be trashed.
2277 # Therefore need to do a query with the file paths themselves
2278 # and return all the refs associated with them. Can only delete
2279 # a file if the refs to be trashed are the only refs associated
2280 # with the file.
2281 # This requires multiple copies of the trashed items
2282 trashed, artifacts_to_keep = trash_data
2284 if artifacts_to_keep is None:
2285 # The bridge is not helping us so have to work it out
2286 # ourselves. This is not going to be as efficient.
2287 trashed = list(trashed)
2289 # The instance check is for mypy since up to this point it
2290 # does not know the type of info.
2291 path_map = self._refs_associated_with_artifacts(
2292 [info.path for _, info in trashed if isinstance(info, StoredFileInfo)]
2293 )
2295 for ref, info in trashed:
2296 # Mypy needs to know this is not the base class
2297 assert isinstance(info, StoredFileInfo), f"Unexpectedly got info of class {type(info)}"
2299 path_map[info.path].remove(ref.id)
2300 if not path_map[info.path]:
2301 del path_map[info.path]
2303 artifacts_to_keep = set(path_map)
2305 for ref, info in trashed:
2306 # Should not happen for this implementation but need
2307 # to keep mypy happy.
2308 assert info is not None, f"Internal logic error in emptyTrash with ref {ref}."
2310 # Mypy needs to know this is not the base class
2311 assert isinstance(info, StoredFileInfo), f"Unexpectedly got info of class {type(info)}"
2313 if info.path in artifacts_to_keep:
2314 # This is a multi-dataset artifact and we are not
2315 # removing all associated refs.
2316 continue
2318 # Only trashed refs still known to datastore will be returned.
2319 location = info.file_location(self.locationFactory)
2321 # Point of no return for this artifact
2322 log.debug("Removing artifact %s from datastore %s", location.uri, self.name)
2323 try:
2324 self._delete_artifact(location)
2325 except FileNotFoundError:
2326 # If the file itself has been deleted there is nothing
2327 # we can do about it. It is possible that trash has
2328 # been run in parallel in another process or someone
2329 # decided to delete the file. It is unlikely to come
2330 # back and so we should still continue with the removal
2331 # of the entry from the trash table. It is also possible
2332 # we removed it in a previous iteration if it was
2333 # a multi-dataset artifact. The delete artifact method
2334 # will log a debug message in this scenario.
2335 # Distinguishing file missing before trash started and
2336 # file already removed previously as part of this trash
2337 # is not worth the distinction with regards to potential
2338 # memory cost.
2339 pass
2340 except Exception as e:
2341 if ignore_errors:
2342 # Use a debug message here even though it's not
2343 # a good situation. In some cases this can be
2344 # caused by a race between user A and user B
2345 # and neither of them has permissions for the
2346 # other's files. Butler does not know about users
2347 # and trash has no idea what collections these
2348 # files were in (without guessing from a path).
2349 log.debug(
2350 "Encountered error removing artifact %s from datastore %s: %s",
2351 location.uri,
2352 self.name,
2353 e,
2354 )
2355 else:
2356 raise
2358 @transactional
2359 def transfer_from(
2360 self,
2361 source_datastore: Datastore,
2362 refs: Collection[DatasetRef],
2363 transfer: str = "auto",
2364 artifact_existence: dict[ResourcePath, bool] | None = None,
2365 dry_run: bool = False,
2366 ) -> tuple[set[DatasetRef], set[DatasetRef]]:
2367 # Docstring inherited
2368 if type(self) is not type(source_datastore):
2369 raise TypeError(
2370 f"Datastore mismatch between this datastore ({type(self)}) and the "
2371 f"source datastore ({type(source_datastore)})."
2372 )
2374 # Be explicit for mypy
2375 if not isinstance(source_datastore, FileDatastore):
2376 raise TypeError(
2377 "Can only transfer to a FileDatastore from another FileDatastore, not"
2378 f" {type(source_datastore)}"
2379 )
2381 # Stop early if "direct" transfer mode is requested. That would
2382 # require that the URI inside the source datastore should be stored
2383 # directly in the target datastore, which seems unlikely to be useful
2384 # since at any moment the source datastore could delete the file.
2385 if transfer in ("direct", "split"):
2386 raise ValueError(
2387 f"Can not transfer from a source datastore using {transfer} mode since"
2388 " those files are controlled by the other datastore."
2389 )
2391 # Empty existence lookup if none given.
2392 if artifact_existence is None:
2393 artifact_existence = {}
2395 # In order to handle disassembled composites the code works
2396 # at the records level since it can assume that internal APIs
2397 # can be used.
2398 # - If the record already exists in the destination this is assumed
2399 # to be okay.
2400 # - If there is no record but the source and destination URIs are
2401 # identical no transfer is done but the record is added.
2402 # - If the source record refers to an absolute URI currently assume
2403 # that that URI should remain absolute and will be visible to the
2404 # destination butler. May need to have a flag to indicate whether
2405 # the dataset should be transferred. This will only happen if
2406 # the detached Butler has had a local ingest.
2408 # What we really want is all the records in the source datastore
2409 # associated with these refs. Or derived ones if they don't exist
2410 # in the source.
2411 source_records = source_datastore._get_stored_records_associated_with_refs(
2412 refs, ignore_datastore_records=True
2413 )
2415 # The source dataset_ids are the keys in these records
2416 source_ids = set(source_records)
2417 log.debug("Number of datastore records found in source: %d", len(source_ids))
2419 requested_ids = {ref.id for ref in refs}
2420 missing_ids = requested_ids - source_ids
2422 # Missing IDs can be okay if that datastore has allowed
2423 # gets based on file existence. Should we transfer what we can
2424 # or complain about it and warn?
2425 if missing_ids and not source_datastore.trustGetRequest:
2426 raise ValueError(
2427 f"Some datasets are missing from source datastore {source_datastore}: {missing_ids}"
2428 )
2430 # Need to map these missing IDs to a DatasetRef so we can guess
2431 # the details.
2432 if missing_ids:
2433 log.info(
2434 "Number of expected datasets missing from source datastore records: %d out of %d",
2435 len(missing_ids),
2436 len(requested_ids),
2437 )
2438 id_to_ref = {ref.id: ref for ref in refs if ref.id in missing_ids}
2440 # This should be chunked in case we end up having to check
2441 # the file store since we need some log output to show
2442 # progress.
2443 for missing_ids_chunk in chunk_iterable(missing_ids, chunk_size=10_000):
2444 records = {}
2445 for missing in missing_ids_chunk:
2446 # Ask the source datastore where the missing artifacts
2447 # should be. An execution butler might not know about the
2448 # artifacts even if they are there.
2449 expected = source_datastore._get_expected_dataset_locations_info(id_to_ref[missing])
2450 records[missing] = [info for _, info in expected]
2452 # Call the mexist helper method in case we have not already
2453 # checked these artifacts such that artifact_existence is
2454 # empty. This allows us to benefit from parallelism.
2455 # datastore.mexists() itself does not give us access to the
2456 # derived datastore record.
2457 log.verbose("Checking existence of %d datasets unknown to datastore", len(records))
2458 ref_exists = source_datastore._process_mexists_records(
2459 id_to_ref, records, False, artifact_existence=artifact_existence
2460 )
2462 # Now go through the records and propagate the ones that exist.
2463 location_factory = source_datastore.locationFactory
2464 for missing, record_list in records.items():
2465 # Skip completely if the ref does not exist.
2466 ref = id_to_ref[missing]
2467 if not ref_exists[ref]:
2468 log.warning("Asked to transfer dataset %s but no file artifacts exist for it.", ref)
2469 continue
2470 # Check for file artifact to decide which parts of a
2471 # disassembled composite do exist. If there is only a
2472 # single record we don't even need to look because it can't
2473 # be a composite and must exist.
2474 if len(record_list) == 1:
2475 dataset_records = record_list
2476 else:
2477 dataset_records = [
2478 record
2479 for record in record_list
2480 if artifact_existence[record.file_location(location_factory).uri]
2481 ]
2482 assert len(dataset_records) > 0, "Disassembled composite should have had some files."
2484 # Rely on source_records being a defaultdict.
2485 source_records[missing].extend(dataset_records)
2486 log.verbose("Completed scan for missing data files")
2488 # See if we already have these records
2489 target_records = self._get_stored_records_associated_with_refs(refs, ignore_datastore_records=True)
2491 # The artifacts to register
2492 artifacts = []
2494 # Refs that already exist
2495 already_present = []
2497 # Refs that were rejected by this datastore.
2498 rejected = set()
2500 # Refs that were transferred successfully.
2501 accepted = set()
2503 # Record each time we have done a "direct" transfer.
2504 direct_transfers = []
2506 # Now can transfer the artifacts
2507 for ref in refs:
2508 if not self.constraints.isAcceptable(ref):
2509 # This datastore should not be accepting this dataset.
2510 rejected.add(ref)
2511 continue
2513 accepted.add(ref)
2515 if ref.id in target_records:
2516 # Already have an artifact for this.
2517 already_present.append(ref)
2518 continue
2520 # mypy needs to know these are always resolved refs
2521 for info in source_records[ref.id]:
2522 source_location = info.file_location(source_datastore.locationFactory)
2523 target_location = info.file_location(self.locationFactory)
2524 if source_location == target_location and not source_location.pathInStore.isabs():
2525 # Artifact is already in the target location.
2526 # (which is how execution butler currently runs)
2527 pass
2528 else:
2529 if target_location.pathInStore.isabs():
2530 # Just because we can see the artifact when running
2531 # the transfer doesn't mean it will be generally
2532 # accessible to a user of this butler. Need to decide
2533 # what to do about an absolute path.
2534 if transfer == "auto":
2535 # For "auto" transfers we allow the absolute URI
2536 # to be recorded in the target datastore.
2537 direct_transfers.append(source_location)
2538 else:
2539 # The user is explicitly requesting a transfer
2540 # even for an absolute URI. This requires us to
2541 # calculate the target path.
2542 template_ref = ref
2543 if info.component:
2544 template_ref = ref.makeComponentRef(info.component)
2545 target_location = self._calculate_ingested_datastore_name(
2546 source_location.uri,
2547 template_ref,
2548 )
2550 info = info.update(path=target_location.pathInStore.path)
2552 # Need to transfer it to the new location.
2553 # Assume we should always overwrite. If the artifact
2554 # is there this might indicate that a previous transfer
2555 # was interrupted but was not able to be rolled back
2556 # completely (eg pre-emption) so follow Datastore default
2557 # and overwrite. Do not copy if we are in dry-run mode.
2558 if not dry_run:
2559 target_location.uri.transfer_from(
2560 source_location.uri,
2561 transfer=transfer,
2562 overwrite=True,
2563 transaction=self._transaction,
2564 )
2566 artifacts.append((ref, info))
2568 if direct_transfers:
2569 log.info(
2570 "Transfer request for an outside-datastore artifact with absolute URI done %d time%s",
2571 len(direct_transfers),
2572 "" if len(direct_transfers) == 1 else "s",
2573 )
2575 # We are overwriting previous datasets that may have already
2576 # existed. We therefore should ensure that we force the
2577 # datastore records to agree. Note that this can potentially lead
2578 # to difficulties if the dataset has previously been ingested
2579 # disassembled and is somehow now assembled, or vice versa.
2580 if not dry_run:
2581 self._register_datasets(artifacts, insert_mode=DatabaseInsertMode.REPLACE)
2583 if already_present:
2584 n_skipped = len(already_present)
2585 log.info(
2586 "Skipped transfer of %d dataset%s already present in datastore",
2587 n_skipped,
2588 "" if n_skipped == 1 else "s",
2589 )
2591 return accepted, rejected
2593 @transactional
2594 def forget(self, refs: Iterable[DatasetRef]) -> None:
2595 # Docstring inherited.
2596 refs = list(refs)
2597 self.bridge.forget(refs)
2598 self._table.delete(["dataset_id"], *[{"dataset_id": ref.id} for ref in refs])
2600 def validateConfiguration(
2601 self, entities: Iterable[DatasetRef | DatasetType | StorageClass], logFailures: bool = False
2602 ) -> None:
2603 """Validate some of the configuration for this datastore.
2605 Parameters
2606 ----------
2607 entities : iterable of `DatasetRef`, `DatasetType`, or `StorageClass`
2608 Entities to test against this configuration. Can be differing
2609 types.
2610 logFailures : `bool`, optional
2611 If `True`, output a log message for every validation error
2612 detected.
2614 Raises
2615 ------
2616 DatastoreValidationError
2617 Raised if there is a validation problem with a configuration.
2618 All the problems are reported in a single exception.
2620 Notes
2621 -----
2622 This method checks that all the supplied entities have valid file
2623 templates and also have formatters defined.
2624 """
2625 templateFailed = None
2626 try:
2627 self.templates.validateTemplates(entities, logFailures=logFailures)
2628 except FileTemplateValidationError as e:
2629 templateFailed = str(e)
2631 formatterFailed = []
2632 for entity in entities:
2633 try:
2634 self.formatterFactory.getFormatterClass(entity)
2635 except KeyError as e:
2636 formatterFailed.append(str(e))
2637 if logFailures:
2638 log.critical("Formatter failure: %s", e)
2640 if templateFailed or formatterFailed:
2641 messages = []
2642 if templateFailed:
2643 messages.append(templateFailed)
2644 if formatterFailed:
2645 messages.append(",".join(formatterFailed))
2646 msg = ";\n".join(messages)
2647 raise DatastoreValidationError(msg)
2649 def getLookupKeys(self) -> set[LookupKey]:
2650 # Docstring is inherited from base class
2651 return (
2652 self.templates.getLookupKeys()
2653 | self.formatterFactory.getLookupKeys()
2654 | self.constraints.getLookupKeys()
2655 )
2657 def validateKey(self, lookupKey: LookupKey, entity: DatasetRef | DatasetType | StorageClass) -> None:
2658 # Docstring is inherited from base class
2659 # The key can be valid in either formatters or templates so we can
2660 # only check the template if it exists
2661 if lookupKey in self.templates:
2662 try:
2663 self.templates[lookupKey].validateTemplate(entity)
2664 except FileTemplateValidationError as e:
2665 raise DatastoreValidationError(e) from e
2667 def export(
2668 self,
2669 refs: Iterable[DatasetRef],
2670 *,
2671 directory: ResourcePathExpression | None = None,
2672 transfer: str | None = "auto",
2673 ) -> Iterable[FileDataset]:
2674 # Docstring inherited from Datastore.export.
2675 if transfer == "auto" and directory is None:
2676 transfer = None
2678 if transfer is not None and directory is None:
2679 raise TypeError(f"Cannot export using transfer mode {transfer} with no export directory given")
2681 if transfer == "move":
2682 raise TypeError("Can not export by moving files out of datastore.")
2683 elif transfer == "direct":
2684 # For an export, treat this as equivalent to None. We do not
2685 # want an import to risk using absolute URIs to datasets owned
2686 # by another datastore.
2687 log.info("Treating 'direct' transfer mode as in-place export.")
2688 transfer = None
2690 # Force the directory to be a URI object
2691 directoryUri: ResourcePath | None = None
2692 if directory is not None:
2693 directoryUri = ResourcePath(directory, forceDirectory=True)
2695 if transfer is not None and directoryUri is not None and not directoryUri.exists():
2696 # mypy needs the second test
2697 raise FileNotFoundError(f"Export location {directory} does not exist")
2699 progress = Progress("lsst.daf.butler.datastores.FileDatastore.export", level=logging.DEBUG)
2700 for ref in progress.wrap(refs, "Exporting dataset files"):
2701 fileLocations = self._get_dataset_locations_info(ref)
2702 if not fileLocations:
2703 raise FileNotFoundError(f"Could not retrieve dataset {ref}.")
2704 # For now we can not export disassembled datasets
2705 if len(fileLocations) > 1:
2706 raise NotImplementedError(f"Can not export disassembled datasets such as {ref}")
2707 location, storedFileInfo = fileLocations[0]
2709 pathInStore = location.pathInStore.path
2710 if transfer is None:
2711 # TODO: do we also need to return the readStorageClass somehow?
2712 # We will use the path in store directly. If this is an
2713 # absolute URI, preserve it.
2714 if location.pathInStore.isabs():
2715 pathInStore = str(location.uri)
2716 elif transfer == "direct":
2717 # Use full URIs to the remote store in the export
2718 pathInStore = str(location.uri)
2719 else:
2720 # mypy needs help
2721 assert directoryUri is not None, "directoryUri must be defined to get here"
2722 storeUri = ResourcePath(location.uri, forceDirectory=False)
2724 # if the datastore has an absolute URI to a resource, we
2725 # have two options:
2726 # 1. Keep the absolute URI in the exported YAML
2727 # 2. Allocate a new name in the local datastore and transfer
2728 # it.
2729 # For now go with option 2
2730 if location.pathInStore.isabs():
2731 template = self.templates.getTemplate(ref)
2732 newURI = ResourcePath(template.format(ref), forceAbsolute=False, forceDirectory=False)
2733 pathInStore = str(newURI.updatedExtension(location.pathInStore.getExtension()))
2735 exportUri = directoryUri.join(pathInStore)
2736 exportUri.transfer_from(storeUri, transfer=transfer)
2738 yield FileDataset(refs=[ref], path=pathInStore, formatter=storedFileInfo.formatter)
2740 @staticmethod
2741 def computeChecksum(uri: ResourcePath, algorithm: str = "blake2b", block_size: int = 8192) -> str | None:
2742 """Compute the checksum of the supplied file.
2744 Parameters
2745 ----------
2746 uri : `lsst.resources.ResourcePath`
2747 Name of resource to calculate checksum from.
2748 algorithm : `str`, optional
2749 Name of algorithm to use. Must be one of the algorithms supported
2750 by :py:class`hashlib`.
2751 block_size : `int`
2752 Number of bytes to read from file at one time.
2754 Returns
2755 -------
2756 hexdigest : `str`
2757 Hex digest of the file.
2759 Notes
2760 -----
2761 Currently returns None if the URI is for a remote resource.
2762 """
2763 if algorithm not in hashlib.algorithms_guaranteed:
2764 raise NameError(f"The specified algorithm '{algorithm}' is not supported by hashlib")
2766 if not uri.isLocal:
2767 return None
2769 hasher = hashlib.new(algorithm)
2771 with uri.as_local() as local_uri, open(local_uri.ospath, "rb") as f:
2772 for chunk in iter(lambda: f.read(block_size), b""):
2773 hasher.update(chunk)
2775 return hasher.hexdigest()
2777 def needs_expanded_data_ids(
2778 self,
2779 transfer: str | None,
2780 entity: DatasetRef | DatasetType | StorageClass | None = None,
2781 ) -> bool:
2782 # Docstring inherited.
2783 # This _could_ also use entity to inspect whether the filename template
2784 # involves placeholders other than the required dimensions for its
2785 # dataset type, but that's not necessary for correctness; it just
2786 # enables more optimizations (perhaps only in theory).
2787 return transfer not in ("direct", None)
2789 def import_records(self, data: Mapping[str, DatastoreRecordData]) -> None:
2790 # Docstring inherited from the base class.
2791 record_data = data.get(self.name)
2792 if not record_data:
2793 return
2795 self._bridge.insert(FakeDatasetRef(dataset_id) for dataset_id in record_data.records)
2797 # TODO: Verify that there are no unexpected table names in the dict?
2798 unpacked_records = []
2799 for dataset_id, dataset_data in record_data.records.items():
2800 records = dataset_data.get(self._table.name)
2801 if records:
2802 for info in records:
2803 assert isinstance(info, StoredFileInfo), "Expecting StoredFileInfo records"
2804 unpacked_records.append(info.to_record(dataset_id=dataset_id))
2805 if unpacked_records:
2806 self._table.insert(*unpacked_records, transaction=self._transaction)
2808 def export_records(self, refs: Iterable[DatasetIdRef]) -> Mapping[str, DatastoreRecordData]:
2809 # Docstring inherited from the base class.
2810 exported_refs = list(self._bridge.check(refs))
2811 ids = {ref.id for ref in exported_refs}
2812 records: dict[DatasetId, dict[str, list[StoredDatastoreItemInfo]]] = {id: {} for id in ids}
2813 for row in self._table.fetch(dataset_id=ids):
2814 info: StoredDatastoreItemInfo = StoredFileInfo.from_record(row)
2815 dataset_records = records.setdefault(row["dataset_id"], {})
2816 dataset_records.setdefault(self._table.name, []).append(info)
2818 record_data = DatastoreRecordData(records=records)
2819 return {self.name: record_data}
2821 def set_retrieve_dataset_type_method(self, method: Callable[[str], DatasetType | None] | None) -> None:
2822 # Docstring inherited from the base class.
2823 self._retrieve_dataset_method = method
2825 def _cast_storage_class(self, ref: DatasetRef) -> DatasetRef:
2826 """Update dataset reference to use the storage class from registry."""
2827 if self._retrieve_dataset_method is None:
2828 # We could raise an exception here but unit tests do not define
2829 # this method.
2830 return ref
2831 dataset_type = self._retrieve_dataset_method(ref.datasetType.name)
2832 if dataset_type is not None:
2833 ref = ref.overrideStorageClass(dataset_type.storageClass)
2834 return ref
2836 def get_opaque_table_definitions(self) -> Mapping[str, DatastoreOpaqueTable]:
2837 # Docstring inherited from the base class.
2838 return {self._opaque_table_name: DatastoreOpaqueTable(self.makeTableSpec(ddl.GUID), StoredFileInfo)}
2841def _to_file_info_payload(
2842 info: DatasetLocationInformation, url_expiration_time_seconds: int
2843) -> FileDatastoreGetPayloadFileInfo:
2844 location, file_info = info
2846 # Make sure that we send only relative paths, to avoid leaking
2847 # details of our configuration to the client.
2848 path = location.pathInStore
2849 if path.isabs():
2850 relative_path = path.relativeToPathRoot
2851 else:
2852 relative_path = str(path)
2854 datastoreRecords = file_info.to_simple()
2855 datastoreRecords.path = relative_path
2857 return FileDatastoreGetPayloadFileInfo(
2858 url=location.uri.generate_presigned_get_url(expiration_time_seconds=url_expiration_time_seconds),
2859 datastoreRecords=datastoreRecords,
2860 )