Coverage for python/lsst/daf/butler/datastores/fileDatastore.py: 10%
1040 statements
« prev ^ index » next coverage.py v7.3.2, created at 2023-12-05 11:07 +0000
« prev ^ index » next coverage.py v7.3.2, created at 2023-12-05 11:07 +0000
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This software is dual licensed under the GNU General Public License and also
10# under a 3-clause BSD license. Recipients may choose which of these licenses
11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt,
12# respectively. If you choose the GPL option then the following text applies
13# (but note that there is still no warranty even if you opt for BSD instead):
14#
15# This program is free software: you can redistribute it and/or modify
16# it under the terms of the GNU General Public License as published by
17# the Free Software Foundation, either version 3 of the License, or
18# (at your option) any later version.
19#
20# This program is distributed in the hope that it will be useful,
21# but WITHOUT ANY WARRANTY; without even the implied warranty of
22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23# GNU General Public License for more details.
24#
25# You should have received a copy of the GNU General Public License
26# along with this program. If not, see <http://www.gnu.org/licenses/>.
28"""Generic file-based datastore code."""
30from __future__ import annotations
32__all__ = ("FileDatastore",)
34import contextlib
35import hashlib
36import logging
37from collections import defaultdict
38from collections.abc import Callable, Iterable, Mapping, Sequence
39from dataclasses import dataclass
40from typing import TYPE_CHECKING, Any, ClassVar, cast
42from lsst.daf.butler import (
43 Config,
44 DatasetId,
45 DatasetRef,
46 DatasetType,
47 DatasetTypeNotSupportedError,
48 FileDataset,
49 FileDescriptor,
50 Formatter,
51 FormatterFactory,
52 Location,
53 LocationFactory,
54 Progress,
55 StorageClass,
56 ddl,
57)
58from lsst.daf.butler.datastore import (
59 DatasetRefURIs,
60 Datastore,
61 DatastoreConfig,
62 DatastoreOpaqueTable,
63 DatastoreValidationError,
64)
65from lsst.daf.butler.datastore.cache_manager import (
66 AbstractDatastoreCacheManager,
67 DatastoreCacheManager,
68 DatastoreDisabledCacheManager,
69)
70from lsst.daf.butler.datastore.composites import CompositesMap
71from lsst.daf.butler.datastore.file_templates import FileTemplates, FileTemplateValidationError
72from lsst.daf.butler.datastore.generic_base import GenericBaseDatastore
73from lsst.daf.butler.datastore.record_data import DatastoreRecordData
74from lsst.daf.butler.datastore.stored_file_info import StoredDatastoreItemInfo, StoredFileInfo
75from lsst.daf.butler.registry.interfaces import (
76 DatabaseInsertMode,
77 DatastoreRegistryBridge,
78 FakeDatasetRef,
79 ReadOnlyDatabaseError,
80)
81from lsst.daf.butler.repo_relocation import replaceRoot
82from lsst.daf.butler.utils import transactional
83from lsst.resources import ResourcePath, ResourcePathExpression
84from lsst.utils.introspection import get_class_of, get_instance_of
85from lsst.utils.iteration import chunk_iterable
87# For VERBOSE logging usage.
88from lsst.utils.logging import VERBOSE, getLogger
89from lsst.utils.timer import time_this
90from sqlalchemy import BigInteger, String
92if TYPE_CHECKING:
93 from lsst.daf.butler import LookupKey
94 from lsst.daf.butler.registry.interfaces import DatasetIdRef, DatastoreRegistryBridgeManager
96log = getLogger(__name__)
99class _IngestPrepData(Datastore.IngestPrepData):
100 """Helper class for FileDatastore ingest implementation.
102 Parameters
103 ----------
104 datasets : `~collections.abc.Iterable` of `FileDataset`
105 Files to be ingested by this datastore.
106 """
108 def __init__(self, datasets: Iterable[FileDataset]):
109 super().__init__(ref for dataset in datasets for ref in dataset.refs)
110 self.datasets = datasets
113@dataclass(frozen=True)
114class DatastoreFileGetInformation:
115 """Collection of useful parameters needed to retrieve a file from
116 a Datastore.
117 """
119 location: Location
120 """The location from which to read the dataset."""
122 formatter: Formatter
123 """The `Formatter` to use to deserialize the dataset."""
125 info: StoredFileInfo
126 """Stored information about this file and its formatter."""
128 assemblerParams: Mapping[str, Any]
129 """Parameters to use for post-processing the retrieved dataset."""
131 formatterParams: Mapping[str, Any]
132 """Parameters that were understood by the associated formatter."""
134 component: str | None
135 """The component to be retrieved (can be `None`)."""
137 readStorageClass: StorageClass
138 """The `StorageClass` of the dataset being read."""
141class FileDatastore(GenericBaseDatastore[StoredFileInfo]):
142 """Generic Datastore for file-based implementations.
144 Should always be sub-classed since key abstract methods are missing.
146 Parameters
147 ----------
148 config : `DatastoreConfig` or `str`
149 Configuration as either a `Config` object or URI to file.
150 bridgeManager : `DatastoreRegistryBridgeManager`
151 Object that manages the interface between `Registry` and datastores.
152 butlerRoot : `str`, optional
153 New datastore root to use to override the configuration value.
155 Raises
156 ------
157 ValueError
158 If root location does not exist and ``create`` is `False` in the
159 configuration.
160 """
162 defaultConfigFile: ClassVar[str | None] = None
163 """Path to configuration defaults. Accessed within the ``config`` resource
164 or relative to a search path. Can be None if no defaults specified.
165 """
167 root: ResourcePath
168 """Root directory URI of this `Datastore`."""
170 locationFactory: LocationFactory
171 """Factory for creating locations relative to the datastore root."""
173 formatterFactory: FormatterFactory
174 """Factory for creating instances of formatters."""
176 templates: FileTemplates
177 """File templates that can be used by this `Datastore`."""
179 composites: CompositesMap
180 """Determines whether a dataset should be disassembled on put."""
182 defaultConfigFile = "datastores/fileDatastore.yaml"
183 """Path to configuration defaults. Accessed within the ``config`` resource
184 or relative to a search path. Can be None if no defaults specified.
185 """
187 _retrieve_dataset_method: Callable[[str], DatasetType | None] | None = None
188 """Callable that is used in trusted mode to retrieve registry definition
189 of a named dataset type.
190 """
192 @classmethod
193 def setConfigRoot(cls, root: str, config: Config, full: Config, overwrite: bool = True) -> None:
194 """Set any filesystem-dependent config options for this Datastore to
195 be appropriate for a new empty repository with the given root.
197 Parameters
198 ----------
199 root : `str`
200 URI to the root of the data repository.
201 config : `Config`
202 A `Config` to update. Only the subset understood by
203 this component will be updated. Will not expand
204 defaults.
205 full : `Config`
206 A complete config with all defaults expanded that can be
207 converted to a `DatastoreConfig`. Read-only and will not be
208 modified by this method.
209 Repository-specific options that should not be obtained
210 from defaults when Butler instances are constructed
211 should be copied from ``full`` to ``config``.
212 overwrite : `bool`, optional
213 If `False`, do not modify a value in ``config`` if the value
214 already exists. Default is always to overwrite with the provided
215 ``root``.
217 Notes
218 -----
219 If a keyword is explicitly defined in the supplied ``config`` it
220 will not be overridden by this method if ``overwrite`` is `False`.
221 This allows explicit values set in external configs to be retained.
222 """
223 Config.updateParameters(
224 DatastoreConfig,
225 config,
226 full,
227 toUpdate={"root": root},
228 toCopy=("cls", ("records", "table")),
229 overwrite=overwrite,
230 )
232 @classmethod
233 def makeTableSpec(cls, datasetIdColumnType: type) -> ddl.TableSpec:
234 return ddl.TableSpec(
235 fields=[
236 ddl.FieldSpec(name="dataset_id", dtype=datasetIdColumnType, primaryKey=True),
237 ddl.FieldSpec(name="path", dtype=String, length=256, nullable=False),
238 ddl.FieldSpec(name="formatter", dtype=String, length=128, nullable=False),
239 ddl.FieldSpec(name="storage_class", dtype=String, length=64, nullable=False),
240 # Use empty string to indicate no component
241 ddl.FieldSpec(name="component", dtype=String, length=32, primaryKey=True),
242 # TODO: should checksum be Base64Bytes instead?
243 ddl.FieldSpec(name="checksum", dtype=String, length=128, nullable=True),
244 ddl.FieldSpec(name="file_size", dtype=BigInteger, nullable=True),
245 ],
246 unique=frozenset(),
247 indexes=[ddl.IndexSpec("path")],
248 )
250 def __init__(
251 self,
252 config: DatastoreConfig | ResourcePathExpression,
253 bridgeManager: DatastoreRegistryBridgeManager,
254 butlerRoot: str | None = None,
255 ):
256 super().__init__(config, bridgeManager)
257 if "root" not in self.config:
258 raise ValueError("No root directory specified in configuration")
260 # Name ourselves either using an explicit name or a name
261 # derived from the (unexpanded) root
262 if "name" in self.config:
263 self.name = self.config["name"]
264 else:
265 # We use the unexpanded root in the name to indicate that this
266 # datastore can be moved without having to update registry.
267 self.name = "{}@{}".format(type(self).__name__, self.config["root"])
269 # Support repository relocation in config
270 # Existence of self.root is checked in subclass
271 self.root = ResourcePath(
272 replaceRoot(self.config["root"], butlerRoot), forceDirectory=True, forceAbsolute=True
273 )
275 self.locationFactory = LocationFactory(self.root)
276 self.formatterFactory = FormatterFactory()
278 # Now associate formatters with storage classes
279 self.formatterFactory.registerFormatters(self.config["formatters"], universe=bridgeManager.universe)
281 # Read the file naming templates
282 self.templates = FileTemplates(self.config["templates"], universe=bridgeManager.universe)
284 # See if composites should be disassembled
285 self.composites = CompositesMap(self.config["composites"], universe=bridgeManager.universe)
287 self._opaque_table_name = self.config["records", "table"]
288 try:
289 # Storage of paths and formatters, keyed by dataset_id
290 self._table = bridgeManager.opaque.register(
291 self._opaque_table_name, self.makeTableSpec(bridgeManager.datasetIdColumnType)
292 )
293 # Interface to Registry.
294 self._bridge = bridgeManager.register(self.name)
295 except ReadOnlyDatabaseError:
296 # If the database is read only and we just tried and failed to
297 # create a table, it means someone is trying to create a read-only
298 # butler client for an empty repo. That should be okay, as long
299 # as they then try to get any datasets before some other client
300 # creates the table. Chances are they're just validating
301 # configuration.
302 pass
304 # Determine whether checksums should be used - default to False
305 self.useChecksum = self.config.get("checksum", False)
307 # Determine whether we can fall back to configuration if a
308 # requested dataset is not known to registry
309 self.trustGetRequest = self.config.get("trust_get_request", False)
311 # Create a cache manager
312 self.cacheManager: AbstractDatastoreCacheManager
313 if "cached" in self.config:
314 self.cacheManager = DatastoreCacheManager(self.config["cached"], universe=bridgeManager.universe)
315 else:
316 self.cacheManager = DatastoreDisabledCacheManager("", universe=bridgeManager.universe)
318 # Check existence and create directory structure if necessary
319 if not self.root.exists():
320 if "create" not in self.config or not self.config["create"]:
321 raise ValueError(f"No valid root and not allowed to create one at: {self.root}")
322 try:
323 self.root.mkdir()
324 except Exception as e:
325 raise ValueError(
326 f"Can not create datastore root '{self.root}', check permissions. Got error: {e}"
327 ) from e
329 def __str__(self) -> str:
330 return str(self.root)
332 @property
333 def bridge(self) -> DatastoreRegistryBridge:
334 return self._bridge
336 @property
337 def roots(self) -> dict[str, ResourcePath | None]:
338 # Docstring inherited.
339 return {self.name: self.root}
341 def _artifact_exists(self, location: Location) -> bool:
342 """Check that an artifact exists in this datastore at the specified
343 location.
345 Parameters
346 ----------
347 location : `Location`
348 Expected location of the artifact associated with this datastore.
350 Returns
351 -------
352 exists : `bool`
353 True if the location can be found, false otherwise.
354 """
355 log.debug("Checking if resource exists: %s", location.uri)
356 return location.uri.exists()
358 def _delete_artifact(self, location: Location) -> None:
359 """Delete the artifact from the datastore.
361 Parameters
362 ----------
363 location : `Location`
364 Location of the artifact associated with this datastore.
365 """
366 if location.pathInStore.isabs():
367 raise RuntimeError(f"Cannot delete artifact with absolute uri {location.uri}.")
369 try:
370 location.uri.remove()
371 except FileNotFoundError:
372 log.debug("File %s did not exist and so could not be deleted.", location.uri)
373 raise
374 except Exception as e:
375 log.critical("Failed to delete file: %s (%s)", location.uri, e)
376 raise
377 log.debug("Successfully deleted file: %s", location.uri)
379 def addStoredItemInfo(
380 self,
381 refs: Iterable[DatasetRef],
382 infos: Iterable[StoredFileInfo],
383 insert_mode: DatabaseInsertMode = DatabaseInsertMode.INSERT,
384 ) -> None:
385 """Record internal storage information associated with one or more
386 datasets.
388 Parameters
389 ----------
390 refs : sequence of `DatasetRef`
391 The datasets that have been stored.
392 infos : sequence of `StoredDatastoreItemInfo`
393 Metadata associated with the stored datasets.
394 insert_mode : `~lsst.daf.butler.registry.interfaces.DatabaseInsertMode`
395 Mode to use to insert the new records into the table. The
396 options are ``INSERT`` (error if pre-existing), ``REPLACE``
397 (replace content with new values), and ``ENSURE`` (skip if the row
398 already exists).
399 """
400 records = [
401 info.rebase(ref).to_record(dataset_id=ref.id) for ref, info in zip(refs, infos, strict=True)
402 ]
403 match insert_mode:
404 case DatabaseInsertMode.INSERT:
405 self._table.insert(*records, transaction=self._transaction)
406 case DatabaseInsertMode.ENSURE:
407 self._table.ensure(*records, transaction=self._transaction)
408 case DatabaseInsertMode.REPLACE:
409 self._table.replace(*records, transaction=self._transaction)
410 case _:
411 raise ValueError(f"Unknown insert mode of '{insert_mode}'")
413 def getStoredItemsInfo(
414 self, ref: DatasetIdRef, ignore_datastore_records: bool = False
415 ) -> list[StoredFileInfo]:
416 """Retrieve information associated with files stored in this
417 `Datastore` associated with this dataset ref.
419 Parameters
420 ----------
421 ref : `DatasetRef`
422 The dataset that is to be queried.
423 ignore_datastore_records : `bool`
424 If `True` then do not use datastore records stored in refs.
426 Returns
427 -------
428 items : `~collections.abc.Iterable` [`StoredDatastoreItemInfo`]
429 Stored information about the files and associated formatters
430 associated with this dataset. Only one file will be returned
431 if the dataset has not been disassembled. Can return an empty
432 list if no matching datasets can be found.
433 """
434 # Try to get them from the ref first.
435 if ref._datastore_records is not None and not ignore_datastore_records:
436 if (ref_records := ref._datastore_records.get(self._table.name)) is not None:
437 # Need to make sure they have correct type.
438 for record in ref_records:
439 if not isinstance(record, StoredFileInfo):
440 raise TypeError(f"Datastore record has unexpected type {record.__class__.__name__}")
441 return cast(list[StoredFileInfo], ref_records)
443 # Look for the dataset_id -- there might be multiple matches
444 # if we have disassembled the dataset.
445 records = self._table.fetch(dataset_id=ref.id)
446 return [StoredFileInfo.from_record(record) for record in records]
448 def _register_datasets(
449 self,
450 refsAndInfos: Iterable[tuple[DatasetRef, StoredFileInfo]],
451 insert_mode: DatabaseInsertMode = DatabaseInsertMode.INSERT,
452 ) -> None:
453 """Update registry to indicate that one or more datasets have been
454 stored.
456 Parameters
457 ----------
458 refsAndInfos : sequence `tuple` [`DatasetRef`,
459 `StoredDatastoreItemInfo`]
460 Datasets to register and the internal datastore metadata associated
461 with them.
462 insert_mode : `str`, optional
463 Indicate whether the new records should be new ("insert", default),
464 or allowed to exists ("ensure") or be replaced if already present
465 ("replace").
466 """
467 expandedRefs: list[DatasetRef] = []
468 expandedItemInfos: list[StoredFileInfo] = []
470 for ref, itemInfo in refsAndInfos:
471 expandedRefs.append(ref)
472 expandedItemInfos.append(itemInfo)
474 # Dataset location only cares about registry ID so if we have
475 # disassembled in datastore we have to deduplicate. Since they
476 # will have different datasetTypes we can't use a set
477 registryRefs = {r.id: r for r in expandedRefs}
478 if insert_mode == DatabaseInsertMode.INSERT:
479 self.bridge.insert(registryRefs.values())
480 else:
481 # There are only two columns and all that matters is the
482 # dataset ID.
483 self.bridge.ensure(registryRefs.values())
484 self.addStoredItemInfo(expandedRefs, expandedItemInfos, insert_mode=insert_mode)
486 def _get_stored_records_associated_with_refs(
487 self, refs: Iterable[DatasetIdRef], ignore_datastore_records: bool = False
488 ) -> dict[DatasetId, list[StoredFileInfo]]:
489 """Retrieve all records associated with the provided refs.
491 Parameters
492 ----------
493 refs : iterable of `DatasetIdRef`
494 The refs for which records are to be retrieved.
495 ignore_datastore_records : `bool`
496 If `True` then do not use datastore records stored in refs.
498 Returns
499 -------
500 records : `dict` of [`DatasetId`, `list` of `StoredFileInfo`]
501 The matching records indexed by the ref ID. The number of entries
502 in the dict can be smaller than the number of requested refs.
503 """
504 # Check datastore records in refs first.
505 records_by_ref: defaultdict[DatasetId, list[StoredFileInfo]] = defaultdict(list)
506 refs_with_no_records = []
507 for ref in refs:
508 if ignore_datastore_records or ref._datastore_records is None:
509 refs_with_no_records.append(ref)
510 else:
511 if (ref_records := ref._datastore_records.get(self._table.name)) is not None:
512 # Need to make sure they have correct type.
513 for ref_record in ref_records:
514 if not isinstance(ref_record, StoredFileInfo):
515 raise TypeError(
516 f"Datastore record has unexpected type {ref_record.__class__.__name__}"
517 )
518 records_by_ref[ref.id].append(ref_record)
520 # If there were any refs without datastore records, check opaque table.
521 records = self._table.fetch(dataset_id=[ref.id for ref in refs_with_no_records])
523 # Uniqueness is dataset_id + component so can have multiple records
524 # per ref.
525 for record in records:
526 records_by_ref[record["dataset_id"]].append(StoredFileInfo.from_record(record))
527 return records_by_ref
529 def _refs_associated_with_artifacts(self, paths: list[str | ResourcePath]) -> dict[str, set[DatasetId]]:
530 """Return paths and associated dataset refs.
532 Parameters
533 ----------
534 paths : `list` of `str` or `lsst.resources.ResourcePath`
535 All the paths to include in search.
537 Returns
538 -------
539 mapping : `dict` of [`str`, `set` [`DatasetId`]]
540 Mapping of each path to a set of associated database IDs.
541 """
542 records = self._table.fetch(path=[str(path) for path in paths])
543 result = defaultdict(set)
544 for row in records:
545 result[row["path"]].add(row["dataset_id"])
546 return result
548 def _registered_refs_per_artifact(self, pathInStore: ResourcePath) -> set[DatasetId]:
549 """Return all dataset refs associated with the supplied path.
551 Parameters
552 ----------
553 pathInStore : `lsst.resources.ResourcePath`
554 Path of interest in the data store.
556 Returns
557 -------
558 ids : `set` of `int`
559 All `DatasetRef` IDs associated with this path.
560 """
561 records = list(self._table.fetch(path=str(pathInStore)))
562 ids = {r["dataset_id"] for r in records}
563 return ids
565 def removeStoredItemInfo(self, ref: DatasetIdRef) -> None:
566 """Remove information about the file associated with this dataset.
568 Parameters
569 ----------
570 ref : `DatasetRef`
571 The dataset that has been removed.
572 """
573 # Note that this method is actually not used by this implementation,
574 # we depend on bridge to delete opaque records. But there are some
575 # tests that check that this method works, so we keep it for now.
576 self._table.delete(["dataset_id"], {"dataset_id": ref.id})
578 def _get_dataset_locations_info(
579 self, ref: DatasetIdRef, ignore_datastore_records: bool = False
580 ) -> list[tuple[Location, StoredFileInfo]]:
581 r"""Find all the `Location`\ s of the requested dataset in the
582 `Datastore` and the associated stored file information.
584 Parameters
585 ----------
586 ref : `DatasetRef`
587 Reference to the required `Dataset`.
588 ignore_datastore_records : `bool`
589 If `True` then do not use datastore records stored in refs.
591 Returns
592 -------
593 results : `list` [`tuple` [`Location`, `StoredFileInfo` ]]
594 Location of the dataset within the datastore and
595 stored information about each file and its formatter.
596 """
597 # Get the file information (this will fail if no file)
598 records = self.getStoredItemsInfo(ref, ignore_datastore_records)
600 # Use the path to determine the location -- we need to take
601 # into account absolute URIs in the datastore record
602 return [(r.file_location(self.locationFactory), r) for r in records]
604 def _can_remove_dataset_artifact(self, ref: DatasetIdRef, location: Location) -> bool:
605 """Check that there is only one dataset associated with the
606 specified artifact.
608 Parameters
609 ----------
610 ref : `DatasetRef` or `FakeDatasetRef`
611 Dataset to be removed.
612 location : `Location`
613 The location of the artifact to be removed.
615 Returns
616 -------
617 can_remove : `Bool`
618 True if the artifact can be safely removed.
619 """
620 # Can't ever delete absolute URIs.
621 if location.pathInStore.isabs():
622 return False
624 # Get all entries associated with this path
625 allRefs = self._registered_refs_per_artifact(location.pathInStore)
626 if not allRefs:
627 raise RuntimeError(f"Datastore inconsistency error. {location.pathInStore} not in registry")
629 # Remove these refs from all the refs and if there is nothing left
630 # then we can delete
631 remainingRefs = allRefs - {ref.id}
633 if remainingRefs:
634 return False
635 return True
637 def _get_expected_dataset_locations_info(self, ref: DatasetRef) -> list[tuple[Location, StoredFileInfo]]:
638 """Predict the location and related file information of the requested
639 dataset in this datastore.
641 Parameters
642 ----------
643 ref : `DatasetRef`
644 Reference to the required `Dataset`.
646 Returns
647 -------
648 results : `list` [`tuple` [`Location`, `StoredFileInfo` ]]
649 Expected Location of the dataset within the datastore and
650 placeholder information about each file and its formatter.
652 Notes
653 -----
654 Uses the current configuration to determine how we would expect the
655 datastore files to have been written if we couldn't ask registry.
656 This is safe so long as there has been no change to datastore
657 configuration between writing the dataset and wanting to read it.
658 Will not work for files that have been ingested without using the
659 standard file template or default formatter.
660 """
661 # If we have a component ref we always need to ask the questions
662 # of the composite. If the composite is disassembled this routine
663 # should return all components. If the composite was not
664 # disassembled the composite is what is stored regardless of
665 # component request. Note that if the caller has disassembled
666 # a composite there is no way for this guess to know that
667 # without trying both the composite and component ref and seeing
668 # if there is something at the component Location even without
669 # disassembly being enabled.
670 if ref.datasetType.isComponent():
671 ref = ref.makeCompositeRef()
673 # See if the ref is a composite that should be disassembled
674 doDisassembly = self.composites.shouldBeDisassembled(ref)
676 all_info: list[tuple[Location, Formatter, StorageClass, str | None]] = []
678 if doDisassembly:
679 for component, componentStorage in ref.datasetType.storageClass.components.items():
680 compRef = ref.makeComponentRef(component)
681 location, formatter = self._determine_put_formatter_location(compRef)
682 all_info.append((location, formatter, componentStorage, component))
684 else:
685 # Always use the composite ref if no disassembly
686 location, formatter = self._determine_put_formatter_location(ref)
687 all_info.append((location, formatter, ref.datasetType.storageClass, None))
689 # Convert the list of tuples to have StoredFileInfo as second element
690 return [
691 (
692 location,
693 StoredFileInfo(
694 formatter=formatter,
695 path=location.pathInStore.path,
696 storageClass=storageClass,
697 component=component,
698 checksum=None,
699 file_size=-1,
700 ),
701 )
702 for location, formatter, storageClass, component in all_info
703 ]
705 def _prepare_for_get(
706 self, ref: DatasetRef, parameters: Mapping[str, Any] | None = None
707 ) -> list[DatastoreFileGetInformation]:
708 """Check parameters for ``get`` and obtain formatter and
709 location.
711 Parameters
712 ----------
713 ref : `DatasetRef`
714 Reference to the required Dataset.
715 parameters : `dict`
716 `StorageClass`-specific parameters that specify, for example,
717 a slice of the dataset to be loaded.
719 Returns
720 -------
721 getInfo : `list` [`DatastoreFileGetInformation`]
722 Parameters needed to retrieve each file.
723 """
724 log.debug("Retrieve %s from %s with parameters %s", ref, self.name, parameters)
726 # The storage class we want to use eventually
727 refStorageClass = ref.datasetType.storageClass
729 # For trusted mode need to reset storage class.
730 ref = self._cast_storage_class(ref)
732 # Get file metadata and internal metadata
733 fileLocations = self._get_dataset_locations_info(ref)
734 if not fileLocations:
735 if not self.trustGetRequest:
736 raise FileNotFoundError(f"Could not retrieve dataset {ref}.")
737 # Assume the dataset is where we think it should be
738 fileLocations = self._get_expected_dataset_locations_info(ref)
740 if len(fileLocations) > 1:
741 disassembled = True
743 # If trust is involved it is possible that there will be
744 # components listed here that do not exist in the datastore.
745 # Explicitly check for file artifact existence and filter out any
746 # that are missing.
747 if self.trustGetRequest:
748 fileLocations = [loc for loc in fileLocations if loc[0].uri.exists()]
750 # For now complain only if we have no components at all. One
751 # component is probably a problem but we can punt that to the
752 # assembler.
753 if not fileLocations:
754 raise FileNotFoundError(f"None of the component files for dataset {ref} exist.")
756 else:
757 disassembled = False
759 # Is this a component request?
760 refComponent = ref.datasetType.component()
762 fileGetInfo = []
763 for location, storedFileInfo in fileLocations:
764 # The storage class used to write the file
765 writeStorageClass = storedFileInfo.storageClass
767 # If this has been disassembled we need read to match the write
768 if disassembled:
769 readStorageClass = writeStorageClass
770 else:
771 readStorageClass = refStorageClass
773 formatter = get_instance_of(
774 storedFileInfo.formatter,
775 FileDescriptor(
776 location,
777 readStorageClass=readStorageClass,
778 storageClass=writeStorageClass,
779 parameters=parameters,
780 ),
781 ref.dataId,
782 )
784 formatterParams, notFormatterParams = formatter.segregateParameters()
786 # Of the remaining parameters, extract the ones supported by
787 # this StorageClass (for components not all will be handled)
788 assemblerParams = readStorageClass.filterParameters(notFormatterParams)
790 # The ref itself could be a component if the dataset was
791 # disassembled by butler, or we disassembled in datastore and
792 # components came from the datastore records
793 component = storedFileInfo.component if storedFileInfo.component else refComponent
795 fileGetInfo.append(
796 DatastoreFileGetInformation(
797 location,
798 formatter,
799 storedFileInfo,
800 assemblerParams,
801 formatterParams,
802 component,
803 readStorageClass,
804 )
805 )
807 return fileGetInfo
809 def _prepare_for_put(self, inMemoryDataset: Any, ref: DatasetRef) -> tuple[Location, Formatter]:
810 """Check the arguments for ``put`` and obtain formatter and
811 location.
813 Parameters
814 ----------
815 inMemoryDataset : `object`
816 The dataset to store.
817 ref : `DatasetRef`
818 Reference to the associated Dataset.
820 Returns
821 -------
822 location : `Location`
823 The location to write the dataset.
824 formatter : `Formatter`
825 The `Formatter` to use to write the dataset.
827 Raises
828 ------
829 TypeError
830 Supplied object and storage class are inconsistent.
831 DatasetTypeNotSupportedError
832 The associated `DatasetType` is not handled by this datastore.
833 """
834 self._validate_put_parameters(inMemoryDataset, ref)
835 return self._determine_put_formatter_location(ref)
837 def _determine_put_formatter_location(self, ref: DatasetRef) -> tuple[Location, Formatter]:
838 """Calculate the formatter and output location to use for put.
840 Parameters
841 ----------
842 ref : `DatasetRef`
843 Reference to the associated Dataset.
845 Returns
846 -------
847 location : `Location`
848 The location to write the dataset.
849 formatter : `Formatter`
850 The `Formatter` to use to write the dataset.
851 """
852 # Work out output file name
853 try:
854 template = self.templates.getTemplate(ref)
855 except KeyError as e:
856 raise DatasetTypeNotSupportedError(f"Unable to find template for {ref}") from e
858 # Validate the template to protect against filenames from different
859 # dataIds returning the same and causing overwrite confusion.
860 template.validateTemplate(ref)
862 location = self.locationFactory.fromPath(template.format(ref))
864 # Get the formatter based on the storage class
865 storageClass = ref.datasetType.storageClass
866 try:
867 formatter = self.formatterFactory.getFormatter(
868 ref, FileDescriptor(location, storageClass=storageClass), ref.dataId
869 )
870 except KeyError as e:
871 raise DatasetTypeNotSupportedError(
872 f"Unable to find formatter for {ref} in datastore {self.name}"
873 ) from e
875 # Now that we know the formatter, update the location
876 location = formatter.makeUpdatedLocation(location)
878 return location, formatter
880 def _overrideTransferMode(self, *datasets: FileDataset, transfer: str | None = None) -> str | None:
881 # Docstring inherited from base class
882 if transfer != "auto":
883 return transfer
885 # See if the paths are within the datastore or not
886 inside = [self._pathInStore(d.path) is not None for d in datasets]
888 if all(inside):
889 transfer = None
890 elif not any(inside):
891 # Allow ResourcePath to use its own knowledge
892 transfer = "auto"
893 else:
894 # This can happen when importing from a datastore that
895 # has had some datasets ingested using "direct" mode.
896 # Also allow ResourcePath to sort it out but warn about it.
897 # This can happen if you are importing from a datastore
898 # that had some direct transfer datasets.
899 log.warning(
900 "Some datasets are inside the datastore and some are outside. Using 'split' "
901 "transfer mode. This assumes that the files outside the datastore are "
902 "still accessible to the new butler since they will not be copied into "
903 "the target datastore."
904 )
905 transfer = "split"
907 return transfer
909 def _pathInStore(self, path: ResourcePathExpression) -> str | None:
910 """Return path relative to datastore root.
912 Parameters
913 ----------
914 path : `lsst.resources.ResourcePathExpression`
915 Path to dataset. Can be absolute URI. If relative assumed to
916 be relative to the datastore. Returns path in datastore
917 or raises an exception if the path it outside.
919 Returns
920 -------
921 inStore : `str`
922 Path relative to datastore root. Returns `None` if the file is
923 outside the root.
924 """
925 # Relative path will always be relative to datastore
926 pathUri = ResourcePath(path, forceAbsolute=False)
927 return pathUri.relative_to(self.root)
929 def _standardizeIngestPath(
930 self, path: str | ResourcePath, *, transfer: str | None = None
931 ) -> str | ResourcePath:
932 """Standardize the path of a to-be-ingested file.
934 Parameters
935 ----------
936 path : `str` or `lsst.resources.ResourcePath`
937 Path of a file to be ingested. This parameter is not expected
938 to be all the types that can be used to construct a
939 `~lsst.resources.ResourcePath`.
940 transfer : `str`, optional
941 How (and whether) the dataset should be added to the datastore.
942 See `ingest` for details of transfer modes.
943 This implementation is provided only so
944 `NotImplementedError` can be raised if the mode is not supported;
945 actual transfers are deferred to `_extractIngestInfo`.
947 Returns
948 -------
949 path : `str` or `lsst.resources.ResourcePath`
950 New path in what the datastore considers standard form. If an
951 absolute URI was given that will be returned unchanged.
953 Notes
954 -----
955 Subclasses of `FileDatastore` can implement this method instead
956 of `_prepIngest`. It should not modify the data repository or given
957 file in any way.
959 Raises
960 ------
961 NotImplementedError
962 Raised if the datastore does not support the given transfer mode
963 (including the case where ingest is not supported at all).
964 FileNotFoundError
965 Raised if one of the given files does not exist.
966 """
967 if transfer not in (None, "direct", "split") + self.root.transferModes:
968 raise NotImplementedError(f"Transfer mode {transfer} not supported.")
970 # A relative URI indicates relative to datastore root
971 srcUri = ResourcePath(path, forceAbsolute=False)
972 if not srcUri.isabs():
973 srcUri = self.root.join(path)
975 if not srcUri.exists():
976 raise FileNotFoundError(
977 f"Resource at {srcUri} does not exist; note that paths to ingest "
978 f"are assumed to be relative to {self.root} unless they are absolute."
979 )
981 if transfer is None:
982 relpath = srcUri.relative_to(self.root)
983 if not relpath:
984 raise RuntimeError(
985 f"Transfer is none but source file ({srcUri}) is not within datastore ({self.root})"
986 )
988 # Return the relative path within the datastore for internal
989 # transfer
990 path = relpath
992 return path
994 def _extractIngestInfo(
995 self,
996 path: ResourcePathExpression,
997 ref: DatasetRef,
998 *,
999 formatter: Formatter | type[Formatter],
1000 transfer: str | None = None,
1001 record_validation_info: bool = True,
1002 ) -> StoredFileInfo:
1003 """Relocate (if necessary) and extract `StoredFileInfo` from a
1004 to-be-ingested file.
1006 Parameters
1007 ----------
1008 path : `lsst.resources.ResourcePathExpression`
1009 URI or path of a file to be ingested.
1010 ref : `DatasetRef`
1011 Reference for the dataset being ingested. Guaranteed to have
1012 ``dataset_id not None`.
1013 formatter : `type` or `Formatter`
1014 `Formatter` subclass to use for this dataset or an instance.
1015 transfer : `str`, optional
1016 How (and whether) the dataset should be added to the datastore.
1017 See `ingest` for details of transfer modes.
1018 record_validation_info : `bool`, optional
1019 If `True`, the default, the datastore can record validation
1020 information associated with the file. If `False` the datastore
1021 will not attempt to track any information such as checksums
1022 or file sizes. This can be useful if such information is tracked
1023 in an external system or if the file is to be compressed in place.
1024 It is up to the datastore whether this parameter is relevant.
1026 Returns
1027 -------
1028 info : `StoredFileInfo`
1029 Internal datastore record for this file. This will be inserted by
1030 the caller; the `_extractIngestInfo` is only responsible for
1031 creating and populating the struct.
1033 Raises
1034 ------
1035 FileNotFoundError
1036 Raised if one of the given files does not exist.
1037 FileExistsError
1038 Raised if transfer is not `None` but the (internal) location the
1039 file would be moved to is already occupied.
1040 """
1041 if self._transaction is None:
1042 raise RuntimeError("Ingest called without transaction enabled")
1044 # Create URI of the source path, do not need to force a relative
1045 # path to absolute.
1046 srcUri = ResourcePath(path, forceAbsolute=False)
1048 # Track whether we have read the size of the source yet
1049 have_sized = False
1051 tgtLocation: Location | None
1052 if transfer is None or transfer == "split":
1053 # A relative path is assumed to be relative to the datastore
1054 # in this context
1055 if not srcUri.isabs():
1056 tgtLocation = self.locationFactory.fromPath(srcUri.ospath)
1057 else:
1058 # Work out the path in the datastore from an absolute URI
1059 # This is required to be within the datastore.
1060 pathInStore = srcUri.relative_to(self.root)
1061 if pathInStore is None and transfer is None:
1062 raise RuntimeError(
1063 f"Unexpectedly learned that {srcUri} is not within datastore {self.root}"
1064 )
1065 if pathInStore:
1066 tgtLocation = self.locationFactory.fromPath(pathInStore)
1067 elif transfer == "split":
1068 # Outside the datastore but treat that as a direct ingest
1069 # instead.
1070 tgtLocation = None
1071 else:
1072 raise RuntimeError(f"Unexpected transfer mode encountered: {transfer} for URI {srcUri}")
1073 elif transfer == "direct":
1074 # Want to store the full URI to the resource directly in
1075 # datastore. This is useful for referring to permanent archive
1076 # storage for raw data.
1077 # Trust that people know what they are doing.
1078 tgtLocation = None
1079 else:
1080 # Work out the name we want this ingested file to have
1081 # inside the datastore
1082 tgtLocation = self._calculate_ingested_datastore_name(srcUri, ref, formatter)
1083 if not tgtLocation.uri.dirname().exists():
1084 log.debug("Folder %s does not exist yet.", tgtLocation.uri.dirname())
1085 tgtLocation.uri.dirname().mkdir()
1087 # if we are transferring from a local file to a remote location
1088 # it may be more efficient to get the size and checksum of the
1089 # local file rather than the transferred one
1090 if record_validation_info and srcUri.isLocal:
1091 size = srcUri.size()
1092 checksum = self.computeChecksum(srcUri) if self.useChecksum else None
1093 have_sized = True
1095 # Transfer the resource to the destination.
1096 # Allow overwrite of an existing file. This matches the behavior
1097 # of datastore.put() in that it trusts that registry would not
1098 # be asking to overwrite unless registry thought that the
1099 # overwrite was allowed.
1100 tgtLocation.uri.transfer_from(
1101 srcUri, transfer=transfer, transaction=self._transaction, overwrite=True
1102 )
1104 if tgtLocation is None:
1105 # This means we are using direct mode
1106 targetUri = srcUri
1107 targetPath = str(srcUri)
1108 else:
1109 targetUri = tgtLocation.uri
1110 targetPath = tgtLocation.pathInStore.path
1112 # the file should exist in the datastore now
1113 if record_validation_info:
1114 if not have_sized:
1115 size = targetUri.size()
1116 checksum = self.computeChecksum(targetUri) if self.useChecksum else None
1117 else:
1118 # Not recording any file information.
1119 size = -1
1120 checksum = None
1122 return StoredFileInfo(
1123 formatter=formatter,
1124 path=targetPath,
1125 storageClass=ref.datasetType.storageClass,
1126 component=ref.datasetType.component(),
1127 file_size=size,
1128 checksum=checksum,
1129 )
1131 def _prepIngest(self, *datasets: FileDataset, transfer: str | None = None) -> _IngestPrepData:
1132 # Docstring inherited from Datastore._prepIngest.
1133 filtered = []
1134 for dataset in datasets:
1135 acceptable = [ref for ref in dataset.refs if self.constraints.isAcceptable(ref)]
1136 if not acceptable:
1137 continue
1138 else:
1139 dataset.refs = acceptable
1140 if dataset.formatter is None:
1141 dataset.formatter = self.formatterFactory.getFormatterClass(dataset.refs[0])
1142 else:
1143 assert isinstance(dataset.formatter, type | str)
1144 formatter_class = get_class_of(dataset.formatter)
1145 if not issubclass(formatter_class, Formatter):
1146 raise TypeError(f"Requested formatter {dataset.formatter} is not a Formatter class.")
1147 dataset.formatter = formatter_class
1148 dataset.path = self._standardizeIngestPath(dataset.path, transfer=transfer)
1149 filtered.append(dataset)
1150 return _IngestPrepData(filtered)
1152 @transactional
1153 def _finishIngest(
1154 self,
1155 prepData: Datastore.IngestPrepData,
1156 *,
1157 transfer: str | None = None,
1158 record_validation_info: bool = True,
1159 ) -> None:
1160 # Docstring inherited from Datastore._finishIngest.
1161 refsAndInfos = []
1162 progress = Progress("lsst.daf.butler.datastores.FileDatastore.ingest", level=logging.DEBUG)
1163 for dataset in progress.wrap(prepData.datasets, desc="Ingesting dataset files"):
1164 # Do ingest as if the first dataset ref is associated with the file
1165 info = self._extractIngestInfo(
1166 dataset.path,
1167 dataset.refs[0],
1168 formatter=dataset.formatter,
1169 transfer=transfer,
1170 record_validation_info=record_validation_info,
1171 )
1172 refsAndInfos.extend([(ref, info) for ref in dataset.refs])
1174 # In direct mode we can allow repeated ingests of the same thing
1175 # if we are sure that the external dataset is immutable. We use
1176 # UUIDv5 to indicate this. If there is a mix of v4 and v5 they are
1177 # separated.
1178 refs_and_infos_replace = []
1179 refs_and_infos_insert = []
1180 if transfer == "direct":
1181 for entry in refsAndInfos:
1182 if entry[0].id.version == 5:
1183 refs_and_infos_replace.append(entry)
1184 else:
1185 refs_and_infos_insert.append(entry)
1186 else:
1187 refs_and_infos_insert = refsAndInfos
1189 if refs_and_infos_insert:
1190 self._register_datasets(refs_and_infos_insert, insert_mode=DatabaseInsertMode.INSERT)
1191 if refs_and_infos_replace:
1192 self._register_datasets(refs_and_infos_replace, insert_mode=DatabaseInsertMode.REPLACE)
1194 def _calculate_ingested_datastore_name(
1195 self,
1196 srcUri: ResourcePath,
1197 ref: DatasetRef,
1198 formatter: Formatter | type[Formatter] | None = None,
1199 ) -> Location:
1200 """Given a source URI and a DatasetRef, determine the name the
1201 dataset will have inside datastore.
1203 Parameters
1204 ----------
1205 srcUri : `lsst.resources.ResourcePath`
1206 URI to the source dataset file.
1207 ref : `DatasetRef`
1208 Ref associated with the newly-ingested dataset artifact. This
1209 is used to determine the name within the datastore.
1210 formatter : `Formatter` or Formatter class.
1211 Formatter to use for validation. Can be a class or an instance.
1212 No validation of the file extension is performed if the
1213 ``formatter`` is `None`. This can be used if the caller knows
1214 that the source URI and target URI will use the same formatter.
1216 Returns
1217 -------
1218 location : `Location`
1219 Target location for the newly-ingested dataset.
1220 """
1221 # Ingesting a file from outside the datastore.
1222 # This involves a new name.
1223 template = self.templates.getTemplate(ref)
1224 location = self.locationFactory.fromPath(template.format(ref))
1226 # Get the extension
1227 ext = srcUri.getExtension()
1229 # Update the destination to include that extension
1230 location.updateExtension(ext)
1232 # Ask the formatter to validate this extension
1233 if formatter is not None:
1234 formatter.validateExtension(location)
1236 return location
1238 def _write_in_memory_to_artifact(self, inMemoryDataset: Any, ref: DatasetRef) -> StoredFileInfo:
1239 """Write out in memory dataset to datastore.
1241 Parameters
1242 ----------
1243 inMemoryDataset : `object`
1244 Dataset to write to datastore.
1245 ref : `DatasetRef`
1246 Registry information associated with this dataset.
1248 Returns
1249 -------
1250 info : `StoredFileInfo`
1251 Information describing the artifact written to the datastore.
1252 """
1253 # May need to coerce the in memory dataset to the correct
1254 # python type, but first we need to make sure the storage class
1255 # reflects the one defined in the data repository.
1256 ref = self._cast_storage_class(ref)
1257 inMemoryDataset = ref.datasetType.storageClass.coerce_type(inMemoryDataset)
1259 location, formatter = self._prepare_for_put(inMemoryDataset, ref)
1260 uri = location.uri
1262 if not uri.dirname().exists():
1263 log.debug("Folder %s does not exist yet so creating it.", uri.dirname())
1264 uri.dirname().mkdir()
1266 if self._transaction is None:
1267 raise RuntimeError("Attempting to write artifact without transaction enabled")
1269 def _removeFileExists(uri: ResourcePath) -> None:
1270 """Remove a file and do not complain if it is not there.
1272 This is important since a formatter might fail before the file
1273 is written and we should not confuse people by writing spurious
1274 error messages to the log.
1275 """
1276 with contextlib.suppress(FileNotFoundError):
1277 uri.remove()
1279 # Register a callback to try to delete the uploaded data if
1280 # something fails below
1281 self._transaction.registerUndo("artifactWrite", _removeFileExists, uri)
1283 data_written = False
1285 # For remote URIs some datasets can be serialized directly
1286 # to bytes and sent to the remote datastore without writing a
1287 # file. If the dataset is intended to be saved to the cache
1288 # a file is always written and direct write to the remote
1289 # datastore is bypassed.
1290 if not uri.isLocal and not self.cacheManager.should_be_cached(ref):
1291 # Remote URI that is not cached so can write directly.
1292 try:
1293 serializedDataset = formatter.toBytes(inMemoryDataset)
1294 except NotImplementedError:
1295 # Fallback to the file writing option.
1296 pass
1297 except Exception as e:
1298 raise RuntimeError(
1299 f"Failed to serialize dataset {ref} of type {type(inMemoryDataset)} to bytes."
1300 ) from e
1301 else:
1302 log.debug("Writing bytes directly to %s", uri)
1303 uri.write(serializedDataset, overwrite=True)
1304 log.debug("Successfully wrote bytes directly to %s", uri)
1305 data_written = True
1307 if not data_written:
1308 # Did not write the bytes directly to object store so instead
1309 # write to temporary file. Always write to a temporary even if
1310 # using a local file system -- that gives us atomic writes.
1311 # If a process is killed as the file is being written we do not
1312 # want it to remain in the correct place but in corrupt state.
1313 # For local files write to the output directory not temporary dir.
1314 prefix = uri.dirname() if uri.isLocal else None
1315 with ResourcePath.temporary_uri(suffix=uri.getExtension(), prefix=prefix) as temporary_uri:
1316 # Need to configure the formatter to write to a different
1317 # location and that needs us to overwrite internals
1318 log.debug("Writing dataset to temporary location at %s", temporary_uri)
1319 with formatter._updateLocation(Location(None, temporary_uri)):
1320 try:
1321 formatter.write(inMemoryDataset)
1322 except Exception as e:
1323 raise RuntimeError(
1324 f"Failed to serialize dataset {ref} of type"
1325 f" {type(inMemoryDataset)} to "
1326 f"temporary location {temporary_uri}"
1327 ) from e
1329 # Use move for a local file since that becomes an efficient
1330 # os.rename. For remote resources we use copy to allow the
1331 # file to be cached afterwards.
1332 transfer = "move" if uri.isLocal else "copy"
1334 uri.transfer_from(temporary_uri, transfer=transfer, overwrite=True)
1336 if transfer == "copy":
1337 # Cache if required
1338 self.cacheManager.move_to_cache(temporary_uri, ref)
1340 log.debug("Successfully wrote dataset to %s via a temporary file.", uri)
1342 # URI is needed to resolve what ingest case are we dealing with
1343 return self._extractIngestInfo(uri, ref, formatter=formatter)
1345 def _read_artifact_into_memory(
1346 self,
1347 getInfo: DatastoreFileGetInformation,
1348 ref: DatasetRef,
1349 isComponent: bool = False,
1350 cache_ref: DatasetRef | None = None,
1351 ) -> Any:
1352 """Read the artifact from datastore into in memory object.
1354 Parameters
1355 ----------
1356 getInfo : `DatastoreFileGetInformation`
1357 Information about the artifact within the datastore.
1358 ref : `DatasetRef`
1359 The registry information associated with this artifact.
1360 isComponent : `bool`
1361 Flag to indicate if a component is being read from this artifact.
1362 cache_ref : `DatasetRef`, optional
1363 The DatasetRef to use when looking up the file in the cache.
1364 This ref must have the same ID as the supplied ref but can
1365 be a parent ref or component ref to indicate to the cache whether
1366 a composite file is being requested from the cache or a component
1367 file. Without this the cache will default to the supplied ref but
1368 it can get confused with read-only derived components for
1369 disassembled composites.
1371 Returns
1372 -------
1373 inMemoryDataset : `object`
1374 The artifact as a python object.
1375 """
1376 location = getInfo.location
1377 uri = location.uri
1378 log.debug("Accessing data from %s", uri)
1380 if cache_ref is None:
1381 cache_ref = ref
1382 if cache_ref.id != ref.id:
1383 raise ValueError(
1384 "The supplied cache dataset ref refers to a different dataset than expected:"
1385 f" {ref.id} != {cache_ref.id}"
1386 )
1388 # Cannot recalculate checksum but can compare size as a quick check
1389 # Do not do this if the size is negative since that indicates
1390 # we do not know.
1391 recorded_size = getInfo.info.file_size
1392 resource_size = uri.size()
1393 if recorded_size >= 0 and resource_size != recorded_size:
1394 raise RuntimeError(
1395 "Integrity failure in Datastore. "
1396 f"Size of file {uri} ({resource_size}) "
1397 f"does not match size recorded in registry of {recorded_size}"
1398 )
1400 # For the general case we have choices for how to proceed.
1401 # 1. Always use a local file (downloading the remote resource to a
1402 # temporary file if needed).
1403 # 2. Use a threshold size and read into memory and use bytes.
1404 # Use both for now with an arbitrary hand off size.
1405 # This allows small datasets to be downloaded from remote object
1406 # stores without requiring a temporary file.
1408 formatter = getInfo.formatter
1409 nbytes_max = 10_000_000 # Arbitrary number that we can tune
1410 if resource_size <= nbytes_max and formatter.can_read_bytes():
1411 with self.cacheManager.find_in_cache(cache_ref, uri.getExtension()) as cached_file:
1412 if cached_file is not None:
1413 desired_uri = cached_file
1414 msg = f" (cached version of {uri})"
1415 else:
1416 desired_uri = uri
1417 msg = ""
1418 with time_this(log, msg="Reading bytes from %s%s", args=(desired_uri, msg)):
1419 serializedDataset = desired_uri.read()
1420 log.debug(
1421 "Deserializing %s from %d bytes from location %s with formatter %s",
1422 f"component {getInfo.component}" if isComponent else "",
1423 len(serializedDataset),
1424 uri,
1425 formatter.name(),
1426 )
1427 try:
1428 result = formatter.fromBytes(
1429 serializedDataset, component=getInfo.component if isComponent else None
1430 )
1431 except Exception as e:
1432 raise ValueError(
1433 f"Failure from formatter '{formatter.name()}' for dataset {ref.id}"
1434 f" ({ref.datasetType.name} from {uri}): {e}"
1435 ) from e
1436 else:
1437 # Read from file.
1439 # Have to update the Location associated with the formatter
1440 # because formatter.read does not allow an override.
1441 # This could be improved.
1442 location_updated = False
1443 msg = ""
1445 # First check in cache for local version.
1446 # The cache will only be relevant for remote resources but
1447 # no harm in always asking. Context manager ensures that cache
1448 # file is not deleted during cache expiration.
1449 with self.cacheManager.find_in_cache(cache_ref, uri.getExtension()) as cached_file:
1450 if cached_file is not None:
1451 msg = f"(via cache read of remote file {uri})"
1452 uri = cached_file
1453 location_updated = True
1455 with uri.as_local() as local_uri:
1456 can_be_cached = False
1457 if uri != local_uri:
1458 # URI was remote and file was downloaded
1459 cache_msg = ""
1460 location_updated = True
1462 if self.cacheManager.should_be_cached(cache_ref):
1463 # In this scenario we want to ask if the downloaded
1464 # file should be cached but we should not cache
1465 # it until after we've used it (to ensure it can't
1466 # be expired whilst we are using it).
1467 can_be_cached = True
1469 # Say that it is "likely" to be cached because
1470 # if the formatter read fails we will not be
1471 # caching this file.
1472 cache_msg = " and likely cached"
1474 msg = f"(via download to local file{cache_msg})"
1476 # Calculate the (possibly) new location for the formatter
1477 # to use.
1478 newLocation = Location(*local_uri.split()) if location_updated else None
1480 log.debug(
1481 "Reading%s from location %s %s with formatter %s",
1482 f" component {getInfo.component}" if isComponent else "",
1483 uri,
1484 msg,
1485 formatter.name(),
1486 )
1487 try:
1488 with (
1489 formatter._updateLocation(newLocation),
1490 time_this(
1491 log,
1492 msg="Reading%s from location %s %s with formatter %s",
1493 args=(
1494 f" component {getInfo.component}" if isComponent else "",
1495 uri,
1496 msg,
1497 formatter.name(),
1498 ),
1499 ),
1500 ):
1501 result = formatter.read(component=getInfo.component if isComponent else None)
1502 except Exception as e:
1503 raise ValueError(
1504 f"Failure from formatter '{formatter.name()}' for dataset {ref.id}"
1505 f" ({ref.datasetType.name} from {uri}): {e}"
1506 ) from e
1508 # File was read successfully so can move to cache
1509 if can_be_cached:
1510 self.cacheManager.move_to_cache(local_uri, cache_ref)
1512 return self._post_process_get(
1513 result, ref.datasetType.storageClass, getInfo.assemblerParams, isComponent=isComponent
1514 )
1516 def knows(self, ref: DatasetRef) -> bool:
1517 """Check if the dataset is known to the datastore.
1519 Does not check for existence of any artifact.
1521 Parameters
1522 ----------
1523 ref : `DatasetRef`
1524 Reference to the required dataset.
1526 Returns
1527 -------
1528 exists : `bool`
1529 `True` if the dataset is known to the datastore.
1530 """
1531 # We cannot trust datastore records from ref, as many unit tests delete
1532 # datasets and check their existence.
1533 fileLocations = self._get_dataset_locations_info(ref, ignore_datastore_records=True)
1534 if fileLocations:
1535 return True
1536 return False
1538 def knows_these(self, refs: Iterable[DatasetRef]) -> dict[DatasetRef, bool]:
1539 # Docstring inherited from the base class.
1541 # The records themselves. Could be missing some entries.
1542 records = self._get_stored_records_associated_with_refs(refs, ignore_datastore_records=True)
1544 return {ref: ref.id in records for ref in refs}
1546 def _process_mexists_records(
1547 self,
1548 id_to_ref: dict[DatasetId, DatasetRef],
1549 records: dict[DatasetId, list[StoredFileInfo]],
1550 all_required: bool,
1551 artifact_existence: dict[ResourcePath, bool] | None = None,
1552 ) -> dict[DatasetRef, bool]:
1553 """Check given records for existence.
1555 Helper function for `mexists()`.
1557 Parameters
1558 ----------
1559 id_to_ref : `dict` of [`DatasetId`, `DatasetRef`]
1560 Mapping of the dataset ID to the dataset ref itself.
1561 records : `dict` of [`DatasetId`, `list` of `StoredFileInfo`]
1562 Records as generally returned by
1563 ``_get_stored_records_associated_with_refs``.
1564 all_required : `bool`
1565 Flag to indicate whether existence requires all artifacts
1566 associated with a dataset ID to exist or not for existence.
1567 artifact_existence : `dict` [`lsst.resources.ResourcePath`, `bool`]
1568 Optional mapping of datastore artifact to existence. Updated by
1569 this method with details of all artifacts tested. Can be `None`
1570 if the caller is not interested.
1572 Returns
1573 -------
1574 existence : `dict` of [`DatasetRef`, `bool`]
1575 Mapping from dataset to boolean indicating existence.
1576 """
1577 # The URIs to be checked and a mapping of those URIs to
1578 # the dataset ID.
1579 uris_to_check: list[ResourcePath] = []
1580 location_map: dict[ResourcePath, DatasetId] = {}
1582 location_factory = self.locationFactory
1584 uri_existence: dict[ResourcePath, bool] = {}
1585 for ref_id, infos in records.items():
1586 # Key is the dataset Id, value is list of StoredItemInfo
1587 uris = [info.file_location(location_factory).uri for info in infos]
1588 location_map.update({uri: ref_id for uri in uris})
1590 # Check the local cache directly for a dataset corresponding
1591 # to the remote URI.
1592 if self.cacheManager.file_count > 0:
1593 ref = id_to_ref[ref_id]
1594 for uri, storedFileInfo in zip(uris, infos, strict=True):
1595 check_ref = ref
1596 if not ref.datasetType.isComponent() and (component := storedFileInfo.component):
1597 check_ref = ref.makeComponentRef(component)
1598 if self.cacheManager.known_to_cache(check_ref, uri.getExtension()):
1599 # Proxy for URI existence.
1600 uri_existence[uri] = True
1601 else:
1602 uris_to_check.append(uri)
1603 else:
1604 # Check all of them.
1605 uris_to_check.extend(uris)
1607 if artifact_existence is not None:
1608 # If a URI has already been checked remove it from the list
1609 # and immediately add the status to the output dict.
1610 filtered_uris_to_check = []
1611 for uri in uris_to_check:
1612 if uri in artifact_existence:
1613 uri_existence[uri] = artifact_existence[uri]
1614 else:
1615 filtered_uris_to_check.append(uri)
1616 uris_to_check = filtered_uris_to_check
1618 # Results.
1619 dataset_existence: dict[DatasetRef, bool] = {}
1621 uri_existence.update(ResourcePath.mexists(uris_to_check))
1622 for uri, exists in uri_existence.items():
1623 dataset_id = location_map[uri]
1624 ref = id_to_ref[dataset_id]
1626 # Disassembled composite needs to check all locations.
1627 # all_required indicates whether all need to exist or not.
1628 if ref in dataset_existence:
1629 if all_required:
1630 exists = dataset_existence[ref] and exists
1631 else:
1632 exists = dataset_existence[ref] or exists
1633 dataset_existence[ref] = exists
1635 if artifact_existence is not None:
1636 artifact_existence.update(uri_existence)
1638 return dataset_existence
1640 def mexists(
1641 self, refs: Iterable[DatasetRef], artifact_existence: dict[ResourcePath, bool] | None = None
1642 ) -> dict[DatasetRef, bool]:
1643 """Check the existence of multiple datasets at once.
1645 Parameters
1646 ----------
1647 refs : iterable of `DatasetRef`
1648 The datasets to be checked.
1649 artifact_existence : `dict` [`lsst.resources.ResourcePath`, `bool`]
1650 Optional mapping of datastore artifact to existence. Updated by
1651 this method with details of all artifacts tested. Can be `None`
1652 if the caller is not interested.
1654 Returns
1655 -------
1656 existence : `dict` of [`DatasetRef`, `bool`]
1657 Mapping from dataset to boolean indicating existence.
1659 Notes
1660 -----
1661 To minimize potentially costly remote existence checks, the local
1662 cache is checked as a proxy for existence. If a file for this
1663 `DatasetRef` does exist no check is done for the actual URI. This
1664 could result in possibly unexpected behavior if the dataset itself
1665 has been removed from the datastore by another process whilst it is
1666 still in the cache.
1667 """
1668 chunk_size = 10_000
1669 dataset_existence: dict[DatasetRef, bool] = {}
1670 log.debug("Checking for the existence of multiple artifacts in datastore in chunks of %d", chunk_size)
1671 n_found_total = 0
1672 n_checked = 0
1673 n_chunks = 0
1674 for chunk in chunk_iterable(refs, chunk_size=chunk_size):
1675 chunk_result = self._mexists(chunk, artifact_existence)
1677 # The log message level and content depend on how many
1678 # datasets we are processing.
1679 n_results = len(chunk_result)
1681 # Use verbose logging to ensure that messages can be seen
1682 # easily if many refs are being checked.
1683 log_threshold = VERBOSE
1684 n_checked += n_results
1686 # This sum can take some time so only do it if we know the
1687 # result is going to be used.
1688 n_found = 0
1689 if log.isEnabledFor(log_threshold):
1690 # Can treat the booleans as 0, 1 integers and sum them.
1691 n_found = sum(chunk_result.values())
1692 n_found_total += n_found
1694 # We are deliberately not trying to count the number of refs
1695 # provided in case it's in the millions. This means there is a
1696 # situation where the number of refs exactly matches the chunk
1697 # size and we will switch to the multi-chunk path even though
1698 # we only have a single chunk.
1699 if n_results < chunk_size and n_chunks == 0:
1700 # Single chunk will be processed so we can provide more detail.
1701 if n_results == 1:
1702 ref = list(chunk_result)[0]
1703 # Use debug logging to be consistent with `exists()`.
1704 log.debug(
1705 "Calling mexists() with single ref that does%s exist (%s).",
1706 "" if chunk_result[ref] else " not",
1707 ref,
1708 )
1709 else:
1710 # Single chunk but multiple files. Summarize.
1711 log.log(
1712 log_threshold,
1713 "Number of datasets found in datastore: %d out of %d datasets checked.",
1714 n_found,
1715 n_checked,
1716 )
1718 else:
1719 # Use incremental verbose logging when we have multiple chunks.
1720 log.log(
1721 log_threshold,
1722 "Number of datasets found in datastore for chunk %d: %d out of %d checked "
1723 "(running total from all chunks so far: %d found out of %d checked)",
1724 n_chunks,
1725 n_found,
1726 n_results,
1727 n_found_total,
1728 n_checked,
1729 )
1730 dataset_existence.update(chunk_result)
1731 n_chunks += 1
1733 return dataset_existence
1735 def _mexists(
1736 self, refs: Sequence[DatasetRef], artifact_existence: dict[ResourcePath, bool] | None = None
1737 ) -> dict[DatasetRef, bool]:
1738 """Check the existence of multiple datasets at once.
1740 Parameters
1741 ----------
1742 refs : iterable of `DatasetRef`
1743 The datasets to be checked.
1744 artifact_existence : `dict` [`lsst.resources.ResourcePath`, `bool`]
1745 Optional mapping of datastore artifact to existence. Updated by
1746 this method with details of all artifacts tested. Can be `None`
1747 if the caller is not interested.
1749 Returns
1750 -------
1751 existence : `dict` of [`DatasetRef`, `bool`]
1752 Mapping from dataset to boolean indicating existence.
1753 """
1754 # Make a mapping from refs with the internal storage class to the given
1755 # refs that may have a different one. We'll use the internal refs
1756 # throughout this method and convert back at the very end.
1757 internal_ref_to_input_ref = {self._cast_storage_class(ref): ref for ref in refs}
1759 # Need a mapping of dataset_id to (internal) dataset ref since some
1760 # internal APIs work with dataset_id.
1761 id_to_ref = {ref.id: ref for ref in internal_ref_to_input_ref}
1763 # Set of all IDs we are checking for.
1764 requested_ids = set(id_to_ref.keys())
1766 # The records themselves. Could be missing some entries.
1767 records = self._get_stored_records_associated_with_refs(
1768 id_to_ref.values(), ignore_datastore_records=True
1769 )
1771 dataset_existence = self._process_mexists_records(
1772 id_to_ref, records, True, artifact_existence=artifact_existence
1773 )
1775 # Set of IDs that have been handled.
1776 handled_ids = {ref.id for ref in dataset_existence}
1778 missing_ids = requested_ids - handled_ids
1779 if missing_ids:
1780 dataset_existence.update(
1781 self._mexists_check_expected(
1782 [id_to_ref[missing] for missing in missing_ids], artifact_existence
1783 )
1784 )
1786 return {
1787 internal_ref_to_input_ref[internal_ref]: existence
1788 for internal_ref, existence in dataset_existence.items()
1789 }
1791 def _mexists_check_expected(
1792 self, refs: Sequence[DatasetRef], artifact_existence: dict[ResourcePath, bool] | None = None
1793 ) -> dict[DatasetRef, bool]:
1794 """Check existence of refs that are not known to datastore.
1796 Parameters
1797 ----------
1798 refs : iterable of `DatasetRef`
1799 The datasets to be checked. These are assumed not to be known
1800 to datastore.
1801 artifact_existence : `dict` [`lsst.resources.ResourcePath`, `bool`]
1802 Optional mapping of datastore artifact to existence. Updated by
1803 this method with details of all artifacts tested. Can be `None`
1804 if the caller is not interested.
1806 Returns
1807 -------
1808 existence : `dict` of [`DatasetRef`, `bool`]
1809 Mapping from dataset to boolean indicating existence.
1810 """
1811 dataset_existence: dict[DatasetRef, bool] = {}
1812 if not self.trustGetRequest:
1813 # Must assume these do not exist
1814 for ref in refs:
1815 dataset_existence[ref] = False
1816 else:
1817 log.debug(
1818 "%d datasets were not known to datastore during initial existence check.",
1819 len(refs),
1820 )
1822 # Construct data structure identical to that returned
1823 # by _get_stored_records_associated_with_refs() but using
1824 # guessed names.
1825 records = {}
1826 id_to_ref = {}
1827 for missing_ref in refs:
1828 expected = self._get_expected_dataset_locations_info(missing_ref)
1829 dataset_id = missing_ref.id
1830 records[dataset_id] = [info for _, info in expected]
1831 id_to_ref[dataset_id] = missing_ref
1833 dataset_existence.update(
1834 self._process_mexists_records(
1835 id_to_ref,
1836 records,
1837 False,
1838 artifact_existence=artifact_existence,
1839 )
1840 )
1842 return dataset_existence
1844 def exists(self, ref: DatasetRef) -> bool:
1845 """Check if the dataset exists in the datastore.
1847 Parameters
1848 ----------
1849 ref : `DatasetRef`
1850 Reference to the required dataset.
1852 Returns
1853 -------
1854 exists : `bool`
1855 `True` if the entity exists in the `Datastore`.
1857 Notes
1858 -----
1859 The local cache is checked as a proxy for existence in the remote
1860 object store. It is possible that another process on a different
1861 compute node could remove the file from the object store even
1862 though it is present in the local cache.
1863 """
1864 ref = self._cast_storage_class(ref)
1865 # We cannot trust datastore records from ref, as many unit tests delete
1866 # datasets and check their existence.
1867 fileLocations = self._get_dataset_locations_info(ref, ignore_datastore_records=True)
1869 # if we are being asked to trust that registry might not be correct
1870 # we ask for the expected locations and check them explicitly
1871 if not fileLocations:
1872 if not self.trustGetRequest:
1873 return False
1875 # First check the cache. If it is not found we must check
1876 # the datastore itself. Assume that any component in the cache
1877 # means that the dataset does exist somewhere.
1878 if self.cacheManager.known_to_cache(ref):
1879 return True
1881 # When we are guessing a dataset location we can not check
1882 # for the existence of every component since we can not
1883 # know if every component was written. Instead we check
1884 # for the existence of any of the expected locations.
1885 for location, _ in self._get_expected_dataset_locations_info(ref):
1886 if self._artifact_exists(location):
1887 return True
1888 return False
1890 # All listed artifacts must exist.
1891 for location, storedFileInfo in fileLocations:
1892 # Checking in cache needs the component ref.
1893 check_ref = ref
1894 if not ref.datasetType.isComponent() and (component := storedFileInfo.component):
1895 check_ref = ref.makeComponentRef(component)
1896 if self.cacheManager.known_to_cache(check_ref, location.getExtension()):
1897 continue
1899 if not self._artifact_exists(location):
1900 return False
1902 return True
1904 def getURIs(self, ref: DatasetRef, predict: bool = False) -> DatasetRefURIs:
1905 """Return URIs associated with dataset.
1907 Parameters
1908 ----------
1909 ref : `DatasetRef`
1910 Reference to the required dataset.
1911 predict : `bool`, optional
1912 If the datastore does not know about the dataset, should it
1913 return a predicted URI or not?
1915 Returns
1916 -------
1917 uris : `DatasetRefURIs`
1918 The URI to the primary artifact associated with this dataset (if
1919 the dataset was disassembled within the datastore this may be
1920 `None`), and the URIs to any components associated with the dataset
1921 artifact. (can be empty if there are no components).
1922 """
1923 many = self.getManyURIs([ref], predict=predict, allow_missing=False)
1924 return many[ref]
1926 def getURI(self, ref: DatasetRef, predict: bool = False) -> ResourcePath:
1927 """URI to the Dataset.
1929 Parameters
1930 ----------
1931 ref : `DatasetRef`
1932 Reference to the required Dataset.
1933 predict : `bool`
1934 If `True`, allow URIs to be returned of datasets that have not
1935 been written.
1937 Returns
1938 -------
1939 uri : `str`
1940 URI pointing to the dataset within the datastore. If the
1941 dataset does not exist in the datastore, and if ``predict`` is
1942 `True`, the URI will be a prediction and will include a URI
1943 fragment "#predicted".
1944 If the datastore does not have entities that relate well
1945 to the concept of a URI the returned URI will be
1946 descriptive. The returned URI is not guaranteed to be obtainable.
1948 Raises
1949 ------
1950 FileNotFoundError
1951 Raised if a URI has been requested for a dataset that does not
1952 exist and guessing is not allowed.
1953 RuntimeError
1954 Raised if a request is made for a single URI but multiple URIs
1955 are associated with this dataset.
1957 Notes
1958 -----
1959 When a predicted URI is requested an attempt will be made to form
1960 a reasonable URI based on file templates and the expected formatter.
1961 """
1962 primary, components = self.getURIs(ref, predict)
1963 if primary is None or components:
1964 raise RuntimeError(
1965 f"Dataset ({ref}) includes distinct URIs for components. Use Datastore.getURIs() instead."
1966 )
1967 return primary
1969 def _predict_URIs(
1970 self,
1971 ref: DatasetRef,
1972 ) -> DatasetRefURIs:
1973 """Predict the URIs of a dataset ref.
1975 Parameters
1976 ----------
1977 ref : `DatasetRef`
1978 Reference to the required Dataset.
1980 Returns
1981 -------
1982 URI : DatasetRefUris
1983 Primary and component URIs. URIs will contain a URI fragment
1984 "#predicted".
1985 """
1986 uris = DatasetRefURIs()
1988 if self.composites.shouldBeDisassembled(ref):
1989 for component, _ in ref.datasetType.storageClass.components.items():
1990 comp_ref = ref.makeComponentRef(component)
1991 comp_location, _ = self._determine_put_formatter_location(comp_ref)
1993 # Add the "#predicted" URI fragment to indicate this is a
1994 # guess
1995 uris.componentURIs[component] = ResourcePath(comp_location.uri.geturl() + "#predicted")
1997 else:
1998 location, _ = self._determine_put_formatter_location(ref)
2000 # Add the "#predicted" URI fragment to indicate this is a guess
2001 uris.primaryURI = ResourcePath(location.uri.geturl() + "#predicted")
2003 return uris
2005 def getManyURIs(
2006 self,
2007 refs: Iterable[DatasetRef],
2008 predict: bool = False,
2009 allow_missing: bool = False,
2010 ) -> dict[DatasetRef, DatasetRefURIs]:
2011 # Docstring inherited
2013 uris: dict[DatasetRef, DatasetRefURIs] = {}
2015 records = self._get_stored_records_associated_with_refs(refs)
2016 records_keys = records.keys()
2018 existing_refs = tuple(ref for ref in refs if ref.id in records_keys)
2019 missing_refs = tuple(ref for ref in refs if ref.id not in records_keys)
2021 # Have to handle trustGetRequest mode by checking for the existence
2022 # of the missing refs on disk.
2023 if missing_refs:
2024 dataset_existence = self._mexists_check_expected(missing_refs, None)
2025 really_missing = set()
2026 not_missing = set()
2027 for ref, exists in dataset_existence.items():
2028 if exists:
2029 not_missing.add(ref)
2030 else:
2031 really_missing.add(ref)
2033 if not_missing:
2034 # Need to recalculate the missing/existing split.
2035 existing_refs = existing_refs + tuple(not_missing)
2036 missing_refs = tuple(really_missing)
2038 for ref in missing_refs:
2039 # if this has never been written then we have to guess
2040 if not predict:
2041 if not allow_missing:
2042 raise FileNotFoundError(f"Dataset {ref} not in this datastore.")
2043 else:
2044 uris[ref] = self._predict_URIs(ref)
2046 for ref in existing_refs:
2047 file_infos = records[ref.id]
2048 file_locations = [(i.file_location(self.locationFactory), i) for i in file_infos]
2049 uris[ref] = self._locations_to_URI(ref, file_locations)
2051 return uris
2053 def _locations_to_URI(
2054 self,
2055 ref: DatasetRef,
2056 file_locations: Sequence[tuple[Location, StoredFileInfo]],
2057 ) -> DatasetRefURIs:
2058 """Convert one or more file locations associated with a DatasetRef
2059 to a DatasetRefURIs.
2061 Parameters
2062 ----------
2063 ref : `DatasetRef`
2064 Reference to the dataset.
2065 file_locations : Sequence[Tuple[Location, StoredFileInfo]]
2066 Each item in the sequence is the location of the dataset within the
2067 datastore and stored information about the file and its formatter.
2068 If there is only one item in the sequence then it is treated as the
2069 primary URI. If there is more than one item then they are treated
2070 as component URIs. If there are no items then an error is raised
2071 unless ``self.trustGetRequest`` is `True`.
2073 Returns
2074 -------
2075 uris: DatasetRefURIs
2076 Represents the primary URI or component URIs described by the
2077 inputs.
2079 Raises
2080 ------
2081 RuntimeError
2082 If no file locations are passed in and ``self.trustGetRequest`` is
2083 `False`.
2084 FileNotFoundError
2085 If the a passed-in URI does not exist, and ``self.trustGetRequest``
2086 is `False`.
2087 RuntimeError
2088 If a passed in `StoredFileInfo`'s ``component`` is `None` (this is
2089 unexpected).
2090 """
2091 guessing = False
2092 uris = DatasetRefURIs()
2094 if not file_locations:
2095 if not self.trustGetRequest:
2096 raise RuntimeError(f"Unexpectedly got no artifacts for dataset {ref}")
2097 file_locations = self._get_expected_dataset_locations_info(ref)
2098 guessing = True
2100 if len(file_locations) == 1:
2101 # No disassembly so this is the primary URI
2102 uris.primaryURI = file_locations[0][0].uri
2103 if guessing and not uris.primaryURI.exists():
2104 raise FileNotFoundError(f"Expected URI ({uris.primaryURI}) does not exist")
2105 else:
2106 for location, file_info in file_locations:
2107 if file_info.component is None:
2108 raise RuntimeError(f"Unexpectedly got no component name for a component at {location}")
2109 if guessing and not location.uri.exists():
2110 # If we are trusting then it is entirely possible for
2111 # some components to be missing. In that case we skip
2112 # to the next component.
2113 if self.trustGetRequest:
2114 continue
2115 raise FileNotFoundError(f"Expected URI ({location.uri}) does not exist")
2116 uris.componentURIs[file_info.component] = location.uri
2118 return uris
2120 def retrieveArtifacts(
2121 self,
2122 refs: Iterable[DatasetRef],
2123 destination: ResourcePath,
2124 transfer: str = "auto",
2125 preserve_path: bool = True,
2126 overwrite: bool = False,
2127 ) -> list[ResourcePath]:
2128 """Retrieve the file artifacts associated with the supplied refs.
2130 Parameters
2131 ----------
2132 refs : iterable of `DatasetRef`
2133 The datasets for which file artifacts are to be retrieved.
2134 A single ref can result in multiple files. The refs must
2135 be resolved.
2136 destination : `lsst.resources.ResourcePath`
2137 Location to write the file artifacts.
2138 transfer : `str`, optional
2139 Method to use to transfer the artifacts. Must be one of the options
2140 supported by `lsst.resources.ResourcePath.transfer_from()`.
2141 "move" is not allowed.
2142 preserve_path : `bool`, optional
2143 If `True` the full path of the file artifact within the datastore
2144 is preserved. If `False` the final file component of the path
2145 is used.
2146 overwrite : `bool`, optional
2147 If `True` allow transfers to overwrite existing files at the
2148 destination.
2150 Returns
2151 -------
2152 targets : `list` of `lsst.resources.ResourcePath`
2153 URIs of file artifacts in destination location. Order is not
2154 preserved.
2155 """
2156 if not destination.isdir():
2157 raise ValueError(f"Destination location must refer to a directory. Given {destination}")
2159 if transfer == "move":
2160 raise ValueError("Can not move artifacts out of datastore. Use copy instead.")
2162 # Source -> Destination
2163 # This also helps filter out duplicate DatasetRef in the request
2164 # that will map to the same underlying file transfer.
2165 to_transfer: dict[ResourcePath, ResourcePath] = {}
2167 for ref in refs:
2168 locations = self._get_dataset_locations_info(ref)
2169 for location, _ in locations:
2170 source_uri = location.uri
2171 target_path: ResourcePathExpression
2172 if preserve_path:
2173 target_path = location.pathInStore
2174 if target_path.isabs():
2175 # This is an absolute path to an external file.
2176 # Use the full path.
2177 target_path = target_path.relativeToPathRoot
2178 else:
2179 target_path = source_uri.basename()
2180 target_uri = destination.join(target_path)
2181 to_transfer[source_uri] = target_uri
2183 # In theory can now parallelize the transfer
2184 log.debug("Number of artifacts to transfer to %s: %d", str(destination), len(to_transfer))
2185 for source_uri, target_uri in to_transfer.items():
2186 target_uri.transfer_from(source_uri, transfer=transfer, overwrite=overwrite)
2188 return list(to_transfer.values())
2190 def get(
2191 self,
2192 ref: DatasetRef,
2193 parameters: Mapping[str, Any] | None = None,
2194 storageClass: StorageClass | str | None = None,
2195 ) -> Any:
2196 """Load an InMemoryDataset from the store.
2198 Parameters
2199 ----------
2200 ref : `DatasetRef`
2201 Reference to the required Dataset.
2202 parameters : `dict`
2203 `StorageClass`-specific parameters that specify, for example,
2204 a slice of the dataset to be loaded.
2205 storageClass : `StorageClass` or `str`, optional
2206 The storage class to be used to override the Python type
2207 returned by this method. By default the returned type matches
2208 the dataset type definition for this dataset. Specifying a
2209 read `StorageClass` can force a different type to be returned.
2210 This type must be compatible with the original type.
2212 Returns
2213 -------
2214 inMemoryDataset : `object`
2215 Requested dataset or slice thereof as an InMemoryDataset.
2217 Raises
2218 ------
2219 FileNotFoundError
2220 Requested dataset can not be retrieved.
2221 TypeError
2222 Return value from formatter has unexpected type.
2223 ValueError
2224 Formatter failed to process the dataset.
2225 """
2226 # Supplied storage class for the component being read is either
2227 # from the ref itself or some an override if we want to force
2228 # type conversion.
2229 if storageClass is not None:
2230 ref = ref.overrideStorageClass(storageClass)
2231 refStorageClass = ref.datasetType.storageClass
2233 allGetInfo = self._prepare_for_get(ref, parameters)
2234 refComponent = ref.datasetType.component()
2236 # Create mapping from component name to related info
2237 allComponents = {i.component: i for i in allGetInfo}
2239 # By definition the dataset is disassembled if we have more
2240 # than one record for it.
2241 isDisassembled = len(allGetInfo) > 1
2243 # Look for the special case where we are disassembled but the
2244 # component is a derived component that was not written during
2245 # disassembly. For this scenario we need to check that the
2246 # component requested is listed as a derived component for the
2247 # composite storage class
2248 isDisassembledReadOnlyComponent = False
2249 if isDisassembled and refComponent:
2250 # The composite storage class should be accessible through
2251 # the component dataset type
2252 compositeStorageClass = ref.datasetType.parentStorageClass
2254 # In the unlikely scenario where the composite storage
2255 # class is not known, we can only assume that this is a
2256 # normal component. If that assumption is wrong then the
2257 # branch below that reads a persisted component will fail
2258 # so there is no need to complain here.
2259 if compositeStorageClass is not None:
2260 isDisassembledReadOnlyComponent = refComponent in compositeStorageClass.derivedComponents
2262 if isDisassembled and not refComponent:
2263 # This was a disassembled dataset spread over multiple files
2264 # and we need to put them all back together again.
2265 # Read into memory and then assemble
2267 # Check that the supplied parameters are suitable for the type read
2268 refStorageClass.validateParameters(parameters)
2270 # We want to keep track of all the parameters that were not used
2271 # by formatters. We assume that if any of the component formatters
2272 # use a parameter that we do not need to apply it again in the
2273 # assembler.
2274 usedParams = set()
2276 components: dict[str, Any] = {}
2277 for getInfo in allGetInfo:
2278 # assemblerParams are parameters not understood by the
2279 # associated formatter.
2280 usedParams.update(set(getInfo.formatterParams))
2282 component = getInfo.component
2284 if component is None:
2285 raise RuntimeError(f"Internal error in datastore assembly of {ref}")
2287 # We do not want the formatter to think it's reading
2288 # a component though because it is really reading a
2289 # standalone dataset -- always tell reader it is not a
2290 # component.
2291 components[component] = self._read_artifact_into_memory(
2292 getInfo, ref.makeComponentRef(component), isComponent=False
2293 )
2295 inMemoryDataset = ref.datasetType.storageClass.delegate().assemble(components)
2297 # Any unused parameters will have to be passed to the assembler
2298 if parameters:
2299 unusedParams = {k: v for k, v in parameters.items() if k not in usedParams}
2300 else:
2301 unusedParams = {}
2303 # Process parameters
2304 return ref.datasetType.storageClass.delegate().handleParameters(
2305 inMemoryDataset, parameters=unusedParams
2306 )
2308 elif isDisassembledReadOnlyComponent:
2309 compositeStorageClass = ref.datasetType.parentStorageClass
2310 if compositeStorageClass is None:
2311 raise RuntimeError(
2312 f"Unable to retrieve derived component '{refComponent}' since"
2313 "no composite storage class is available."
2314 )
2316 if refComponent is None:
2317 # Mainly for mypy
2318 raise RuntimeError(f"Internal error in datastore {self.name}: component can not be None here")
2320 # Assume that every derived component can be calculated by
2321 # forwarding the request to a single read/write component.
2322 # Rather than guessing which rw component is the right one by
2323 # scanning each for a derived component of the same name,
2324 # we ask the storage class delegate directly which one is best to
2325 # use.
2326 compositeDelegate = compositeStorageClass.delegate()
2327 forwardedComponent = compositeDelegate.selectResponsibleComponent(
2328 refComponent, set(allComponents)
2329 )
2331 # Select the relevant component
2332 rwInfo = allComponents[forwardedComponent]
2334 # For now assume that read parameters are validated against
2335 # the real component and not the requested component
2336 forwardedStorageClass = rwInfo.formatter.fileDescriptor.readStorageClass
2337 forwardedStorageClass.validateParameters(parameters)
2339 # The reference to use for the caching must refer to the forwarded
2340 # component and not the derived component.
2341 cache_ref = ref.makeCompositeRef().makeComponentRef(forwardedComponent)
2343 # Unfortunately the FileDescriptor inside the formatter will have
2344 # the wrong write storage class so we need to create a new one
2345 # given the immutability constraint.
2346 writeStorageClass = rwInfo.info.storageClass
2348 # We may need to put some thought into parameters for read
2349 # components but for now forward them on as is
2350 readFormatter = type(rwInfo.formatter)(
2351 FileDescriptor(
2352 rwInfo.location,
2353 readStorageClass=refStorageClass,
2354 storageClass=writeStorageClass,
2355 parameters=parameters,
2356 ),
2357 ref.dataId,
2358 )
2360 # The assembler can not receive any parameter requests for a
2361 # derived component at this time since the assembler will
2362 # see the storage class of the derived component and those
2363 # parameters will have to be handled by the formatter on the
2364 # forwarded storage class.
2365 assemblerParams: dict[str, Any] = {}
2367 # Need to created a new info that specifies the derived
2368 # component and associated storage class
2369 readInfo = DatastoreFileGetInformation(
2370 rwInfo.location,
2371 readFormatter,
2372 rwInfo.info,
2373 assemblerParams,
2374 {},
2375 refComponent,
2376 refStorageClass,
2377 )
2379 return self._read_artifact_into_memory(readInfo, ref, isComponent=True, cache_ref=cache_ref)
2381 else:
2382 # Single file request or component from that composite file
2383 for lookup in (refComponent, None):
2384 if lookup in allComponents:
2385 getInfo = allComponents[lookup]
2386 break
2387 else:
2388 raise FileNotFoundError(
2389 f"Component {refComponent} not found for ref {ref} in datastore {self.name}"
2390 )
2392 # Do not need the component itself if already disassembled
2393 if isDisassembled:
2394 isComponent = False
2395 else:
2396 isComponent = getInfo.component is not None
2398 # For a component read of a composite we want the cache to
2399 # be looking at the composite ref itself.
2400 cache_ref = ref.makeCompositeRef() if isComponent else ref
2402 # For a disassembled component we can validate parameters against
2403 # the component storage class directly
2404 if isDisassembled:
2405 refStorageClass.validateParameters(parameters)
2406 else:
2407 # For an assembled composite this could be a derived
2408 # component derived from a real component. The validity
2409 # of the parameters is not clear. For now validate against
2410 # the composite storage class
2411 getInfo.formatter.fileDescriptor.storageClass.validateParameters(parameters)
2413 return self._read_artifact_into_memory(getInfo, ref, isComponent=isComponent, cache_ref=cache_ref)
2415 @transactional
2416 def put(self, inMemoryDataset: Any, ref: DatasetRef) -> None:
2417 """Write a InMemoryDataset with a given `DatasetRef` to the store.
2419 Parameters
2420 ----------
2421 inMemoryDataset : `object`
2422 The dataset to store.
2423 ref : `DatasetRef`
2424 Reference to the associated Dataset.
2426 Raises
2427 ------
2428 TypeError
2429 Supplied object and storage class are inconsistent.
2430 DatasetTypeNotSupportedError
2431 The associated `DatasetType` is not handled by this datastore.
2433 Notes
2434 -----
2435 If the datastore is configured to reject certain dataset types it
2436 is possible that the put will fail and raise a
2437 `DatasetTypeNotSupportedError`. The main use case for this is to
2438 allow `ChainedDatastore` to put to multiple datastores without
2439 requiring that every datastore accepts the dataset.
2440 """
2441 doDisassembly = self.composites.shouldBeDisassembled(ref)
2442 # doDisassembly = True
2444 artifacts = []
2445 if doDisassembly:
2446 components = ref.datasetType.storageClass.delegate().disassemble(inMemoryDataset)
2447 if components is None:
2448 raise RuntimeError(
2449 f"Inconsistent configuration: dataset type {ref.datasetType.name} "
2450 f"with storage class {ref.datasetType.storageClass.name} "
2451 "is configured to be disassembled, but cannot be."
2452 )
2453 for component, componentInfo in components.items():
2454 # Don't recurse because we want to take advantage of
2455 # bulk insert -- need a new DatasetRef that refers to the
2456 # same dataset_id but has the component DatasetType
2457 # DatasetType does not refer to the types of components
2458 # So we construct one ourselves.
2459 compRef = ref.makeComponentRef(component)
2460 storedInfo = self._write_in_memory_to_artifact(componentInfo.component, compRef)
2461 artifacts.append((compRef, storedInfo))
2462 else:
2463 # Write the entire thing out
2464 storedInfo = self._write_in_memory_to_artifact(inMemoryDataset, ref)
2465 artifacts.append((ref, storedInfo))
2467 self._register_datasets(artifacts, insert_mode=DatabaseInsertMode.INSERT)
2469 @transactional
2470 def put_new(self, inMemoryDataset: Any, ref: DatasetRef) -> Mapping[str, DatasetRef]:
2471 doDisassembly = self.composites.shouldBeDisassembled(ref)
2472 # doDisassembly = True
2474 artifacts = []
2475 if doDisassembly:
2476 components = ref.datasetType.storageClass.delegate().disassemble(inMemoryDataset)
2477 if components is None:
2478 raise RuntimeError(
2479 f"Inconsistent configuration: dataset type {ref.datasetType.name} "
2480 f"with storage class {ref.datasetType.storageClass.name} "
2481 "is configured to be disassembled, but cannot be."
2482 )
2483 for component, componentInfo in components.items():
2484 # Don't recurse because we want to take advantage of
2485 # bulk insert -- need a new DatasetRef that refers to the
2486 # same dataset_id but has the component DatasetType
2487 # DatasetType does not refer to the types of components
2488 # So we construct one ourselves.
2489 compRef = ref.makeComponentRef(component)
2490 storedInfo = self._write_in_memory_to_artifact(componentInfo.component, compRef)
2491 artifacts.append((compRef, storedInfo))
2492 else:
2493 # Write the entire thing out
2494 storedInfo = self._write_in_memory_to_artifact(inMemoryDataset, ref)
2495 artifacts.append((ref, storedInfo))
2497 ref_records = {self._opaque_table_name: [info for _, info in artifacts]}
2498 ref = ref.replace(datastore_records=ref_records)
2499 return {self.name: ref}
2501 @transactional
2502 def trash(self, ref: DatasetRef | Iterable[DatasetRef], ignore_errors: bool = True) -> None:
2503 # At this point can safely remove these datasets from the cache
2504 # to avoid confusion later on. If they are not trashed later
2505 # the cache will simply be refilled.
2506 self.cacheManager.remove_from_cache(ref)
2508 # If we are in trust mode there will be nothing to move to
2509 # the trash table and we will have to try to delete the file
2510 # immediately.
2511 if self.trustGetRequest:
2512 # Try to keep the logic below for a single file trash.
2513 if isinstance(ref, DatasetRef):
2514 refs = {ref}
2515 else:
2516 # Will recreate ref at the end of this branch.
2517 refs = set(ref)
2519 # Determine which datasets are known to datastore directly.
2520 id_to_ref = {ref.id: ref for ref in refs}
2521 existing_ids = self._get_stored_records_associated_with_refs(refs, ignore_datastore_records=True)
2522 existing_refs = {id_to_ref[ref_id] for ref_id in existing_ids}
2524 missing = refs - existing_refs
2525 if missing:
2526 # Do an explicit existence check on these refs.
2527 # We only care about the artifacts at this point and not
2528 # the dataset existence.
2529 artifact_existence: dict[ResourcePath, bool] = {}
2530 _ = self.mexists(missing, artifact_existence)
2531 uris = [uri for uri, exists in artifact_existence.items() if exists]
2533 # FUTURE UPGRADE: Implement a parallelized bulk remove.
2534 log.debug("Removing %d artifacts from datastore that are unknown to datastore", len(uris))
2535 for uri in uris:
2536 try:
2537 uri.remove()
2538 except Exception as e:
2539 if ignore_errors:
2540 log.debug("Artifact %s could not be removed: %s", uri, e)
2541 continue
2542 raise
2544 # There is no point asking the code below to remove refs we
2545 # know are missing so update it with the list of existing
2546 # records. Try to retain one vs many logic.
2547 if not existing_refs:
2548 # Nothing more to do since none of the datasets were
2549 # known to the datastore record table.
2550 return
2551 ref = list(existing_refs)
2552 if len(ref) == 1:
2553 ref = ref[0]
2555 # Get file metadata and internal metadata
2556 if not isinstance(ref, DatasetRef):
2557 log.debug("Doing multi-dataset trash in datastore %s", self.name)
2558 # Assumed to be an iterable of refs so bulk mode enabled.
2559 try:
2560 self.bridge.moveToTrash(ref, transaction=self._transaction)
2561 except Exception as e:
2562 if ignore_errors:
2563 log.warning("Unexpected issue moving multiple datasets to trash: %s", e)
2564 else:
2565 raise
2566 return
2568 log.debug("Trashing dataset %s in datastore %s", ref, self.name)
2570 fileLocations = self._get_dataset_locations_info(ref)
2572 if not fileLocations:
2573 err_msg = f"Requested dataset to trash ({ref}) is not known to datastore {self.name}"
2574 if ignore_errors:
2575 log.warning(err_msg)
2576 return
2577 else:
2578 raise FileNotFoundError(err_msg)
2580 for location, _ in fileLocations:
2581 if not self._artifact_exists(location):
2582 err_msg = (
2583 f"Dataset is known to datastore {self.name} but "
2584 f"associated artifact ({location.uri}) is missing"
2585 )
2586 if ignore_errors:
2587 log.warning(err_msg)
2588 return
2589 else:
2590 raise FileNotFoundError(err_msg)
2592 # Mark dataset as trashed
2593 try:
2594 self.bridge.moveToTrash([ref], transaction=self._transaction)
2595 except Exception as e:
2596 if ignore_errors:
2597 log.warning(
2598 "Attempted to mark dataset (%s) to be trashed in datastore %s "
2599 "but encountered an error: %s",
2600 ref,
2601 self.name,
2602 e,
2603 )
2604 pass
2605 else:
2606 raise
2608 @transactional
2609 def emptyTrash(self, ignore_errors: bool = True) -> None:
2610 """Remove all datasets from the trash.
2612 Parameters
2613 ----------
2614 ignore_errors : `bool`
2615 If `True` return without error even if something went wrong.
2616 Problems could occur if another process is simultaneously trying
2617 to delete.
2618 """
2619 log.debug("Emptying trash in datastore %s", self.name)
2621 # Context manager will empty trash iff we finish it without raising.
2622 # It will also automatically delete the relevant rows from the
2623 # trash table and the records table.
2624 with self.bridge.emptyTrash(
2625 self._table, record_class=StoredFileInfo, record_column="path"
2626 ) as trash_data:
2627 # Removing the artifacts themselves requires that the files are
2628 # not also associated with refs that are not to be trashed.
2629 # Therefore need to do a query with the file paths themselves
2630 # and return all the refs associated with them. Can only delete
2631 # a file if the refs to be trashed are the only refs associated
2632 # with the file.
2633 # This requires multiple copies of the trashed items
2634 trashed, artifacts_to_keep = trash_data
2636 if artifacts_to_keep is None:
2637 # The bridge is not helping us so have to work it out
2638 # ourselves. This is not going to be as efficient.
2639 trashed = list(trashed)
2641 # The instance check is for mypy since up to this point it
2642 # does not know the type of info.
2643 path_map = self._refs_associated_with_artifacts(
2644 [info.path for _, info in trashed if isinstance(info, StoredFileInfo)]
2645 )
2647 for ref, info in trashed:
2648 # Mypy needs to know this is not the base class
2649 assert isinstance(info, StoredFileInfo), f"Unexpectedly got info of class {type(info)}"
2651 path_map[info.path].remove(ref.id)
2652 if not path_map[info.path]:
2653 del path_map[info.path]
2655 artifacts_to_keep = set(path_map)
2657 for ref, info in trashed:
2658 # Should not happen for this implementation but need
2659 # to keep mypy happy.
2660 assert info is not None, f"Internal logic error in emptyTrash with ref {ref}."
2662 # Mypy needs to know this is not the base class
2663 assert isinstance(info, StoredFileInfo), f"Unexpectedly got info of class {type(info)}"
2665 if info.path in artifacts_to_keep:
2666 # This is a multi-dataset artifact and we are not
2667 # removing all associated refs.
2668 continue
2670 # Only trashed refs still known to datastore will be returned.
2671 location = info.file_location(self.locationFactory)
2673 # Point of no return for this artifact
2674 log.debug("Removing artifact %s from datastore %s", location.uri, self.name)
2675 try:
2676 self._delete_artifact(location)
2677 except FileNotFoundError:
2678 # If the file itself has been deleted there is nothing
2679 # we can do about it. It is possible that trash has
2680 # been run in parallel in another process or someone
2681 # decided to delete the file. It is unlikely to come
2682 # back and so we should still continue with the removal
2683 # of the entry from the trash table. It is also possible
2684 # we removed it in a previous iteration if it was
2685 # a multi-dataset artifact. The delete artifact method
2686 # will log a debug message in this scenario.
2687 # Distinguishing file missing before trash started and
2688 # file already removed previously as part of this trash
2689 # is not worth the distinction with regards to potential
2690 # memory cost.
2691 pass
2692 except Exception as e:
2693 if ignore_errors:
2694 # Use a debug message here even though it's not
2695 # a good situation. In some cases this can be
2696 # caused by a race between user A and user B
2697 # and neither of them has permissions for the
2698 # other's files. Butler does not know about users
2699 # and trash has no idea what collections these
2700 # files were in (without guessing from a path).
2701 log.debug(
2702 "Encountered error removing artifact %s from datastore %s: %s",
2703 location.uri,
2704 self.name,
2705 e,
2706 )
2707 else:
2708 raise
2710 @transactional
2711 def transfer_from(
2712 self,
2713 source_datastore: Datastore,
2714 refs: Iterable[DatasetRef],
2715 transfer: str = "auto",
2716 artifact_existence: dict[ResourcePath, bool] | None = None,
2717 ) -> tuple[set[DatasetRef], set[DatasetRef]]:
2718 # Docstring inherited
2719 if type(self) is not type(source_datastore):
2720 raise TypeError(
2721 f"Datastore mismatch between this datastore ({type(self)}) and the "
2722 f"source datastore ({type(source_datastore)})."
2723 )
2725 # Be explicit for mypy
2726 if not isinstance(source_datastore, FileDatastore):
2727 raise TypeError(
2728 "Can only transfer to a FileDatastore from another FileDatastore, not"
2729 f" {type(source_datastore)}"
2730 )
2732 # Stop early if "direct" transfer mode is requested. That would
2733 # require that the URI inside the source datastore should be stored
2734 # directly in the target datastore, which seems unlikely to be useful
2735 # since at any moment the source datastore could delete the file.
2736 if transfer in ("direct", "split"):
2737 raise ValueError(
2738 f"Can not transfer from a source datastore using {transfer} mode since"
2739 " those files are controlled by the other datastore."
2740 )
2742 # Empty existence lookup if none given.
2743 if artifact_existence is None:
2744 artifact_existence = {}
2746 # We will go through the list multiple times so must convert
2747 # generators to lists.
2748 refs = list(refs)
2750 # In order to handle disassembled composites the code works
2751 # at the records level since it can assume that internal APIs
2752 # can be used.
2753 # - If the record already exists in the destination this is assumed
2754 # to be okay.
2755 # - If there is no record but the source and destination URIs are
2756 # identical no transfer is done but the record is added.
2757 # - If the source record refers to an absolute URI currently assume
2758 # that that URI should remain absolute and will be visible to the
2759 # destination butler. May need to have a flag to indicate whether
2760 # the dataset should be transferred. This will only happen if
2761 # the detached Butler has had a local ingest.
2763 # What we really want is all the records in the source datastore
2764 # associated with these refs. Or derived ones if they don't exist
2765 # in the source.
2766 source_records = source_datastore._get_stored_records_associated_with_refs(
2767 refs, ignore_datastore_records=True
2768 )
2770 # The source dataset_ids are the keys in these records
2771 source_ids = set(source_records)
2772 log.debug("Number of datastore records found in source: %d", len(source_ids))
2774 requested_ids = {ref.id for ref in refs}
2775 missing_ids = requested_ids - source_ids
2777 # Missing IDs can be okay if that datastore has allowed
2778 # gets based on file existence. Should we transfer what we can
2779 # or complain about it and warn?
2780 if missing_ids and not source_datastore.trustGetRequest:
2781 raise ValueError(
2782 f"Some datasets are missing from source datastore {source_datastore}: {missing_ids}"
2783 )
2785 # Need to map these missing IDs to a DatasetRef so we can guess
2786 # the details.
2787 if missing_ids:
2788 log.info(
2789 "Number of expected datasets missing from source datastore records: %d out of %d",
2790 len(missing_ids),
2791 len(requested_ids),
2792 )
2793 id_to_ref = {ref.id: ref for ref in refs if ref.id in missing_ids}
2795 # This should be chunked in case we end up having to check
2796 # the file store since we need some log output to show
2797 # progress.
2798 for missing_ids_chunk in chunk_iterable(missing_ids, chunk_size=10_000):
2799 records = {}
2800 for missing in missing_ids_chunk:
2801 # Ask the source datastore where the missing artifacts
2802 # should be. An execution butler might not know about the
2803 # artifacts even if they are there.
2804 expected = source_datastore._get_expected_dataset_locations_info(id_to_ref[missing])
2805 records[missing] = [info for _, info in expected]
2807 # Call the mexist helper method in case we have not already
2808 # checked these artifacts such that artifact_existence is
2809 # empty. This allows us to benefit from parallelism.
2810 # datastore.mexists() itself does not give us access to the
2811 # derived datastore record.
2812 log.verbose("Checking existence of %d datasets unknown to datastore", len(records))
2813 ref_exists = source_datastore._process_mexists_records(
2814 id_to_ref, records, False, artifact_existence=artifact_existence
2815 )
2817 # Now go through the records and propagate the ones that exist.
2818 location_factory = source_datastore.locationFactory
2819 for missing, record_list in records.items():
2820 # Skip completely if the ref does not exist.
2821 ref = id_to_ref[missing]
2822 if not ref_exists[ref]:
2823 log.warning("Asked to transfer dataset %s but no file artifacts exist for it.", ref)
2824 continue
2825 # Check for file artifact to decide which parts of a
2826 # disassembled composite do exist. If there is only a
2827 # single record we don't even need to look because it can't
2828 # be a composite and must exist.
2829 if len(record_list) == 1:
2830 dataset_records = record_list
2831 else:
2832 dataset_records = [
2833 record
2834 for record in record_list
2835 if artifact_existence[record.file_location(location_factory).uri]
2836 ]
2837 assert len(dataset_records) > 0, "Disassembled composite should have had some files."
2839 # Rely on source_records being a defaultdict.
2840 source_records[missing].extend(dataset_records)
2842 # See if we already have these records
2843 target_records = self._get_stored_records_associated_with_refs(refs, ignore_datastore_records=True)
2845 # The artifacts to register
2846 artifacts = []
2848 # Refs that already exist
2849 already_present = []
2851 # Refs that were rejected by this datastore.
2852 rejected = set()
2854 # Refs that were transferred successfully.
2855 accepted = set()
2857 # Record each time we have done a "direct" transfer.
2858 direct_transfers = []
2860 # Now can transfer the artifacts
2861 for ref in refs:
2862 if not self.constraints.isAcceptable(ref):
2863 # This datastore should not be accepting this dataset.
2864 rejected.add(ref)
2865 continue
2867 accepted.add(ref)
2869 if ref.id in target_records:
2870 # Already have an artifact for this.
2871 already_present.append(ref)
2872 continue
2874 # mypy needs to know these are always resolved refs
2875 for info in source_records[ref.id]:
2876 source_location = info.file_location(source_datastore.locationFactory)
2877 target_location = info.file_location(self.locationFactory)
2878 if source_location == target_location and not source_location.pathInStore.isabs():
2879 # Artifact is already in the target location.
2880 # (which is how execution butler currently runs)
2881 pass
2882 else:
2883 if target_location.pathInStore.isabs():
2884 # Just because we can see the artifact when running
2885 # the transfer doesn't mean it will be generally
2886 # accessible to a user of this butler. Need to decide
2887 # what to do about an absolute path.
2888 if transfer == "auto":
2889 # For "auto" transfers we allow the absolute URI
2890 # to be recorded in the target datastore.
2891 direct_transfers.append(source_location)
2892 else:
2893 # The user is explicitly requesting a transfer
2894 # even for an absolute URI. This requires us to
2895 # calculate the target path.
2896 template_ref = ref
2897 if info.component:
2898 template_ref = ref.makeComponentRef(info.component)
2899 target_location = self._calculate_ingested_datastore_name(
2900 source_location.uri,
2901 template_ref,
2902 )
2904 info = info.update(path=target_location.pathInStore.path)
2906 # Need to transfer it to the new location.
2907 # Assume we should always overwrite. If the artifact
2908 # is there this might indicate that a previous transfer
2909 # was interrupted but was not able to be rolled back
2910 # completely (eg pre-emption) so follow Datastore default
2911 # and overwrite.
2912 target_location.uri.transfer_from(
2913 source_location.uri, transfer=transfer, overwrite=True, transaction=self._transaction
2914 )
2916 artifacts.append((ref, info))
2918 if direct_transfers:
2919 log.info(
2920 "Transfer request for an outside-datastore artifact with absolute URI done %d time%s",
2921 len(direct_transfers),
2922 "" if len(direct_transfers) == 1 else "s",
2923 )
2925 # We are overwriting previous datasets that may have already
2926 # existed. We therefore should ensure that we force the
2927 # datastore records to agree. Note that this can potentially lead
2928 # to difficulties if the dataset has previously been ingested
2929 # disassembled and is somehow now assembled, or vice versa.
2930 self._register_datasets(artifacts, insert_mode=DatabaseInsertMode.REPLACE)
2932 if already_present:
2933 n_skipped = len(already_present)
2934 log.info(
2935 "Skipped transfer of %d dataset%s already present in datastore",
2936 n_skipped,
2937 "" if n_skipped == 1 else "s",
2938 )
2940 return accepted, rejected
2942 @transactional
2943 def forget(self, refs: Iterable[DatasetRef]) -> None:
2944 # Docstring inherited.
2945 refs = list(refs)
2946 self.bridge.forget(refs)
2947 self._table.delete(["dataset_id"], *[{"dataset_id": ref.id} for ref in refs])
2949 def validateConfiguration(
2950 self, entities: Iterable[DatasetRef | DatasetType | StorageClass], logFailures: bool = False
2951 ) -> None:
2952 """Validate some of the configuration for this datastore.
2954 Parameters
2955 ----------
2956 entities : iterable of `DatasetRef`, `DatasetType`, or `StorageClass`
2957 Entities to test against this configuration. Can be differing
2958 types.
2959 logFailures : `bool`, optional
2960 If `True`, output a log message for every validation error
2961 detected.
2963 Raises
2964 ------
2965 DatastoreValidationError
2966 Raised if there is a validation problem with a configuration.
2967 All the problems are reported in a single exception.
2969 Notes
2970 -----
2971 This method checks that all the supplied entities have valid file
2972 templates and also have formatters defined.
2973 """
2974 templateFailed = None
2975 try:
2976 self.templates.validateTemplates(entities, logFailures=logFailures)
2977 except FileTemplateValidationError as e:
2978 templateFailed = str(e)
2980 formatterFailed = []
2981 for entity in entities:
2982 try:
2983 self.formatterFactory.getFormatterClass(entity)
2984 except KeyError as e:
2985 formatterFailed.append(str(e))
2986 if logFailures:
2987 log.critical("Formatter failure: %s", e)
2989 if templateFailed or formatterFailed:
2990 messages = []
2991 if templateFailed:
2992 messages.append(templateFailed)
2993 if formatterFailed:
2994 messages.append(",".join(formatterFailed))
2995 msg = ";\n".join(messages)
2996 raise DatastoreValidationError(msg)
2998 def getLookupKeys(self) -> set[LookupKey]:
2999 # Docstring is inherited from base class
3000 return (
3001 self.templates.getLookupKeys()
3002 | self.formatterFactory.getLookupKeys()
3003 | self.constraints.getLookupKeys()
3004 )
3006 def validateKey(self, lookupKey: LookupKey, entity: DatasetRef | DatasetType | StorageClass) -> None:
3007 # Docstring is inherited from base class
3008 # The key can be valid in either formatters or templates so we can
3009 # only check the template if it exists
3010 if lookupKey in self.templates:
3011 try:
3012 self.templates[lookupKey].validateTemplate(entity)
3013 except FileTemplateValidationError as e:
3014 raise DatastoreValidationError(e) from e
3016 def export(
3017 self,
3018 refs: Iterable[DatasetRef],
3019 *,
3020 directory: ResourcePathExpression | None = None,
3021 transfer: str | None = "auto",
3022 ) -> Iterable[FileDataset]:
3023 # Docstring inherited from Datastore.export.
3024 if transfer == "auto" and directory is None:
3025 transfer = None
3027 if transfer is not None and directory is None:
3028 raise TypeError(f"Cannot export using transfer mode {transfer} with no export directory given")
3030 if transfer == "move":
3031 raise TypeError("Can not export by moving files out of datastore.")
3032 elif transfer == "direct":
3033 # For an export, treat this as equivalent to None. We do not
3034 # want an import to risk using absolute URIs to datasets owned
3035 # by another datastore.
3036 log.info("Treating 'direct' transfer mode as in-place export.")
3037 transfer = None
3039 # Force the directory to be a URI object
3040 directoryUri: ResourcePath | None = None
3041 if directory is not None:
3042 directoryUri = ResourcePath(directory, forceDirectory=True)
3044 if transfer is not None and directoryUri is not None and not directoryUri.exists():
3045 # mypy needs the second test
3046 raise FileNotFoundError(f"Export location {directory} does not exist")
3048 progress = Progress("lsst.daf.butler.datastores.FileDatastore.export", level=logging.DEBUG)
3049 for ref in progress.wrap(refs, "Exporting dataset files"):
3050 fileLocations = self._get_dataset_locations_info(ref)
3051 if not fileLocations:
3052 raise FileNotFoundError(f"Could not retrieve dataset {ref}.")
3053 # For now we can not export disassembled datasets
3054 if len(fileLocations) > 1:
3055 raise NotImplementedError(f"Can not export disassembled datasets such as {ref}")
3056 location, storedFileInfo = fileLocations[0]
3058 pathInStore = location.pathInStore.path
3059 if transfer is None:
3060 # TODO: do we also need to return the readStorageClass somehow?
3061 # We will use the path in store directly. If this is an
3062 # absolute URI, preserve it.
3063 if location.pathInStore.isabs():
3064 pathInStore = str(location.uri)
3065 elif transfer == "direct":
3066 # Use full URIs to the remote store in the export
3067 pathInStore = str(location.uri)
3068 else:
3069 # mypy needs help
3070 assert directoryUri is not None, "directoryUri must be defined to get here"
3071 storeUri = ResourcePath(location.uri)
3073 # if the datastore has an absolute URI to a resource, we
3074 # have two options:
3075 # 1. Keep the absolute URI in the exported YAML
3076 # 2. Allocate a new name in the local datastore and transfer
3077 # it.
3078 # For now go with option 2
3079 if location.pathInStore.isabs():
3080 template = self.templates.getTemplate(ref)
3081 newURI = ResourcePath(template.format(ref), forceAbsolute=False)
3082 pathInStore = str(newURI.updatedExtension(location.pathInStore.getExtension()))
3084 exportUri = directoryUri.join(pathInStore)
3085 exportUri.transfer_from(storeUri, transfer=transfer)
3087 yield FileDataset(refs=[ref], path=pathInStore, formatter=storedFileInfo.formatter)
3089 @staticmethod
3090 def computeChecksum(uri: ResourcePath, algorithm: str = "blake2b", block_size: int = 8192) -> str | None:
3091 """Compute the checksum of the supplied file.
3093 Parameters
3094 ----------
3095 uri : `lsst.resources.ResourcePath`
3096 Name of resource to calculate checksum from.
3097 algorithm : `str`, optional
3098 Name of algorithm to use. Must be one of the algorithms supported
3099 by :py:class`hashlib`.
3100 block_size : `int`
3101 Number of bytes to read from file at one time.
3103 Returns
3104 -------
3105 hexdigest : `str`
3106 Hex digest of the file.
3108 Notes
3109 -----
3110 Currently returns None if the URI is for a remote resource.
3111 """
3112 if algorithm not in hashlib.algorithms_guaranteed:
3113 raise NameError(f"The specified algorithm '{algorithm}' is not supported by hashlib")
3115 if not uri.isLocal:
3116 return None
3118 hasher = hashlib.new(algorithm)
3120 with uri.as_local() as local_uri, open(local_uri.ospath, "rb") as f:
3121 for chunk in iter(lambda: f.read(block_size), b""):
3122 hasher.update(chunk)
3124 return hasher.hexdigest()
3126 def needs_expanded_data_ids(
3127 self,
3128 transfer: str | None,
3129 entity: DatasetRef | DatasetType | StorageClass | None = None,
3130 ) -> bool:
3131 # Docstring inherited.
3132 # This _could_ also use entity to inspect whether the filename template
3133 # involves placeholders other than the required dimensions for its
3134 # dataset type, but that's not necessary for correctness; it just
3135 # enables more optimizations (perhaps only in theory).
3136 return transfer not in ("direct", None)
3138 def import_records(self, data: Mapping[str, DatastoreRecordData]) -> None:
3139 # Docstring inherited from the base class.
3140 record_data = data.get(self.name)
3141 if not record_data:
3142 return
3144 self._bridge.insert(FakeDatasetRef(dataset_id) for dataset_id in record_data.records)
3146 # TODO: Verify that there are no unexpected table names in the dict?
3147 unpacked_records = []
3148 for dataset_id, dataset_data in record_data.records.items():
3149 records = dataset_data.get(self._table.name)
3150 if records:
3151 for info in records:
3152 assert isinstance(info, StoredFileInfo), "Expecting StoredFileInfo records"
3153 unpacked_records.append(info.to_record(dataset_id=dataset_id))
3154 if unpacked_records:
3155 self._table.insert(*unpacked_records, transaction=self._transaction)
3157 def export_records(self, refs: Iterable[DatasetIdRef]) -> Mapping[str, DatastoreRecordData]:
3158 # Docstring inherited from the base class.
3159 exported_refs = list(self._bridge.check(refs))
3160 ids = {ref.id for ref in exported_refs}
3161 records: dict[DatasetId, dict[str, list[StoredDatastoreItemInfo]]] = {id: {} for id in ids}
3162 for row in self._table.fetch(dataset_id=ids):
3163 info: StoredDatastoreItemInfo = StoredFileInfo.from_record(row)
3164 dataset_records = records.setdefault(row["dataset_id"], {})
3165 dataset_records.setdefault(self._table.name, []).append(info)
3167 record_data = DatastoreRecordData(records=records)
3168 return {self.name: record_data}
3170 def set_retrieve_dataset_type_method(self, method: Callable[[str], DatasetType | None] | None) -> None:
3171 # Docstring inherited from the base class.
3172 self._retrieve_dataset_method = method
3174 def _cast_storage_class(self, ref: DatasetRef) -> DatasetRef:
3175 """Update dataset reference to use the storage class from registry."""
3176 if self._retrieve_dataset_method is None:
3177 # We could raise an exception here but unit tests do not define
3178 # this method.
3179 return ref
3180 dataset_type = self._retrieve_dataset_method(ref.datasetType.name)
3181 if dataset_type is not None:
3182 ref = ref.overrideStorageClass(dataset_type.storageClass)
3183 return ref
3185 def get_opaque_table_definitions(self) -> Mapping[str, DatastoreOpaqueTable]:
3186 # Docstring inherited from the base class.
3187 return {self._opaque_table_name: DatastoreOpaqueTable(self.makeTableSpec(ddl.GUID), StoredFileInfo)}