Coverage for python/lsst/daf/butler/datastores/fileDatastore.py: 10%
909 statements
« prev ^ index » next coverage.py v7.4.0, created at 2024-01-16 10:44 +0000
« prev ^ index » next coverage.py v7.4.0, created at 2024-01-16 10:44 +0000
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This software is dual licensed under the GNU General Public License and also
10# under a 3-clause BSD license. Recipients may choose which of these licenses
11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt,
12# respectively. If you choose the GPL option then the following text applies
13# (but note that there is still no warranty even if you opt for BSD instead):
14#
15# This program is free software: you can redistribute it and/or modify
16# it under the terms of the GNU General Public License as published by
17# the Free Software Foundation, either version 3 of the License, or
18# (at your option) any later version.
19#
20# This program is distributed in the hope that it will be useful,
21# but WITHOUT ANY WARRANTY; without even the implied warranty of
22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23# GNU General Public License for more details.
24#
25# You should have received a copy of the GNU General Public License
26# along with this program. If not, see <http://www.gnu.org/licenses/>.
28"""Generic file-based datastore code."""
30from __future__ import annotations
32__all__ = ("FileDatastore",)
34import contextlib
35import hashlib
36import logging
37from collections import defaultdict
38from collections.abc import Callable, Iterable, Mapping, Sequence
39from typing import TYPE_CHECKING, Any, ClassVar, cast
41from lsst.daf.butler import (
42 Config,
43 DatasetId,
44 DatasetRef,
45 DatasetType,
46 DatasetTypeNotSupportedError,
47 FileDataset,
48 FileDescriptor,
49 Formatter,
50 FormatterFactory,
51 Location,
52 LocationFactory,
53 Progress,
54 StorageClass,
55 ddl,
56)
57from lsst.daf.butler.datastore import (
58 DatasetRefURIs,
59 Datastore,
60 DatastoreConfig,
61 DatastoreOpaqueTable,
62 DatastoreValidationError,
63)
64from lsst.daf.butler.datastore.cache_manager import (
65 AbstractDatastoreCacheManager,
66 DatastoreCacheManager,
67 DatastoreDisabledCacheManager,
68)
69from lsst.daf.butler.datastore.composites import CompositesMap
70from lsst.daf.butler.datastore.file_templates import FileTemplates, FileTemplateValidationError
71from lsst.daf.butler.datastore.generic_base import GenericBaseDatastore
72from lsst.daf.butler.datastore.record_data import DatastoreRecordData
73from lsst.daf.butler.datastore.stored_file_info import StoredDatastoreItemInfo, StoredFileInfo
74from lsst.daf.butler.datastores.file_datastore.get import (
75 DatasetLocationInformation,
76 DatastoreFileGetInformation,
77 generate_datastore_get_information,
78 get_dataset_as_python_object_from_get_info,
79)
80from lsst.daf.butler.datastores.fileDatastoreClient import (
81 FileDatastoreGetPayload,
82 FileDatastoreGetPayloadFileInfo,
83)
84from lsst.daf.butler.registry.interfaces import (
85 DatabaseInsertMode,
86 DatastoreRegistryBridge,
87 FakeDatasetRef,
88 ReadOnlyDatabaseError,
89)
90from lsst.daf.butler.repo_relocation import replaceRoot
91from lsst.daf.butler.utils import transactional
92from lsst.resources import ResourcePath, ResourcePathExpression
93from lsst.utils.introspection import get_class_of
94from lsst.utils.iteration import chunk_iterable
96# For VERBOSE logging usage.
97from lsst.utils.logging import VERBOSE, getLogger
98from sqlalchemy import BigInteger, String
100if TYPE_CHECKING:
101 from lsst.daf.butler import LookupKey
102 from lsst.daf.butler.registry.interfaces import DatasetIdRef, DatastoreRegistryBridgeManager
104log = getLogger(__name__)
107class _IngestPrepData(Datastore.IngestPrepData):
108 """Helper class for FileDatastore ingest implementation.
110 Parameters
111 ----------
112 datasets : `~collections.abc.Iterable` of `FileDataset`
113 Files to be ingested by this datastore.
114 """
116 def __init__(self, datasets: Iterable[FileDataset]):
117 super().__init__(ref for dataset in datasets for ref in dataset.refs)
118 self.datasets = datasets
121class FileDatastore(GenericBaseDatastore[StoredFileInfo]):
122 """Generic Datastore for file-based implementations.
124 Should always be sub-classed since key abstract methods are missing.
126 Parameters
127 ----------
128 config : `DatastoreConfig` or `str`
129 Configuration as either a `Config` object or URI to file.
130 bridgeManager : `DatastoreRegistryBridgeManager`
131 Object that manages the interface between `Registry` and datastores.
132 butlerRoot : `str`, optional
133 New datastore root to use to override the configuration value.
135 Raises
136 ------
137 ValueError
138 If root location does not exist and ``create`` is `False` in the
139 configuration.
140 """
142 defaultConfigFile: ClassVar[str | None] = None
143 """Path to configuration defaults. Accessed within the ``config`` resource
144 or relative to a search path. Can be None if no defaults specified.
145 """
147 root: ResourcePath
148 """Root directory URI of this `Datastore`."""
150 locationFactory: LocationFactory
151 """Factory for creating locations relative to the datastore root."""
153 formatterFactory: FormatterFactory
154 """Factory for creating instances of formatters."""
156 templates: FileTemplates
157 """File templates that can be used by this `Datastore`."""
159 composites: CompositesMap
160 """Determines whether a dataset should be disassembled on put."""
162 defaultConfigFile = "datastores/fileDatastore.yaml"
163 """Path to configuration defaults. Accessed within the ``config`` resource
164 or relative to a search path. Can be None if no defaults specified.
165 """
167 _retrieve_dataset_method: Callable[[str], DatasetType | None] | None = None
168 """Callable that is used in trusted mode to retrieve registry definition
169 of a named dataset type.
170 """
172 @classmethod
173 def setConfigRoot(cls, root: str, config: Config, full: Config, overwrite: bool = True) -> None:
174 """Set any filesystem-dependent config options for this Datastore to
175 be appropriate for a new empty repository with the given root.
177 Parameters
178 ----------
179 root : `str`
180 URI to the root of the data repository.
181 config : `Config`
182 A `Config` to update. Only the subset understood by
183 this component will be updated. Will not expand
184 defaults.
185 full : `Config`
186 A complete config with all defaults expanded that can be
187 converted to a `DatastoreConfig`. Read-only and will not be
188 modified by this method.
189 Repository-specific options that should not be obtained
190 from defaults when Butler instances are constructed
191 should be copied from ``full`` to ``config``.
192 overwrite : `bool`, optional
193 If `False`, do not modify a value in ``config`` if the value
194 already exists. Default is always to overwrite with the provided
195 ``root``.
197 Notes
198 -----
199 If a keyword is explicitly defined in the supplied ``config`` it
200 will not be overridden by this method if ``overwrite`` is `False`.
201 This allows explicit values set in external configs to be retained.
202 """
203 Config.updateParameters(
204 DatastoreConfig,
205 config,
206 full,
207 toUpdate={"root": root},
208 toCopy=("cls", ("records", "table")),
209 overwrite=overwrite,
210 )
212 @classmethod
213 def makeTableSpec(cls, datasetIdColumnType: type) -> ddl.TableSpec:
214 return ddl.TableSpec(
215 fields=[
216 ddl.FieldSpec(name="dataset_id", dtype=datasetIdColumnType, primaryKey=True),
217 ddl.FieldSpec(name="path", dtype=String, length=256, nullable=False),
218 ddl.FieldSpec(name="formatter", dtype=String, length=128, nullable=False),
219 ddl.FieldSpec(name="storage_class", dtype=String, length=64, nullable=False),
220 # Use empty string to indicate no component
221 ddl.FieldSpec(name="component", dtype=String, length=32, primaryKey=True),
222 # TODO: should checksum be Base64Bytes instead?
223 ddl.FieldSpec(name="checksum", dtype=String, length=128, nullable=True),
224 ddl.FieldSpec(name="file_size", dtype=BigInteger, nullable=True),
225 ],
226 unique=frozenset(),
227 indexes=[ddl.IndexSpec("path")],
228 )
230 def __init__(
231 self,
232 config: DatastoreConfig | ResourcePathExpression,
233 bridgeManager: DatastoreRegistryBridgeManager,
234 butlerRoot: str | None = None,
235 ):
236 super().__init__(config, bridgeManager)
237 if "root" not in self.config:
238 raise ValueError("No root directory specified in configuration")
240 # Name ourselves either using an explicit name or a name
241 # derived from the (unexpanded) root
242 if "name" in self.config:
243 self.name = self.config["name"]
244 else:
245 # We use the unexpanded root in the name to indicate that this
246 # datastore can be moved without having to update registry.
247 self.name = "{}@{}".format(type(self).__name__, self.config["root"])
249 # Support repository relocation in config
250 # Existence of self.root is checked in subclass
251 self.root = ResourcePath(
252 replaceRoot(self.config["root"], butlerRoot), forceDirectory=True, forceAbsolute=True
253 )
255 self.locationFactory = LocationFactory(self.root)
256 self.formatterFactory = FormatterFactory()
258 # Now associate formatters with storage classes
259 self.formatterFactory.registerFormatters(self.config["formatters"], universe=bridgeManager.universe)
261 # Read the file naming templates
262 self.templates = FileTemplates(self.config["templates"], universe=bridgeManager.universe)
264 # See if composites should be disassembled
265 self.composites = CompositesMap(self.config["composites"], universe=bridgeManager.universe)
267 self._opaque_table_name = self.config["records", "table"]
268 try:
269 # Storage of paths and formatters, keyed by dataset_id
270 self._table = bridgeManager.opaque.register(
271 self._opaque_table_name, self.makeTableSpec(bridgeManager.datasetIdColumnType)
272 )
273 # Interface to Registry.
274 self._bridge = bridgeManager.register(self.name)
275 except ReadOnlyDatabaseError:
276 # If the database is read only and we just tried and failed to
277 # create a table, it means someone is trying to create a read-only
278 # butler client for an empty repo. That should be okay, as long
279 # as they then try to get any datasets before some other client
280 # creates the table. Chances are they're just validating
281 # configuration.
282 pass
284 # Determine whether checksums should be used - default to False
285 self.useChecksum = self.config.get("checksum", False)
287 # Determine whether we can fall back to configuration if a
288 # requested dataset is not known to registry
289 self.trustGetRequest = self.config.get("trust_get_request", False)
291 # Create a cache manager
292 self.cacheManager: AbstractDatastoreCacheManager
293 if "cached" in self.config:
294 self.cacheManager = DatastoreCacheManager(self.config["cached"], universe=bridgeManager.universe)
295 else:
296 self.cacheManager = DatastoreDisabledCacheManager("", universe=bridgeManager.universe)
298 # Check existence and create directory structure if necessary
299 if not self.root.exists():
300 if "create" not in self.config or not self.config["create"]:
301 raise ValueError(f"No valid root and not allowed to create one at: {self.root}")
302 try:
303 self.root.mkdir()
304 except Exception as e:
305 raise ValueError(
306 f"Can not create datastore root '{self.root}', check permissions. Got error: {e}"
307 ) from e
309 def __str__(self) -> str:
310 return str(self.root)
312 @property
313 def bridge(self) -> DatastoreRegistryBridge:
314 return self._bridge
316 @property
317 def roots(self) -> dict[str, ResourcePath | None]:
318 # Docstring inherited.
319 return {self.name: self.root}
321 def _artifact_exists(self, location: Location) -> bool:
322 """Check that an artifact exists in this datastore at the specified
323 location.
325 Parameters
326 ----------
327 location : `Location`
328 Expected location of the artifact associated with this datastore.
330 Returns
331 -------
332 exists : `bool`
333 True if the location can be found, false otherwise.
334 """
335 log.debug("Checking if resource exists: %s", location.uri)
336 return location.uri.exists()
338 def _delete_artifact(self, location: Location) -> None:
339 """Delete the artifact from the datastore.
341 Parameters
342 ----------
343 location : `Location`
344 Location of the artifact associated with this datastore.
345 """
346 if location.pathInStore.isabs():
347 raise RuntimeError(f"Cannot delete artifact with absolute uri {location.uri}.")
349 try:
350 location.uri.remove()
351 except FileNotFoundError:
352 log.debug("File %s did not exist and so could not be deleted.", location.uri)
353 raise
354 except Exception as e:
355 log.critical("Failed to delete file: %s (%s)", location.uri, e)
356 raise
357 log.debug("Successfully deleted file: %s", location.uri)
359 def addStoredItemInfo(
360 self,
361 refs: Iterable[DatasetRef],
362 infos: Iterable[StoredFileInfo],
363 insert_mode: DatabaseInsertMode = DatabaseInsertMode.INSERT,
364 ) -> None:
365 """Record internal storage information associated with one or more
366 datasets.
368 Parameters
369 ----------
370 refs : sequence of `DatasetRef`
371 The datasets that have been stored.
372 infos : sequence of `StoredDatastoreItemInfo`
373 Metadata associated with the stored datasets.
374 insert_mode : `~lsst.daf.butler.registry.interfaces.DatabaseInsertMode`
375 Mode to use to insert the new records into the table. The
376 options are ``INSERT`` (error if pre-existing), ``REPLACE``
377 (replace content with new values), and ``ENSURE`` (skip if the row
378 already exists).
379 """
380 records = [
381 info.rebase(ref).to_record(dataset_id=ref.id) for ref, info in zip(refs, infos, strict=True)
382 ]
383 match insert_mode:
384 case DatabaseInsertMode.INSERT:
385 self._table.insert(*records, transaction=self._transaction)
386 case DatabaseInsertMode.ENSURE:
387 self._table.ensure(*records, transaction=self._transaction)
388 case DatabaseInsertMode.REPLACE:
389 self._table.replace(*records, transaction=self._transaction)
390 case _:
391 raise ValueError(f"Unknown insert mode of '{insert_mode}'")
393 def getStoredItemsInfo(
394 self, ref: DatasetIdRef, ignore_datastore_records: bool = False
395 ) -> list[StoredFileInfo]:
396 """Retrieve information associated with files stored in this
397 `Datastore` associated with this dataset ref.
399 Parameters
400 ----------
401 ref : `DatasetRef`
402 The dataset that is to be queried.
403 ignore_datastore_records : `bool`
404 If `True` then do not use datastore records stored in refs.
406 Returns
407 -------
408 items : `~collections.abc.Iterable` [`StoredDatastoreItemInfo`]
409 Stored information about the files and associated formatters
410 associated with this dataset. Only one file will be returned
411 if the dataset has not been disassembled. Can return an empty
412 list if no matching datasets can be found.
413 """
414 # Try to get them from the ref first.
415 if ref._datastore_records is not None and not ignore_datastore_records:
416 if (ref_records := ref._datastore_records.get(self._table.name)) is not None:
417 # Need to make sure they have correct type.
418 for record in ref_records:
419 if not isinstance(record, StoredFileInfo):
420 raise TypeError(f"Datastore record has unexpected type {record.__class__.__name__}")
421 return cast(list[StoredFileInfo], ref_records)
423 # Look for the dataset_id -- there might be multiple matches
424 # if we have disassembled the dataset.
425 records = self._table.fetch(dataset_id=ref.id)
426 return [StoredFileInfo.from_record(record) for record in records]
428 def _register_datasets(
429 self,
430 refsAndInfos: Iterable[tuple[DatasetRef, StoredFileInfo]],
431 insert_mode: DatabaseInsertMode = DatabaseInsertMode.INSERT,
432 ) -> None:
433 """Update registry to indicate that one or more datasets have been
434 stored.
436 Parameters
437 ----------
438 refsAndInfos : sequence `tuple` [`DatasetRef`,
439 `StoredDatastoreItemInfo`]
440 Datasets to register and the internal datastore metadata associated
441 with them.
442 insert_mode : `str`, optional
443 Indicate whether the new records should be new ("insert", default),
444 or allowed to exists ("ensure") or be replaced if already present
445 ("replace").
446 """
447 expandedRefs: list[DatasetRef] = []
448 expandedItemInfos: list[StoredFileInfo] = []
450 for ref, itemInfo in refsAndInfos:
451 expandedRefs.append(ref)
452 expandedItemInfos.append(itemInfo)
454 # Dataset location only cares about registry ID so if we have
455 # disassembled in datastore we have to deduplicate. Since they
456 # will have different datasetTypes we can't use a set
457 registryRefs = {r.id: r for r in expandedRefs}
458 if insert_mode == DatabaseInsertMode.INSERT:
459 self.bridge.insert(registryRefs.values())
460 else:
461 # There are only two columns and all that matters is the
462 # dataset ID.
463 self.bridge.ensure(registryRefs.values())
464 self.addStoredItemInfo(expandedRefs, expandedItemInfos, insert_mode=insert_mode)
466 def _get_stored_records_associated_with_refs(
467 self, refs: Iterable[DatasetIdRef], ignore_datastore_records: bool = False
468 ) -> dict[DatasetId, list[StoredFileInfo]]:
469 """Retrieve all records associated with the provided refs.
471 Parameters
472 ----------
473 refs : iterable of `DatasetIdRef`
474 The refs for which records are to be retrieved.
475 ignore_datastore_records : `bool`
476 If `True` then do not use datastore records stored in refs.
478 Returns
479 -------
480 records : `dict` of [`DatasetId`, `list` of `StoredFileInfo`]
481 The matching records indexed by the ref ID. The number of entries
482 in the dict can be smaller than the number of requested refs.
483 """
484 # Check datastore records in refs first.
485 records_by_ref: defaultdict[DatasetId, list[StoredFileInfo]] = defaultdict(list)
486 refs_with_no_records = []
487 for ref in refs:
488 if ignore_datastore_records or ref._datastore_records is None:
489 refs_with_no_records.append(ref)
490 else:
491 if (ref_records := ref._datastore_records.get(self._table.name)) is not None:
492 # Need to make sure they have correct type.
493 for ref_record in ref_records:
494 if not isinstance(ref_record, StoredFileInfo):
495 raise TypeError(
496 f"Datastore record has unexpected type {ref_record.__class__.__name__}"
497 )
498 records_by_ref[ref.id].append(ref_record)
500 # If there were any refs without datastore records, check opaque table.
501 records = self._table.fetch(dataset_id=[ref.id for ref in refs_with_no_records])
503 # Uniqueness is dataset_id + component so can have multiple records
504 # per ref.
505 for record in records:
506 records_by_ref[record["dataset_id"]].append(StoredFileInfo.from_record(record))
507 return records_by_ref
509 def _refs_associated_with_artifacts(self, paths: list[str | ResourcePath]) -> dict[str, set[DatasetId]]:
510 """Return paths and associated dataset refs.
512 Parameters
513 ----------
514 paths : `list` of `str` or `lsst.resources.ResourcePath`
515 All the paths to include in search.
517 Returns
518 -------
519 mapping : `dict` of [`str`, `set` [`DatasetId`]]
520 Mapping of each path to a set of associated database IDs.
521 """
522 records = self._table.fetch(path=[str(path) for path in paths])
523 result = defaultdict(set)
524 for row in records:
525 result[row["path"]].add(row["dataset_id"])
526 return result
528 def _registered_refs_per_artifact(self, pathInStore: ResourcePath) -> set[DatasetId]:
529 """Return all dataset refs associated with the supplied path.
531 Parameters
532 ----------
533 pathInStore : `lsst.resources.ResourcePath`
534 Path of interest in the data store.
536 Returns
537 -------
538 ids : `set` of `int`
539 All `DatasetRef` IDs associated with this path.
540 """
541 records = list(self._table.fetch(path=str(pathInStore)))
542 ids = {r["dataset_id"] for r in records}
543 return ids
545 def removeStoredItemInfo(self, ref: DatasetIdRef) -> None:
546 """Remove information about the file associated with this dataset.
548 Parameters
549 ----------
550 ref : `DatasetRef`
551 The dataset that has been removed.
552 """
553 # Note that this method is actually not used by this implementation,
554 # we depend on bridge to delete opaque records. But there are some
555 # tests that check that this method works, so we keep it for now.
556 self._table.delete(["dataset_id"], {"dataset_id": ref.id})
558 def _get_dataset_locations_info(
559 self, ref: DatasetIdRef, ignore_datastore_records: bool = False
560 ) -> list[DatasetLocationInformation]:
561 r"""Find all the `Location`\ s of the requested dataset in the
562 `Datastore` and the associated stored file information.
564 Parameters
565 ----------
566 ref : `DatasetRef`
567 Reference to the required `Dataset`.
568 ignore_datastore_records : `bool`
569 If `True` then do not use datastore records stored in refs.
571 Returns
572 -------
573 results : `list` [`tuple` [`Location`, `StoredFileInfo` ]]
574 Location of the dataset within the datastore and
575 stored information about each file and its formatter.
576 """
577 # Get the file information (this will fail if no file)
578 records = self.getStoredItemsInfo(ref, ignore_datastore_records)
580 # Use the path to determine the location -- we need to take
581 # into account absolute URIs in the datastore record
582 return [(r.file_location(self.locationFactory), r) for r in records]
584 def _can_remove_dataset_artifact(self, ref: DatasetIdRef, location: Location) -> bool:
585 """Check that there is only one dataset associated with the
586 specified artifact.
588 Parameters
589 ----------
590 ref : `DatasetRef` or `FakeDatasetRef`
591 Dataset to be removed.
592 location : `Location`
593 The location of the artifact to be removed.
595 Returns
596 -------
597 can_remove : `Bool`
598 True if the artifact can be safely removed.
599 """
600 # Can't ever delete absolute URIs.
601 if location.pathInStore.isabs():
602 return False
604 # Get all entries associated with this path
605 allRefs = self._registered_refs_per_artifact(location.pathInStore)
606 if not allRefs:
607 raise RuntimeError(f"Datastore inconsistency error. {location.pathInStore} not in registry")
609 # Remove these refs from all the refs and if there is nothing left
610 # then we can delete
611 remainingRefs = allRefs - {ref.id}
613 if remainingRefs:
614 return False
615 return True
617 def _get_expected_dataset_locations_info(self, ref: DatasetRef) -> list[tuple[Location, StoredFileInfo]]:
618 """Predict the location and related file information of the requested
619 dataset in this datastore.
621 Parameters
622 ----------
623 ref : `DatasetRef`
624 Reference to the required `Dataset`.
626 Returns
627 -------
628 results : `list` [`tuple` [`Location`, `StoredFileInfo` ]]
629 Expected Location of the dataset within the datastore and
630 placeholder information about each file and its formatter.
632 Notes
633 -----
634 Uses the current configuration to determine how we would expect the
635 datastore files to have been written if we couldn't ask registry.
636 This is safe so long as there has been no change to datastore
637 configuration between writing the dataset and wanting to read it.
638 Will not work for files that have been ingested without using the
639 standard file template or default formatter.
640 """
641 # If we have a component ref we always need to ask the questions
642 # of the composite. If the composite is disassembled this routine
643 # should return all components. If the composite was not
644 # disassembled the composite is what is stored regardless of
645 # component request. Note that if the caller has disassembled
646 # a composite there is no way for this guess to know that
647 # without trying both the composite and component ref and seeing
648 # if there is something at the component Location even without
649 # disassembly being enabled.
650 if ref.datasetType.isComponent():
651 ref = ref.makeCompositeRef()
653 # See if the ref is a composite that should be disassembled
654 doDisassembly = self.composites.shouldBeDisassembled(ref)
656 all_info: list[tuple[Location, Formatter, StorageClass, str | None]] = []
658 if doDisassembly:
659 for component, componentStorage in ref.datasetType.storageClass.components.items():
660 compRef = ref.makeComponentRef(component)
661 location, formatter = self._determine_put_formatter_location(compRef)
662 all_info.append((location, formatter, componentStorage, component))
664 else:
665 # Always use the composite ref if no disassembly
666 location, formatter = self._determine_put_formatter_location(ref)
667 all_info.append((location, formatter, ref.datasetType.storageClass, None))
669 # Convert the list of tuples to have StoredFileInfo as second element
670 return [
671 (
672 location,
673 StoredFileInfo(
674 formatter=formatter,
675 path=location.pathInStore.path,
676 storageClass=storageClass,
677 component=component,
678 checksum=None,
679 file_size=-1,
680 ),
681 )
682 for location, formatter, storageClass, component in all_info
683 ]
685 def _prepare_for_direct_get(
686 self, ref: DatasetRef, parameters: Mapping[str, Any] | None = None
687 ) -> list[DatastoreFileGetInformation]:
688 """Check parameters for ``get`` and obtain formatter and
689 location.
691 Parameters
692 ----------
693 ref : `DatasetRef`
694 Reference to the required Dataset.
695 parameters : `dict`
696 `StorageClass`-specific parameters that specify, for example,
697 a slice of the dataset to be loaded.
699 Returns
700 -------
701 getInfo : `list` [`DatastoreFileGetInformation`]
702 Parameters needed to retrieve each file.
703 """
704 log.debug("Retrieve %s from %s with parameters %s", ref, self.name, parameters)
706 # The storage class we want to use eventually
707 refStorageClass = ref.datasetType.storageClass
709 # For trusted mode need to reset storage class.
710 ref = self._cast_storage_class(ref)
712 # Get file metadata and internal metadata
713 fileLocations = self._get_dataset_locations_info(ref)
714 if not fileLocations:
715 if not self.trustGetRequest:
716 raise FileNotFoundError(f"Could not retrieve dataset {ref}.")
717 # Assume the dataset is where we think it should be
718 fileLocations = self._get_expected_dataset_locations_info(ref)
720 if len(fileLocations) > 1:
721 # If trust is involved it is possible that there will be
722 # components listed here that do not exist in the datastore.
723 # Explicitly check for file artifact existence and filter out any
724 # that are missing.
725 if self.trustGetRequest:
726 fileLocations = [loc for loc in fileLocations if loc[0].uri.exists()]
728 # For now complain only if we have no components at all. One
729 # component is probably a problem but we can punt that to the
730 # assembler.
731 if not fileLocations:
732 raise FileNotFoundError(f"None of the component files for dataset {ref} exist.")
734 return generate_datastore_get_information(
735 fileLocations,
736 readStorageClass=refStorageClass,
737 ref=ref,
738 parameters=parameters,
739 )
741 def _prepare_for_put(self, inMemoryDataset: Any, ref: DatasetRef) -> tuple[Location, Formatter]:
742 """Check the arguments for ``put`` and obtain formatter and
743 location.
745 Parameters
746 ----------
747 inMemoryDataset : `object`
748 The dataset to store.
749 ref : `DatasetRef`
750 Reference to the associated Dataset.
752 Returns
753 -------
754 location : `Location`
755 The location to write the dataset.
756 formatter : `Formatter`
757 The `Formatter` to use to write the dataset.
759 Raises
760 ------
761 TypeError
762 Supplied object and storage class are inconsistent.
763 DatasetTypeNotSupportedError
764 The associated `DatasetType` is not handled by this datastore.
765 """
766 self._validate_put_parameters(inMemoryDataset, ref)
767 return self._determine_put_formatter_location(ref)
769 def _determine_put_formatter_location(self, ref: DatasetRef) -> tuple[Location, Formatter]:
770 """Calculate the formatter and output location to use for put.
772 Parameters
773 ----------
774 ref : `DatasetRef`
775 Reference to the associated Dataset.
777 Returns
778 -------
779 location : `Location`
780 The location to write the dataset.
781 formatter : `Formatter`
782 The `Formatter` to use to write the dataset.
783 """
784 # Work out output file name
785 try:
786 template = self.templates.getTemplate(ref)
787 except KeyError as e:
788 raise DatasetTypeNotSupportedError(f"Unable to find template for {ref}") from e
790 # Validate the template to protect against filenames from different
791 # dataIds returning the same and causing overwrite confusion.
792 template.validateTemplate(ref)
794 location = self.locationFactory.fromPath(template.format(ref))
796 # Get the formatter based on the storage class
797 storageClass = ref.datasetType.storageClass
798 try:
799 formatter = self.formatterFactory.getFormatter(
800 ref, FileDescriptor(location, storageClass=storageClass), ref.dataId
801 )
802 except KeyError as e:
803 raise DatasetTypeNotSupportedError(
804 f"Unable to find formatter for {ref} in datastore {self.name}"
805 ) from e
807 # Now that we know the formatter, update the location
808 location = formatter.makeUpdatedLocation(location)
810 return location, formatter
812 def _overrideTransferMode(self, *datasets: FileDataset, transfer: str | None = None) -> str | None:
813 # Docstring inherited from base class
814 if transfer != "auto":
815 return transfer
817 # See if the paths are within the datastore or not
818 inside = [self._pathInStore(d.path) is not None for d in datasets]
820 if all(inside):
821 transfer = None
822 elif not any(inside):
823 # Allow ResourcePath to use its own knowledge
824 transfer = "auto"
825 else:
826 # This can happen when importing from a datastore that
827 # has had some datasets ingested using "direct" mode.
828 # Also allow ResourcePath to sort it out but warn about it.
829 # This can happen if you are importing from a datastore
830 # that had some direct transfer datasets.
831 log.warning(
832 "Some datasets are inside the datastore and some are outside. Using 'split' "
833 "transfer mode. This assumes that the files outside the datastore are "
834 "still accessible to the new butler since they will not be copied into "
835 "the target datastore."
836 )
837 transfer = "split"
839 return transfer
841 def _pathInStore(self, path: ResourcePathExpression) -> str | None:
842 """Return path relative to datastore root.
844 Parameters
845 ----------
846 path : `lsst.resources.ResourcePathExpression`
847 Path to dataset. Can be absolute URI. If relative assumed to
848 be relative to the datastore. Returns path in datastore
849 or raises an exception if the path it outside.
851 Returns
852 -------
853 inStore : `str`
854 Path relative to datastore root. Returns `None` if the file is
855 outside the root.
856 """
857 # Relative path will always be relative to datastore
858 pathUri = ResourcePath(path, forceAbsolute=False)
859 return pathUri.relative_to(self.root)
861 def _standardizeIngestPath(
862 self, path: str | ResourcePath, *, transfer: str | None = None
863 ) -> str | ResourcePath:
864 """Standardize the path of a to-be-ingested file.
866 Parameters
867 ----------
868 path : `str` or `lsst.resources.ResourcePath`
869 Path of a file to be ingested. This parameter is not expected
870 to be all the types that can be used to construct a
871 `~lsst.resources.ResourcePath`.
872 transfer : `str`, optional
873 How (and whether) the dataset should be added to the datastore.
874 See `ingest` for details of transfer modes.
875 This implementation is provided only so
876 `NotImplementedError` can be raised if the mode is not supported;
877 actual transfers are deferred to `_extractIngestInfo`.
879 Returns
880 -------
881 path : `str` or `lsst.resources.ResourcePath`
882 New path in what the datastore considers standard form. If an
883 absolute URI was given that will be returned unchanged.
885 Notes
886 -----
887 Subclasses of `FileDatastore` can implement this method instead
888 of `_prepIngest`. It should not modify the data repository or given
889 file in any way.
891 Raises
892 ------
893 NotImplementedError
894 Raised if the datastore does not support the given transfer mode
895 (including the case where ingest is not supported at all).
896 FileNotFoundError
897 Raised if one of the given files does not exist.
898 """
899 if transfer not in (None, "direct", "split") + self.root.transferModes:
900 raise NotImplementedError(f"Transfer mode {transfer} not supported.")
902 # A relative URI indicates relative to datastore root
903 srcUri = ResourcePath(path, forceAbsolute=False)
904 if not srcUri.isabs():
905 srcUri = self.root.join(path)
907 if not srcUri.exists():
908 raise FileNotFoundError(
909 f"Resource at {srcUri} does not exist; note that paths to ingest "
910 f"are assumed to be relative to {self.root} unless they are absolute."
911 )
913 if transfer is None:
914 relpath = srcUri.relative_to(self.root)
915 if not relpath:
916 raise RuntimeError(
917 f"Transfer is none but source file ({srcUri}) is not within datastore ({self.root})"
918 )
920 # Return the relative path within the datastore for internal
921 # transfer
922 path = relpath
924 return path
926 def _extractIngestInfo(
927 self,
928 path: ResourcePathExpression,
929 ref: DatasetRef,
930 *,
931 formatter: Formatter | type[Formatter],
932 transfer: str | None = None,
933 record_validation_info: bool = True,
934 ) -> StoredFileInfo:
935 """Relocate (if necessary) and extract `StoredFileInfo` from a
936 to-be-ingested file.
938 Parameters
939 ----------
940 path : `lsst.resources.ResourcePathExpression`
941 URI or path of a file to be ingested.
942 ref : `DatasetRef`
943 Reference for the dataset being ingested. Guaranteed to have
944 ``dataset_id not None`.
945 formatter : `type` or `Formatter`
946 `Formatter` subclass to use for this dataset or an instance.
947 transfer : `str`, optional
948 How (and whether) the dataset should be added to the datastore.
949 See `ingest` for details of transfer modes.
950 record_validation_info : `bool`, optional
951 If `True`, the default, the datastore can record validation
952 information associated with the file. If `False` the datastore
953 will not attempt to track any information such as checksums
954 or file sizes. This can be useful if such information is tracked
955 in an external system or if the file is to be compressed in place.
956 It is up to the datastore whether this parameter is relevant.
958 Returns
959 -------
960 info : `StoredFileInfo`
961 Internal datastore record for this file. This will be inserted by
962 the caller; the `_extractIngestInfo` is only responsible for
963 creating and populating the struct.
965 Raises
966 ------
967 FileNotFoundError
968 Raised if one of the given files does not exist.
969 FileExistsError
970 Raised if transfer is not `None` but the (internal) location the
971 file would be moved to is already occupied.
972 """
973 if self._transaction is None:
974 raise RuntimeError("Ingest called without transaction enabled")
976 # Create URI of the source path, do not need to force a relative
977 # path to absolute.
978 srcUri = ResourcePath(path, forceAbsolute=False)
980 # Track whether we have read the size of the source yet
981 have_sized = False
983 tgtLocation: Location | None
984 if transfer is None or transfer == "split":
985 # A relative path is assumed to be relative to the datastore
986 # in this context
987 if not srcUri.isabs():
988 tgtLocation = self.locationFactory.fromPath(srcUri.ospath)
989 else:
990 # Work out the path in the datastore from an absolute URI
991 # This is required to be within the datastore.
992 pathInStore = srcUri.relative_to(self.root)
993 if pathInStore is None and transfer is None:
994 raise RuntimeError(
995 f"Unexpectedly learned that {srcUri} is not within datastore {self.root}"
996 )
997 if pathInStore:
998 tgtLocation = self.locationFactory.fromPath(pathInStore)
999 elif transfer == "split":
1000 # Outside the datastore but treat that as a direct ingest
1001 # instead.
1002 tgtLocation = None
1003 else:
1004 raise RuntimeError(f"Unexpected transfer mode encountered: {transfer} for URI {srcUri}")
1005 elif transfer == "direct":
1006 # Want to store the full URI to the resource directly in
1007 # datastore. This is useful for referring to permanent archive
1008 # storage for raw data.
1009 # Trust that people know what they are doing.
1010 tgtLocation = None
1011 else:
1012 # Work out the name we want this ingested file to have
1013 # inside the datastore
1014 tgtLocation = self._calculate_ingested_datastore_name(srcUri, ref, formatter)
1015 if not tgtLocation.uri.dirname().exists():
1016 log.debug("Folder %s does not exist yet.", tgtLocation.uri.dirname())
1017 tgtLocation.uri.dirname().mkdir()
1019 # if we are transferring from a local file to a remote location
1020 # it may be more efficient to get the size and checksum of the
1021 # local file rather than the transferred one
1022 if record_validation_info and srcUri.isLocal:
1023 size = srcUri.size()
1024 checksum = self.computeChecksum(srcUri) if self.useChecksum else None
1025 have_sized = True
1027 # Transfer the resource to the destination.
1028 # Allow overwrite of an existing file. This matches the behavior
1029 # of datastore.put() in that it trusts that registry would not
1030 # be asking to overwrite unless registry thought that the
1031 # overwrite was allowed.
1032 tgtLocation.uri.transfer_from(
1033 srcUri, transfer=transfer, transaction=self._transaction, overwrite=True
1034 )
1036 if tgtLocation is None:
1037 # This means we are using direct mode
1038 targetUri = srcUri
1039 targetPath = str(srcUri)
1040 else:
1041 targetUri = tgtLocation.uri
1042 targetPath = tgtLocation.pathInStore.path
1044 # the file should exist in the datastore now
1045 if record_validation_info:
1046 if not have_sized:
1047 size = targetUri.size()
1048 checksum = self.computeChecksum(targetUri) if self.useChecksum else None
1049 else:
1050 # Not recording any file information.
1051 size = -1
1052 checksum = None
1054 return StoredFileInfo(
1055 formatter=formatter,
1056 path=targetPath,
1057 storageClass=ref.datasetType.storageClass,
1058 component=ref.datasetType.component(),
1059 file_size=size,
1060 checksum=checksum,
1061 )
1063 def _prepIngest(self, *datasets: FileDataset, transfer: str | None = None) -> _IngestPrepData:
1064 # Docstring inherited from Datastore._prepIngest.
1065 filtered = []
1066 for dataset in datasets:
1067 acceptable = [ref for ref in dataset.refs if self.constraints.isAcceptable(ref)]
1068 if not acceptable:
1069 continue
1070 else:
1071 dataset.refs = acceptable
1072 if dataset.formatter is None:
1073 dataset.formatter = self.formatterFactory.getFormatterClass(dataset.refs[0])
1074 else:
1075 assert isinstance(dataset.formatter, type | str)
1076 formatter_class = get_class_of(dataset.formatter)
1077 if not issubclass(formatter_class, Formatter):
1078 raise TypeError(f"Requested formatter {dataset.formatter} is not a Formatter class.")
1079 dataset.formatter = formatter_class
1080 dataset.path = self._standardizeIngestPath(dataset.path, transfer=transfer)
1081 filtered.append(dataset)
1082 return _IngestPrepData(filtered)
1084 @transactional
1085 def _finishIngest(
1086 self,
1087 prepData: Datastore.IngestPrepData,
1088 *,
1089 transfer: str | None = None,
1090 record_validation_info: bool = True,
1091 ) -> None:
1092 # Docstring inherited from Datastore._finishIngest.
1093 refsAndInfos = []
1094 progress = Progress("lsst.daf.butler.datastores.FileDatastore.ingest", level=logging.DEBUG)
1095 for dataset in progress.wrap(prepData.datasets, desc="Ingesting dataset files"):
1096 # Do ingest as if the first dataset ref is associated with the file
1097 info = self._extractIngestInfo(
1098 dataset.path,
1099 dataset.refs[0],
1100 formatter=dataset.formatter,
1101 transfer=transfer,
1102 record_validation_info=record_validation_info,
1103 )
1104 refsAndInfos.extend([(ref, info) for ref in dataset.refs])
1106 # In direct mode we can allow repeated ingests of the same thing
1107 # if we are sure that the external dataset is immutable. We use
1108 # UUIDv5 to indicate this. If there is a mix of v4 and v5 they are
1109 # separated.
1110 refs_and_infos_replace = []
1111 refs_and_infos_insert = []
1112 if transfer == "direct":
1113 for entry in refsAndInfos:
1114 if entry[0].id.version == 5:
1115 refs_and_infos_replace.append(entry)
1116 else:
1117 refs_and_infos_insert.append(entry)
1118 else:
1119 refs_and_infos_insert = refsAndInfos
1121 if refs_and_infos_insert:
1122 self._register_datasets(refs_and_infos_insert, insert_mode=DatabaseInsertMode.INSERT)
1123 if refs_and_infos_replace:
1124 self._register_datasets(refs_and_infos_replace, insert_mode=DatabaseInsertMode.REPLACE)
1126 def _calculate_ingested_datastore_name(
1127 self,
1128 srcUri: ResourcePath,
1129 ref: DatasetRef,
1130 formatter: Formatter | type[Formatter] | None = None,
1131 ) -> Location:
1132 """Given a source URI and a DatasetRef, determine the name the
1133 dataset will have inside datastore.
1135 Parameters
1136 ----------
1137 srcUri : `lsst.resources.ResourcePath`
1138 URI to the source dataset file.
1139 ref : `DatasetRef`
1140 Ref associated with the newly-ingested dataset artifact. This
1141 is used to determine the name within the datastore.
1142 formatter : `Formatter` or Formatter class.
1143 Formatter to use for validation. Can be a class or an instance.
1144 No validation of the file extension is performed if the
1145 ``formatter`` is `None`. This can be used if the caller knows
1146 that the source URI and target URI will use the same formatter.
1148 Returns
1149 -------
1150 location : `Location`
1151 Target location for the newly-ingested dataset.
1152 """
1153 # Ingesting a file from outside the datastore.
1154 # This involves a new name.
1155 template = self.templates.getTemplate(ref)
1156 location = self.locationFactory.fromPath(template.format(ref))
1158 # Get the extension
1159 ext = srcUri.getExtension()
1161 # Update the destination to include that extension
1162 location.updateExtension(ext)
1164 # Ask the formatter to validate this extension
1165 if formatter is not None:
1166 formatter.validateExtension(location)
1168 return location
1170 def _write_in_memory_to_artifact(self, inMemoryDataset: Any, ref: DatasetRef) -> StoredFileInfo:
1171 """Write out in memory dataset to datastore.
1173 Parameters
1174 ----------
1175 inMemoryDataset : `object`
1176 Dataset to write to datastore.
1177 ref : `DatasetRef`
1178 Registry information associated with this dataset.
1180 Returns
1181 -------
1182 info : `StoredFileInfo`
1183 Information describing the artifact written to the datastore.
1184 """
1185 # May need to coerce the in memory dataset to the correct
1186 # python type, but first we need to make sure the storage class
1187 # reflects the one defined in the data repository.
1188 ref = self._cast_storage_class(ref)
1189 inMemoryDataset = ref.datasetType.storageClass.coerce_type(inMemoryDataset)
1191 location, formatter = self._prepare_for_put(inMemoryDataset, ref)
1192 uri = location.uri
1194 if not uri.dirname().exists():
1195 log.debug("Folder %s does not exist yet so creating it.", uri.dirname())
1196 uri.dirname().mkdir()
1198 if self._transaction is None:
1199 raise RuntimeError("Attempting to write artifact without transaction enabled")
1201 def _removeFileExists(uri: ResourcePath) -> None:
1202 """Remove a file and do not complain if it is not there.
1204 This is important since a formatter might fail before the file
1205 is written and we should not confuse people by writing spurious
1206 error messages to the log.
1207 """
1208 with contextlib.suppress(FileNotFoundError):
1209 uri.remove()
1211 # Register a callback to try to delete the uploaded data if
1212 # something fails below
1213 self._transaction.registerUndo("artifactWrite", _removeFileExists, uri)
1215 data_written = False
1217 # For remote URIs some datasets can be serialized directly
1218 # to bytes and sent to the remote datastore without writing a
1219 # file. If the dataset is intended to be saved to the cache
1220 # a file is always written and direct write to the remote
1221 # datastore is bypassed.
1222 if not uri.isLocal and not self.cacheManager.should_be_cached(ref):
1223 # Remote URI that is not cached so can write directly.
1224 try:
1225 serializedDataset = formatter.toBytes(inMemoryDataset)
1226 except NotImplementedError:
1227 # Fallback to the file writing option.
1228 pass
1229 except Exception as e:
1230 raise RuntimeError(
1231 f"Failed to serialize dataset {ref} of type {type(inMemoryDataset)} to bytes."
1232 ) from e
1233 else:
1234 log.debug("Writing bytes directly to %s", uri)
1235 uri.write(serializedDataset, overwrite=True)
1236 log.debug("Successfully wrote bytes directly to %s", uri)
1237 data_written = True
1239 if not data_written:
1240 # Did not write the bytes directly to object store so instead
1241 # write to temporary file. Always write to a temporary even if
1242 # using a local file system -- that gives us atomic writes.
1243 # If a process is killed as the file is being written we do not
1244 # want it to remain in the correct place but in corrupt state.
1245 # For local files write to the output directory not temporary dir.
1246 prefix = uri.dirname() if uri.isLocal else None
1247 with ResourcePath.temporary_uri(suffix=uri.getExtension(), prefix=prefix) as temporary_uri:
1248 # Need to configure the formatter to write to a different
1249 # location and that needs us to overwrite internals
1250 log.debug("Writing dataset to temporary location at %s", temporary_uri)
1251 with formatter._updateLocation(Location(None, temporary_uri)):
1252 try:
1253 formatter.write(inMemoryDataset)
1254 except Exception as e:
1255 raise RuntimeError(
1256 f"Failed to serialize dataset {ref} of type"
1257 f" {type(inMemoryDataset)} to "
1258 f"temporary location {temporary_uri}"
1259 ) from e
1261 # Use move for a local file since that becomes an efficient
1262 # os.rename. For remote resources we use copy to allow the
1263 # file to be cached afterwards.
1264 transfer = "move" if uri.isLocal else "copy"
1266 uri.transfer_from(temporary_uri, transfer=transfer, overwrite=True)
1268 if transfer == "copy":
1269 # Cache if required
1270 self.cacheManager.move_to_cache(temporary_uri, ref)
1272 log.debug("Successfully wrote dataset to %s via a temporary file.", uri)
1274 # URI is needed to resolve what ingest case are we dealing with
1275 return self._extractIngestInfo(uri, ref, formatter=formatter)
1277 def knows(self, ref: DatasetRef) -> bool:
1278 """Check if the dataset is known to the datastore.
1280 Does not check for existence of any artifact.
1282 Parameters
1283 ----------
1284 ref : `DatasetRef`
1285 Reference to the required dataset.
1287 Returns
1288 -------
1289 exists : `bool`
1290 `True` if the dataset is known to the datastore.
1291 """
1292 # We cannot trust datastore records from ref, as many unit tests delete
1293 # datasets and check their existence.
1294 fileLocations = self._get_dataset_locations_info(ref, ignore_datastore_records=True)
1295 if fileLocations:
1296 return True
1297 return False
1299 def knows_these(self, refs: Iterable[DatasetRef]) -> dict[DatasetRef, bool]:
1300 # Docstring inherited from the base class.
1302 # The records themselves. Could be missing some entries.
1303 records = self._get_stored_records_associated_with_refs(refs, ignore_datastore_records=True)
1305 return {ref: ref.id in records for ref in refs}
1307 def _process_mexists_records(
1308 self,
1309 id_to_ref: dict[DatasetId, DatasetRef],
1310 records: dict[DatasetId, list[StoredFileInfo]],
1311 all_required: bool,
1312 artifact_existence: dict[ResourcePath, bool] | None = None,
1313 ) -> dict[DatasetRef, bool]:
1314 """Check given records for existence.
1316 Helper function for `mexists()`.
1318 Parameters
1319 ----------
1320 id_to_ref : `dict` of [`DatasetId`, `DatasetRef`]
1321 Mapping of the dataset ID to the dataset ref itself.
1322 records : `dict` of [`DatasetId`, `list` of `StoredFileInfo`]
1323 Records as generally returned by
1324 ``_get_stored_records_associated_with_refs``.
1325 all_required : `bool`
1326 Flag to indicate whether existence requires all artifacts
1327 associated with a dataset ID to exist or not for existence.
1328 artifact_existence : `dict` [`lsst.resources.ResourcePath`, `bool`]
1329 Optional mapping of datastore artifact to existence. Updated by
1330 this method with details of all artifacts tested. Can be `None`
1331 if the caller is not interested.
1333 Returns
1334 -------
1335 existence : `dict` of [`DatasetRef`, `bool`]
1336 Mapping from dataset to boolean indicating existence.
1337 """
1338 # The URIs to be checked and a mapping of those URIs to
1339 # the dataset ID.
1340 uris_to_check: list[ResourcePath] = []
1341 location_map: dict[ResourcePath, DatasetId] = {}
1343 location_factory = self.locationFactory
1345 uri_existence: dict[ResourcePath, bool] = {}
1346 for ref_id, infos in records.items():
1347 # Key is the dataset Id, value is list of StoredItemInfo
1348 uris = [info.file_location(location_factory).uri for info in infos]
1349 location_map.update({uri: ref_id for uri in uris})
1351 # Check the local cache directly for a dataset corresponding
1352 # to the remote URI.
1353 if self.cacheManager.file_count > 0:
1354 ref = id_to_ref[ref_id]
1355 for uri, storedFileInfo in zip(uris, infos, strict=True):
1356 check_ref = ref
1357 if not ref.datasetType.isComponent() and (component := storedFileInfo.component):
1358 check_ref = ref.makeComponentRef(component)
1359 if self.cacheManager.known_to_cache(check_ref, uri.getExtension()):
1360 # Proxy for URI existence.
1361 uri_existence[uri] = True
1362 else:
1363 uris_to_check.append(uri)
1364 else:
1365 # Check all of them.
1366 uris_to_check.extend(uris)
1368 if artifact_existence is not None:
1369 # If a URI has already been checked remove it from the list
1370 # and immediately add the status to the output dict.
1371 filtered_uris_to_check = []
1372 for uri in uris_to_check:
1373 if uri in artifact_existence:
1374 uri_existence[uri] = artifact_existence[uri]
1375 else:
1376 filtered_uris_to_check.append(uri)
1377 uris_to_check = filtered_uris_to_check
1379 # Results.
1380 dataset_existence: dict[DatasetRef, bool] = {}
1382 uri_existence.update(ResourcePath.mexists(uris_to_check))
1383 for uri, exists in uri_existence.items():
1384 dataset_id = location_map[uri]
1385 ref = id_to_ref[dataset_id]
1387 # Disassembled composite needs to check all locations.
1388 # all_required indicates whether all need to exist or not.
1389 if ref in dataset_existence:
1390 if all_required:
1391 exists = dataset_existence[ref] and exists
1392 else:
1393 exists = dataset_existence[ref] or exists
1394 dataset_existence[ref] = exists
1396 if artifact_existence is not None:
1397 artifact_existence.update(uri_existence)
1399 return dataset_existence
1401 def mexists(
1402 self, refs: Iterable[DatasetRef], artifact_existence: dict[ResourcePath, bool] | None = None
1403 ) -> dict[DatasetRef, bool]:
1404 """Check the existence of multiple datasets at once.
1406 Parameters
1407 ----------
1408 refs : iterable of `DatasetRef`
1409 The datasets to be checked.
1410 artifact_existence : `dict` [`lsst.resources.ResourcePath`, `bool`]
1411 Optional mapping of datastore artifact to existence. Updated by
1412 this method with details of all artifacts tested. Can be `None`
1413 if the caller is not interested.
1415 Returns
1416 -------
1417 existence : `dict` of [`DatasetRef`, `bool`]
1418 Mapping from dataset to boolean indicating existence.
1420 Notes
1421 -----
1422 To minimize potentially costly remote existence checks, the local
1423 cache is checked as a proxy for existence. If a file for this
1424 `DatasetRef` does exist no check is done for the actual URI. This
1425 could result in possibly unexpected behavior if the dataset itself
1426 has been removed from the datastore by another process whilst it is
1427 still in the cache.
1428 """
1429 chunk_size = 10_000
1430 dataset_existence: dict[DatasetRef, bool] = {}
1431 log.debug("Checking for the existence of multiple artifacts in datastore in chunks of %d", chunk_size)
1432 n_found_total = 0
1433 n_checked = 0
1434 n_chunks = 0
1435 for chunk in chunk_iterable(refs, chunk_size=chunk_size):
1436 chunk_result = self._mexists(chunk, artifact_existence)
1438 # The log message level and content depend on how many
1439 # datasets we are processing.
1440 n_results = len(chunk_result)
1442 # Use verbose logging to ensure that messages can be seen
1443 # easily if many refs are being checked.
1444 log_threshold = VERBOSE
1445 n_checked += n_results
1447 # This sum can take some time so only do it if we know the
1448 # result is going to be used.
1449 n_found = 0
1450 if log.isEnabledFor(log_threshold):
1451 # Can treat the booleans as 0, 1 integers and sum them.
1452 n_found = sum(chunk_result.values())
1453 n_found_total += n_found
1455 # We are deliberately not trying to count the number of refs
1456 # provided in case it's in the millions. This means there is a
1457 # situation where the number of refs exactly matches the chunk
1458 # size and we will switch to the multi-chunk path even though
1459 # we only have a single chunk.
1460 if n_results < chunk_size and n_chunks == 0:
1461 # Single chunk will be processed so we can provide more detail.
1462 if n_results == 1:
1463 ref = list(chunk_result)[0]
1464 # Use debug logging to be consistent with `exists()`.
1465 log.debug(
1466 "Calling mexists() with single ref that does%s exist (%s).",
1467 "" if chunk_result[ref] else " not",
1468 ref,
1469 )
1470 else:
1471 # Single chunk but multiple files. Summarize.
1472 log.log(
1473 log_threshold,
1474 "Number of datasets found in datastore: %d out of %d datasets checked.",
1475 n_found,
1476 n_checked,
1477 )
1479 else:
1480 # Use incremental verbose logging when we have multiple chunks.
1481 log.log(
1482 log_threshold,
1483 "Number of datasets found in datastore for chunk %d: %d out of %d checked "
1484 "(running total from all chunks so far: %d found out of %d checked)",
1485 n_chunks,
1486 n_found,
1487 n_results,
1488 n_found_total,
1489 n_checked,
1490 )
1491 dataset_existence.update(chunk_result)
1492 n_chunks += 1
1494 return dataset_existence
1496 def _mexists(
1497 self, refs: Sequence[DatasetRef], artifact_existence: dict[ResourcePath, bool] | None = None
1498 ) -> dict[DatasetRef, bool]:
1499 """Check the existence of multiple datasets at once.
1501 Parameters
1502 ----------
1503 refs : iterable of `DatasetRef`
1504 The datasets to be checked.
1505 artifact_existence : `dict` [`lsst.resources.ResourcePath`, `bool`]
1506 Optional mapping of datastore artifact to existence. Updated by
1507 this method with details of all artifacts tested. Can be `None`
1508 if the caller is not interested.
1510 Returns
1511 -------
1512 existence : `dict` of [`DatasetRef`, `bool`]
1513 Mapping from dataset to boolean indicating existence.
1514 """
1515 # Make a mapping from refs with the internal storage class to the given
1516 # refs that may have a different one. We'll use the internal refs
1517 # throughout this method and convert back at the very end.
1518 internal_ref_to_input_ref = {self._cast_storage_class(ref): ref for ref in refs}
1520 # Need a mapping of dataset_id to (internal) dataset ref since some
1521 # internal APIs work with dataset_id.
1522 id_to_ref = {ref.id: ref for ref in internal_ref_to_input_ref}
1524 # Set of all IDs we are checking for.
1525 requested_ids = set(id_to_ref.keys())
1527 # The records themselves. Could be missing some entries.
1528 records = self._get_stored_records_associated_with_refs(
1529 id_to_ref.values(), ignore_datastore_records=True
1530 )
1532 dataset_existence = self._process_mexists_records(
1533 id_to_ref, records, True, artifact_existence=artifact_existence
1534 )
1536 # Set of IDs that have been handled.
1537 handled_ids = {ref.id for ref in dataset_existence}
1539 missing_ids = requested_ids - handled_ids
1540 if missing_ids:
1541 dataset_existence.update(
1542 self._mexists_check_expected(
1543 [id_to_ref[missing] for missing in missing_ids], artifact_existence
1544 )
1545 )
1547 return {
1548 internal_ref_to_input_ref[internal_ref]: existence
1549 for internal_ref, existence in dataset_existence.items()
1550 }
1552 def _mexists_check_expected(
1553 self, refs: Sequence[DatasetRef], artifact_existence: dict[ResourcePath, bool] | None = None
1554 ) -> dict[DatasetRef, bool]:
1555 """Check existence of refs that are not known to datastore.
1557 Parameters
1558 ----------
1559 refs : iterable of `DatasetRef`
1560 The datasets to be checked. These are assumed not to be known
1561 to datastore.
1562 artifact_existence : `dict` [`lsst.resources.ResourcePath`, `bool`]
1563 Optional mapping of datastore artifact to existence. Updated by
1564 this method with details of all artifacts tested. Can be `None`
1565 if the caller is not interested.
1567 Returns
1568 -------
1569 existence : `dict` of [`DatasetRef`, `bool`]
1570 Mapping from dataset to boolean indicating existence.
1571 """
1572 dataset_existence: dict[DatasetRef, bool] = {}
1573 if not self.trustGetRequest:
1574 # Must assume these do not exist
1575 for ref in refs:
1576 dataset_existence[ref] = False
1577 else:
1578 log.debug(
1579 "%d datasets were not known to datastore during initial existence check.",
1580 len(refs),
1581 )
1583 # Construct data structure identical to that returned
1584 # by _get_stored_records_associated_with_refs() but using
1585 # guessed names.
1586 records = {}
1587 id_to_ref = {}
1588 for missing_ref in refs:
1589 expected = self._get_expected_dataset_locations_info(missing_ref)
1590 dataset_id = missing_ref.id
1591 records[dataset_id] = [info for _, info in expected]
1592 id_to_ref[dataset_id] = missing_ref
1594 dataset_existence.update(
1595 self._process_mexists_records(
1596 id_to_ref,
1597 records,
1598 False,
1599 artifact_existence=artifact_existence,
1600 )
1601 )
1603 return dataset_existence
1605 def exists(self, ref: DatasetRef) -> bool:
1606 """Check if the dataset exists in the datastore.
1608 Parameters
1609 ----------
1610 ref : `DatasetRef`
1611 Reference to the required dataset.
1613 Returns
1614 -------
1615 exists : `bool`
1616 `True` if the entity exists in the `Datastore`.
1618 Notes
1619 -----
1620 The local cache is checked as a proxy for existence in the remote
1621 object store. It is possible that another process on a different
1622 compute node could remove the file from the object store even
1623 though it is present in the local cache.
1624 """
1625 ref = self._cast_storage_class(ref)
1626 # We cannot trust datastore records from ref, as many unit tests delete
1627 # datasets and check their existence.
1628 fileLocations = self._get_dataset_locations_info(ref, ignore_datastore_records=True)
1630 # if we are being asked to trust that registry might not be correct
1631 # we ask for the expected locations and check them explicitly
1632 if not fileLocations:
1633 if not self.trustGetRequest:
1634 return False
1636 # First check the cache. If it is not found we must check
1637 # the datastore itself. Assume that any component in the cache
1638 # means that the dataset does exist somewhere.
1639 if self.cacheManager.known_to_cache(ref):
1640 return True
1642 # When we are guessing a dataset location we can not check
1643 # for the existence of every component since we can not
1644 # know if every component was written. Instead we check
1645 # for the existence of any of the expected locations.
1646 for location, _ in self._get_expected_dataset_locations_info(ref):
1647 if self._artifact_exists(location):
1648 return True
1649 return False
1651 # All listed artifacts must exist.
1652 for location, storedFileInfo in fileLocations:
1653 # Checking in cache needs the component ref.
1654 check_ref = ref
1655 if not ref.datasetType.isComponent() and (component := storedFileInfo.component):
1656 check_ref = ref.makeComponentRef(component)
1657 if self.cacheManager.known_to_cache(check_ref, location.getExtension()):
1658 continue
1660 if not self._artifact_exists(location):
1661 return False
1663 return True
1665 def getURIs(self, ref: DatasetRef, predict: bool = False) -> DatasetRefURIs:
1666 """Return URIs associated with dataset.
1668 Parameters
1669 ----------
1670 ref : `DatasetRef`
1671 Reference to the required dataset.
1672 predict : `bool`, optional
1673 If the datastore does not know about the dataset, controls whether
1674 it should return a predicted URI or not.
1676 Returns
1677 -------
1678 uris : `DatasetRefURIs`
1679 The URI to the primary artifact associated with this dataset (if
1680 the dataset was disassembled within the datastore this may be
1681 `None`), and the URIs to any components associated with the dataset
1682 artifact. (can be empty if there are no components).
1683 """
1684 many = self.getManyURIs([ref], predict=predict, allow_missing=False)
1685 return many[ref]
1687 def getURI(self, ref: DatasetRef, predict: bool = False) -> ResourcePath:
1688 """URI to the Dataset.
1690 Parameters
1691 ----------
1692 ref : `DatasetRef`
1693 Reference to the required Dataset.
1694 predict : `bool`
1695 If `True`, allow URIs to be returned of datasets that have not
1696 been written.
1698 Returns
1699 -------
1700 uri : `str`
1701 URI pointing to the dataset within the datastore. If the
1702 dataset does not exist in the datastore, and if ``predict`` is
1703 `True`, the URI will be a prediction and will include a URI
1704 fragment "#predicted".
1705 If the datastore does not have entities that relate well
1706 to the concept of a URI the returned URI will be
1707 descriptive. The returned URI is not guaranteed to be obtainable.
1709 Raises
1710 ------
1711 FileNotFoundError
1712 Raised if a URI has been requested for a dataset that does not
1713 exist and guessing is not allowed.
1714 RuntimeError
1715 Raised if a request is made for a single URI but multiple URIs
1716 are associated with this dataset.
1718 Notes
1719 -----
1720 When a predicted URI is requested an attempt will be made to form
1721 a reasonable URI based on file templates and the expected formatter.
1722 """
1723 primary, components = self.getURIs(ref, predict)
1724 if primary is None or components:
1725 raise RuntimeError(
1726 f"Dataset ({ref}) includes distinct URIs for components. Use Datastore.getURIs() instead."
1727 )
1728 return primary
1730 def _predict_URIs(
1731 self,
1732 ref: DatasetRef,
1733 ) -> DatasetRefURIs:
1734 """Predict the URIs of a dataset ref.
1736 Parameters
1737 ----------
1738 ref : `DatasetRef`
1739 Reference to the required Dataset.
1741 Returns
1742 -------
1743 URI : DatasetRefUris
1744 Primary and component URIs. URIs will contain a URI fragment
1745 "#predicted".
1746 """
1747 uris = DatasetRefURIs()
1749 if self.composites.shouldBeDisassembled(ref):
1750 for component, _ in ref.datasetType.storageClass.components.items():
1751 comp_ref = ref.makeComponentRef(component)
1752 comp_location, _ = self._determine_put_formatter_location(comp_ref)
1754 # Add the "#predicted" URI fragment to indicate this is a
1755 # guess
1756 uris.componentURIs[component] = ResourcePath(comp_location.uri.geturl() + "#predicted")
1758 else:
1759 location, _ = self._determine_put_formatter_location(ref)
1761 # Add the "#predicted" URI fragment to indicate this is a guess
1762 uris.primaryURI = ResourcePath(location.uri.geturl() + "#predicted")
1764 return uris
1766 def getManyURIs(
1767 self,
1768 refs: Iterable[DatasetRef],
1769 predict: bool = False,
1770 allow_missing: bool = False,
1771 ) -> dict[DatasetRef, DatasetRefURIs]:
1772 # Docstring inherited
1774 uris: dict[DatasetRef, DatasetRefURIs] = {}
1776 records = self._get_stored_records_associated_with_refs(refs)
1777 records_keys = records.keys()
1779 existing_refs = tuple(ref for ref in refs if ref.id in records_keys)
1780 missing_refs = tuple(ref for ref in refs if ref.id not in records_keys)
1782 # Have to handle trustGetRequest mode by checking for the existence
1783 # of the missing refs on disk.
1784 if missing_refs:
1785 dataset_existence = self._mexists_check_expected(missing_refs, None)
1786 really_missing = set()
1787 not_missing = set()
1788 for ref, exists in dataset_existence.items():
1789 if exists:
1790 not_missing.add(ref)
1791 else:
1792 really_missing.add(ref)
1794 if not_missing:
1795 # Need to recalculate the missing/existing split.
1796 existing_refs = existing_refs + tuple(not_missing)
1797 missing_refs = tuple(really_missing)
1799 for ref in missing_refs:
1800 # if this has never been written then we have to guess
1801 if not predict:
1802 if not allow_missing:
1803 raise FileNotFoundError(f"Dataset {ref} not in this datastore.")
1804 else:
1805 uris[ref] = self._predict_URIs(ref)
1807 for ref in existing_refs:
1808 file_infos = records[ref.id]
1809 file_locations = [(i.file_location(self.locationFactory), i) for i in file_infos]
1810 uris[ref] = self._locations_to_URI(ref, file_locations)
1812 return uris
1814 def _locations_to_URI(
1815 self,
1816 ref: DatasetRef,
1817 file_locations: Sequence[tuple[Location, StoredFileInfo]],
1818 ) -> DatasetRefURIs:
1819 """Convert one or more file locations associated with a DatasetRef
1820 to a DatasetRefURIs.
1822 Parameters
1823 ----------
1824 ref : `DatasetRef`
1825 Reference to the dataset.
1826 file_locations : Sequence[Tuple[Location, StoredFileInfo]]
1827 Each item in the sequence is the location of the dataset within the
1828 datastore and stored information about the file and its formatter.
1829 If there is only one item in the sequence then it is treated as the
1830 primary URI. If there is more than one item then they are treated
1831 as component URIs. If there are no items then an error is raised
1832 unless ``self.trustGetRequest`` is `True`.
1834 Returns
1835 -------
1836 uris: DatasetRefURIs
1837 Represents the primary URI or component URIs described by the
1838 inputs.
1840 Raises
1841 ------
1842 RuntimeError
1843 If no file locations are passed in and ``self.trustGetRequest`` is
1844 `False`.
1845 FileNotFoundError
1846 If the a passed-in URI does not exist, and ``self.trustGetRequest``
1847 is `False`.
1848 RuntimeError
1849 If a passed in `StoredFileInfo`'s ``component`` is `None` (this is
1850 unexpected).
1851 """
1852 guessing = False
1853 uris = DatasetRefURIs()
1855 if not file_locations:
1856 if not self.trustGetRequest:
1857 raise RuntimeError(f"Unexpectedly got no artifacts for dataset {ref}")
1858 file_locations = self._get_expected_dataset_locations_info(ref)
1859 guessing = True
1861 if len(file_locations) == 1:
1862 # No disassembly so this is the primary URI
1863 uris.primaryURI = file_locations[0][0].uri
1864 if guessing and not uris.primaryURI.exists():
1865 raise FileNotFoundError(f"Expected URI ({uris.primaryURI}) does not exist")
1866 else:
1867 for location, file_info in file_locations:
1868 if file_info.component is None:
1869 raise RuntimeError(f"Unexpectedly got no component name for a component at {location}")
1870 if guessing and not location.uri.exists():
1871 # If we are trusting then it is entirely possible for
1872 # some components to be missing. In that case we skip
1873 # to the next component.
1874 if self.trustGetRequest:
1875 continue
1876 raise FileNotFoundError(f"Expected URI ({location.uri}) does not exist")
1877 uris.componentURIs[file_info.component] = location.uri
1879 return uris
1881 def retrieveArtifacts(
1882 self,
1883 refs: Iterable[DatasetRef],
1884 destination: ResourcePath,
1885 transfer: str = "auto",
1886 preserve_path: bool = True,
1887 overwrite: bool = False,
1888 ) -> list[ResourcePath]:
1889 """Retrieve the file artifacts associated with the supplied refs.
1891 Parameters
1892 ----------
1893 refs : iterable of `DatasetRef`
1894 The datasets for which file artifacts are to be retrieved.
1895 A single ref can result in multiple files. The refs must
1896 be resolved.
1897 destination : `lsst.resources.ResourcePath`
1898 Location to write the file artifacts.
1899 transfer : `str`, optional
1900 Method to use to transfer the artifacts. Must be one of the options
1901 supported by `lsst.resources.ResourcePath.transfer_from()`.
1902 "move" is not allowed.
1903 preserve_path : `bool`, optional
1904 If `True` the full path of the file artifact within the datastore
1905 is preserved. If `False` the final file component of the path
1906 is used.
1907 overwrite : `bool`, optional
1908 If `True` allow transfers to overwrite existing files at the
1909 destination.
1911 Returns
1912 -------
1913 targets : `list` of `lsst.resources.ResourcePath`
1914 URIs of file artifacts in destination location. Order is not
1915 preserved.
1916 """
1917 if not destination.isdir():
1918 raise ValueError(f"Destination location must refer to a directory. Given {destination}")
1920 if transfer == "move":
1921 raise ValueError("Can not move artifacts out of datastore. Use copy instead.")
1923 # Source -> Destination
1924 # This also helps filter out duplicate DatasetRef in the request
1925 # that will map to the same underlying file transfer.
1926 to_transfer: dict[ResourcePath, ResourcePath] = {}
1928 for ref in refs:
1929 locations = self._get_dataset_locations_info(ref)
1930 for location, _ in locations:
1931 source_uri = location.uri
1932 target_path: ResourcePathExpression
1933 if preserve_path:
1934 target_path = location.pathInStore
1935 if target_path.isabs():
1936 # This is an absolute path to an external file.
1937 # Use the full path.
1938 target_path = target_path.relativeToPathRoot
1939 else:
1940 target_path = source_uri.basename()
1941 target_uri = destination.join(target_path)
1942 to_transfer[source_uri] = target_uri
1944 # In theory can now parallelize the transfer
1945 log.debug("Number of artifacts to transfer to %s: %d", str(destination), len(to_transfer))
1946 for source_uri, target_uri in to_transfer.items():
1947 target_uri.transfer_from(source_uri, transfer=transfer, overwrite=overwrite)
1949 return list(to_transfer.values())
1951 def get(
1952 self,
1953 ref: DatasetRef,
1954 parameters: Mapping[str, Any] | None = None,
1955 storageClass: StorageClass | str | None = None,
1956 ) -> Any:
1957 """Load an InMemoryDataset from the store.
1959 Parameters
1960 ----------
1961 ref : `DatasetRef`
1962 Reference to the required Dataset.
1963 parameters : `dict`
1964 `StorageClass`-specific parameters that specify, for example,
1965 a slice of the dataset to be loaded.
1966 storageClass : `StorageClass` or `str`, optional
1967 The storage class to be used to override the Python type
1968 returned by this method. By default the returned type matches
1969 the dataset type definition for this dataset. Specifying a
1970 read `StorageClass` can force a different type to be returned.
1971 This type must be compatible with the original type.
1973 Returns
1974 -------
1975 inMemoryDataset : `object`
1976 Requested dataset or slice thereof as an InMemoryDataset.
1978 Raises
1979 ------
1980 FileNotFoundError
1981 Requested dataset can not be retrieved.
1982 TypeError
1983 Return value from formatter has unexpected type.
1984 ValueError
1985 Formatter failed to process the dataset.
1986 """
1987 # Supplied storage class for the component being read is either
1988 # from the ref itself or some an override if we want to force
1989 # type conversion.
1990 if storageClass is not None:
1991 ref = ref.overrideStorageClass(storageClass)
1993 allGetInfo = self._prepare_for_direct_get(ref, parameters)
1994 return get_dataset_as_python_object_from_get_info(
1995 allGetInfo, ref=ref, parameters=parameters, cache_manager=self.cacheManager
1996 )
1998 def prepare_get_for_external_client(self, ref: DatasetRef) -> FileDatastoreGetPayload:
1999 # Docstring inherited
2001 # 1 hour. Chosen somewhat arbitrarily -- this is long enough that the
2002 # client should have time to download a large file with retries if
2003 # needed, but short enough that it will become obvious quickly that
2004 # these URLs expire.
2005 # From a strictly technical standpoint there is no reason this
2006 # shouldn't be a day or more, but there seems to be a political issue
2007 # where people think there is a risk of end users posting presigned
2008 # URLs for people without access rights to download.
2009 url_expiration_time_seconds = 1 * 60 * 60
2011 def to_file_info_payload(info: DatasetLocationInformation) -> FileDatastoreGetPayloadFileInfo:
2012 location, file_info = info
2013 return FileDatastoreGetPayloadFileInfo(
2014 url=location.uri.generate_presigned_get_url(
2015 expiration_time_seconds=url_expiration_time_seconds
2016 ),
2017 datastoreRecords=file_info.to_simple(),
2018 )
2020 return FileDatastoreGetPayload(
2021 datastore_type="file",
2022 dataset_ref=ref.to_simple(),
2023 file_info=[to_file_info_payload(info) for info in self._get_dataset_locations_info(ref)],
2024 )
2026 @transactional
2027 def put(self, inMemoryDataset: Any, ref: DatasetRef) -> None:
2028 """Write a InMemoryDataset with a given `DatasetRef` to the store.
2030 Parameters
2031 ----------
2032 inMemoryDataset : `object`
2033 The dataset to store.
2034 ref : `DatasetRef`
2035 Reference to the associated Dataset.
2037 Raises
2038 ------
2039 TypeError
2040 Supplied object and storage class are inconsistent.
2041 DatasetTypeNotSupportedError
2042 The associated `DatasetType` is not handled by this datastore.
2044 Notes
2045 -----
2046 If the datastore is configured to reject certain dataset types it
2047 is possible that the put will fail and raise a
2048 `DatasetTypeNotSupportedError`. The main use case for this is to
2049 allow `ChainedDatastore` to put to multiple datastores without
2050 requiring that every datastore accepts the dataset.
2051 """
2052 doDisassembly = self.composites.shouldBeDisassembled(ref)
2053 # doDisassembly = True
2055 artifacts = []
2056 if doDisassembly:
2057 components = ref.datasetType.storageClass.delegate().disassemble(inMemoryDataset)
2058 if components is None:
2059 raise RuntimeError(
2060 f"Inconsistent configuration: dataset type {ref.datasetType.name} "
2061 f"with storage class {ref.datasetType.storageClass.name} "
2062 "is configured to be disassembled, but cannot be."
2063 )
2064 for component, componentInfo in components.items():
2065 # Don't recurse because we want to take advantage of
2066 # bulk insert -- need a new DatasetRef that refers to the
2067 # same dataset_id but has the component DatasetType
2068 # DatasetType does not refer to the types of components
2069 # So we construct one ourselves.
2070 compRef = ref.makeComponentRef(component)
2071 storedInfo = self._write_in_memory_to_artifact(componentInfo.component, compRef)
2072 artifacts.append((compRef, storedInfo))
2073 else:
2074 # Write the entire thing out
2075 storedInfo = self._write_in_memory_to_artifact(inMemoryDataset, ref)
2076 artifacts.append((ref, storedInfo))
2078 self._register_datasets(artifacts, insert_mode=DatabaseInsertMode.INSERT)
2080 @transactional
2081 def put_new(self, in_memory_dataset: Any, ref: DatasetRef) -> Mapping[str, DatasetRef]:
2082 doDisassembly = self.composites.shouldBeDisassembled(ref)
2083 # doDisassembly = True
2085 artifacts = []
2086 if doDisassembly:
2087 components = ref.datasetType.storageClass.delegate().disassemble(in_memory_dataset)
2088 if components is None:
2089 raise RuntimeError(
2090 f"Inconsistent configuration: dataset type {ref.datasetType.name} "
2091 f"with storage class {ref.datasetType.storageClass.name} "
2092 "is configured to be disassembled, but cannot be."
2093 )
2094 for component, componentInfo in components.items():
2095 # Don't recurse because we want to take advantage of
2096 # bulk insert -- need a new DatasetRef that refers to the
2097 # same dataset_id but has the component DatasetType
2098 # DatasetType does not refer to the types of components
2099 # So we construct one ourselves.
2100 compRef = ref.makeComponentRef(component)
2101 storedInfo = self._write_in_memory_to_artifact(componentInfo.component, compRef)
2102 artifacts.append((compRef, storedInfo))
2103 else:
2104 # Write the entire thing out
2105 storedInfo = self._write_in_memory_to_artifact(in_memory_dataset, ref)
2106 artifacts.append((ref, storedInfo))
2108 ref_records = {self._opaque_table_name: [info for _, info in artifacts]}
2109 ref = ref.replace(datastore_records=ref_records)
2110 return {self.name: ref}
2112 @transactional
2113 def trash(self, ref: DatasetRef | Iterable[DatasetRef], ignore_errors: bool = True) -> None:
2114 # At this point can safely remove these datasets from the cache
2115 # to avoid confusion later on. If they are not trashed later
2116 # the cache will simply be refilled.
2117 self.cacheManager.remove_from_cache(ref)
2119 # If we are in trust mode there will be nothing to move to
2120 # the trash table and we will have to try to delete the file
2121 # immediately.
2122 if self.trustGetRequest:
2123 # Try to keep the logic below for a single file trash.
2124 if isinstance(ref, DatasetRef):
2125 refs = {ref}
2126 else:
2127 # Will recreate ref at the end of this branch.
2128 refs = set(ref)
2130 # Determine which datasets are known to datastore directly.
2131 id_to_ref = {ref.id: ref for ref in refs}
2132 existing_ids = self._get_stored_records_associated_with_refs(refs, ignore_datastore_records=True)
2133 existing_refs = {id_to_ref[ref_id] for ref_id in existing_ids}
2135 missing = refs - existing_refs
2136 if missing:
2137 # Do an explicit existence check on these refs.
2138 # We only care about the artifacts at this point and not
2139 # the dataset existence.
2140 artifact_existence: dict[ResourcePath, bool] = {}
2141 _ = self.mexists(missing, artifact_existence)
2142 uris = [uri for uri, exists in artifact_existence.items() if exists]
2144 # FUTURE UPGRADE: Implement a parallelized bulk remove.
2145 log.debug("Removing %d artifacts from datastore that are unknown to datastore", len(uris))
2146 for uri in uris:
2147 try:
2148 uri.remove()
2149 except Exception as e:
2150 if ignore_errors:
2151 log.debug("Artifact %s could not be removed: %s", uri, e)
2152 continue
2153 raise
2155 # There is no point asking the code below to remove refs we
2156 # know are missing so update it with the list of existing
2157 # records. Try to retain one vs many logic.
2158 if not existing_refs:
2159 # Nothing more to do since none of the datasets were
2160 # known to the datastore record table.
2161 return
2162 ref = list(existing_refs)
2163 if len(ref) == 1:
2164 ref = ref[0]
2166 # Get file metadata and internal metadata
2167 if not isinstance(ref, DatasetRef):
2168 log.debug("Doing multi-dataset trash in datastore %s", self.name)
2169 # Assumed to be an iterable of refs so bulk mode enabled.
2170 try:
2171 self.bridge.moveToTrash(ref, transaction=self._transaction)
2172 except Exception as e:
2173 if ignore_errors:
2174 log.warning("Unexpected issue moving multiple datasets to trash: %s", e)
2175 else:
2176 raise
2177 return
2179 log.debug("Trashing dataset %s in datastore %s", ref, self.name)
2181 fileLocations = self._get_dataset_locations_info(ref)
2183 if not fileLocations:
2184 err_msg = f"Requested dataset to trash ({ref}) is not known to datastore {self.name}"
2185 if ignore_errors:
2186 log.warning(err_msg)
2187 return
2188 else:
2189 raise FileNotFoundError(err_msg)
2191 for location, _ in fileLocations:
2192 if not self._artifact_exists(location):
2193 err_msg = (
2194 f"Dataset is known to datastore {self.name} but "
2195 f"associated artifact ({location.uri}) is missing"
2196 )
2197 if ignore_errors:
2198 log.warning(err_msg)
2199 return
2200 else:
2201 raise FileNotFoundError(err_msg)
2203 # Mark dataset as trashed
2204 try:
2205 self.bridge.moveToTrash([ref], transaction=self._transaction)
2206 except Exception as e:
2207 if ignore_errors:
2208 log.warning(
2209 "Attempted to mark dataset (%s) to be trashed in datastore %s "
2210 "but encountered an error: %s",
2211 ref,
2212 self.name,
2213 e,
2214 )
2215 pass
2216 else:
2217 raise
2219 @transactional
2220 def emptyTrash(self, ignore_errors: bool = True) -> None:
2221 """Remove all datasets from the trash.
2223 Parameters
2224 ----------
2225 ignore_errors : `bool`
2226 If `True` return without error even if something went wrong.
2227 Problems could occur if another process is simultaneously trying
2228 to delete.
2229 """
2230 log.debug("Emptying trash in datastore %s", self.name)
2232 # Context manager will empty trash iff we finish it without raising.
2233 # It will also automatically delete the relevant rows from the
2234 # trash table and the records table.
2235 with self.bridge.emptyTrash(
2236 self._table, record_class=StoredFileInfo, record_column="path"
2237 ) as trash_data:
2238 # Removing the artifacts themselves requires that the files are
2239 # not also associated with refs that are not to be trashed.
2240 # Therefore need to do a query with the file paths themselves
2241 # and return all the refs associated with them. Can only delete
2242 # a file if the refs to be trashed are the only refs associated
2243 # with the file.
2244 # This requires multiple copies of the trashed items
2245 trashed, artifacts_to_keep = trash_data
2247 if artifacts_to_keep is None:
2248 # The bridge is not helping us so have to work it out
2249 # ourselves. This is not going to be as efficient.
2250 trashed = list(trashed)
2252 # The instance check is for mypy since up to this point it
2253 # does not know the type of info.
2254 path_map = self._refs_associated_with_artifacts(
2255 [info.path for _, info in trashed if isinstance(info, StoredFileInfo)]
2256 )
2258 for ref, info in trashed:
2259 # Mypy needs to know this is not the base class
2260 assert isinstance(info, StoredFileInfo), f"Unexpectedly got info of class {type(info)}"
2262 path_map[info.path].remove(ref.id)
2263 if not path_map[info.path]:
2264 del path_map[info.path]
2266 artifacts_to_keep = set(path_map)
2268 for ref, info in trashed:
2269 # Should not happen for this implementation but need
2270 # to keep mypy happy.
2271 assert info is not None, f"Internal logic error in emptyTrash with ref {ref}."
2273 # Mypy needs to know this is not the base class
2274 assert isinstance(info, StoredFileInfo), f"Unexpectedly got info of class {type(info)}"
2276 if info.path in artifacts_to_keep:
2277 # This is a multi-dataset artifact and we are not
2278 # removing all associated refs.
2279 continue
2281 # Only trashed refs still known to datastore will be returned.
2282 location = info.file_location(self.locationFactory)
2284 # Point of no return for this artifact
2285 log.debug("Removing artifact %s from datastore %s", location.uri, self.name)
2286 try:
2287 self._delete_artifact(location)
2288 except FileNotFoundError:
2289 # If the file itself has been deleted there is nothing
2290 # we can do about it. It is possible that trash has
2291 # been run in parallel in another process or someone
2292 # decided to delete the file. It is unlikely to come
2293 # back and so we should still continue with the removal
2294 # of the entry from the trash table. It is also possible
2295 # we removed it in a previous iteration if it was
2296 # a multi-dataset artifact. The delete artifact method
2297 # will log a debug message in this scenario.
2298 # Distinguishing file missing before trash started and
2299 # file already removed previously as part of this trash
2300 # is not worth the distinction with regards to potential
2301 # memory cost.
2302 pass
2303 except Exception as e:
2304 if ignore_errors:
2305 # Use a debug message here even though it's not
2306 # a good situation. In some cases this can be
2307 # caused by a race between user A and user B
2308 # and neither of them has permissions for the
2309 # other's files. Butler does not know about users
2310 # and trash has no idea what collections these
2311 # files were in (without guessing from a path).
2312 log.debug(
2313 "Encountered error removing artifact %s from datastore %s: %s",
2314 location.uri,
2315 self.name,
2316 e,
2317 )
2318 else:
2319 raise
2321 @transactional
2322 def transfer_from(
2323 self,
2324 source_datastore: Datastore,
2325 refs: Iterable[DatasetRef],
2326 transfer: str = "auto",
2327 artifact_existence: dict[ResourcePath, bool] | None = None,
2328 ) -> tuple[set[DatasetRef], set[DatasetRef]]:
2329 # Docstring inherited
2330 if type(self) is not type(source_datastore):
2331 raise TypeError(
2332 f"Datastore mismatch between this datastore ({type(self)}) and the "
2333 f"source datastore ({type(source_datastore)})."
2334 )
2336 # Be explicit for mypy
2337 if not isinstance(source_datastore, FileDatastore):
2338 raise TypeError(
2339 "Can only transfer to a FileDatastore from another FileDatastore, not"
2340 f" {type(source_datastore)}"
2341 )
2343 # Stop early if "direct" transfer mode is requested. That would
2344 # require that the URI inside the source datastore should be stored
2345 # directly in the target datastore, which seems unlikely to be useful
2346 # since at any moment the source datastore could delete the file.
2347 if transfer in ("direct", "split"):
2348 raise ValueError(
2349 f"Can not transfer from a source datastore using {transfer} mode since"
2350 " those files are controlled by the other datastore."
2351 )
2353 # Empty existence lookup if none given.
2354 if artifact_existence is None:
2355 artifact_existence = {}
2357 # We will go through the list multiple times so must convert
2358 # generators to lists.
2359 refs = list(refs)
2361 # In order to handle disassembled composites the code works
2362 # at the records level since it can assume that internal APIs
2363 # can be used.
2364 # - If the record already exists in the destination this is assumed
2365 # to be okay.
2366 # - If there is no record but the source and destination URIs are
2367 # identical no transfer is done but the record is added.
2368 # - If the source record refers to an absolute URI currently assume
2369 # that that URI should remain absolute and will be visible to the
2370 # destination butler. May need to have a flag to indicate whether
2371 # the dataset should be transferred. This will only happen if
2372 # the detached Butler has had a local ingest.
2374 # What we really want is all the records in the source datastore
2375 # associated with these refs. Or derived ones if they don't exist
2376 # in the source.
2377 source_records = source_datastore._get_stored_records_associated_with_refs(
2378 refs, ignore_datastore_records=True
2379 )
2381 # The source dataset_ids are the keys in these records
2382 source_ids = set(source_records)
2383 log.debug("Number of datastore records found in source: %d", len(source_ids))
2385 requested_ids = {ref.id for ref in refs}
2386 missing_ids = requested_ids - source_ids
2388 # Missing IDs can be okay if that datastore has allowed
2389 # gets based on file existence. Should we transfer what we can
2390 # or complain about it and warn?
2391 if missing_ids and not source_datastore.trustGetRequest:
2392 raise ValueError(
2393 f"Some datasets are missing from source datastore {source_datastore}: {missing_ids}"
2394 )
2396 # Need to map these missing IDs to a DatasetRef so we can guess
2397 # the details.
2398 if missing_ids:
2399 log.info(
2400 "Number of expected datasets missing from source datastore records: %d out of %d",
2401 len(missing_ids),
2402 len(requested_ids),
2403 )
2404 id_to_ref = {ref.id: ref for ref in refs if ref.id in missing_ids}
2406 # This should be chunked in case we end up having to check
2407 # the file store since we need some log output to show
2408 # progress.
2409 for missing_ids_chunk in chunk_iterable(missing_ids, chunk_size=10_000):
2410 records = {}
2411 for missing in missing_ids_chunk:
2412 # Ask the source datastore where the missing artifacts
2413 # should be. An execution butler might not know about the
2414 # artifacts even if they are there.
2415 expected = source_datastore._get_expected_dataset_locations_info(id_to_ref[missing])
2416 records[missing] = [info for _, info in expected]
2418 # Call the mexist helper method in case we have not already
2419 # checked these artifacts such that artifact_existence is
2420 # empty. This allows us to benefit from parallelism.
2421 # datastore.mexists() itself does not give us access to the
2422 # derived datastore record.
2423 log.verbose("Checking existence of %d datasets unknown to datastore", len(records))
2424 ref_exists = source_datastore._process_mexists_records(
2425 id_to_ref, records, False, artifact_existence=artifact_existence
2426 )
2428 # Now go through the records and propagate the ones that exist.
2429 location_factory = source_datastore.locationFactory
2430 for missing, record_list in records.items():
2431 # Skip completely if the ref does not exist.
2432 ref = id_to_ref[missing]
2433 if not ref_exists[ref]:
2434 log.warning("Asked to transfer dataset %s but no file artifacts exist for it.", ref)
2435 continue
2436 # Check for file artifact to decide which parts of a
2437 # disassembled composite do exist. If there is only a
2438 # single record we don't even need to look because it can't
2439 # be a composite and must exist.
2440 if len(record_list) == 1:
2441 dataset_records = record_list
2442 else:
2443 dataset_records = [
2444 record
2445 for record in record_list
2446 if artifact_existence[record.file_location(location_factory).uri]
2447 ]
2448 assert len(dataset_records) > 0, "Disassembled composite should have had some files."
2450 # Rely on source_records being a defaultdict.
2451 source_records[missing].extend(dataset_records)
2453 # See if we already have these records
2454 target_records = self._get_stored_records_associated_with_refs(refs, ignore_datastore_records=True)
2456 # The artifacts to register
2457 artifacts = []
2459 # Refs that already exist
2460 already_present = []
2462 # Refs that were rejected by this datastore.
2463 rejected = set()
2465 # Refs that were transferred successfully.
2466 accepted = set()
2468 # Record each time we have done a "direct" transfer.
2469 direct_transfers = []
2471 # Now can transfer the artifacts
2472 for ref in refs:
2473 if not self.constraints.isAcceptable(ref):
2474 # This datastore should not be accepting this dataset.
2475 rejected.add(ref)
2476 continue
2478 accepted.add(ref)
2480 if ref.id in target_records:
2481 # Already have an artifact for this.
2482 already_present.append(ref)
2483 continue
2485 # mypy needs to know these are always resolved refs
2486 for info in source_records[ref.id]:
2487 source_location = info.file_location(source_datastore.locationFactory)
2488 target_location = info.file_location(self.locationFactory)
2489 if source_location == target_location and not source_location.pathInStore.isabs():
2490 # Artifact is already in the target location.
2491 # (which is how execution butler currently runs)
2492 pass
2493 else:
2494 if target_location.pathInStore.isabs():
2495 # Just because we can see the artifact when running
2496 # the transfer doesn't mean it will be generally
2497 # accessible to a user of this butler. Need to decide
2498 # what to do about an absolute path.
2499 if transfer == "auto":
2500 # For "auto" transfers we allow the absolute URI
2501 # to be recorded in the target datastore.
2502 direct_transfers.append(source_location)
2503 else:
2504 # The user is explicitly requesting a transfer
2505 # even for an absolute URI. This requires us to
2506 # calculate the target path.
2507 template_ref = ref
2508 if info.component:
2509 template_ref = ref.makeComponentRef(info.component)
2510 target_location = self._calculate_ingested_datastore_name(
2511 source_location.uri,
2512 template_ref,
2513 )
2515 info = info.update(path=target_location.pathInStore.path)
2517 # Need to transfer it to the new location.
2518 # Assume we should always overwrite. If the artifact
2519 # is there this might indicate that a previous transfer
2520 # was interrupted but was not able to be rolled back
2521 # completely (eg pre-emption) so follow Datastore default
2522 # and overwrite.
2523 target_location.uri.transfer_from(
2524 source_location.uri, transfer=transfer, overwrite=True, transaction=self._transaction
2525 )
2527 artifacts.append((ref, info))
2529 if direct_transfers:
2530 log.info(
2531 "Transfer request for an outside-datastore artifact with absolute URI done %d time%s",
2532 len(direct_transfers),
2533 "" if len(direct_transfers) == 1 else "s",
2534 )
2536 # We are overwriting previous datasets that may have already
2537 # existed. We therefore should ensure that we force the
2538 # datastore records to agree. Note that this can potentially lead
2539 # to difficulties if the dataset has previously been ingested
2540 # disassembled and is somehow now assembled, or vice versa.
2541 self._register_datasets(artifacts, insert_mode=DatabaseInsertMode.REPLACE)
2543 if already_present:
2544 n_skipped = len(already_present)
2545 log.info(
2546 "Skipped transfer of %d dataset%s already present in datastore",
2547 n_skipped,
2548 "" if n_skipped == 1 else "s",
2549 )
2551 return accepted, rejected
2553 @transactional
2554 def forget(self, refs: Iterable[DatasetRef]) -> None:
2555 # Docstring inherited.
2556 refs = list(refs)
2557 self.bridge.forget(refs)
2558 self._table.delete(["dataset_id"], *[{"dataset_id": ref.id} for ref in refs])
2560 def validateConfiguration(
2561 self, entities: Iterable[DatasetRef | DatasetType | StorageClass], logFailures: bool = False
2562 ) -> None:
2563 """Validate some of the configuration for this datastore.
2565 Parameters
2566 ----------
2567 entities : iterable of `DatasetRef`, `DatasetType`, or `StorageClass`
2568 Entities to test against this configuration. Can be differing
2569 types.
2570 logFailures : `bool`, optional
2571 If `True`, output a log message for every validation error
2572 detected.
2574 Raises
2575 ------
2576 DatastoreValidationError
2577 Raised if there is a validation problem with a configuration.
2578 All the problems are reported in a single exception.
2580 Notes
2581 -----
2582 This method checks that all the supplied entities have valid file
2583 templates and also have formatters defined.
2584 """
2585 templateFailed = None
2586 try:
2587 self.templates.validateTemplates(entities, logFailures=logFailures)
2588 except FileTemplateValidationError as e:
2589 templateFailed = str(e)
2591 formatterFailed = []
2592 for entity in entities:
2593 try:
2594 self.formatterFactory.getFormatterClass(entity)
2595 except KeyError as e:
2596 formatterFailed.append(str(e))
2597 if logFailures:
2598 log.critical("Formatter failure: %s", e)
2600 if templateFailed or formatterFailed:
2601 messages = []
2602 if templateFailed:
2603 messages.append(templateFailed)
2604 if formatterFailed:
2605 messages.append(",".join(formatterFailed))
2606 msg = ";\n".join(messages)
2607 raise DatastoreValidationError(msg)
2609 def getLookupKeys(self) -> set[LookupKey]:
2610 # Docstring is inherited from base class
2611 return (
2612 self.templates.getLookupKeys()
2613 | self.formatterFactory.getLookupKeys()
2614 | self.constraints.getLookupKeys()
2615 )
2617 def validateKey(self, lookupKey: LookupKey, entity: DatasetRef | DatasetType | StorageClass) -> None:
2618 # Docstring is inherited from base class
2619 # The key can be valid in either formatters or templates so we can
2620 # only check the template if it exists
2621 if lookupKey in self.templates:
2622 try:
2623 self.templates[lookupKey].validateTemplate(entity)
2624 except FileTemplateValidationError as e:
2625 raise DatastoreValidationError(e) from e
2627 def export(
2628 self,
2629 refs: Iterable[DatasetRef],
2630 *,
2631 directory: ResourcePathExpression | None = None,
2632 transfer: str | None = "auto",
2633 ) -> Iterable[FileDataset]:
2634 # Docstring inherited from Datastore.export.
2635 if transfer == "auto" and directory is None:
2636 transfer = None
2638 if transfer is not None and directory is None:
2639 raise TypeError(f"Cannot export using transfer mode {transfer} with no export directory given")
2641 if transfer == "move":
2642 raise TypeError("Can not export by moving files out of datastore.")
2643 elif transfer == "direct":
2644 # For an export, treat this as equivalent to None. We do not
2645 # want an import to risk using absolute URIs to datasets owned
2646 # by another datastore.
2647 log.info("Treating 'direct' transfer mode as in-place export.")
2648 transfer = None
2650 # Force the directory to be a URI object
2651 directoryUri: ResourcePath | None = None
2652 if directory is not None:
2653 directoryUri = ResourcePath(directory, forceDirectory=True)
2655 if transfer is not None and directoryUri is not None and not directoryUri.exists():
2656 # mypy needs the second test
2657 raise FileNotFoundError(f"Export location {directory} does not exist")
2659 progress = Progress("lsst.daf.butler.datastores.FileDatastore.export", level=logging.DEBUG)
2660 for ref in progress.wrap(refs, "Exporting dataset files"):
2661 fileLocations = self._get_dataset_locations_info(ref)
2662 if not fileLocations:
2663 raise FileNotFoundError(f"Could not retrieve dataset {ref}.")
2664 # For now we can not export disassembled datasets
2665 if len(fileLocations) > 1:
2666 raise NotImplementedError(f"Can not export disassembled datasets such as {ref}")
2667 location, storedFileInfo = fileLocations[0]
2669 pathInStore = location.pathInStore.path
2670 if transfer is None:
2671 # TODO: do we also need to return the readStorageClass somehow?
2672 # We will use the path in store directly. If this is an
2673 # absolute URI, preserve it.
2674 if location.pathInStore.isabs():
2675 pathInStore = str(location.uri)
2676 elif transfer == "direct":
2677 # Use full URIs to the remote store in the export
2678 pathInStore = str(location.uri)
2679 else:
2680 # mypy needs help
2681 assert directoryUri is not None, "directoryUri must be defined to get here"
2682 storeUri = ResourcePath(location.uri)
2684 # if the datastore has an absolute URI to a resource, we
2685 # have two options:
2686 # 1. Keep the absolute URI in the exported YAML
2687 # 2. Allocate a new name in the local datastore and transfer
2688 # it.
2689 # For now go with option 2
2690 if location.pathInStore.isabs():
2691 template = self.templates.getTemplate(ref)
2692 newURI = ResourcePath(template.format(ref), forceAbsolute=False)
2693 pathInStore = str(newURI.updatedExtension(location.pathInStore.getExtension()))
2695 exportUri = directoryUri.join(pathInStore)
2696 exportUri.transfer_from(storeUri, transfer=transfer)
2698 yield FileDataset(refs=[ref], path=pathInStore, formatter=storedFileInfo.formatter)
2700 @staticmethod
2701 def computeChecksum(uri: ResourcePath, algorithm: str = "blake2b", block_size: int = 8192) -> str | None:
2702 """Compute the checksum of the supplied file.
2704 Parameters
2705 ----------
2706 uri : `lsst.resources.ResourcePath`
2707 Name of resource to calculate checksum from.
2708 algorithm : `str`, optional
2709 Name of algorithm to use. Must be one of the algorithms supported
2710 by :py:class`hashlib`.
2711 block_size : `int`
2712 Number of bytes to read from file at one time.
2714 Returns
2715 -------
2716 hexdigest : `str`
2717 Hex digest of the file.
2719 Notes
2720 -----
2721 Currently returns None if the URI is for a remote resource.
2722 """
2723 if algorithm not in hashlib.algorithms_guaranteed:
2724 raise NameError(f"The specified algorithm '{algorithm}' is not supported by hashlib")
2726 if not uri.isLocal:
2727 return None
2729 hasher = hashlib.new(algorithm)
2731 with uri.as_local() as local_uri, open(local_uri.ospath, "rb") as f:
2732 for chunk in iter(lambda: f.read(block_size), b""):
2733 hasher.update(chunk)
2735 return hasher.hexdigest()
2737 def needs_expanded_data_ids(
2738 self,
2739 transfer: str | None,
2740 entity: DatasetRef | DatasetType | StorageClass | None = None,
2741 ) -> bool:
2742 # Docstring inherited.
2743 # This _could_ also use entity to inspect whether the filename template
2744 # involves placeholders other than the required dimensions for its
2745 # dataset type, but that's not necessary for correctness; it just
2746 # enables more optimizations (perhaps only in theory).
2747 return transfer not in ("direct", None)
2749 def import_records(self, data: Mapping[str, DatastoreRecordData]) -> None:
2750 # Docstring inherited from the base class.
2751 record_data = data.get(self.name)
2752 if not record_data:
2753 return
2755 self._bridge.insert(FakeDatasetRef(dataset_id) for dataset_id in record_data.records)
2757 # TODO: Verify that there are no unexpected table names in the dict?
2758 unpacked_records = []
2759 for dataset_id, dataset_data in record_data.records.items():
2760 records = dataset_data.get(self._table.name)
2761 if records:
2762 for info in records:
2763 assert isinstance(info, StoredFileInfo), "Expecting StoredFileInfo records"
2764 unpacked_records.append(info.to_record(dataset_id=dataset_id))
2765 if unpacked_records:
2766 self._table.insert(*unpacked_records, transaction=self._transaction)
2768 def export_records(self, refs: Iterable[DatasetIdRef]) -> Mapping[str, DatastoreRecordData]:
2769 # Docstring inherited from the base class.
2770 exported_refs = list(self._bridge.check(refs))
2771 ids = {ref.id for ref in exported_refs}
2772 records: dict[DatasetId, dict[str, list[StoredDatastoreItemInfo]]] = {id: {} for id in ids}
2773 for row in self._table.fetch(dataset_id=ids):
2774 info: StoredDatastoreItemInfo = StoredFileInfo.from_record(row)
2775 dataset_records = records.setdefault(row["dataset_id"], {})
2776 dataset_records.setdefault(self._table.name, []).append(info)
2778 record_data = DatastoreRecordData(records=records)
2779 return {self.name: record_data}
2781 def set_retrieve_dataset_type_method(self, method: Callable[[str], DatasetType | None] | None) -> None:
2782 # Docstring inherited from the base class.
2783 self._retrieve_dataset_method = method
2785 def _cast_storage_class(self, ref: DatasetRef) -> DatasetRef:
2786 """Update dataset reference to use the storage class from registry."""
2787 if self._retrieve_dataset_method is None:
2788 # We could raise an exception here but unit tests do not define
2789 # this method.
2790 return ref
2791 dataset_type = self._retrieve_dataset_method(ref.datasetType.name)
2792 if dataset_type is not None:
2793 ref = ref.overrideStorageClass(dataset_type.storageClass)
2794 return ref
2796 def get_opaque_table_definitions(self) -> Mapping[str, DatastoreOpaqueTable]:
2797 # Docstring inherited from the base class.
2798 return {self._opaque_table_name: DatastoreOpaqueTable(self.makeTableSpec(ddl.GUID), StoredFileInfo)}