Coverage for python/lsst/daf/butler/datastores/fileDatastore.py: 10%
923 statements
« prev ^ index » next coverage.py v7.4.0, created at 2024-01-25 10:50 +0000
« prev ^ index » next coverage.py v7.4.0, created at 2024-01-25 10:50 +0000
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This software is dual licensed under the GNU General Public License and also
10# under a 3-clause BSD license. Recipients may choose which of these licenses
11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt,
12# respectively. If you choose the GPL option then the following text applies
13# (but note that there is still no warranty even if you opt for BSD instead):
14#
15# This program is free software: you can redistribute it and/or modify
16# it under the terms of the GNU General Public License as published by
17# the Free Software Foundation, either version 3 of the License, or
18# (at your option) any later version.
19#
20# This program is distributed in the hope that it will be useful,
21# but WITHOUT ANY WARRANTY; without even the implied warranty of
22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23# GNU General Public License for more details.
24#
25# You should have received a copy of the GNU General Public License
26# along with this program. If not, see <http://www.gnu.org/licenses/>.
28"""Generic file-based datastore code."""
30from __future__ import annotations
32__all__ = ("FileDatastore",)
34import contextlib
35import hashlib
36import logging
37from collections import defaultdict
38from collections.abc import Callable, Iterable, Mapping, Sequence
39from typing import TYPE_CHECKING, Any, ClassVar, cast
41from lsst.daf.butler import (
42 Config,
43 DatasetId,
44 DatasetRef,
45 DatasetType,
46 DatasetTypeNotSupportedError,
47 FileDataset,
48 FileDescriptor,
49 Formatter,
50 FormatterFactory,
51 Location,
52 LocationFactory,
53 Progress,
54 StorageClass,
55 ddl,
56)
57from lsst.daf.butler.datastore import (
58 DatasetRefURIs,
59 Datastore,
60 DatastoreConfig,
61 DatastoreOpaqueTable,
62 DatastoreValidationError,
63)
64from lsst.daf.butler.datastore.cache_manager import (
65 AbstractDatastoreCacheManager,
66 DatastoreCacheManager,
67 DatastoreDisabledCacheManager,
68)
69from lsst.daf.butler.datastore.composites import CompositesMap
70from lsst.daf.butler.datastore.file_templates import FileTemplates, FileTemplateValidationError
71from lsst.daf.butler.datastore.generic_base import GenericBaseDatastore
72from lsst.daf.butler.datastore.record_data import DatastoreRecordData
73from lsst.daf.butler.datastore.stored_file_info import StoredDatastoreItemInfo, StoredFileInfo
74from lsst.daf.butler.datastores.file_datastore.get import (
75 DatasetLocationInformation,
76 DatastoreFileGetInformation,
77 generate_datastore_get_information,
78 get_dataset_as_python_object_from_get_info,
79)
80from lsst.daf.butler.datastores.fileDatastoreClient import (
81 FileDatastoreGetPayload,
82 FileDatastoreGetPayloadFileInfo,
83)
84from lsst.daf.butler.registry.interfaces import (
85 DatabaseInsertMode,
86 DatastoreRegistryBridge,
87 FakeDatasetRef,
88 ReadOnlyDatabaseError,
89)
90from lsst.daf.butler.repo_relocation import replaceRoot
91from lsst.daf.butler.utils import transactional
92from lsst.resources import ResourcePath, ResourcePathExpression
93from lsst.utils.introspection import get_class_of
94from lsst.utils.iteration import chunk_iterable
96# For VERBOSE logging usage.
97from lsst.utils.logging import VERBOSE, getLogger
98from sqlalchemy import BigInteger, String
100if TYPE_CHECKING:
101 from lsst.daf.butler import LookupKey
102 from lsst.daf.butler.registry.interfaces import DatasetIdRef, DatastoreRegistryBridgeManager
104log = getLogger(__name__)
107class _IngestPrepData(Datastore.IngestPrepData):
108 """Helper class for FileDatastore ingest implementation.
110 Parameters
111 ----------
112 datasets : `~collections.abc.Iterable` of `FileDataset`
113 Files to be ingested by this datastore.
114 """
116 def __init__(self, datasets: Iterable[FileDataset]):
117 super().__init__(ref for dataset in datasets for ref in dataset.refs)
118 self.datasets = datasets
121class FileDatastore(GenericBaseDatastore[StoredFileInfo]):
122 """Generic Datastore for file-based implementations.
124 Should always be sub-classed since key abstract methods are missing.
126 Parameters
127 ----------
128 config : `DatastoreConfig` or `str`
129 Configuration as either a `Config` object or URI to file.
130 bridgeManager : `DatastoreRegistryBridgeManager`
131 Object that manages the interface between `Registry` and datastores.
132 root : `ResourcePath`
133 Root directory URI of this `Datastore`.
134 formatterFactory : `FormatterFactory`
135 Factory for creating instances of formatters.
136 templates : `FileTemplates`
137 File templates that can be used by this `Datastore`.
138 composites : `CompositesMap`
139 Determines whether a dataset should be disassembled on put.
140 trustGetRequest : `bool`
141 Determine whether we can fall back to configuration if a requested
142 dataset is not known to registry.
144 Raises
145 ------
146 ValueError
147 If root location does not exist and ``create`` is `False` in the
148 configuration.
149 """
151 defaultConfigFile: ClassVar[str | None] = None
152 """Path to configuration defaults. Accessed within the ``config`` resource
153 or relative to a search path. Can be None if no defaults specified.
154 """
156 root: ResourcePath
157 """Root directory URI of this `Datastore`."""
159 locationFactory: LocationFactory
160 """Factory for creating locations relative to the datastore root."""
162 formatterFactory: FormatterFactory
163 """Factory for creating instances of formatters."""
165 templates: FileTemplates
166 """File templates that can be used by this `Datastore`."""
168 composites: CompositesMap
169 """Determines whether a dataset should be disassembled on put."""
171 defaultConfigFile = "datastores/fileDatastore.yaml"
172 """Path to configuration defaults. Accessed within the ``config`` resource
173 or relative to a search path. Can be None if no defaults specified.
174 """
176 _retrieve_dataset_method: Callable[[str], DatasetType | None] | None = None
177 """Callable that is used in trusted mode to retrieve registry definition
178 of a named dataset type.
179 """
181 @classmethod
182 def setConfigRoot(cls, root: str, config: Config, full: Config, overwrite: bool = True) -> None:
183 """Set any filesystem-dependent config options for this Datastore to
184 be appropriate for a new empty repository with the given root.
186 Parameters
187 ----------
188 root : `str`
189 URI to the root of the data repository.
190 config : `Config`
191 A `Config` to update. Only the subset understood by
192 this component will be updated. Will not expand
193 defaults.
194 full : `Config`
195 A complete config with all defaults expanded that can be
196 converted to a `DatastoreConfig`. Read-only and will not be
197 modified by this method.
198 Repository-specific options that should not be obtained
199 from defaults when Butler instances are constructed
200 should be copied from ``full`` to ``config``.
201 overwrite : `bool`, optional
202 If `False`, do not modify a value in ``config`` if the value
203 already exists. Default is always to overwrite with the provided
204 ``root``.
206 Notes
207 -----
208 If a keyword is explicitly defined in the supplied ``config`` it
209 will not be overridden by this method if ``overwrite`` is `False`.
210 This allows explicit values set in external configs to be retained.
211 """
212 Config.updateParameters(
213 DatastoreConfig,
214 config,
215 full,
216 toUpdate={"root": root},
217 toCopy=("cls", ("records", "table")),
218 overwrite=overwrite,
219 )
221 @classmethod
222 def makeTableSpec(cls, datasetIdColumnType: type) -> ddl.TableSpec:
223 return ddl.TableSpec(
224 fields=[
225 ddl.FieldSpec(name="dataset_id", dtype=datasetIdColumnType, primaryKey=True),
226 ddl.FieldSpec(name="path", dtype=String, length=256, nullable=False),
227 ddl.FieldSpec(name="formatter", dtype=String, length=128, nullable=False),
228 ddl.FieldSpec(name="storage_class", dtype=String, length=64, nullable=False),
229 # Use empty string to indicate no component
230 ddl.FieldSpec(name="component", dtype=String, length=32, primaryKey=True),
231 # TODO: should checksum be Base64Bytes instead?
232 ddl.FieldSpec(name="checksum", dtype=String, length=128, nullable=True),
233 ddl.FieldSpec(name="file_size", dtype=BigInteger, nullable=True),
234 ],
235 unique=frozenset(),
236 indexes=[ddl.IndexSpec("path")],
237 )
239 def __init__(
240 self,
241 config: DatastoreConfig,
242 bridgeManager: DatastoreRegistryBridgeManager,
243 root: ResourcePath,
244 formatterFactory: FormatterFactory,
245 templates: FileTemplates,
246 composites: CompositesMap,
247 trustGetRequest: bool,
248 ):
249 super().__init__(config, bridgeManager)
250 self.root = ResourcePath(root)
251 self.formatterFactory = formatterFactory
252 self.templates = templates
253 self.composites = composites
254 self.trustGetRequest = trustGetRequest
256 # Name ourselves either using an explicit name or a name
257 # derived from the (unexpanded) root
258 if "name" in self.config:
259 self.name = self.config["name"]
260 else:
261 # We use the unexpanded root in the name to indicate that this
262 # datastore can be moved without having to update registry.
263 self.name = "{}@{}".format(type(self).__name__, self.config["root"])
265 self.locationFactory = LocationFactory(self.root)
267 self._opaque_table_name = self.config["records", "table"]
268 try:
269 # Storage of paths and formatters, keyed by dataset_id
270 self._table = bridgeManager.opaque.register(
271 self._opaque_table_name, self.makeTableSpec(bridgeManager.datasetIdColumnType)
272 )
273 # Interface to Registry.
274 self._bridge = bridgeManager.register(self.name)
275 except ReadOnlyDatabaseError:
276 # If the database is read only and we just tried and failed to
277 # create a table, it means someone is trying to create a read-only
278 # butler client for an empty repo. That should be okay, as long
279 # as they then try to get any datasets before some other client
280 # creates the table. Chances are they're just validating
281 # configuration.
282 pass
284 # Determine whether checksums should be used - default to False
285 self.useChecksum = self.config.get("checksum", False)
287 # Create a cache manager
288 self.cacheManager: AbstractDatastoreCacheManager
289 if "cached" in self.config:
290 self.cacheManager = DatastoreCacheManager(self.config["cached"], universe=bridgeManager.universe)
291 else:
292 self.cacheManager = DatastoreDisabledCacheManager("", universe=bridgeManager.universe)
294 @classmethod
295 def _create_from_config(
296 cls,
297 config: DatastoreConfig,
298 bridgeManager: DatastoreRegistryBridgeManager,
299 butlerRoot: ResourcePathExpression | None,
300 ) -> FileDatastore:
301 if "root" not in config:
302 raise ValueError("No root directory specified in configuration")
304 # Support repository relocation in config
305 # Existence of self.root is checked in subclass
306 root = ResourcePath(replaceRoot(config["root"], butlerRoot), forceDirectory=True, forceAbsolute=True)
308 # Now associate formatters with storage classes
309 formatterFactory = FormatterFactory()
310 formatterFactory.registerFormatters(config["formatters"], universe=bridgeManager.universe)
312 # Read the file naming templates
313 templates = FileTemplates(config["templates"], universe=bridgeManager.universe)
315 # See if composites should be disassembled
316 composites = CompositesMap(config["composites"], universe=bridgeManager.universe)
318 # Determine whether we can fall back to configuration if a
319 # requested dataset is not known to registry
320 trustGetRequest = config.get("trust_get_request", False)
322 self = FileDatastore(
323 config, bridgeManager, root, formatterFactory, templates, composites, trustGetRequest
324 )
326 # Check existence and create directory structure if necessary
327 if not self.root.exists():
328 if "create" not in self.config or not self.config["create"]:
329 raise ValueError(f"No valid root and not allowed to create one at: {self.root}")
330 try:
331 self.root.mkdir()
332 except Exception as e:
333 raise ValueError(
334 f"Can not create datastore root '{self.root}', check permissions. Got error: {e}"
335 ) from e
337 return self
339 def clone(self, bridgeManager: DatastoreRegistryBridgeManager) -> Datastore:
340 return FileDatastore(
341 self.config,
342 bridgeManager,
343 self.root,
344 self.formatterFactory,
345 self.templates,
346 self.composites,
347 self.trustGetRequest,
348 )
350 def __str__(self) -> str:
351 return str(self.root)
353 @property
354 def bridge(self) -> DatastoreRegistryBridge:
355 return self._bridge
357 @property
358 def roots(self) -> dict[str, ResourcePath | None]:
359 # Docstring inherited.
360 return {self.name: self.root}
362 def _artifact_exists(self, location: Location) -> bool:
363 """Check that an artifact exists in this datastore at the specified
364 location.
366 Parameters
367 ----------
368 location : `Location`
369 Expected location of the artifact associated with this datastore.
371 Returns
372 -------
373 exists : `bool`
374 True if the location can be found, false otherwise.
375 """
376 log.debug("Checking if resource exists: %s", location.uri)
377 return location.uri.exists()
379 def _delete_artifact(self, location: Location) -> None:
380 """Delete the artifact from the datastore.
382 Parameters
383 ----------
384 location : `Location`
385 Location of the artifact associated with this datastore.
386 """
387 if location.pathInStore.isabs():
388 raise RuntimeError(f"Cannot delete artifact with absolute uri {location.uri}.")
390 try:
391 location.uri.remove()
392 except FileNotFoundError:
393 log.debug("File %s did not exist and so could not be deleted.", location.uri)
394 raise
395 except Exception as e:
396 log.critical("Failed to delete file: %s (%s)", location.uri, e)
397 raise
398 log.debug("Successfully deleted file: %s", location.uri)
400 def addStoredItemInfo(
401 self,
402 refs: Iterable[DatasetRef],
403 infos: Iterable[StoredFileInfo],
404 insert_mode: DatabaseInsertMode = DatabaseInsertMode.INSERT,
405 ) -> None:
406 """Record internal storage information associated with one or more
407 datasets.
409 Parameters
410 ----------
411 refs : sequence of `DatasetRef`
412 The datasets that have been stored.
413 infos : sequence of `StoredDatastoreItemInfo`
414 Metadata associated with the stored datasets.
415 insert_mode : `~lsst.daf.butler.registry.interfaces.DatabaseInsertMode`
416 Mode to use to insert the new records into the table. The
417 options are ``INSERT`` (error if pre-existing), ``REPLACE``
418 (replace content with new values), and ``ENSURE`` (skip if the row
419 already exists).
420 """
421 records = [
422 info.rebase(ref).to_record(dataset_id=ref.id) for ref, info in zip(refs, infos, strict=True)
423 ]
424 match insert_mode:
425 case DatabaseInsertMode.INSERT:
426 self._table.insert(*records, transaction=self._transaction)
427 case DatabaseInsertMode.ENSURE:
428 self._table.ensure(*records, transaction=self._transaction)
429 case DatabaseInsertMode.REPLACE:
430 self._table.replace(*records, transaction=self._transaction)
431 case _:
432 raise ValueError(f"Unknown insert mode of '{insert_mode}'")
434 def getStoredItemsInfo(
435 self, ref: DatasetIdRef, ignore_datastore_records: bool = False
436 ) -> list[StoredFileInfo]:
437 """Retrieve information associated with files stored in this
438 `Datastore` associated with this dataset ref.
440 Parameters
441 ----------
442 ref : `DatasetRef`
443 The dataset that is to be queried.
444 ignore_datastore_records : `bool`
445 If `True` then do not use datastore records stored in refs.
447 Returns
448 -------
449 items : `~collections.abc.Iterable` [`StoredDatastoreItemInfo`]
450 Stored information about the files and associated formatters
451 associated with this dataset. Only one file will be returned
452 if the dataset has not been disassembled. Can return an empty
453 list if no matching datasets can be found.
454 """
455 # Try to get them from the ref first.
456 if ref._datastore_records is not None and not ignore_datastore_records:
457 if (ref_records := ref._datastore_records.get(self._table.name)) is not None:
458 # Need to make sure they have correct type.
459 for record in ref_records:
460 if not isinstance(record, StoredFileInfo):
461 raise TypeError(f"Datastore record has unexpected type {record.__class__.__name__}")
462 return cast(list[StoredFileInfo], ref_records)
464 # Look for the dataset_id -- there might be multiple matches
465 # if we have disassembled the dataset.
466 records = self._table.fetch(dataset_id=ref.id)
467 return [StoredFileInfo.from_record(record) for record in records]
469 def _register_datasets(
470 self,
471 refsAndInfos: Iterable[tuple[DatasetRef, StoredFileInfo]],
472 insert_mode: DatabaseInsertMode = DatabaseInsertMode.INSERT,
473 ) -> None:
474 """Update registry to indicate that one or more datasets have been
475 stored.
477 Parameters
478 ----------
479 refsAndInfos : sequence `tuple` [`DatasetRef`,
480 `StoredDatastoreItemInfo`]
481 Datasets to register and the internal datastore metadata associated
482 with them.
483 insert_mode : `str`, optional
484 Indicate whether the new records should be new ("insert", default),
485 or allowed to exists ("ensure") or be replaced if already present
486 ("replace").
487 """
488 expandedRefs: list[DatasetRef] = []
489 expandedItemInfos: list[StoredFileInfo] = []
491 for ref, itemInfo in refsAndInfos:
492 expandedRefs.append(ref)
493 expandedItemInfos.append(itemInfo)
495 # Dataset location only cares about registry ID so if we have
496 # disassembled in datastore we have to deduplicate. Since they
497 # will have different datasetTypes we can't use a set
498 registryRefs = {r.id: r for r in expandedRefs}
499 if insert_mode == DatabaseInsertMode.INSERT:
500 self.bridge.insert(registryRefs.values())
501 else:
502 # There are only two columns and all that matters is the
503 # dataset ID.
504 self.bridge.ensure(registryRefs.values())
505 self.addStoredItemInfo(expandedRefs, expandedItemInfos, insert_mode=insert_mode)
507 def _get_stored_records_associated_with_refs(
508 self, refs: Iterable[DatasetIdRef], ignore_datastore_records: bool = False
509 ) -> dict[DatasetId, list[StoredFileInfo]]:
510 """Retrieve all records associated with the provided refs.
512 Parameters
513 ----------
514 refs : iterable of `DatasetIdRef`
515 The refs for which records are to be retrieved.
516 ignore_datastore_records : `bool`
517 If `True` then do not use datastore records stored in refs.
519 Returns
520 -------
521 records : `dict` of [`DatasetId`, `list` of `StoredFileInfo`]
522 The matching records indexed by the ref ID. The number of entries
523 in the dict can be smaller than the number of requested refs.
524 """
525 # Check datastore records in refs first.
526 records_by_ref: defaultdict[DatasetId, list[StoredFileInfo]] = defaultdict(list)
527 refs_with_no_records = []
528 for ref in refs:
529 if ignore_datastore_records or ref._datastore_records is None:
530 refs_with_no_records.append(ref)
531 else:
532 if (ref_records := ref._datastore_records.get(self._table.name)) is not None:
533 # Need to make sure they have correct type.
534 for ref_record in ref_records:
535 if not isinstance(ref_record, StoredFileInfo):
536 raise TypeError(
537 f"Datastore record has unexpected type {ref_record.__class__.__name__}"
538 )
539 records_by_ref[ref.id].append(ref_record)
541 # If there were any refs without datastore records, check opaque table.
542 records = self._table.fetch(dataset_id=[ref.id for ref in refs_with_no_records])
544 # Uniqueness is dataset_id + component so can have multiple records
545 # per ref.
546 for record in records:
547 records_by_ref[record["dataset_id"]].append(StoredFileInfo.from_record(record))
548 return records_by_ref
550 def _refs_associated_with_artifacts(self, paths: list[str | ResourcePath]) -> dict[str, set[DatasetId]]:
551 """Return paths and associated dataset refs.
553 Parameters
554 ----------
555 paths : `list` of `str` or `lsst.resources.ResourcePath`
556 All the paths to include in search.
558 Returns
559 -------
560 mapping : `dict` of [`str`, `set` [`DatasetId`]]
561 Mapping of each path to a set of associated database IDs.
562 """
563 records = self._table.fetch(path=[str(path) for path in paths])
564 result = defaultdict(set)
565 for row in records:
566 result[row["path"]].add(row["dataset_id"])
567 return result
569 def _registered_refs_per_artifact(self, pathInStore: ResourcePath) -> set[DatasetId]:
570 """Return all dataset refs associated with the supplied path.
572 Parameters
573 ----------
574 pathInStore : `lsst.resources.ResourcePath`
575 Path of interest in the data store.
577 Returns
578 -------
579 ids : `set` of `int`
580 All `DatasetRef` IDs associated with this path.
581 """
582 records = list(self._table.fetch(path=str(pathInStore)))
583 ids = {r["dataset_id"] for r in records}
584 return ids
586 def removeStoredItemInfo(self, ref: DatasetIdRef) -> None:
587 """Remove information about the file associated with this dataset.
589 Parameters
590 ----------
591 ref : `DatasetRef`
592 The dataset that has been removed.
593 """
594 # Note that this method is actually not used by this implementation,
595 # we depend on bridge to delete opaque records. But there are some
596 # tests that check that this method works, so we keep it for now.
597 self._table.delete(["dataset_id"], {"dataset_id": ref.id})
599 def _get_dataset_locations_info(
600 self, ref: DatasetIdRef, ignore_datastore_records: bool = False
601 ) -> list[DatasetLocationInformation]:
602 r"""Find all the `Location`\ s of the requested dataset in the
603 `Datastore` and the associated stored file information.
605 Parameters
606 ----------
607 ref : `DatasetRef`
608 Reference to the required `Dataset`.
609 ignore_datastore_records : `bool`
610 If `True` then do not use datastore records stored in refs.
612 Returns
613 -------
614 results : `list` [`tuple` [`Location`, `StoredFileInfo` ]]
615 Location of the dataset within the datastore and
616 stored information about each file and its formatter.
617 """
618 # Get the file information (this will fail if no file)
619 records = self.getStoredItemsInfo(ref, ignore_datastore_records)
621 # Use the path to determine the location -- we need to take
622 # into account absolute URIs in the datastore record
623 return [(r.file_location(self.locationFactory), r) for r in records]
625 def _can_remove_dataset_artifact(self, ref: DatasetIdRef, location: Location) -> bool:
626 """Check that there is only one dataset associated with the
627 specified artifact.
629 Parameters
630 ----------
631 ref : `DatasetRef` or `FakeDatasetRef`
632 Dataset to be removed.
633 location : `Location`
634 The location of the artifact to be removed.
636 Returns
637 -------
638 can_remove : `Bool`
639 True if the artifact can be safely removed.
640 """
641 # Can't ever delete absolute URIs.
642 if location.pathInStore.isabs():
643 return False
645 # Get all entries associated with this path
646 allRefs = self._registered_refs_per_artifact(location.pathInStore)
647 if not allRefs:
648 raise RuntimeError(f"Datastore inconsistency error. {location.pathInStore} not in registry")
650 # Remove these refs from all the refs and if there is nothing left
651 # then we can delete
652 remainingRefs = allRefs - {ref.id}
654 if remainingRefs:
655 return False
656 return True
658 def _get_expected_dataset_locations_info(self, ref: DatasetRef) -> list[tuple[Location, StoredFileInfo]]:
659 """Predict the location and related file information of the requested
660 dataset in this datastore.
662 Parameters
663 ----------
664 ref : `DatasetRef`
665 Reference to the required `Dataset`.
667 Returns
668 -------
669 results : `list` [`tuple` [`Location`, `StoredFileInfo` ]]
670 Expected Location of the dataset within the datastore and
671 placeholder information about each file and its formatter.
673 Notes
674 -----
675 Uses the current configuration to determine how we would expect the
676 datastore files to have been written if we couldn't ask registry.
677 This is safe so long as there has been no change to datastore
678 configuration between writing the dataset and wanting to read it.
679 Will not work for files that have been ingested without using the
680 standard file template or default formatter.
681 """
682 # If we have a component ref we always need to ask the questions
683 # of the composite. If the composite is disassembled this routine
684 # should return all components. If the composite was not
685 # disassembled the composite is what is stored regardless of
686 # component request. Note that if the caller has disassembled
687 # a composite there is no way for this guess to know that
688 # without trying both the composite and component ref and seeing
689 # if there is something at the component Location even without
690 # disassembly being enabled.
691 if ref.datasetType.isComponent():
692 ref = ref.makeCompositeRef()
694 # See if the ref is a composite that should be disassembled
695 doDisassembly = self.composites.shouldBeDisassembled(ref)
697 all_info: list[tuple[Location, Formatter, StorageClass, str | None]] = []
699 if doDisassembly:
700 for component, componentStorage in ref.datasetType.storageClass.components.items():
701 compRef = ref.makeComponentRef(component)
702 location, formatter = self._determine_put_formatter_location(compRef)
703 all_info.append((location, formatter, componentStorage, component))
705 else:
706 # Always use the composite ref if no disassembly
707 location, formatter = self._determine_put_formatter_location(ref)
708 all_info.append((location, formatter, ref.datasetType.storageClass, None))
710 # Convert the list of tuples to have StoredFileInfo as second element
711 return [
712 (
713 location,
714 StoredFileInfo(
715 formatter=formatter,
716 path=location.pathInStore.path,
717 storageClass=storageClass,
718 component=component,
719 checksum=None,
720 file_size=-1,
721 ),
722 )
723 for location, formatter, storageClass, component in all_info
724 ]
726 def _prepare_for_direct_get(
727 self, ref: DatasetRef, parameters: Mapping[str, Any] | None = None
728 ) -> list[DatastoreFileGetInformation]:
729 """Check parameters for ``get`` and obtain formatter and
730 location.
732 Parameters
733 ----------
734 ref : `DatasetRef`
735 Reference to the required Dataset.
736 parameters : `dict`
737 `StorageClass`-specific parameters that specify, for example,
738 a slice of the dataset to be loaded.
740 Returns
741 -------
742 getInfo : `list` [`DatastoreFileGetInformation`]
743 Parameters needed to retrieve each file.
744 """
745 log.debug("Retrieve %s from %s with parameters %s", ref, self.name, parameters)
747 # The storage class we want to use eventually
748 refStorageClass = ref.datasetType.storageClass
750 # For trusted mode need to reset storage class.
751 ref = self._cast_storage_class(ref)
753 # Get file metadata and internal metadata
754 fileLocations = self._get_dataset_locations_info(ref)
755 if not fileLocations:
756 if not self.trustGetRequest:
757 raise FileNotFoundError(f"Could not retrieve dataset {ref}.")
758 # Assume the dataset is where we think it should be
759 fileLocations = self._get_expected_dataset_locations_info(ref)
761 if len(fileLocations) > 1:
762 # If trust is involved it is possible that there will be
763 # components listed here that do not exist in the datastore.
764 # Explicitly check for file artifact existence and filter out any
765 # that are missing.
766 if self.trustGetRequest:
767 fileLocations = [loc for loc in fileLocations if loc[0].uri.exists()]
769 # For now complain only if we have no components at all. One
770 # component is probably a problem but we can punt that to the
771 # assembler.
772 if not fileLocations:
773 raise FileNotFoundError(f"None of the component files for dataset {ref} exist.")
775 return generate_datastore_get_information(
776 fileLocations,
777 readStorageClass=refStorageClass,
778 ref=ref,
779 parameters=parameters,
780 )
782 def _prepare_for_put(self, inMemoryDataset: Any, ref: DatasetRef) -> tuple[Location, Formatter]:
783 """Check the arguments for ``put`` and obtain formatter and
784 location.
786 Parameters
787 ----------
788 inMemoryDataset : `object`
789 The dataset to store.
790 ref : `DatasetRef`
791 Reference to the associated Dataset.
793 Returns
794 -------
795 location : `Location`
796 The location to write the dataset.
797 formatter : `Formatter`
798 The `Formatter` to use to write the dataset.
800 Raises
801 ------
802 TypeError
803 Supplied object and storage class are inconsistent.
804 DatasetTypeNotSupportedError
805 The associated `DatasetType` is not handled by this datastore.
806 """
807 self._validate_put_parameters(inMemoryDataset, ref)
808 return self._determine_put_formatter_location(ref)
810 def _determine_put_formatter_location(self, ref: DatasetRef) -> tuple[Location, Formatter]:
811 """Calculate the formatter and output location to use for put.
813 Parameters
814 ----------
815 ref : `DatasetRef`
816 Reference to the associated Dataset.
818 Returns
819 -------
820 location : `Location`
821 The location to write the dataset.
822 formatter : `Formatter`
823 The `Formatter` to use to write the dataset.
824 """
825 # Work out output file name
826 try:
827 template = self.templates.getTemplate(ref)
828 except KeyError as e:
829 raise DatasetTypeNotSupportedError(f"Unable to find template for {ref}") from e
831 # Validate the template to protect against filenames from different
832 # dataIds returning the same and causing overwrite confusion.
833 template.validateTemplate(ref)
835 location = self.locationFactory.fromPath(template.format(ref), trusted_path=True)
837 # Get the formatter based on the storage class
838 storageClass = ref.datasetType.storageClass
839 try:
840 formatter = self.formatterFactory.getFormatter(
841 ref, FileDescriptor(location, storageClass=storageClass), ref.dataId
842 )
843 except KeyError as e:
844 raise DatasetTypeNotSupportedError(
845 f"Unable to find formatter for {ref} in datastore {self.name}"
846 ) from e
848 # Now that we know the formatter, update the location
849 location = formatter.makeUpdatedLocation(location)
851 return location, formatter
853 def _overrideTransferMode(self, *datasets: FileDataset, transfer: str | None = None) -> str | None:
854 # Docstring inherited from base class
855 if transfer != "auto":
856 return transfer
858 # See if the paths are within the datastore or not
859 inside = [self._pathInStore(d.path) is not None for d in datasets]
861 if all(inside):
862 transfer = None
863 elif not any(inside):
864 # Allow ResourcePath to use its own knowledge
865 transfer = "auto"
866 else:
867 # This can happen when importing from a datastore that
868 # has had some datasets ingested using "direct" mode.
869 # Also allow ResourcePath to sort it out but warn about it.
870 # This can happen if you are importing from a datastore
871 # that had some direct transfer datasets.
872 log.warning(
873 "Some datasets are inside the datastore and some are outside. Using 'split' "
874 "transfer mode. This assumes that the files outside the datastore are "
875 "still accessible to the new butler since they will not be copied into "
876 "the target datastore."
877 )
878 transfer = "split"
880 return transfer
882 def _pathInStore(self, path: ResourcePathExpression) -> str | None:
883 """Return path relative to datastore root.
885 Parameters
886 ----------
887 path : `lsst.resources.ResourcePathExpression`
888 Path to dataset. Can be absolute URI. If relative assumed to
889 be relative to the datastore. Returns path in datastore
890 or raises an exception if the path it outside.
892 Returns
893 -------
894 inStore : `str`
895 Path relative to datastore root. Returns `None` if the file is
896 outside the root.
897 """
898 # Relative path will always be relative to datastore
899 pathUri = ResourcePath(path, forceAbsolute=False, forceDirectory=False)
900 return pathUri.relative_to(self.root)
902 def _standardizeIngestPath(
903 self, path: str | ResourcePath, *, transfer: str | None = None
904 ) -> str | ResourcePath:
905 """Standardize the path of a to-be-ingested file.
907 Parameters
908 ----------
909 path : `str` or `lsst.resources.ResourcePath`
910 Path of a file to be ingested. This parameter is not expected
911 to be all the types that can be used to construct a
912 `~lsst.resources.ResourcePath`.
913 transfer : `str`, optional
914 How (and whether) the dataset should be added to the datastore.
915 See `ingest` for details of transfer modes.
916 This implementation is provided only so
917 `NotImplementedError` can be raised if the mode is not supported;
918 actual transfers are deferred to `_extractIngestInfo`.
920 Returns
921 -------
922 path : `str` or `lsst.resources.ResourcePath`
923 New path in what the datastore considers standard form. If an
924 absolute URI was given that will be returned unchanged.
926 Notes
927 -----
928 Subclasses of `FileDatastore` can implement this method instead
929 of `_prepIngest`. It should not modify the data repository or given
930 file in any way.
932 Raises
933 ------
934 NotImplementedError
935 Raised if the datastore does not support the given transfer mode
936 (including the case where ingest is not supported at all).
937 FileNotFoundError
938 Raised if one of the given files does not exist.
939 """
940 if transfer not in (None, "direct", "split") + self.root.transferModes:
941 raise NotImplementedError(f"Transfer mode {transfer} not supported.")
943 # A relative URI indicates relative to datastore root
944 srcUri = ResourcePath(path, forceAbsolute=False, forceDirectory=False)
945 if not srcUri.isabs():
946 srcUri = self.root.join(path)
948 if not srcUri.exists():
949 raise FileNotFoundError(
950 f"Resource at {srcUri} does not exist; note that paths to ingest "
951 f"are assumed to be relative to {self.root} unless they are absolute."
952 )
954 if transfer is None:
955 relpath = srcUri.relative_to(self.root)
956 if not relpath:
957 raise RuntimeError(
958 f"Transfer is none but source file ({srcUri}) is not within datastore ({self.root})"
959 )
961 # Return the relative path within the datastore for internal
962 # transfer
963 path = relpath
965 return path
967 def _extractIngestInfo(
968 self,
969 path: ResourcePathExpression,
970 ref: DatasetRef,
971 *,
972 formatter: Formatter | type[Formatter],
973 transfer: str | None = None,
974 record_validation_info: bool = True,
975 ) -> StoredFileInfo:
976 """Relocate (if necessary) and extract `StoredFileInfo` from a
977 to-be-ingested file.
979 Parameters
980 ----------
981 path : `lsst.resources.ResourcePathExpression`
982 URI or path of a file to be ingested.
983 ref : `DatasetRef`
984 Reference for the dataset being ingested. Guaranteed to have
985 ``dataset_id not None`.
986 formatter : `type` or `Formatter`
987 `Formatter` subclass to use for this dataset or an instance.
988 transfer : `str`, optional
989 How (and whether) the dataset should be added to the datastore.
990 See `ingest` for details of transfer modes.
991 record_validation_info : `bool`, optional
992 If `True`, the default, the datastore can record validation
993 information associated with the file. If `False` the datastore
994 will not attempt to track any information such as checksums
995 or file sizes. This can be useful if such information is tracked
996 in an external system or if the file is to be compressed in place.
997 It is up to the datastore whether this parameter is relevant.
999 Returns
1000 -------
1001 info : `StoredFileInfo`
1002 Internal datastore record for this file. This will be inserted by
1003 the caller; the `_extractIngestInfo` is only responsible for
1004 creating and populating the struct.
1006 Raises
1007 ------
1008 FileNotFoundError
1009 Raised if one of the given files does not exist.
1010 FileExistsError
1011 Raised if transfer is not `None` but the (internal) location the
1012 file would be moved to is already occupied.
1013 """
1014 if self._transaction is None:
1015 raise RuntimeError("Ingest called without transaction enabled")
1017 # Create URI of the source path, do not need to force a relative
1018 # path to absolute.
1019 srcUri = ResourcePath(path, forceAbsolute=False, forceDirectory=False)
1021 # Track whether we have read the size of the source yet
1022 have_sized = False
1024 tgtLocation: Location | None
1025 if transfer is None or transfer == "split":
1026 # A relative path is assumed to be relative to the datastore
1027 # in this context
1028 if not srcUri.isabs():
1029 tgtLocation = self.locationFactory.fromPath(srcUri.ospath, trusted_path=False)
1030 else:
1031 # Work out the path in the datastore from an absolute URI
1032 # This is required to be within the datastore.
1033 pathInStore = srcUri.relative_to(self.root)
1034 if pathInStore is None and transfer is None:
1035 raise RuntimeError(
1036 f"Unexpectedly learned that {srcUri} is not within datastore {self.root}"
1037 )
1038 if pathInStore:
1039 tgtLocation = self.locationFactory.fromPath(pathInStore, trusted_path=True)
1040 elif transfer == "split":
1041 # Outside the datastore but treat that as a direct ingest
1042 # instead.
1043 tgtLocation = None
1044 else:
1045 raise RuntimeError(f"Unexpected transfer mode encountered: {transfer} for URI {srcUri}")
1046 elif transfer == "direct":
1047 # Want to store the full URI to the resource directly in
1048 # datastore. This is useful for referring to permanent archive
1049 # storage for raw data.
1050 # Trust that people know what they are doing.
1051 tgtLocation = None
1052 else:
1053 # Work out the name we want this ingested file to have
1054 # inside the datastore
1055 tgtLocation = self._calculate_ingested_datastore_name(srcUri, ref, formatter)
1056 if not tgtLocation.uri.dirname().exists():
1057 log.debug("Folder %s does not exist yet.", tgtLocation.uri.dirname())
1058 tgtLocation.uri.dirname().mkdir()
1060 # if we are transferring from a local file to a remote location
1061 # it may be more efficient to get the size and checksum of the
1062 # local file rather than the transferred one
1063 if record_validation_info and srcUri.isLocal:
1064 size = srcUri.size()
1065 checksum = self.computeChecksum(srcUri) if self.useChecksum else None
1066 have_sized = True
1068 # Transfer the resource to the destination.
1069 # Allow overwrite of an existing file. This matches the behavior
1070 # of datastore.put() in that it trusts that registry would not
1071 # be asking to overwrite unless registry thought that the
1072 # overwrite was allowed.
1073 tgtLocation.uri.transfer_from(
1074 srcUri, transfer=transfer, transaction=self._transaction, overwrite=True
1075 )
1077 if tgtLocation is None:
1078 # This means we are using direct mode
1079 targetUri = srcUri
1080 targetPath = str(srcUri)
1081 else:
1082 targetUri = tgtLocation.uri
1083 targetPath = tgtLocation.pathInStore.path
1085 # the file should exist in the datastore now
1086 if record_validation_info:
1087 if not have_sized:
1088 size = targetUri.size()
1089 checksum = self.computeChecksum(targetUri) if self.useChecksum else None
1090 else:
1091 # Not recording any file information.
1092 size = -1
1093 checksum = None
1095 return StoredFileInfo(
1096 formatter=formatter,
1097 path=targetPath,
1098 storageClass=ref.datasetType.storageClass,
1099 component=ref.datasetType.component(),
1100 file_size=size,
1101 checksum=checksum,
1102 )
1104 def _prepIngest(self, *datasets: FileDataset, transfer: str | None = None) -> _IngestPrepData:
1105 # Docstring inherited from Datastore._prepIngest.
1106 filtered = []
1107 for dataset in datasets:
1108 acceptable = [ref for ref in dataset.refs if self.constraints.isAcceptable(ref)]
1109 if not acceptable:
1110 continue
1111 else:
1112 dataset.refs = acceptable
1113 if dataset.formatter is None:
1114 dataset.formatter = self.formatterFactory.getFormatterClass(dataset.refs[0])
1115 else:
1116 assert isinstance(dataset.formatter, type | str)
1117 formatter_class = get_class_of(dataset.formatter)
1118 if not issubclass(formatter_class, Formatter):
1119 raise TypeError(f"Requested formatter {dataset.formatter} is not a Formatter class.")
1120 dataset.formatter = formatter_class
1121 dataset.path = self._standardizeIngestPath(dataset.path, transfer=transfer)
1122 filtered.append(dataset)
1123 return _IngestPrepData(filtered)
1125 @transactional
1126 def _finishIngest(
1127 self,
1128 prepData: Datastore.IngestPrepData,
1129 *,
1130 transfer: str | None = None,
1131 record_validation_info: bool = True,
1132 ) -> None:
1133 # Docstring inherited from Datastore._finishIngest.
1134 refsAndInfos = []
1135 progress = Progress("lsst.daf.butler.datastores.FileDatastore.ingest", level=logging.DEBUG)
1136 for dataset in progress.wrap(prepData.datasets, desc="Ingesting dataset files"):
1137 # Do ingest as if the first dataset ref is associated with the file
1138 info = self._extractIngestInfo(
1139 dataset.path,
1140 dataset.refs[0],
1141 formatter=dataset.formatter,
1142 transfer=transfer,
1143 record_validation_info=record_validation_info,
1144 )
1145 refsAndInfos.extend([(ref, info) for ref in dataset.refs])
1147 # In direct mode we can allow repeated ingests of the same thing
1148 # if we are sure that the external dataset is immutable. We use
1149 # UUIDv5 to indicate this. If there is a mix of v4 and v5 they are
1150 # separated.
1151 refs_and_infos_replace = []
1152 refs_and_infos_insert = []
1153 if transfer == "direct":
1154 for entry in refsAndInfos:
1155 if entry[0].id.version == 5:
1156 refs_and_infos_replace.append(entry)
1157 else:
1158 refs_and_infos_insert.append(entry)
1159 else:
1160 refs_and_infos_insert = refsAndInfos
1162 if refs_and_infos_insert:
1163 self._register_datasets(refs_and_infos_insert, insert_mode=DatabaseInsertMode.INSERT)
1164 if refs_and_infos_replace:
1165 self._register_datasets(refs_and_infos_replace, insert_mode=DatabaseInsertMode.REPLACE)
1167 def _calculate_ingested_datastore_name(
1168 self,
1169 srcUri: ResourcePath,
1170 ref: DatasetRef,
1171 formatter: Formatter | type[Formatter] | None = None,
1172 ) -> Location:
1173 """Given a source URI and a DatasetRef, determine the name the
1174 dataset will have inside datastore.
1176 Parameters
1177 ----------
1178 srcUri : `lsst.resources.ResourcePath`
1179 URI to the source dataset file.
1180 ref : `DatasetRef`
1181 Ref associated with the newly-ingested dataset artifact. This
1182 is used to determine the name within the datastore.
1183 formatter : `Formatter` or Formatter class.
1184 Formatter to use for validation. Can be a class or an instance.
1185 No validation of the file extension is performed if the
1186 ``formatter`` is `None`. This can be used if the caller knows
1187 that the source URI and target URI will use the same formatter.
1189 Returns
1190 -------
1191 location : `Location`
1192 Target location for the newly-ingested dataset.
1193 """
1194 # Ingesting a file from outside the datastore.
1195 # This involves a new name.
1196 template = self.templates.getTemplate(ref)
1197 location = self.locationFactory.fromPath(template.format(ref), trusted_path=True)
1199 # Get the extension
1200 ext = srcUri.getExtension()
1202 # Update the destination to include that extension
1203 location.updateExtension(ext)
1205 # Ask the formatter to validate this extension
1206 if formatter is not None:
1207 formatter.validateExtension(location)
1209 return location
1211 def _write_in_memory_to_artifact(self, inMemoryDataset: Any, ref: DatasetRef) -> StoredFileInfo:
1212 """Write out in memory dataset to datastore.
1214 Parameters
1215 ----------
1216 inMemoryDataset : `object`
1217 Dataset to write to datastore.
1218 ref : `DatasetRef`
1219 Registry information associated with this dataset.
1221 Returns
1222 -------
1223 info : `StoredFileInfo`
1224 Information describing the artifact written to the datastore.
1225 """
1226 # May need to coerce the in memory dataset to the correct
1227 # python type, but first we need to make sure the storage class
1228 # reflects the one defined in the data repository.
1229 ref = self._cast_storage_class(ref)
1230 inMemoryDataset = ref.datasetType.storageClass.coerce_type(inMemoryDataset)
1232 location, formatter = self._prepare_for_put(inMemoryDataset, ref)
1233 uri = location.uri
1235 if not uri.dirname().exists():
1236 log.debug("Folder %s does not exist yet so creating it.", uri.dirname())
1237 uri.dirname().mkdir()
1239 if self._transaction is None:
1240 raise RuntimeError("Attempting to write artifact without transaction enabled")
1242 def _removeFileExists(uri: ResourcePath) -> None:
1243 """Remove a file and do not complain if it is not there.
1245 This is important since a formatter might fail before the file
1246 is written and we should not confuse people by writing spurious
1247 error messages to the log.
1248 """
1249 with contextlib.suppress(FileNotFoundError):
1250 uri.remove()
1252 # Register a callback to try to delete the uploaded data if
1253 # something fails below
1254 self._transaction.registerUndo("artifactWrite", _removeFileExists, uri)
1256 data_written = False
1258 # For remote URIs some datasets can be serialized directly
1259 # to bytes and sent to the remote datastore without writing a
1260 # file. If the dataset is intended to be saved to the cache
1261 # a file is always written and direct write to the remote
1262 # datastore is bypassed.
1263 if not uri.isLocal and not self.cacheManager.should_be_cached(ref):
1264 # Remote URI that is not cached so can write directly.
1265 try:
1266 serializedDataset = formatter.toBytes(inMemoryDataset)
1267 except NotImplementedError:
1268 # Fallback to the file writing option.
1269 pass
1270 except Exception as e:
1271 raise RuntimeError(
1272 f"Failed to serialize dataset {ref} of type {type(inMemoryDataset)} to bytes."
1273 ) from e
1274 else:
1275 log.debug("Writing bytes directly to %s", uri)
1276 uri.write(serializedDataset, overwrite=True)
1277 log.debug("Successfully wrote bytes directly to %s", uri)
1278 data_written = True
1280 if not data_written:
1281 # Did not write the bytes directly to object store so instead
1282 # write to temporary file. Always write to a temporary even if
1283 # using a local file system -- that gives us atomic writes.
1284 # If a process is killed as the file is being written we do not
1285 # want it to remain in the correct place but in corrupt state.
1286 # For local files write to the output directory not temporary dir.
1287 prefix = uri.dirname() if uri.isLocal else None
1288 with ResourcePath.temporary_uri(suffix=uri.getExtension(), prefix=prefix) as temporary_uri:
1289 # Need to configure the formatter to write to a different
1290 # location and that needs us to overwrite internals
1291 log.debug("Writing dataset to temporary location at %s", temporary_uri)
1292 with formatter._updateLocation(Location(None, temporary_uri)):
1293 try:
1294 formatter.write(inMemoryDataset)
1295 except Exception as e:
1296 raise RuntimeError(
1297 f"Failed to serialize dataset {ref} of type"
1298 f" {type(inMemoryDataset)} to "
1299 f"temporary location {temporary_uri}"
1300 ) from e
1302 # Use move for a local file since that becomes an efficient
1303 # os.rename. For remote resources we use copy to allow the
1304 # file to be cached afterwards.
1305 transfer = "move" if uri.isLocal else "copy"
1307 uri.transfer_from(temporary_uri, transfer=transfer, overwrite=True)
1309 if transfer == "copy":
1310 # Cache if required
1311 self.cacheManager.move_to_cache(temporary_uri, ref)
1313 log.debug("Successfully wrote dataset to %s via a temporary file.", uri)
1315 # URI is needed to resolve what ingest case are we dealing with
1316 return self._extractIngestInfo(uri, ref, formatter=formatter)
1318 def knows(self, ref: DatasetRef) -> bool:
1319 """Check if the dataset is known to the datastore.
1321 Does not check for existence of any artifact.
1323 Parameters
1324 ----------
1325 ref : `DatasetRef`
1326 Reference to the required dataset.
1328 Returns
1329 -------
1330 exists : `bool`
1331 `True` if the dataset is known to the datastore.
1332 """
1333 # We cannot trust datastore records from ref, as many unit tests delete
1334 # datasets and check their existence.
1335 fileLocations = self._get_dataset_locations_info(ref, ignore_datastore_records=True)
1336 if fileLocations:
1337 return True
1338 return False
1340 def knows_these(self, refs: Iterable[DatasetRef]) -> dict[DatasetRef, bool]:
1341 # Docstring inherited from the base class.
1343 # The records themselves. Could be missing some entries.
1344 records = self._get_stored_records_associated_with_refs(refs, ignore_datastore_records=True)
1346 return {ref: ref.id in records for ref in refs}
1348 def _process_mexists_records(
1349 self,
1350 id_to_ref: dict[DatasetId, DatasetRef],
1351 records: dict[DatasetId, list[StoredFileInfo]],
1352 all_required: bool,
1353 artifact_existence: dict[ResourcePath, bool] | None = None,
1354 ) -> dict[DatasetRef, bool]:
1355 """Check given records for existence.
1357 Helper function for `mexists()`.
1359 Parameters
1360 ----------
1361 id_to_ref : `dict` of [`DatasetId`, `DatasetRef`]
1362 Mapping of the dataset ID to the dataset ref itself.
1363 records : `dict` of [`DatasetId`, `list` of `StoredFileInfo`]
1364 Records as generally returned by
1365 ``_get_stored_records_associated_with_refs``.
1366 all_required : `bool`
1367 Flag to indicate whether existence requires all artifacts
1368 associated with a dataset ID to exist or not for existence.
1369 artifact_existence : `dict` [`lsst.resources.ResourcePath`, `bool`]
1370 Optional mapping of datastore artifact to existence. Updated by
1371 this method with details of all artifacts tested. Can be `None`
1372 if the caller is not interested.
1374 Returns
1375 -------
1376 existence : `dict` of [`DatasetRef`, `bool`]
1377 Mapping from dataset to boolean indicating existence.
1378 """
1379 # The URIs to be checked and a mapping of those URIs to
1380 # the dataset ID.
1381 uris_to_check: list[ResourcePath] = []
1382 location_map: dict[ResourcePath, DatasetId] = {}
1384 location_factory = self.locationFactory
1386 uri_existence: dict[ResourcePath, bool] = {}
1387 for ref_id, infos in records.items():
1388 # Key is the dataset Id, value is list of StoredItemInfo
1389 uris = [info.file_location(location_factory).uri for info in infos]
1390 location_map.update({uri: ref_id for uri in uris})
1392 # Check the local cache directly for a dataset corresponding
1393 # to the remote URI.
1394 if self.cacheManager.file_count > 0:
1395 ref = id_to_ref[ref_id]
1396 for uri, storedFileInfo in zip(uris, infos, strict=True):
1397 check_ref = ref
1398 if not ref.datasetType.isComponent() and (component := storedFileInfo.component):
1399 check_ref = ref.makeComponentRef(component)
1400 if self.cacheManager.known_to_cache(check_ref, uri.getExtension()):
1401 # Proxy for URI existence.
1402 uri_existence[uri] = True
1403 else:
1404 uris_to_check.append(uri)
1405 else:
1406 # Check all of them.
1407 uris_to_check.extend(uris)
1409 if artifact_existence is not None:
1410 # If a URI has already been checked remove it from the list
1411 # and immediately add the status to the output dict.
1412 filtered_uris_to_check = []
1413 for uri in uris_to_check:
1414 if uri in artifact_existence:
1415 uri_existence[uri] = artifact_existence[uri]
1416 else:
1417 filtered_uris_to_check.append(uri)
1418 uris_to_check = filtered_uris_to_check
1420 # Results.
1421 dataset_existence: dict[DatasetRef, bool] = {}
1423 uri_existence.update(ResourcePath.mexists(uris_to_check))
1424 for uri, exists in uri_existence.items():
1425 dataset_id = location_map[uri]
1426 ref = id_to_ref[dataset_id]
1428 # Disassembled composite needs to check all locations.
1429 # all_required indicates whether all need to exist or not.
1430 if ref in dataset_existence:
1431 if all_required:
1432 exists = dataset_existence[ref] and exists
1433 else:
1434 exists = dataset_existence[ref] or exists
1435 dataset_existence[ref] = exists
1437 if artifact_existence is not None:
1438 artifact_existence.update(uri_existence)
1440 return dataset_existence
1442 def mexists(
1443 self, refs: Iterable[DatasetRef], artifact_existence: dict[ResourcePath, bool] | None = None
1444 ) -> dict[DatasetRef, bool]:
1445 """Check the existence of multiple datasets at once.
1447 Parameters
1448 ----------
1449 refs : iterable of `DatasetRef`
1450 The datasets to be checked.
1451 artifact_existence : `dict` [`lsst.resources.ResourcePath`, `bool`]
1452 Optional mapping of datastore artifact to existence. Updated by
1453 this method with details of all artifacts tested. Can be `None`
1454 if the caller is not interested.
1456 Returns
1457 -------
1458 existence : `dict` of [`DatasetRef`, `bool`]
1459 Mapping from dataset to boolean indicating existence.
1461 Notes
1462 -----
1463 To minimize potentially costly remote existence checks, the local
1464 cache is checked as a proxy for existence. If a file for this
1465 `DatasetRef` does exist no check is done for the actual URI. This
1466 could result in possibly unexpected behavior if the dataset itself
1467 has been removed from the datastore by another process whilst it is
1468 still in the cache.
1469 """
1470 chunk_size = 10_000
1471 dataset_existence: dict[DatasetRef, bool] = {}
1472 log.debug("Checking for the existence of multiple artifacts in datastore in chunks of %d", chunk_size)
1473 n_found_total = 0
1474 n_checked = 0
1475 n_chunks = 0
1476 for chunk in chunk_iterable(refs, chunk_size=chunk_size):
1477 chunk_result = self._mexists(chunk, artifact_existence)
1479 # The log message level and content depend on how many
1480 # datasets we are processing.
1481 n_results = len(chunk_result)
1483 # Use verbose logging to ensure that messages can be seen
1484 # easily if many refs are being checked.
1485 log_threshold = VERBOSE
1486 n_checked += n_results
1488 # This sum can take some time so only do it if we know the
1489 # result is going to be used.
1490 n_found = 0
1491 if log.isEnabledFor(log_threshold):
1492 # Can treat the booleans as 0, 1 integers and sum them.
1493 n_found = sum(chunk_result.values())
1494 n_found_total += n_found
1496 # We are deliberately not trying to count the number of refs
1497 # provided in case it's in the millions. This means there is a
1498 # situation where the number of refs exactly matches the chunk
1499 # size and we will switch to the multi-chunk path even though
1500 # we only have a single chunk.
1501 if n_results < chunk_size and n_chunks == 0:
1502 # Single chunk will be processed so we can provide more detail.
1503 if n_results == 1:
1504 ref = list(chunk_result)[0]
1505 # Use debug logging to be consistent with `exists()`.
1506 log.debug(
1507 "Calling mexists() with single ref that does%s exist (%s).",
1508 "" if chunk_result[ref] else " not",
1509 ref,
1510 )
1511 else:
1512 # Single chunk but multiple files. Summarize.
1513 log.log(
1514 log_threshold,
1515 "Number of datasets found in datastore: %d out of %d datasets checked.",
1516 n_found,
1517 n_checked,
1518 )
1520 else:
1521 # Use incremental verbose logging when we have multiple chunks.
1522 log.log(
1523 log_threshold,
1524 "Number of datasets found in datastore for chunk %d: %d out of %d checked "
1525 "(running total from all chunks so far: %d found out of %d checked)",
1526 n_chunks,
1527 n_found,
1528 n_results,
1529 n_found_total,
1530 n_checked,
1531 )
1532 dataset_existence.update(chunk_result)
1533 n_chunks += 1
1535 return dataset_existence
1537 def _mexists(
1538 self, refs: Sequence[DatasetRef], artifact_existence: dict[ResourcePath, bool] | None = None
1539 ) -> dict[DatasetRef, bool]:
1540 """Check the existence of multiple datasets at once.
1542 Parameters
1543 ----------
1544 refs : iterable of `DatasetRef`
1545 The datasets to be checked.
1546 artifact_existence : `dict` [`lsst.resources.ResourcePath`, `bool`]
1547 Optional mapping of datastore artifact to existence. Updated by
1548 this method with details of all artifacts tested. Can be `None`
1549 if the caller is not interested.
1551 Returns
1552 -------
1553 existence : `dict` of [`DatasetRef`, `bool`]
1554 Mapping from dataset to boolean indicating existence.
1555 """
1556 # Make a mapping from refs with the internal storage class to the given
1557 # refs that may have a different one. We'll use the internal refs
1558 # throughout this method and convert back at the very end.
1559 internal_ref_to_input_ref = {self._cast_storage_class(ref): ref for ref in refs}
1561 # Need a mapping of dataset_id to (internal) dataset ref since some
1562 # internal APIs work with dataset_id.
1563 id_to_ref = {ref.id: ref for ref in internal_ref_to_input_ref}
1565 # Set of all IDs we are checking for.
1566 requested_ids = set(id_to_ref.keys())
1568 # The records themselves. Could be missing some entries.
1569 records = self._get_stored_records_associated_with_refs(
1570 id_to_ref.values(), ignore_datastore_records=True
1571 )
1573 dataset_existence = self._process_mexists_records(
1574 id_to_ref, records, True, artifact_existence=artifact_existence
1575 )
1577 # Set of IDs that have been handled.
1578 handled_ids = {ref.id for ref in dataset_existence}
1580 missing_ids = requested_ids - handled_ids
1581 if missing_ids:
1582 dataset_existence.update(
1583 self._mexists_check_expected(
1584 [id_to_ref[missing] for missing in missing_ids], artifact_existence
1585 )
1586 )
1588 return {
1589 internal_ref_to_input_ref[internal_ref]: existence
1590 for internal_ref, existence in dataset_existence.items()
1591 }
1593 def _mexists_check_expected(
1594 self, refs: Sequence[DatasetRef], artifact_existence: dict[ResourcePath, bool] | None = None
1595 ) -> dict[DatasetRef, bool]:
1596 """Check existence of refs that are not known to datastore.
1598 Parameters
1599 ----------
1600 refs : iterable of `DatasetRef`
1601 The datasets to be checked. These are assumed not to be known
1602 to datastore.
1603 artifact_existence : `dict` [`lsst.resources.ResourcePath`, `bool`]
1604 Optional mapping of datastore artifact to existence. Updated by
1605 this method with details of all artifacts tested. Can be `None`
1606 if the caller is not interested.
1608 Returns
1609 -------
1610 existence : `dict` of [`DatasetRef`, `bool`]
1611 Mapping from dataset to boolean indicating existence.
1612 """
1613 dataset_existence: dict[DatasetRef, bool] = {}
1614 if not self.trustGetRequest:
1615 # Must assume these do not exist
1616 for ref in refs:
1617 dataset_existence[ref] = False
1618 else:
1619 log.debug(
1620 "%d datasets were not known to datastore during initial existence check.",
1621 len(refs),
1622 )
1624 # Construct data structure identical to that returned
1625 # by _get_stored_records_associated_with_refs() but using
1626 # guessed names.
1627 records = {}
1628 id_to_ref = {}
1629 for missing_ref in refs:
1630 expected = self._get_expected_dataset_locations_info(missing_ref)
1631 dataset_id = missing_ref.id
1632 records[dataset_id] = [info for _, info in expected]
1633 id_to_ref[dataset_id] = missing_ref
1635 dataset_existence.update(
1636 self._process_mexists_records(
1637 id_to_ref,
1638 records,
1639 False,
1640 artifact_existence=artifact_existence,
1641 )
1642 )
1644 return dataset_existence
1646 def exists(self, ref: DatasetRef) -> bool:
1647 """Check if the dataset exists in the datastore.
1649 Parameters
1650 ----------
1651 ref : `DatasetRef`
1652 Reference to the required dataset.
1654 Returns
1655 -------
1656 exists : `bool`
1657 `True` if the entity exists in the `Datastore`.
1659 Notes
1660 -----
1661 The local cache is checked as a proxy for existence in the remote
1662 object store. It is possible that another process on a different
1663 compute node could remove the file from the object store even
1664 though it is present in the local cache.
1665 """
1666 ref = self._cast_storage_class(ref)
1667 # We cannot trust datastore records from ref, as many unit tests delete
1668 # datasets and check their existence.
1669 fileLocations = self._get_dataset_locations_info(ref, ignore_datastore_records=True)
1671 # if we are being asked to trust that registry might not be correct
1672 # we ask for the expected locations and check them explicitly
1673 if not fileLocations:
1674 if not self.trustGetRequest:
1675 return False
1677 # First check the cache. If it is not found we must check
1678 # the datastore itself. Assume that any component in the cache
1679 # means that the dataset does exist somewhere.
1680 if self.cacheManager.known_to_cache(ref):
1681 return True
1683 # When we are guessing a dataset location we can not check
1684 # for the existence of every component since we can not
1685 # know if every component was written. Instead we check
1686 # for the existence of any of the expected locations.
1687 for location, _ in self._get_expected_dataset_locations_info(ref):
1688 if self._artifact_exists(location):
1689 return True
1690 return False
1692 # All listed artifacts must exist.
1693 for location, storedFileInfo in fileLocations:
1694 # Checking in cache needs the component ref.
1695 check_ref = ref
1696 if not ref.datasetType.isComponent() and (component := storedFileInfo.component):
1697 check_ref = ref.makeComponentRef(component)
1698 if self.cacheManager.known_to_cache(check_ref, location.getExtension()):
1699 continue
1701 if not self._artifact_exists(location):
1702 return False
1704 return True
1706 def getURIs(self, ref: DatasetRef, predict: bool = False) -> DatasetRefURIs:
1707 """Return URIs associated with dataset.
1709 Parameters
1710 ----------
1711 ref : `DatasetRef`
1712 Reference to the required dataset.
1713 predict : `bool`, optional
1714 If the datastore does not know about the dataset, controls whether
1715 it should return a predicted URI or not.
1717 Returns
1718 -------
1719 uris : `DatasetRefURIs`
1720 The URI to the primary artifact associated with this dataset (if
1721 the dataset was disassembled within the datastore this may be
1722 `None`), and the URIs to any components associated with the dataset
1723 artifact. (can be empty if there are no components).
1724 """
1725 many = self.getManyURIs([ref], predict=predict, allow_missing=False)
1726 return many[ref]
1728 def getURI(self, ref: DatasetRef, predict: bool = False) -> ResourcePath:
1729 """URI to the Dataset.
1731 Parameters
1732 ----------
1733 ref : `DatasetRef`
1734 Reference to the required Dataset.
1735 predict : `bool`
1736 If `True`, allow URIs to be returned of datasets that have not
1737 been written.
1739 Returns
1740 -------
1741 uri : `str`
1742 URI pointing to the dataset within the datastore. If the
1743 dataset does not exist in the datastore, and if ``predict`` is
1744 `True`, the URI will be a prediction and will include a URI
1745 fragment "#predicted".
1746 If the datastore does not have entities that relate well
1747 to the concept of a URI the returned URI will be
1748 descriptive. The returned URI is not guaranteed to be obtainable.
1750 Raises
1751 ------
1752 FileNotFoundError
1753 Raised if a URI has been requested for a dataset that does not
1754 exist and guessing is not allowed.
1755 RuntimeError
1756 Raised if a request is made for a single URI but multiple URIs
1757 are associated with this dataset.
1759 Notes
1760 -----
1761 When a predicted URI is requested an attempt will be made to form
1762 a reasonable URI based on file templates and the expected formatter.
1763 """
1764 primary, components = self.getURIs(ref, predict)
1765 if primary is None or components:
1766 raise RuntimeError(
1767 f"Dataset ({ref}) includes distinct URIs for components. Use Datastore.getURIs() instead."
1768 )
1769 return primary
1771 def _predict_URIs(
1772 self,
1773 ref: DatasetRef,
1774 ) -> DatasetRefURIs:
1775 """Predict the URIs of a dataset ref.
1777 Parameters
1778 ----------
1779 ref : `DatasetRef`
1780 Reference to the required Dataset.
1782 Returns
1783 -------
1784 URI : DatasetRefUris
1785 Primary and component URIs. URIs will contain a URI fragment
1786 "#predicted".
1787 """
1788 uris = DatasetRefURIs()
1790 if self.composites.shouldBeDisassembled(ref):
1791 for component, _ in ref.datasetType.storageClass.components.items():
1792 comp_ref = ref.makeComponentRef(component)
1793 comp_location, _ = self._determine_put_formatter_location(comp_ref)
1795 # Add the "#predicted" URI fragment to indicate this is a
1796 # guess
1797 uris.componentURIs[component] = ResourcePath(
1798 comp_location.uri.geturl() + "#predicted", forceDirectory=comp_location.uri.dirLike
1799 )
1801 else:
1802 location, _ = self._determine_put_formatter_location(ref)
1804 # Add the "#predicted" URI fragment to indicate this is a guess
1805 uris.primaryURI = ResourcePath(
1806 location.uri.geturl() + "#predicted", forceDirectory=location.uri.dirLike
1807 )
1809 return uris
1811 def getManyURIs(
1812 self,
1813 refs: Iterable[DatasetRef],
1814 predict: bool = False,
1815 allow_missing: bool = False,
1816 ) -> dict[DatasetRef, DatasetRefURIs]:
1817 # Docstring inherited
1819 uris: dict[DatasetRef, DatasetRefURIs] = {}
1821 records = self._get_stored_records_associated_with_refs(refs)
1822 records_keys = records.keys()
1824 existing_refs = tuple(ref for ref in refs if ref.id in records_keys)
1825 missing_refs = tuple(ref for ref in refs if ref.id not in records_keys)
1827 # Have to handle trustGetRequest mode by checking for the existence
1828 # of the missing refs on disk.
1829 if missing_refs:
1830 dataset_existence = self._mexists_check_expected(missing_refs, None)
1831 really_missing = set()
1832 not_missing = set()
1833 for ref, exists in dataset_existence.items():
1834 if exists:
1835 not_missing.add(ref)
1836 else:
1837 really_missing.add(ref)
1839 if not_missing:
1840 # Need to recalculate the missing/existing split.
1841 existing_refs = existing_refs + tuple(not_missing)
1842 missing_refs = tuple(really_missing)
1844 for ref in missing_refs:
1845 # if this has never been written then we have to guess
1846 if not predict:
1847 if not allow_missing:
1848 raise FileNotFoundError(f"Dataset {ref} not in this datastore.")
1849 else:
1850 uris[ref] = self._predict_URIs(ref)
1852 for ref in existing_refs:
1853 file_infos = records[ref.id]
1854 file_locations = [(i.file_location(self.locationFactory), i) for i in file_infos]
1855 uris[ref] = self._locations_to_URI(ref, file_locations)
1857 return uris
1859 def _locations_to_URI(
1860 self,
1861 ref: DatasetRef,
1862 file_locations: Sequence[tuple[Location, StoredFileInfo]],
1863 ) -> DatasetRefURIs:
1864 """Convert one or more file locations associated with a DatasetRef
1865 to a DatasetRefURIs.
1867 Parameters
1868 ----------
1869 ref : `DatasetRef`
1870 Reference to the dataset.
1871 file_locations : Sequence[Tuple[Location, StoredFileInfo]]
1872 Each item in the sequence is the location of the dataset within the
1873 datastore and stored information about the file and its formatter.
1874 If there is only one item in the sequence then it is treated as the
1875 primary URI. If there is more than one item then they are treated
1876 as component URIs. If there are no items then an error is raised
1877 unless ``self.trustGetRequest`` is `True`.
1879 Returns
1880 -------
1881 uris: DatasetRefURIs
1882 Represents the primary URI or component URIs described by the
1883 inputs.
1885 Raises
1886 ------
1887 RuntimeError
1888 If no file locations are passed in and ``self.trustGetRequest`` is
1889 `False`.
1890 FileNotFoundError
1891 If the a passed-in URI does not exist, and ``self.trustGetRequest``
1892 is `False`.
1893 RuntimeError
1894 If a passed in `StoredFileInfo`'s ``component`` is `None` (this is
1895 unexpected).
1896 """
1897 guessing = False
1898 uris = DatasetRefURIs()
1900 if not file_locations:
1901 if not self.trustGetRequest:
1902 raise RuntimeError(f"Unexpectedly got no artifacts for dataset {ref}")
1903 file_locations = self._get_expected_dataset_locations_info(ref)
1904 guessing = True
1906 if len(file_locations) == 1:
1907 # No disassembly so this is the primary URI
1908 uris.primaryURI = file_locations[0][0].uri
1909 if guessing and not uris.primaryURI.exists():
1910 raise FileNotFoundError(f"Expected URI ({uris.primaryURI}) does not exist")
1911 else:
1912 for location, file_info in file_locations:
1913 if file_info.component is None:
1914 raise RuntimeError(f"Unexpectedly got no component name for a component at {location}")
1915 if guessing and not location.uri.exists():
1916 # If we are trusting then it is entirely possible for
1917 # some components to be missing. In that case we skip
1918 # to the next component.
1919 if self.trustGetRequest:
1920 continue
1921 raise FileNotFoundError(f"Expected URI ({location.uri}) does not exist")
1922 uris.componentURIs[file_info.component] = location.uri
1924 return uris
1926 def retrieveArtifacts(
1927 self,
1928 refs: Iterable[DatasetRef],
1929 destination: ResourcePath,
1930 transfer: str = "auto",
1931 preserve_path: bool = True,
1932 overwrite: bool = False,
1933 ) -> list[ResourcePath]:
1934 """Retrieve the file artifacts associated with the supplied refs.
1936 Parameters
1937 ----------
1938 refs : iterable of `DatasetRef`
1939 The datasets for which file artifacts are to be retrieved.
1940 A single ref can result in multiple files. The refs must
1941 be resolved.
1942 destination : `lsst.resources.ResourcePath`
1943 Location to write the file artifacts.
1944 transfer : `str`, optional
1945 Method to use to transfer the artifacts. Must be one of the options
1946 supported by `lsst.resources.ResourcePath.transfer_from()`.
1947 "move" is not allowed.
1948 preserve_path : `bool`, optional
1949 If `True` the full path of the file artifact within the datastore
1950 is preserved. If `False` the final file component of the path
1951 is used.
1952 overwrite : `bool`, optional
1953 If `True` allow transfers to overwrite existing files at the
1954 destination.
1956 Returns
1957 -------
1958 targets : `list` of `lsst.resources.ResourcePath`
1959 URIs of file artifacts in destination location. Order is not
1960 preserved.
1961 """
1962 if not destination.isdir():
1963 raise ValueError(f"Destination location must refer to a directory. Given {destination}")
1965 if transfer == "move":
1966 raise ValueError("Can not move artifacts out of datastore. Use copy instead.")
1968 # Source -> Destination
1969 # This also helps filter out duplicate DatasetRef in the request
1970 # that will map to the same underlying file transfer.
1971 to_transfer: dict[ResourcePath, ResourcePath] = {}
1973 for ref in refs:
1974 locations = self._get_dataset_locations_info(ref)
1975 for location, _ in locations:
1976 source_uri = location.uri
1977 target_path: ResourcePathExpression
1978 if preserve_path:
1979 target_path = location.pathInStore
1980 if target_path.isabs():
1981 # This is an absolute path to an external file.
1982 # Use the full path.
1983 target_path = target_path.relativeToPathRoot
1984 else:
1985 target_path = source_uri.basename()
1986 target_uri = destination.join(target_path)
1987 to_transfer[source_uri] = target_uri
1989 # In theory can now parallelize the transfer
1990 log.debug("Number of artifacts to transfer to %s: %d", str(destination), len(to_transfer))
1991 for source_uri, target_uri in to_transfer.items():
1992 target_uri.transfer_from(source_uri, transfer=transfer, overwrite=overwrite)
1994 return list(to_transfer.values())
1996 def get(
1997 self,
1998 ref: DatasetRef,
1999 parameters: Mapping[str, Any] | None = None,
2000 storageClass: StorageClass | str | None = None,
2001 ) -> Any:
2002 """Load an InMemoryDataset from the store.
2004 Parameters
2005 ----------
2006 ref : `DatasetRef`
2007 Reference to the required Dataset.
2008 parameters : `dict`
2009 `StorageClass`-specific parameters that specify, for example,
2010 a slice of the dataset to be loaded.
2011 storageClass : `StorageClass` or `str`, optional
2012 The storage class to be used to override the Python type
2013 returned by this method. By default the returned type matches
2014 the dataset type definition for this dataset. Specifying a
2015 read `StorageClass` can force a different type to be returned.
2016 This type must be compatible with the original type.
2018 Returns
2019 -------
2020 inMemoryDataset : `object`
2021 Requested dataset or slice thereof as an InMemoryDataset.
2023 Raises
2024 ------
2025 FileNotFoundError
2026 Requested dataset can not be retrieved.
2027 TypeError
2028 Return value from formatter has unexpected type.
2029 ValueError
2030 Formatter failed to process the dataset.
2031 """
2032 # Supplied storage class for the component being read is either
2033 # from the ref itself or some an override if we want to force
2034 # type conversion.
2035 if storageClass is not None:
2036 ref = ref.overrideStorageClass(storageClass)
2038 allGetInfo = self._prepare_for_direct_get(ref, parameters)
2039 return get_dataset_as_python_object_from_get_info(
2040 allGetInfo, ref=ref, parameters=parameters, cache_manager=self.cacheManager
2041 )
2043 def prepare_get_for_external_client(self, ref: DatasetRef) -> FileDatastoreGetPayload:
2044 # Docstring inherited
2046 # 1 hour. Chosen somewhat arbitrarily -- this is long enough that the
2047 # client should have time to download a large file with retries if
2048 # needed, but short enough that it will become obvious quickly that
2049 # these URLs expire.
2050 # From a strictly technical standpoint there is no reason this
2051 # shouldn't be a day or more, but there seems to be a political issue
2052 # where people think there is a risk of end users posting presigned
2053 # URLs for people without access rights to download.
2054 url_expiration_time_seconds = 1 * 60 * 60
2056 def to_file_info_payload(info: DatasetLocationInformation) -> FileDatastoreGetPayloadFileInfo:
2057 location, file_info = info
2058 return FileDatastoreGetPayloadFileInfo(
2059 url=location.uri.generate_presigned_get_url(
2060 expiration_time_seconds=url_expiration_time_seconds
2061 ),
2062 datastoreRecords=file_info.to_simple(),
2063 )
2065 return FileDatastoreGetPayload(
2066 datastore_type="file",
2067 dataset_ref=ref.to_simple(),
2068 file_info=[to_file_info_payload(info) for info in self._get_dataset_locations_info(ref)],
2069 )
2071 @transactional
2072 def put(self, inMemoryDataset: Any, ref: DatasetRef) -> None:
2073 """Write a InMemoryDataset with a given `DatasetRef` to the store.
2075 Parameters
2076 ----------
2077 inMemoryDataset : `object`
2078 The dataset to store.
2079 ref : `DatasetRef`
2080 Reference to the associated Dataset.
2082 Raises
2083 ------
2084 TypeError
2085 Supplied object and storage class are inconsistent.
2086 DatasetTypeNotSupportedError
2087 The associated `DatasetType` is not handled by this datastore.
2089 Notes
2090 -----
2091 If the datastore is configured to reject certain dataset types it
2092 is possible that the put will fail and raise a
2093 `DatasetTypeNotSupportedError`. The main use case for this is to
2094 allow `ChainedDatastore` to put to multiple datastores without
2095 requiring that every datastore accepts the dataset.
2096 """
2097 doDisassembly = self.composites.shouldBeDisassembled(ref)
2098 # doDisassembly = True
2100 artifacts = []
2101 if doDisassembly:
2102 components = ref.datasetType.storageClass.delegate().disassemble(inMemoryDataset)
2103 if components is None:
2104 raise RuntimeError(
2105 f"Inconsistent configuration: dataset type {ref.datasetType.name} "
2106 f"with storage class {ref.datasetType.storageClass.name} "
2107 "is configured to be disassembled, but cannot be."
2108 )
2109 for component, componentInfo in components.items():
2110 # Don't recurse because we want to take advantage of
2111 # bulk insert -- need a new DatasetRef that refers to the
2112 # same dataset_id but has the component DatasetType
2113 # DatasetType does not refer to the types of components
2114 # So we construct one ourselves.
2115 compRef = ref.makeComponentRef(component)
2116 storedInfo = self._write_in_memory_to_artifact(componentInfo.component, compRef)
2117 artifacts.append((compRef, storedInfo))
2118 else:
2119 # Write the entire thing out
2120 storedInfo = self._write_in_memory_to_artifact(inMemoryDataset, ref)
2121 artifacts.append((ref, storedInfo))
2123 self._register_datasets(artifacts, insert_mode=DatabaseInsertMode.INSERT)
2125 @transactional
2126 def put_new(self, in_memory_dataset: Any, ref: DatasetRef) -> Mapping[str, DatasetRef]:
2127 doDisassembly = self.composites.shouldBeDisassembled(ref)
2128 # doDisassembly = True
2130 artifacts = []
2131 if doDisassembly:
2132 components = ref.datasetType.storageClass.delegate().disassemble(in_memory_dataset)
2133 if components is None:
2134 raise RuntimeError(
2135 f"Inconsistent configuration: dataset type {ref.datasetType.name} "
2136 f"with storage class {ref.datasetType.storageClass.name} "
2137 "is configured to be disassembled, but cannot be."
2138 )
2139 for component, componentInfo in components.items():
2140 # Don't recurse because we want to take advantage of
2141 # bulk insert -- need a new DatasetRef that refers to the
2142 # same dataset_id but has the component DatasetType
2143 # DatasetType does not refer to the types of components
2144 # So we construct one ourselves.
2145 compRef = ref.makeComponentRef(component)
2146 storedInfo = self._write_in_memory_to_artifact(componentInfo.component, compRef)
2147 artifacts.append((compRef, storedInfo))
2148 else:
2149 # Write the entire thing out
2150 storedInfo = self._write_in_memory_to_artifact(in_memory_dataset, ref)
2151 artifacts.append((ref, storedInfo))
2153 ref_records = {self._opaque_table_name: [info for _, info in artifacts]}
2154 ref = ref.replace(datastore_records=ref_records)
2155 return {self.name: ref}
2157 @transactional
2158 def trash(self, ref: DatasetRef | Iterable[DatasetRef], ignore_errors: bool = True) -> None:
2159 # At this point can safely remove these datasets from the cache
2160 # to avoid confusion later on. If they are not trashed later
2161 # the cache will simply be refilled.
2162 self.cacheManager.remove_from_cache(ref)
2164 # If we are in trust mode there will be nothing to move to
2165 # the trash table and we will have to try to delete the file
2166 # immediately.
2167 if self.trustGetRequest:
2168 # Try to keep the logic below for a single file trash.
2169 if isinstance(ref, DatasetRef):
2170 refs = {ref}
2171 else:
2172 # Will recreate ref at the end of this branch.
2173 refs = set(ref)
2175 # Determine which datasets are known to datastore directly.
2176 id_to_ref = {ref.id: ref for ref in refs}
2177 existing_ids = self._get_stored_records_associated_with_refs(refs, ignore_datastore_records=True)
2178 existing_refs = {id_to_ref[ref_id] for ref_id in existing_ids}
2180 missing = refs - existing_refs
2181 if missing:
2182 # Do an explicit existence check on these refs.
2183 # We only care about the artifacts at this point and not
2184 # the dataset existence.
2185 artifact_existence: dict[ResourcePath, bool] = {}
2186 _ = self.mexists(missing, artifact_existence)
2187 uris = [uri for uri, exists in artifact_existence.items() if exists]
2189 # FUTURE UPGRADE: Implement a parallelized bulk remove.
2190 log.debug("Removing %d artifacts from datastore that are unknown to datastore", len(uris))
2191 for uri in uris:
2192 try:
2193 uri.remove()
2194 except Exception as e:
2195 if ignore_errors:
2196 log.debug("Artifact %s could not be removed: %s", uri, e)
2197 continue
2198 raise
2200 # There is no point asking the code below to remove refs we
2201 # know are missing so update it with the list of existing
2202 # records. Try to retain one vs many logic.
2203 if not existing_refs:
2204 # Nothing more to do since none of the datasets were
2205 # known to the datastore record table.
2206 return
2207 ref = list(existing_refs)
2208 if len(ref) == 1:
2209 ref = ref[0]
2211 # Get file metadata and internal metadata
2212 if not isinstance(ref, DatasetRef):
2213 log.debug("Doing multi-dataset trash in datastore %s", self.name)
2214 # Assumed to be an iterable of refs so bulk mode enabled.
2215 try:
2216 self.bridge.moveToTrash(ref, transaction=self._transaction)
2217 except Exception as e:
2218 if ignore_errors:
2219 log.warning("Unexpected issue moving multiple datasets to trash: %s", e)
2220 else:
2221 raise
2222 return
2224 log.debug("Trashing dataset %s in datastore %s", ref, self.name)
2226 fileLocations = self._get_dataset_locations_info(ref)
2228 if not fileLocations:
2229 err_msg = f"Requested dataset to trash ({ref}) is not known to datastore {self.name}"
2230 if ignore_errors:
2231 log.warning(err_msg)
2232 return
2233 else:
2234 raise FileNotFoundError(err_msg)
2236 for location, _ in fileLocations:
2237 if not self._artifact_exists(location):
2238 err_msg = (
2239 f"Dataset is known to datastore {self.name} but "
2240 f"associated artifact ({location.uri}) is missing"
2241 )
2242 if ignore_errors:
2243 log.warning(err_msg)
2244 return
2245 else:
2246 raise FileNotFoundError(err_msg)
2248 # Mark dataset as trashed
2249 try:
2250 self.bridge.moveToTrash([ref], transaction=self._transaction)
2251 except Exception as e:
2252 if ignore_errors:
2253 log.warning(
2254 "Attempted to mark dataset (%s) to be trashed in datastore %s "
2255 "but encountered an error: %s",
2256 ref,
2257 self.name,
2258 e,
2259 )
2260 pass
2261 else:
2262 raise
2264 @transactional
2265 def emptyTrash(self, ignore_errors: bool = True) -> None:
2266 """Remove all datasets from the trash.
2268 Parameters
2269 ----------
2270 ignore_errors : `bool`
2271 If `True` return without error even if something went wrong.
2272 Problems could occur if another process is simultaneously trying
2273 to delete.
2274 """
2275 log.debug("Emptying trash in datastore %s", self.name)
2277 # Context manager will empty trash iff we finish it without raising.
2278 # It will also automatically delete the relevant rows from the
2279 # trash table and the records table.
2280 with self.bridge.emptyTrash(
2281 self._table, record_class=StoredFileInfo, record_column="path"
2282 ) as trash_data:
2283 # Removing the artifacts themselves requires that the files are
2284 # not also associated with refs that are not to be trashed.
2285 # Therefore need to do a query with the file paths themselves
2286 # and return all the refs associated with them. Can only delete
2287 # a file if the refs to be trashed are the only refs associated
2288 # with the file.
2289 # This requires multiple copies of the trashed items
2290 trashed, artifacts_to_keep = trash_data
2292 if artifacts_to_keep is None:
2293 # The bridge is not helping us so have to work it out
2294 # ourselves. This is not going to be as efficient.
2295 trashed = list(trashed)
2297 # The instance check is for mypy since up to this point it
2298 # does not know the type of info.
2299 path_map = self._refs_associated_with_artifacts(
2300 [info.path for _, info in trashed if isinstance(info, StoredFileInfo)]
2301 )
2303 for ref, info in trashed:
2304 # Mypy needs to know this is not the base class
2305 assert isinstance(info, StoredFileInfo), f"Unexpectedly got info of class {type(info)}"
2307 path_map[info.path].remove(ref.id)
2308 if not path_map[info.path]:
2309 del path_map[info.path]
2311 artifacts_to_keep = set(path_map)
2313 for ref, info in trashed:
2314 # Should not happen for this implementation but need
2315 # to keep mypy happy.
2316 assert info is not None, f"Internal logic error in emptyTrash with ref {ref}."
2318 # Mypy needs to know this is not the base class
2319 assert isinstance(info, StoredFileInfo), f"Unexpectedly got info of class {type(info)}"
2321 if info.path in artifacts_to_keep:
2322 # This is a multi-dataset artifact and we are not
2323 # removing all associated refs.
2324 continue
2326 # Only trashed refs still known to datastore will be returned.
2327 location = info.file_location(self.locationFactory)
2329 # Point of no return for this artifact
2330 log.debug("Removing artifact %s from datastore %s", location.uri, self.name)
2331 try:
2332 self._delete_artifact(location)
2333 except FileNotFoundError:
2334 # If the file itself has been deleted there is nothing
2335 # we can do about it. It is possible that trash has
2336 # been run in parallel in another process or someone
2337 # decided to delete the file. It is unlikely to come
2338 # back and so we should still continue with the removal
2339 # of the entry from the trash table. It is also possible
2340 # we removed it in a previous iteration if it was
2341 # a multi-dataset artifact. The delete artifact method
2342 # will log a debug message in this scenario.
2343 # Distinguishing file missing before trash started and
2344 # file already removed previously as part of this trash
2345 # is not worth the distinction with regards to potential
2346 # memory cost.
2347 pass
2348 except Exception as e:
2349 if ignore_errors:
2350 # Use a debug message here even though it's not
2351 # a good situation. In some cases this can be
2352 # caused by a race between user A and user B
2353 # and neither of them has permissions for the
2354 # other's files. Butler does not know about users
2355 # and trash has no idea what collections these
2356 # files were in (without guessing from a path).
2357 log.debug(
2358 "Encountered error removing artifact %s from datastore %s: %s",
2359 location.uri,
2360 self.name,
2361 e,
2362 )
2363 else:
2364 raise
2366 @transactional
2367 def transfer_from(
2368 self,
2369 source_datastore: Datastore,
2370 refs: Iterable[DatasetRef],
2371 transfer: str = "auto",
2372 artifact_existence: dict[ResourcePath, bool] | None = None,
2373 dry_run: bool = False,
2374 ) -> tuple[set[DatasetRef], set[DatasetRef]]:
2375 # Docstring inherited
2376 if type(self) is not type(source_datastore):
2377 raise TypeError(
2378 f"Datastore mismatch between this datastore ({type(self)}) and the "
2379 f"source datastore ({type(source_datastore)})."
2380 )
2382 # Be explicit for mypy
2383 if not isinstance(source_datastore, FileDatastore):
2384 raise TypeError(
2385 "Can only transfer to a FileDatastore from another FileDatastore, not"
2386 f" {type(source_datastore)}"
2387 )
2389 # Stop early if "direct" transfer mode is requested. That would
2390 # require that the URI inside the source datastore should be stored
2391 # directly in the target datastore, which seems unlikely to be useful
2392 # since at any moment the source datastore could delete the file.
2393 if transfer in ("direct", "split"):
2394 raise ValueError(
2395 f"Can not transfer from a source datastore using {transfer} mode since"
2396 " those files are controlled by the other datastore."
2397 )
2399 # Empty existence lookup if none given.
2400 if artifact_existence is None:
2401 artifact_existence = {}
2403 # We will go through the list multiple times so must convert
2404 # generators to lists.
2405 refs = list(refs)
2407 # In order to handle disassembled composites the code works
2408 # at the records level since it can assume that internal APIs
2409 # can be used.
2410 # - If the record already exists in the destination this is assumed
2411 # to be okay.
2412 # - If there is no record but the source and destination URIs are
2413 # identical no transfer is done but the record is added.
2414 # - If the source record refers to an absolute URI currently assume
2415 # that that URI should remain absolute and will be visible to the
2416 # destination butler. May need to have a flag to indicate whether
2417 # the dataset should be transferred. This will only happen if
2418 # the detached Butler has had a local ingest.
2420 # What we really want is all the records in the source datastore
2421 # associated with these refs. Or derived ones if they don't exist
2422 # in the source.
2423 source_records = source_datastore._get_stored_records_associated_with_refs(
2424 refs, ignore_datastore_records=True
2425 )
2427 # The source dataset_ids are the keys in these records
2428 source_ids = set(source_records)
2429 log.debug("Number of datastore records found in source: %d", len(source_ids))
2431 requested_ids = {ref.id for ref in refs}
2432 missing_ids = requested_ids - source_ids
2434 # Missing IDs can be okay if that datastore has allowed
2435 # gets based on file existence. Should we transfer what we can
2436 # or complain about it and warn?
2437 if missing_ids and not source_datastore.trustGetRequest:
2438 raise ValueError(
2439 f"Some datasets are missing from source datastore {source_datastore}: {missing_ids}"
2440 )
2442 # Need to map these missing IDs to a DatasetRef so we can guess
2443 # the details.
2444 if missing_ids:
2445 log.info(
2446 "Number of expected datasets missing from source datastore records: %d out of %d",
2447 len(missing_ids),
2448 len(requested_ids),
2449 )
2450 id_to_ref = {ref.id: ref for ref in refs if ref.id in missing_ids}
2452 # This should be chunked in case we end up having to check
2453 # the file store since we need some log output to show
2454 # progress.
2455 for missing_ids_chunk in chunk_iterable(missing_ids, chunk_size=10_000):
2456 records = {}
2457 for missing in missing_ids_chunk:
2458 # Ask the source datastore where the missing artifacts
2459 # should be. An execution butler might not know about the
2460 # artifacts even if they are there.
2461 expected = source_datastore._get_expected_dataset_locations_info(id_to_ref[missing])
2462 records[missing] = [info for _, info in expected]
2464 # Call the mexist helper method in case we have not already
2465 # checked these artifacts such that artifact_existence is
2466 # empty. This allows us to benefit from parallelism.
2467 # datastore.mexists() itself does not give us access to the
2468 # derived datastore record.
2469 log.verbose("Checking existence of %d datasets unknown to datastore", len(records))
2470 ref_exists = source_datastore._process_mexists_records(
2471 id_to_ref, records, False, artifact_existence=artifact_existence
2472 )
2474 # Now go through the records and propagate the ones that exist.
2475 location_factory = source_datastore.locationFactory
2476 for missing, record_list in records.items():
2477 # Skip completely if the ref does not exist.
2478 ref = id_to_ref[missing]
2479 if not ref_exists[ref]:
2480 log.warning("Asked to transfer dataset %s but no file artifacts exist for it.", ref)
2481 continue
2482 # Check for file artifact to decide which parts of a
2483 # disassembled composite do exist. If there is only a
2484 # single record we don't even need to look because it can't
2485 # be a composite and must exist.
2486 if len(record_list) == 1:
2487 dataset_records = record_list
2488 else:
2489 dataset_records = [
2490 record
2491 for record in record_list
2492 if artifact_existence[record.file_location(location_factory).uri]
2493 ]
2494 assert len(dataset_records) > 0, "Disassembled composite should have had some files."
2496 # Rely on source_records being a defaultdict.
2497 source_records[missing].extend(dataset_records)
2498 log.verbose("Completed scan for missing data files")
2500 # See if we already have these records
2501 target_records = self._get_stored_records_associated_with_refs(refs, ignore_datastore_records=True)
2503 # The artifacts to register
2504 artifacts = []
2506 # Refs that already exist
2507 already_present = []
2509 # Refs that were rejected by this datastore.
2510 rejected = set()
2512 # Refs that were transferred successfully.
2513 accepted = set()
2515 # Record each time we have done a "direct" transfer.
2516 direct_transfers = []
2518 # Now can transfer the artifacts
2519 for ref in refs:
2520 if not self.constraints.isAcceptable(ref):
2521 # This datastore should not be accepting this dataset.
2522 rejected.add(ref)
2523 continue
2525 accepted.add(ref)
2527 if ref.id in target_records:
2528 # Already have an artifact for this.
2529 already_present.append(ref)
2530 continue
2532 # mypy needs to know these are always resolved refs
2533 for info in source_records[ref.id]:
2534 source_location = info.file_location(source_datastore.locationFactory)
2535 target_location = info.file_location(self.locationFactory)
2536 if source_location == target_location and not source_location.pathInStore.isabs():
2537 # Artifact is already in the target location.
2538 # (which is how execution butler currently runs)
2539 pass
2540 else:
2541 if target_location.pathInStore.isabs():
2542 # Just because we can see the artifact when running
2543 # the transfer doesn't mean it will be generally
2544 # accessible to a user of this butler. Need to decide
2545 # what to do about an absolute path.
2546 if transfer == "auto":
2547 # For "auto" transfers we allow the absolute URI
2548 # to be recorded in the target datastore.
2549 direct_transfers.append(source_location)
2550 else:
2551 # The user is explicitly requesting a transfer
2552 # even for an absolute URI. This requires us to
2553 # calculate the target path.
2554 template_ref = ref
2555 if info.component:
2556 template_ref = ref.makeComponentRef(info.component)
2557 target_location = self._calculate_ingested_datastore_name(
2558 source_location.uri,
2559 template_ref,
2560 )
2562 info = info.update(path=target_location.pathInStore.path)
2564 # Need to transfer it to the new location.
2565 # Assume we should always overwrite. If the artifact
2566 # is there this might indicate that a previous transfer
2567 # was interrupted but was not able to be rolled back
2568 # completely (eg pre-emption) so follow Datastore default
2569 # and overwrite. Do not copy if we are in dry-run mode.
2570 if not dry_run:
2571 target_location.uri.transfer_from(
2572 source_location.uri,
2573 transfer=transfer,
2574 overwrite=True,
2575 transaction=self._transaction,
2576 )
2578 artifacts.append((ref, info))
2580 if direct_transfers:
2581 log.info(
2582 "Transfer request for an outside-datastore artifact with absolute URI done %d time%s",
2583 len(direct_transfers),
2584 "" if len(direct_transfers) == 1 else "s",
2585 )
2587 # We are overwriting previous datasets that may have already
2588 # existed. We therefore should ensure that we force the
2589 # datastore records to agree. Note that this can potentially lead
2590 # to difficulties if the dataset has previously been ingested
2591 # disassembled and is somehow now assembled, or vice versa.
2592 if not dry_run:
2593 self._register_datasets(artifacts, insert_mode=DatabaseInsertMode.REPLACE)
2595 if already_present:
2596 n_skipped = len(already_present)
2597 log.info(
2598 "Skipped transfer of %d dataset%s already present in datastore",
2599 n_skipped,
2600 "" if n_skipped == 1 else "s",
2601 )
2603 return accepted, rejected
2605 @transactional
2606 def forget(self, refs: Iterable[DatasetRef]) -> None:
2607 # Docstring inherited.
2608 refs = list(refs)
2609 self.bridge.forget(refs)
2610 self._table.delete(["dataset_id"], *[{"dataset_id": ref.id} for ref in refs])
2612 def validateConfiguration(
2613 self, entities: Iterable[DatasetRef | DatasetType | StorageClass], logFailures: bool = False
2614 ) -> None:
2615 """Validate some of the configuration for this datastore.
2617 Parameters
2618 ----------
2619 entities : iterable of `DatasetRef`, `DatasetType`, or `StorageClass`
2620 Entities to test against this configuration. Can be differing
2621 types.
2622 logFailures : `bool`, optional
2623 If `True`, output a log message for every validation error
2624 detected.
2626 Raises
2627 ------
2628 DatastoreValidationError
2629 Raised if there is a validation problem with a configuration.
2630 All the problems are reported in a single exception.
2632 Notes
2633 -----
2634 This method checks that all the supplied entities have valid file
2635 templates and also have formatters defined.
2636 """
2637 templateFailed = None
2638 try:
2639 self.templates.validateTemplates(entities, logFailures=logFailures)
2640 except FileTemplateValidationError as e:
2641 templateFailed = str(e)
2643 formatterFailed = []
2644 for entity in entities:
2645 try:
2646 self.formatterFactory.getFormatterClass(entity)
2647 except KeyError as e:
2648 formatterFailed.append(str(e))
2649 if logFailures:
2650 log.critical("Formatter failure: %s", e)
2652 if templateFailed or formatterFailed:
2653 messages = []
2654 if templateFailed:
2655 messages.append(templateFailed)
2656 if formatterFailed:
2657 messages.append(",".join(formatterFailed))
2658 msg = ";\n".join(messages)
2659 raise DatastoreValidationError(msg)
2661 def getLookupKeys(self) -> set[LookupKey]:
2662 # Docstring is inherited from base class
2663 return (
2664 self.templates.getLookupKeys()
2665 | self.formatterFactory.getLookupKeys()
2666 | self.constraints.getLookupKeys()
2667 )
2669 def validateKey(self, lookupKey: LookupKey, entity: DatasetRef | DatasetType | StorageClass) -> None:
2670 # Docstring is inherited from base class
2671 # The key can be valid in either formatters or templates so we can
2672 # only check the template if it exists
2673 if lookupKey in self.templates:
2674 try:
2675 self.templates[lookupKey].validateTemplate(entity)
2676 except FileTemplateValidationError as e:
2677 raise DatastoreValidationError(e) from e
2679 def export(
2680 self,
2681 refs: Iterable[DatasetRef],
2682 *,
2683 directory: ResourcePathExpression | None = None,
2684 transfer: str | None = "auto",
2685 ) -> Iterable[FileDataset]:
2686 # Docstring inherited from Datastore.export.
2687 if transfer == "auto" and directory is None:
2688 transfer = None
2690 if transfer is not None and directory is None:
2691 raise TypeError(f"Cannot export using transfer mode {transfer} with no export directory given")
2693 if transfer == "move":
2694 raise TypeError("Can not export by moving files out of datastore.")
2695 elif transfer == "direct":
2696 # For an export, treat this as equivalent to None. We do not
2697 # want an import to risk using absolute URIs to datasets owned
2698 # by another datastore.
2699 log.info("Treating 'direct' transfer mode as in-place export.")
2700 transfer = None
2702 # Force the directory to be a URI object
2703 directoryUri: ResourcePath | None = None
2704 if directory is not None:
2705 directoryUri = ResourcePath(directory, forceDirectory=True)
2707 if transfer is not None and directoryUri is not None and not directoryUri.exists():
2708 # mypy needs the second test
2709 raise FileNotFoundError(f"Export location {directory} does not exist")
2711 progress = Progress("lsst.daf.butler.datastores.FileDatastore.export", level=logging.DEBUG)
2712 for ref in progress.wrap(refs, "Exporting dataset files"):
2713 fileLocations = self._get_dataset_locations_info(ref)
2714 if not fileLocations:
2715 raise FileNotFoundError(f"Could not retrieve dataset {ref}.")
2716 # For now we can not export disassembled datasets
2717 if len(fileLocations) > 1:
2718 raise NotImplementedError(f"Can not export disassembled datasets such as {ref}")
2719 location, storedFileInfo = fileLocations[0]
2721 pathInStore = location.pathInStore.path
2722 if transfer is None:
2723 # TODO: do we also need to return the readStorageClass somehow?
2724 # We will use the path in store directly. If this is an
2725 # absolute URI, preserve it.
2726 if location.pathInStore.isabs():
2727 pathInStore = str(location.uri)
2728 elif transfer == "direct":
2729 # Use full URIs to the remote store in the export
2730 pathInStore = str(location.uri)
2731 else:
2732 # mypy needs help
2733 assert directoryUri is not None, "directoryUri must be defined to get here"
2734 storeUri = ResourcePath(location.uri, forceDirectory=False)
2736 # if the datastore has an absolute URI to a resource, we
2737 # have two options:
2738 # 1. Keep the absolute URI in the exported YAML
2739 # 2. Allocate a new name in the local datastore and transfer
2740 # it.
2741 # For now go with option 2
2742 if location.pathInStore.isabs():
2743 template = self.templates.getTemplate(ref)
2744 newURI = ResourcePath(template.format(ref), forceAbsolute=False, forceDirectory=False)
2745 pathInStore = str(newURI.updatedExtension(location.pathInStore.getExtension()))
2747 exportUri = directoryUri.join(pathInStore)
2748 exportUri.transfer_from(storeUri, transfer=transfer)
2750 yield FileDataset(refs=[ref], path=pathInStore, formatter=storedFileInfo.formatter)
2752 @staticmethod
2753 def computeChecksum(uri: ResourcePath, algorithm: str = "blake2b", block_size: int = 8192) -> str | None:
2754 """Compute the checksum of the supplied file.
2756 Parameters
2757 ----------
2758 uri : `lsst.resources.ResourcePath`
2759 Name of resource to calculate checksum from.
2760 algorithm : `str`, optional
2761 Name of algorithm to use. Must be one of the algorithms supported
2762 by :py:class`hashlib`.
2763 block_size : `int`
2764 Number of bytes to read from file at one time.
2766 Returns
2767 -------
2768 hexdigest : `str`
2769 Hex digest of the file.
2771 Notes
2772 -----
2773 Currently returns None if the URI is for a remote resource.
2774 """
2775 if algorithm not in hashlib.algorithms_guaranteed:
2776 raise NameError(f"The specified algorithm '{algorithm}' is not supported by hashlib")
2778 if not uri.isLocal:
2779 return None
2781 hasher = hashlib.new(algorithm)
2783 with uri.as_local() as local_uri, open(local_uri.ospath, "rb") as f:
2784 for chunk in iter(lambda: f.read(block_size), b""):
2785 hasher.update(chunk)
2787 return hasher.hexdigest()
2789 def needs_expanded_data_ids(
2790 self,
2791 transfer: str | None,
2792 entity: DatasetRef | DatasetType | StorageClass | None = None,
2793 ) -> bool:
2794 # Docstring inherited.
2795 # This _could_ also use entity to inspect whether the filename template
2796 # involves placeholders other than the required dimensions for its
2797 # dataset type, but that's not necessary for correctness; it just
2798 # enables more optimizations (perhaps only in theory).
2799 return transfer not in ("direct", None)
2801 def import_records(self, data: Mapping[str, DatastoreRecordData]) -> None:
2802 # Docstring inherited from the base class.
2803 record_data = data.get(self.name)
2804 if not record_data:
2805 return
2807 self._bridge.insert(FakeDatasetRef(dataset_id) for dataset_id in record_data.records)
2809 # TODO: Verify that there are no unexpected table names in the dict?
2810 unpacked_records = []
2811 for dataset_id, dataset_data in record_data.records.items():
2812 records = dataset_data.get(self._table.name)
2813 if records:
2814 for info in records:
2815 assert isinstance(info, StoredFileInfo), "Expecting StoredFileInfo records"
2816 unpacked_records.append(info.to_record(dataset_id=dataset_id))
2817 if unpacked_records:
2818 self._table.insert(*unpacked_records, transaction=self._transaction)
2820 def export_records(self, refs: Iterable[DatasetIdRef]) -> Mapping[str, DatastoreRecordData]:
2821 # Docstring inherited from the base class.
2822 exported_refs = list(self._bridge.check(refs))
2823 ids = {ref.id for ref in exported_refs}
2824 records: dict[DatasetId, dict[str, list[StoredDatastoreItemInfo]]] = {id: {} for id in ids}
2825 for row in self._table.fetch(dataset_id=ids):
2826 info: StoredDatastoreItemInfo = StoredFileInfo.from_record(row)
2827 dataset_records = records.setdefault(row["dataset_id"], {})
2828 dataset_records.setdefault(self._table.name, []).append(info)
2830 record_data = DatastoreRecordData(records=records)
2831 return {self.name: record_data}
2833 def set_retrieve_dataset_type_method(self, method: Callable[[str], DatasetType | None] | None) -> None:
2834 # Docstring inherited from the base class.
2835 self._retrieve_dataset_method = method
2837 def _cast_storage_class(self, ref: DatasetRef) -> DatasetRef:
2838 """Update dataset reference to use the storage class from registry."""
2839 if self._retrieve_dataset_method is None:
2840 # We could raise an exception here but unit tests do not define
2841 # this method.
2842 return ref
2843 dataset_type = self._retrieve_dataset_method(ref.datasetType.name)
2844 if dataset_type is not None:
2845 ref = ref.overrideStorageClass(dataset_type.storageClass)
2846 return ref
2848 def get_opaque_table_definitions(self) -> Mapping[str, DatastoreOpaqueTable]:
2849 # Docstring inherited from the base class.
2850 return {self._opaque_table_name: DatastoreOpaqueTable(self.makeTableSpec(ddl.GUID), StoredFileInfo)}