Coverage for python/lsst/daf/butler/datastores/fileDatastore.py: 10%
924 statements
« prev ^ index » next coverage.py v7.4.1, created at 2024-02-01 11:20 +0000
« prev ^ index » next coverage.py v7.4.1, created at 2024-02-01 11:20 +0000
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This software is dual licensed under the GNU General Public License and also
10# under a 3-clause BSD license. Recipients may choose which of these licenses
11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt,
12# respectively. If you choose the GPL option then the following text applies
13# (but note that there is still no warranty even if you opt for BSD instead):
14#
15# This program is free software: you can redistribute it and/or modify
16# it under the terms of the GNU General Public License as published by
17# the Free Software Foundation, either version 3 of the License, or
18# (at your option) any later version.
19#
20# This program is distributed in the hope that it will be useful,
21# but WITHOUT ANY WARRANTY; without even the implied warranty of
22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23# GNU General Public License for more details.
24#
25# You should have received a copy of the GNU General Public License
26# along with this program. If not, see <http://www.gnu.org/licenses/>.
28"""Generic file-based datastore code."""
30from __future__ import annotations
32__all__ = ("FileDatastore",)
34import contextlib
35import hashlib
36import logging
37from collections import defaultdict
38from collections.abc import Callable, Collection, Iterable, Mapping, Sequence
39from typing import TYPE_CHECKING, Any, ClassVar, cast
41from lsst.daf.butler import (
42 Config,
43 DatasetId,
44 DatasetRef,
45 DatasetType,
46 DatasetTypeNotSupportedError,
47 FileDataset,
48 FileDescriptor,
49 Formatter,
50 FormatterFactory,
51 Location,
52 LocationFactory,
53 Progress,
54 StorageClass,
55 ddl,
56)
57from lsst.daf.butler.datastore import (
58 DatasetRefURIs,
59 Datastore,
60 DatastoreConfig,
61 DatastoreOpaqueTable,
62 DatastoreValidationError,
63)
64from lsst.daf.butler.datastore.cache_manager import (
65 AbstractDatastoreCacheManager,
66 DatastoreCacheManager,
67 DatastoreDisabledCacheManager,
68)
69from lsst.daf.butler.datastore.composites import CompositesMap
70from lsst.daf.butler.datastore.file_templates import FileTemplates, FileTemplateValidationError
71from lsst.daf.butler.datastore.generic_base import GenericBaseDatastore
72from lsst.daf.butler.datastore.record_data import DatastoreRecordData
73from lsst.daf.butler.datastore.stored_file_info import StoredDatastoreItemInfo, StoredFileInfo
74from lsst.daf.butler.datastores.file_datastore.get import (
75 DatasetLocationInformation,
76 DatastoreFileGetInformation,
77 generate_datastore_get_information,
78 get_dataset_as_python_object_from_get_info,
79)
80from lsst.daf.butler.datastores.fileDatastoreClient import (
81 FileDatastoreGetPayload,
82 FileDatastoreGetPayloadFileInfo,
83)
84from lsst.daf.butler.registry.interfaces import (
85 DatabaseInsertMode,
86 DatastoreRegistryBridge,
87 FakeDatasetRef,
88 ReadOnlyDatabaseError,
89)
90from lsst.daf.butler.repo_relocation import replaceRoot
91from lsst.daf.butler.utils import transactional
92from lsst.resources import ResourcePath, ResourcePathExpression
93from lsst.utils.introspection import get_class_of
94from lsst.utils.iteration import chunk_iterable
96# For VERBOSE logging usage.
97from lsst.utils.logging import VERBOSE, getLogger
98from sqlalchemy import BigInteger, String
100if TYPE_CHECKING:
101 from lsst.daf.butler import LookupKey
102 from lsst.daf.butler.registry.interfaces import DatasetIdRef, DatastoreRegistryBridgeManager
104log = getLogger(__name__)
107class _IngestPrepData(Datastore.IngestPrepData):
108 """Helper class for FileDatastore ingest implementation.
110 Parameters
111 ----------
112 datasets : `~collections.abc.Iterable` of `FileDataset`
113 Files to be ingested by this datastore.
114 """
116 def __init__(self, datasets: Iterable[FileDataset]):
117 super().__init__(ref for dataset in datasets for ref in dataset.refs)
118 self.datasets = datasets
121class FileDatastore(GenericBaseDatastore[StoredFileInfo]):
122 """Generic Datastore for file-based implementations.
124 Should always be sub-classed since key abstract methods are missing.
126 Parameters
127 ----------
128 config : `DatastoreConfig` or `str`
129 Configuration as either a `Config` object or URI to file.
130 bridgeManager : `DatastoreRegistryBridgeManager`
131 Object that manages the interface between `Registry` and datastores.
132 root : `ResourcePath`
133 Root directory URI of this `Datastore`.
134 formatterFactory : `FormatterFactory`
135 Factory for creating instances of formatters.
136 templates : `FileTemplates`
137 File templates that can be used by this `Datastore`.
138 composites : `CompositesMap`
139 Determines whether a dataset should be disassembled on put.
140 trustGetRequest : `bool`
141 Determine whether we can fall back to configuration if a requested
142 dataset is not known to registry.
144 Raises
145 ------
146 ValueError
147 If root location does not exist and ``create`` is `False` in the
148 configuration.
149 """
151 defaultConfigFile: ClassVar[str | None] = None
152 """Path to configuration defaults. Accessed within the ``config`` resource
153 or relative to a search path. Can be None if no defaults specified.
154 """
156 root: ResourcePath
157 """Root directory URI of this `Datastore`."""
159 locationFactory: LocationFactory
160 """Factory for creating locations relative to the datastore root."""
162 formatterFactory: FormatterFactory
163 """Factory for creating instances of formatters."""
165 templates: FileTemplates
166 """File templates that can be used by this `Datastore`."""
168 composites: CompositesMap
169 """Determines whether a dataset should be disassembled on put."""
171 defaultConfigFile = "datastores/fileDatastore.yaml"
172 """Path to configuration defaults. Accessed within the ``config`` resource
173 or relative to a search path. Can be None if no defaults specified.
174 """
176 _retrieve_dataset_method: Callable[[str], DatasetType | None] | None = None
177 """Callable that is used in trusted mode to retrieve registry definition
178 of a named dataset type.
179 """
181 @classmethod
182 def setConfigRoot(cls, root: str, config: Config, full: Config, overwrite: bool = True) -> None:
183 """Set any filesystem-dependent config options for this Datastore to
184 be appropriate for a new empty repository with the given root.
186 Parameters
187 ----------
188 root : `str`
189 URI to the root of the data repository.
190 config : `Config`
191 A `Config` to update. Only the subset understood by
192 this component will be updated. Will not expand
193 defaults.
194 full : `Config`
195 A complete config with all defaults expanded that can be
196 converted to a `DatastoreConfig`. Read-only and will not be
197 modified by this method.
198 Repository-specific options that should not be obtained
199 from defaults when Butler instances are constructed
200 should be copied from ``full`` to ``config``.
201 overwrite : `bool`, optional
202 If `False`, do not modify a value in ``config`` if the value
203 already exists. Default is always to overwrite with the provided
204 ``root``.
206 Notes
207 -----
208 If a keyword is explicitly defined in the supplied ``config`` it
209 will not be overridden by this method if ``overwrite`` is `False`.
210 This allows explicit values set in external configs to be retained.
211 """
212 Config.updateParameters(
213 DatastoreConfig,
214 config,
215 full,
216 toUpdate={"root": root},
217 toCopy=("cls", ("records", "table")),
218 overwrite=overwrite,
219 )
221 @classmethod
222 def makeTableSpec(cls, datasetIdColumnType: type) -> ddl.TableSpec:
223 return ddl.TableSpec(
224 fields=[
225 ddl.FieldSpec(name="dataset_id", dtype=datasetIdColumnType, primaryKey=True),
226 ddl.FieldSpec(name="path", dtype=String, length=256, nullable=False),
227 ddl.FieldSpec(name="formatter", dtype=String, length=128, nullable=False),
228 ddl.FieldSpec(name="storage_class", dtype=String, length=64, nullable=False),
229 # Use empty string to indicate no component
230 ddl.FieldSpec(name="component", dtype=String, length=32, primaryKey=True),
231 # TODO: should checksum be Base64Bytes instead?
232 ddl.FieldSpec(name="checksum", dtype=String, length=128, nullable=True),
233 ddl.FieldSpec(name="file_size", dtype=BigInteger, nullable=True),
234 ],
235 unique=frozenset(),
236 indexes=[ddl.IndexSpec("path")],
237 )
239 def __init__(
240 self,
241 config: DatastoreConfig,
242 bridgeManager: DatastoreRegistryBridgeManager,
243 root: ResourcePath,
244 formatterFactory: FormatterFactory,
245 templates: FileTemplates,
246 composites: CompositesMap,
247 trustGetRequest: bool,
248 ):
249 super().__init__(config, bridgeManager)
250 self.root = ResourcePath(root)
251 self.formatterFactory = formatterFactory
252 self.templates = templates
253 self.composites = composites
254 self.trustGetRequest = trustGetRequest
256 # Name ourselves either using an explicit name or a name
257 # derived from the (unexpanded) root
258 if "name" in self.config:
259 self.name = self.config["name"]
260 else:
261 # We use the unexpanded root in the name to indicate that this
262 # datastore can be moved without having to update registry.
263 self.name = "{}@{}".format(type(self).__name__, self.config["root"])
265 self.locationFactory = LocationFactory(self.root)
267 self._opaque_table_name = self.config["records", "table"]
268 try:
269 # Storage of paths and formatters, keyed by dataset_id
270 self._table = bridgeManager.opaque.register(
271 self._opaque_table_name, self.makeTableSpec(bridgeManager.datasetIdColumnType)
272 )
273 # Interface to Registry.
274 self._bridge = bridgeManager.register(self.name)
275 except ReadOnlyDatabaseError:
276 # If the database is read only and we just tried and failed to
277 # create a table, it means someone is trying to create a read-only
278 # butler client for an empty repo. That should be okay, as long
279 # as they then try to get any datasets before some other client
280 # creates the table. Chances are they're just validating
281 # configuration.
282 pass
284 # Determine whether checksums should be used - default to False
285 self.useChecksum = self.config.get("checksum", False)
287 # Create a cache manager
288 self.cacheManager: AbstractDatastoreCacheManager
289 if "cached" in self.config:
290 self.cacheManager = DatastoreCacheManager(self.config["cached"], universe=bridgeManager.universe)
291 else:
292 self.cacheManager = DatastoreDisabledCacheManager("", universe=bridgeManager.universe)
294 @classmethod
295 def _create_from_config(
296 cls,
297 config: DatastoreConfig,
298 bridgeManager: DatastoreRegistryBridgeManager,
299 butlerRoot: ResourcePathExpression | None,
300 ) -> FileDatastore:
301 if "root" not in config:
302 raise ValueError("No root directory specified in configuration")
304 # Support repository relocation in config
305 # Existence of self.root is checked in subclass
306 root = ResourcePath(replaceRoot(config["root"], butlerRoot), forceDirectory=True, forceAbsolute=True)
308 # Now associate formatters with storage classes
309 formatterFactory = FormatterFactory()
310 formatterFactory.registerFormatters(config["formatters"], universe=bridgeManager.universe)
312 # Read the file naming templates
313 templates = FileTemplates(config["templates"], universe=bridgeManager.universe)
315 # See if composites should be disassembled
316 composites = CompositesMap(config["composites"], universe=bridgeManager.universe)
318 # Determine whether we can fall back to configuration if a
319 # requested dataset is not known to registry
320 trustGetRequest = config.get("trust_get_request", False)
322 self = FileDatastore(
323 config, bridgeManager, root, formatterFactory, templates, composites, trustGetRequest
324 )
326 # Check existence and create directory structure if necessary
327 if not self.root.exists():
328 if "create" not in self.config or not self.config["create"]:
329 raise ValueError(f"No valid root and not allowed to create one at: {self.root}")
330 try:
331 self.root.mkdir()
332 except Exception as e:
333 raise ValueError(
334 f"Can not create datastore root '{self.root}', check permissions. Got error: {e}"
335 ) from e
337 return self
339 def clone(self, bridgeManager: DatastoreRegistryBridgeManager) -> Datastore:
340 return FileDatastore(
341 self.config,
342 bridgeManager,
343 self.root,
344 self.formatterFactory,
345 self.templates,
346 self.composites,
347 self.trustGetRequest,
348 )
350 def __str__(self) -> str:
351 return str(self.root)
353 @property
354 def bridge(self) -> DatastoreRegistryBridge:
355 return self._bridge
357 @property
358 def roots(self) -> dict[str, ResourcePath | None]:
359 # Docstring inherited.
360 return {self.name: self.root}
362 def _set_trust_mode(self, mode: bool) -> None:
363 self.trustGetRequest = mode
365 def _artifact_exists(self, location: Location) -> bool:
366 """Check that an artifact exists in this datastore at the specified
367 location.
369 Parameters
370 ----------
371 location : `Location`
372 Expected location of the artifact associated with this datastore.
374 Returns
375 -------
376 exists : `bool`
377 True if the location can be found, false otherwise.
378 """
379 log.debug("Checking if resource exists: %s", location.uri)
380 return location.uri.exists()
382 def _delete_artifact(self, location: Location) -> None:
383 """Delete the artifact from the datastore.
385 Parameters
386 ----------
387 location : `Location`
388 Location of the artifact associated with this datastore.
389 """
390 if location.pathInStore.isabs():
391 raise RuntimeError(f"Cannot delete artifact with absolute uri {location.uri}.")
393 try:
394 location.uri.remove()
395 except FileNotFoundError:
396 log.debug("File %s did not exist and so could not be deleted.", location.uri)
397 raise
398 except Exception as e:
399 log.critical("Failed to delete file: %s (%s)", location.uri, e)
400 raise
401 log.debug("Successfully deleted file: %s", location.uri)
403 def addStoredItemInfo(
404 self,
405 refs: Iterable[DatasetRef],
406 infos: Iterable[StoredFileInfo],
407 insert_mode: DatabaseInsertMode = DatabaseInsertMode.INSERT,
408 ) -> None:
409 """Record internal storage information associated with one or more
410 datasets.
412 Parameters
413 ----------
414 refs : sequence of `DatasetRef`
415 The datasets that have been stored.
416 infos : sequence of `StoredDatastoreItemInfo`
417 Metadata associated with the stored datasets.
418 insert_mode : `~lsst.daf.butler.registry.interfaces.DatabaseInsertMode`
419 Mode to use to insert the new records into the table. The
420 options are ``INSERT`` (error if pre-existing), ``REPLACE``
421 (replace content with new values), and ``ENSURE`` (skip if the row
422 already exists).
423 """
424 records = [
425 info.rebase(ref).to_record(dataset_id=ref.id) for ref, info in zip(refs, infos, strict=True)
426 ]
427 match insert_mode:
428 case DatabaseInsertMode.INSERT:
429 self._table.insert(*records, transaction=self._transaction)
430 case DatabaseInsertMode.ENSURE:
431 self._table.ensure(*records, transaction=self._transaction)
432 case DatabaseInsertMode.REPLACE:
433 self._table.replace(*records, transaction=self._transaction)
434 case _:
435 raise ValueError(f"Unknown insert mode of '{insert_mode}'")
437 def getStoredItemsInfo(
438 self, ref: DatasetIdRef, ignore_datastore_records: bool = False
439 ) -> list[StoredFileInfo]:
440 """Retrieve information associated with files stored in this
441 `Datastore` associated with this dataset ref.
443 Parameters
444 ----------
445 ref : `DatasetRef`
446 The dataset that is to be queried.
447 ignore_datastore_records : `bool`
448 If `True` then do not use datastore records stored in refs.
450 Returns
451 -------
452 items : `~collections.abc.Iterable` [`StoredDatastoreItemInfo`]
453 Stored information about the files and associated formatters
454 associated with this dataset. Only one file will be returned
455 if the dataset has not been disassembled. Can return an empty
456 list if no matching datasets can be found.
457 """
458 # Try to get them from the ref first.
459 if ref._datastore_records is not None and not ignore_datastore_records:
460 if (ref_records := ref._datastore_records.get(self._table.name)) is not None:
461 # Need to make sure they have correct type.
462 for record in ref_records:
463 if not isinstance(record, StoredFileInfo):
464 raise TypeError(f"Datastore record has unexpected type {record.__class__.__name__}")
465 return cast(list[StoredFileInfo], ref_records)
467 # Look for the dataset_id -- there might be multiple matches
468 # if we have disassembled the dataset.
469 records = self._table.fetch(dataset_id=ref.id)
470 return [StoredFileInfo.from_record(record) for record in records]
472 def _register_datasets(
473 self,
474 refsAndInfos: Iterable[tuple[DatasetRef, StoredFileInfo]],
475 insert_mode: DatabaseInsertMode = DatabaseInsertMode.INSERT,
476 ) -> None:
477 """Update registry to indicate that one or more datasets have been
478 stored.
480 Parameters
481 ----------
482 refsAndInfos : sequence `tuple` [`DatasetRef`,
483 `StoredDatastoreItemInfo`]
484 Datasets to register and the internal datastore metadata associated
485 with them.
486 insert_mode : `str`, optional
487 Indicate whether the new records should be new ("insert", default),
488 or allowed to exists ("ensure") or be replaced if already present
489 ("replace").
490 """
491 expandedRefs: list[DatasetRef] = []
492 expandedItemInfos: list[StoredFileInfo] = []
494 for ref, itemInfo in refsAndInfos:
495 expandedRefs.append(ref)
496 expandedItemInfos.append(itemInfo)
498 # Dataset location only cares about registry ID so if we have
499 # disassembled in datastore we have to deduplicate. Since they
500 # will have different datasetTypes we can't use a set
501 registryRefs = {r.id: r for r in expandedRefs}
502 if insert_mode == DatabaseInsertMode.INSERT:
503 self.bridge.insert(registryRefs.values())
504 else:
505 # There are only two columns and all that matters is the
506 # dataset ID.
507 self.bridge.ensure(registryRefs.values())
508 self.addStoredItemInfo(expandedRefs, expandedItemInfos, insert_mode=insert_mode)
510 def _get_stored_records_associated_with_refs(
511 self, refs: Iterable[DatasetIdRef], ignore_datastore_records: bool = False
512 ) -> dict[DatasetId, list[StoredFileInfo]]:
513 """Retrieve all records associated with the provided refs.
515 Parameters
516 ----------
517 refs : iterable of `DatasetIdRef`
518 The refs for which records are to be retrieved.
519 ignore_datastore_records : `bool`
520 If `True` then do not use datastore records stored in refs.
522 Returns
523 -------
524 records : `dict` of [`DatasetId`, `list` of `StoredFileInfo`]
525 The matching records indexed by the ref ID. The number of entries
526 in the dict can be smaller than the number of requested refs.
527 """
528 # Check datastore records in refs first.
529 records_by_ref: defaultdict[DatasetId, list[StoredFileInfo]] = defaultdict(list)
530 refs_with_no_records = []
531 for ref in refs:
532 if ignore_datastore_records or ref._datastore_records is None:
533 refs_with_no_records.append(ref)
534 else:
535 if (ref_records := ref._datastore_records.get(self._table.name)) is not None:
536 # Need to make sure they have correct type.
537 for ref_record in ref_records:
538 if not isinstance(ref_record, StoredFileInfo):
539 raise TypeError(
540 f"Datastore record has unexpected type {ref_record.__class__.__name__}"
541 )
542 records_by_ref[ref.id].append(ref_record)
544 # If there were any refs without datastore records, check opaque table.
545 records = self._table.fetch(dataset_id=[ref.id for ref in refs_with_no_records])
547 # Uniqueness is dataset_id + component so can have multiple records
548 # per ref.
549 for record in records:
550 records_by_ref[record["dataset_id"]].append(StoredFileInfo.from_record(record))
551 return records_by_ref
553 def _refs_associated_with_artifacts(self, paths: list[str | ResourcePath]) -> dict[str, set[DatasetId]]:
554 """Return paths and associated dataset refs.
556 Parameters
557 ----------
558 paths : `list` of `str` or `lsst.resources.ResourcePath`
559 All the paths to include in search.
561 Returns
562 -------
563 mapping : `dict` of [`str`, `set` [`DatasetId`]]
564 Mapping of each path to a set of associated database IDs.
565 """
566 records = self._table.fetch(path=[str(path) for path in paths])
567 result = defaultdict(set)
568 for row in records:
569 result[row["path"]].add(row["dataset_id"])
570 return result
572 def _registered_refs_per_artifact(self, pathInStore: ResourcePath) -> set[DatasetId]:
573 """Return all dataset refs associated with the supplied path.
575 Parameters
576 ----------
577 pathInStore : `lsst.resources.ResourcePath`
578 Path of interest in the data store.
580 Returns
581 -------
582 ids : `set` of `int`
583 All `DatasetRef` IDs associated with this path.
584 """
585 records = list(self._table.fetch(path=str(pathInStore)))
586 ids = {r["dataset_id"] for r in records}
587 return ids
589 def removeStoredItemInfo(self, ref: DatasetIdRef) -> None:
590 """Remove information about the file associated with this dataset.
592 Parameters
593 ----------
594 ref : `DatasetRef`
595 The dataset that has been removed.
596 """
597 # Note that this method is actually not used by this implementation,
598 # we depend on bridge to delete opaque records. But there are some
599 # tests that check that this method works, so we keep it for now.
600 self._table.delete(["dataset_id"], {"dataset_id": ref.id})
602 def _get_dataset_locations_info(
603 self, ref: DatasetIdRef, ignore_datastore_records: bool = False
604 ) -> list[DatasetLocationInformation]:
605 r"""Find all the `Location`\ s of the requested dataset in the
606 `Datastore` and the associated stored file information.
608 Parameters
609 ----------
610 ref : `DatasetRef`
611 Reference to the required `Dataset`.
612 ignore_datastore_records : `bool`
613 If `True` then do not use datastore records stored in refs.
615 Returns
616 -------
617 results : `list` [`tuple` [`Location`, `StoredFileInfo` ]]
618 Location of the dataset within the datastore and
619 stored information about each file and its formatter.
620 """
621 # Get the file information (this will fail if no file)
622 records = self.getStoredItemsInfo(ref, ignore_datastore_records)
624 # Use the path to determine the location -- we need to take
625 # into account absolute URIs in the datastore record
626 return [(r.file_location(self.locationFactory), r) for r in records]
628 def _can_remove_dataset_artifact(self, ref: DatasetIdRef, location: Location) -> bool:
629 """Check that there is only one dataset associated with the
630 specified artifact.
632 Parameters
633 ----------
634 ref : `DatasetRef` or `FakeDatasetRef`
635 Dataset to be removed.
636 location : `Location`
637 The location of the artifact to be removed.
639 Returns
640 -------
641 can_remove : `Bool`
642 True if the artifact can be safely removed.
643 """
644 # Can't ever delete absolute URIs.
645 if location.pathInStore.isabs():
646 return False
648 # Get all entries associated with this path
649 allRefs = self._registered_refs_per_artifact(location.pathInStore)
650 if not allRefs:
651 raise RuntimeError(f"Datastore inconsistency error. {location.pathInStore} not in registry")
653 # Remove these refs from all the refs and if there is nothing left
654 # then we can delete
655 remainingRefs = allRefs - {ref.id}
657 if remainingRefs:
658 return False
659 return True
661 def _get_expected_dataset_locations_info(self, ref: DatasetRef) -> list[tuple[Location, StoredFileInfo]]:
662 """Predict the location and related file information of the requested
663 dataset in this datastore.
665 Parameters
666 ----------
667 ref : `DatasetRef`
668 Reference to the required `Dataset`.
670 Returns
671 -------
672 results : `list` [`tuple` [`Location`, `StoredFileInfo` ]]
673 Expected Location of the dataset within the datastore and
674 placeholder information about each file and its formatter.
676 Notes
677 -----
678 Uses the current configuration to determine how we would expect the
679 datastore files to have been written if we couldn't ask registry.
680 This is safe so long as there has been no change to datastore
681 configuration between writing the dataset and wanting to read it.
682 Will not work for files that have been ingested without using the
683 standard file template or default formatter.
684 """
685 # If we have a component ref we always need to ask the questions
686 # of the composite. If the composite is disassembled this routine
687 # should return all components. If the composite was not
688 # disassembled the composite is what is stored regardless of
689 # component request. Note that if the caller has disassembled
690 # a composite there is no way for this guess to know that
691 # without trying both the composite and component ref and seeing
692 # if there is something at the component Location even without
693 # disassembly being enabled.
694 if ref.datasetType.isComponent():
695 ref = ref.makeCompositeRef()
697 # See if the ref is a composite that should be disassembled
698 doDisassembly = self.composites.shouldBeDisassembled(ref)
700 all_info: list[tuple[Location, Formatter, StorageClass, str | None]] = []
702 if doDisassembly:
703 for component, componentStorage in ref.datasetType.storageClass.components.items():
704 compRef = ref.makeComponentRef(component)
705 location, formatter = self._determine_put_formatter_location(compRef)
706 all_info.append((location, formatter, componentStorage, component))
708 else:
709 # Always use the composite ref if no disassembly
710 location, formatter = self._determine_put_formatter_location(ref)
711 all_info.append((location, formatter, ref.datasetType.storageClass, None))
713 # Convert the list of tuples to have StoredFileInfo as second element
714 return [
715 (
716 location,
717 StoredFileInfo(
718 formatter=formatter,
719 path=location.pathInStore.path,
720 storageClass=storageClass,
721 component=component,
722 checksum=None,
723 file_size=-1,
724 ),
725 )
726 for location, formatter, storageClass, component in all_info
727 ]
729 def _prepare_for_direct_get(
730 self, ref: DatasetRef, parameters: Mapping[str, Any] | None = None
731 ) -> list[DatastoreFileGetInformation]:
732 """Check parameters for ``get`` and obtain formatter and
733 location.
735 Parameters
736 ----------
737 ref : `DatasetRef`
738 Reference to the required Dataset.
739 parameters : `dict`
740 `StorageClass`-specific parameters that specify, for example,
741 a slice of the dataset to be loaded.
743 Returns
744 -------
745 getInfo : `list` [`DatastoreFileGetInformation`]
746 Parameters needed to retrieve each file.
747 """
748 log.debug("Retrieve %s from %s with parameters %s", ref, self.name, parameters)
750 # The storage class we want to use eventually
751 refStorageClass = ref.datasetType.storageClass
753 # For trusted mode need to reset storage class.
754 ref = self._cast_storage_class(ref)
756 # Get file metadata and internal metadata
757 fileLocations = self._get_dataset_locations_info(ref)
758 if not fileLocations:
759 if not self.trustGetRequest:
760 raise FileNotFoundError(f"Could not retrieve dataset {ref}.")
761 # Assume the dataset is where we think it should be
762 fileLocations = self._get_expected_dataset_locations_info(ref)
764 if len(fileLocations) > 1:
765 # If trust is involved it is possible that there will be
766 # components listed here that do not exist in the datastore.
767 # Explicitly check for file artifact existence and filter out any
768 # that are missing.
769 if self.trustGetRequest:
770 fileLocations = [loc for loc in fileLocations if loc[0].uri.exists()]
772 # For now complain only if we have no components at all. One
773 # component is probably a problem but we can punt that to the
774 # assembler.
775 if not fileLocations:
776 raise FileNotFoundError(f"None of the component files for dataset {ref} exist.")
778 return generate_datastore_get_information(
779 fileLocations,
780 readStorageClass=refStorageClass,
781 ref=ref,
782 parameters=parameters,
783 )
785 def _prepare_for_put(self, inMemoryDataset: Any, ref: DatasetRef) -> tuple[Location, Formatter]:
786 """Check the arguments for ``put`` and obtain formatter and
787 location.
789 Parameters
790 ----------
791 inMemoryDataset : `object`
792 The dataset to store.
793 ref : `DatasetRef`
794 Reference to the associated Dataset.
796 Returns
797 -------
798 location : `Location`
799 The location to write the dataset.
800 formatter : `Formatter`
801 The `Formatter` to use to write the dataset.
803 Raises
804 ------
805 TypeError
806 Supplied object and storage class are inconsistent.
807 DatasetTypeNotSupportedError
808 The associated `DatasetType` is not handled by this datastore.
809 """
810 self._validate_put_parameters(inMemoryDataset, ref)
811 return self._determine_put_formatter_location(ref)
813 def _determine_put_formatter_location(self, ref: DatasetRef) -> tuple[Location, Formatter]:
814 """Calculate the formatter and output location to use for put.
816 Parameters
817 ----------
818 ref : `DatasetRef`
819 Reference to the associated Dataset.
821 Returns
822 -------
823 location : `Location`
824 The location to write the dataset.
825 formatter : `Formatter`
826 The `Formatter` to use to write the dataset.
827 """
828 # Work out output file name
829 try:
830 template = self.templates.getTemplate(ref)
831 except KeyError as e:
832 raise DatasetTypeNotSupportedError(f"Unable to find template for {ref}") from e
834 # Validate the template to protect against filenames from different
835 # dataIds returning the same and causing overwrite confusion.
836 template.validateTemplate(ref)
838 location = self.locationFactory.fromPath(template.format(ref), trusted_path=True)
840 # Get the formatter based on the storage class
841 storageClass = ref.datasetType.storageClass
842 try:
843 formatter = self.formatterFactory.getFormatter(
844 ref, FileDescriptor(location, storageClass=storageClass), ref.dataId
845 )
846 except KeyError as e:
847 raise DatasetTypeNotSupportedError(
848 f"Unable to find formatter for {ref} in datastore {self.name}"
849 ) from e
851 # Now that we know the formatter, update the location
852 location = formatter.makeUpdatedLocation(location)
854 return location, formatter
856 def _overrideTransferMode(self, *datasets: FileDataset, transfer: str | None = None) -> str | None:
857 # Docstring inherited from base class
858 if transfer != "auto":
859 return transfer
861 # See if the paths are within the datastore or not
862 inside = [self._pathInStore(d.path) is not None for d in datasets]
864 if all(inside):
865 transfer = None
866 elif not any(inside):
867 # Allow ResourcePath to use its own knowledge
868 transfer = "auto"
869 else:
870 # This can happen when importing from a datastore that
871 # has had some datasets ingested using "direct" mode.
872 # Also allow ResourcePath to sort it out but warn about it.
873 # This can happen if you are importing from a datastore
874 # that had some direct transfer datasets.
875 log.warning(
876 "Some datasets are inside the datastore and some are outside. Using 'split' "
877 "transfer mode. This assumes that the files outside the datastore are "
878 "still accessible to the new butler since they will not be copied into "
879 "the target datastore."
880 )
881 transfer = "split"
883 return transfer
885 def _pathInStore(self, path: ResourcePathExpression) -> str | None:
886 """Return path relative to datastore root.
888 Parameters
889 ----------
890 path : `lsst.resources.ResourcePathExpression`
891 Path to dataset. Can be absolute URI. If relative assumed to
892 be relative to the datastore. Returns path in datastore
893 or raises an exception if the path it outside.
895 Returns
896 -------
897 inStore : `str`
898 Path relative to datastore root. Returns `None` if the file is
899 outside the root.
900 """
901 # Relative path will always be relative to datastore
902 pathUri = ResourcePath(path, forceAbsolute=False, forceDirectory=False)
903 return pathUri.relative_to(self.root)
905 def _standardizeIngestPath(
906 self, path: str | ResourcePath, *, transfer: str | None = None
907 ) -> str | ResourcePath:
908 """Standardize the path of a to-be-ingested file.
910 Parameters
911 ----------
912 path : `str` or `lsst.resources.ResourcePath`
913 Path of a file to be ingested. This parameter is not expected
914 to be all the types that can be used to construct a
915 `~lsst.resources.ResourcePath`.
916 transfer : `str`, optional
917 How (and whether) the dataset should be added to the datastore.
918 See `ingest` for details of transfer modes.
919 This implementation is provided only so
920 `NotImplementedError` can be raised if the mode is not supported;
921 actual transfers are deferred to `_extractIngestInfo`.
923 Returns
924 -------
925 path : `str` or `lsst.resources.ResourcePath`
926 New path in what the datastore considers standard form. If an
927 absolute URI was given that will be returned unchanged.
929 Notes
930 -----
931 Subclasses of `FileDatastore` can implement this method instead
932 of `_prepIngest`. It should not modify the data repository or given
933 file in any way.
935 Raises
936 ------
937 NotImplementedError
938 Raised if the datastore does not support the given transfer mode
939 (including the case where ingest is not supported at all).
940 FileNotFoundError
941 Raised if one of the given files does not exist.
942 """
943 if transfer not in (None, "direct", "split") + self.root.transferModes:
944 raise NotImplementedError(f"Transfer mode {transfer} not supported.")
946 # A relative URI indicates relative to datastore root
947 srcUri = ResourcePath(path, forceAbsolute=False, forceDirectory=False)
948 if not srcUri.isabs():
949 srcUri = self.root.join(path)
951 if not srcUri.exists():
952 raise FileNotFoundError(
953 f"Resource at {srcUri} does not exist; note that paths to ingest "
954 f"are assumed to be relative to {self.root} unless they are absolute."
955 )
957 if transfer is None:
958 relpath = srcUri.relative_to(self.root)
959 if not relpath:
960 raise RuntimeError(
961 f"Transfer is none but source file ({srcUri}) is not within datastore ({self.root})"
962 )
964 # Return the relative path within the datastore for internal
965 # transfer
966 path = relpath
968 return path
970 def _extractIngestInfo(
971 self,
972 path: ResourcePathExpression,
973 ref: DatasetRef,
974 *,
975 formatter: Formatter | type[Formatter],
976 transfer: str | None = None,
977 record_validation_info: bool = True,
978 ) -> StoredFileInfo:
979 """Relocate (if necessary) and extract `StoredFileInfo` from a
980 to-be-ingested file.
982 Parameters
983 ----------
984 path : `lsst.resources.ResourcePathExpression`
985 URI or path of a file to be ingested.
986 ref : `DatasetRef`
987 Reference for the dataset being ingested. Guaranteed to have
988 ``dataset_id not None`.
989 formatter : `type` or `Formatter`
990 `Formatter` subclass to use for this dataset or an instance.
991 transfer : `str`, optional
992 How (and whether) the dataset should be added to the datastore.
993 See `ingest` for details of transfer modes.
994 record_validation_info : `bool`, optional
995 If `True`, the default, the datastore can record validation
996 information associated with the file. If `False` the datastore
997 will not attempt to track any information such as checksums
998 or file sizes. This can be useful if such information is tracked
999 in an external system or if the file is to be compressed in place.
1000 It is up to the datastore whether this parameter is relevant.
1002 Returns
1003 -------
1004 info : `StoredFileInfo`
1005 Internal datastore record for this file. This will be inserted by
1006 the caller; the `_extractIngestInfo` is only responsible for
1007 creating and populating the struct.
1009 Raises
1010 ------
1011 FileNotFoundError
1012 Raised if one of the given files does not exist.
1013 FileExistsError
1014 Raised if transfer is not `None` but the (internal) location the
1015 file would be moved to is already occupied.
1016 """
1017 if self._transaction is None:
1018 raise RuntimeError("Ingest called without transaction enabled")
1020 # Create URI of the source path, do not need to force a relative
1021 # path to absolute.
1022 srcUri = ResourcePath(path, forceAbsolute=False, forceDirectory=False)
1024 # Track whether we have read the size of the source yet
1025 have_sized = False
1027 tgtLocation: Location | None
1028 if transfer is None or transfer == "split":
1029 # A relative path is assumed to be relative to the datastore
1030 # in this context
1031 if not srcUri.isabs():
1032 tgtLocation = self.locationFactory.fromPath(srcUri.ospath, trusted_path=False)
1033 else:
1034 # Work out the path in the datastore from an absolute URI
1035 # This is required to be within the datastore.
1036 pathInStore = srcUri.relative_to(self.root)
1037 if pathInStore is None and transfer is None:
1038 raise RuntimeError(
1039 f"Unexpectedly learned that {srcUri} is not within datastore {self.root}"
1040 )
1041 if pathInStore:
1042 tgtLocation = self.locationFactory.fromPath(pathInStore, trusted_path=True)
1043 elif transfer == "split":
1044 # Outside the datastore but treat that as a direct ingest
1045 # instead.
1046 tgtLocation = None
1047 else:
1048 raise RuntimeError(f"Unexpected transfer mode encountered: {transfer} for URI {srcUri}")
1049 elif transfer == "direct":
1050 # Want to store the full URI to the resource directly in
1051 # datastore. This is useful for referring to permanent archive
1052 # storage for raw data.
1053 # Trust that people know what they are doing.
1054 tgtLocation = None
1055 else:
1056 # Work out the name we want this ingested file to have
1057 # inside the datastore
1058 tgtLocation = self._calculate_ingested_datastore_name(srcUri, ref, formatter)
1059 if not tgtLocation.uri.dirname().exists():
1060 log.debug("Folder %s does not exist yet.", tgtLocation.uri.dirname())
1061 tgtLocation.uri.dirname().mkdir()
1063 # if we are transferring from a local file to a remote location
1064 # it may be more efficient to get the size and checksum of the
1065 # local file rather than the transferred one
1066 if record_validation_info and srcUri.isLocal:
1067 size = srcUri.size()
1068 checksum = self.computeChecksum(srcUri) if self.useChecksum else None
1069 have_sized = True
1071 # Transfer the resource to the destination.
1072 # Allow overwrite of an existing file. This matches the behavior
1073 # of datastore.put() in that it trusts that registry would not
1074 # be asking to overwrite unless registry thought that the
1075 # overwrite was allowed.
1076 tgtLocation.uri.transfer_from(
1077 srcUri, transfer=transfer, transaction=self._transaction, overwrite=True
1078 )
1080 if tgtLocation is None:
1081 # This means we are using direct mode
1082 targetUri = srcUri
1083 targetPath = str(srcUri)
1084 else:
1085 targetUri = tgtLocation.uri
1086 targetPath = tgtLocation.pathInStore.path
1088 # the file should exist in the datastore now
1089 if record_validation_info:
1090 if not have_sized:
1091 size = targetUri.size()
1092 checksum = self.computeChecksum(targetUri) if self.useChecksum else None
1093 else:
1094 # Not recording any file information.
1095 size = -1
1096 checksum = None
1098 return StoredFileInfo(
1099 formatter=formatter,
1100 path=targetPath,
1101 storageClass=ref.datasetType.storageClass,
1102 component=ref.datasetType.component(),
1103 file_size=size,
1104 checksum=checksum,
1105 )
1107 def _prepIngest(self, *datasets: FileDataset, transfer: str | None = None) -> _IngestPrepData:
1108 # Docstring inherited from Datastore._prepIngest.
1109 filtered = []
1110 for dataset in datasets:
1111 acceptable = [ref for ref in dataset.refs if self.constraints.isAcceptable(ref)]
1112 if not acceptable:
1113 continue
1114 else:
1115 dataset.refs = acceptable
1116 if dataset.formatter is None:
1117 dataset.formatter = self.formatterFactory.getFormatterClass(dataset.refs[0])
1118 else:
1119 assert isinstance(dataset.formatter, type | str)
1120 formatter_class = get_class_of(dataset.formatter)
1121 if not issubclass(formatter_class, Formatter):
1122 raise TypeError(f"Requested formatter {dataset.formatter} is not a Formatter class.")
1123 dataset.formatter = formatter_class
1124 dataset.path = self._standardizeIngestPath(dataset.path, transfer=transfer)
1125 filtered.append(dataset)
1126 return _IngestPrepData(filtered)
1128 @transactional
1129 def _finishIngest(
1130 self,
1131 prepData: Datastore.IngestPrepData,
1132 *,
1133 transfer: str | None = None,
1134 record_validation_info: bool = True,
1135 ) -> None:
1136 # Docstring inherited from Datastore._finishIngest.
1137 refsAndInfos = []
1138 progress = Progress("lsst.daf.butler.datastores.FileDatastore.ingest", level=logging.DEBUG)
1139 for dataset in progress.wrap(prepData.datasets, desc="Ingesting dataset files"):
1140 # Do ingest as if the first dataset ref is associated with the file
1141 info = self._extractIngestInfo(
1142 dataset.path,
1143 dataset.refs[0],
1144 formatter=dataset.formatter,
1145 transfer=transfer,
1146 record_validation_info=record_validation_info,
1147 )
1148 refsAndInfos.extend([(ref, info) for ref in dataset.refs])
1150 # In direct mode we can allow repeated ingests of the same thing
1151 # if we are sure that the external dataset is immutable. We use
1152 # UUIDv5 to indicate this. If there is a mix of v4 and v5 they are
1153 # separated.
1154 refs_and_infos_replace = []
1155 refs_and_infos_insert = []
1156 if transfer == "direct":
1157 for entry in refsAndInfos:
1158 if entry[0].id.version == 5:
1159 refs_and_infos_replace.append(entry)
1160 else:
1161 refs_and_infos_insert.append(entry)
1162 else:
1163 refs_and_infos_insert = refsAndInfos
1165 if refs_and_infos_insert:
1166 self._register_datasets(refs_and_infos_insert, insert_mode=DatabaseInsertMode.INSERT)
1167 if refs_and_infos_replace:
1168 self._register_datasets(refs_and_infos_replace, insert_mode=DatabaseInsertMode.REPLACE)
1170 def _calculate_ingested_datastore_name(
1171 self,
1172 srcUri: ResourcePath,
1173 ref: DatasetRef,
1174 formatter: Formatter | type[Formatter] | None = None,
1175 ) -> Location:
1176 """Given a source URI and a DatasetRef, determine the name the
1177 dataset will have inside datastore.
1179 Parameters
1180 ----------
1181 srcUri : `lsst.resources.ResourcePath`
1182 URI to the source dataset file.
1183 ref : `DatasetRef`
1184 Ref associated with the newly-ingested dataset artifact. This
1185 is used to determine the name within the datastore.
1186 formatter : `Formatter` or Formatter class.
1187 Formatter to use for validation. Can be a class or an instance.
1188 No validation of the file extension is performed if the
1189 ``formatter`` is `None`. This can be used if the caller knows
1190 that the source URI and target URI will use the same formatter.
1192 Returns
1193 -------
1194 location : `Location`
1195 Target location for the newly-ingested dataset.
1196 """
1197 # Ingesting a file from outside the datastore.
1198 # This involves a new name.
1199 template = self.templates.getTemplate(ref)
1200 location = self.locationFactory.fromPath(template.format(ref), trusted_path=True)
1202 # Get the extension
1203 ext = srcUri.getExtension()
1205 # Update the destination to include that extension
1206 location.updateExtension(ext)
1208 # Ask the formatter to validate this extension
1209 if formatter is not None:
1210 formatter.validateExtension(location)
1212 return location
1214 def _write_in_memory_to_artifact(self, inMemoryDataset: Any, ref: DatasetRef) -> StoredFileInfo:
1215 """Write out in memory dataset to datastore.
1217 Parameters
1218 ----------
1219 inMemoryDataset : `object`
1220 Dataset to write to datastore.
1221 ref : `DatasetRef`
1222 Registry information associated with this dataset.
1224 Returns
1225 -------
1226 info : `StoredFileInfo`
1227 Information describing the artifact written to the datastore.
1228 """
1229 # May need to coerce the in memory dataset to the correct
1230 # python type, but first we need to make sure the storage class
1231 # reflects the one defined in the data repository.
1232 ref = self._cast_storage_class(ref)
1233 inMemoryDataset = ref.datasetType.storageClass.coerce_type(inMemoryDataset)
1235 location, formatter = self._prepare_for_put(inMemoryDataset, ref)
1236 uri = location.uri
1238 if not uri.dirname().exists():
1239 log.debug("Folder %s does not exist yet so creating it.", uri.dirname())
1240 uri.dirname().mkdir()
1242 if self._transaction is None:
1243 raise RuntimeError("Attempting to write artifact without transaction enabled")
1245 def _removeFileExists(uri: ResourcePath) -> None:
1246 """Remove a file and do not complain if it is not there.
1248 This is important since a formatter might fail before the file
1249 is written and we should not confuse people by writing spurious
1250 error messages to the log.
1251 """
1252 with contextlib.suppress(FileNotFoundError):
1253 uri.remove()
1255 # Register a callback to try to delete the uploaded data if
1256 # something fails below
1257 self._transaction.registerUndo("artifactWrite", _removeFileExists, uri)
1259 data_written = False
1261 # For remote URIs some datasets can be serialized directly
1262 # to bytes and sent to the remote datastore without writing a
1263 # file. If the dataset is intended to be saved to the cache
1264 # a file is always written and direct write to the remote
1265 # datastore is bypassed.
1266 if not uri.isLocal and not self.cacheManager.should_be_cached(ref):
1267 # Remote URI that is not cached so can write directly.
1268 try:
1269 serializedDataset = formatter.toBytes(inMemoryDataset)
1270 except NotImplementedError:
1271 # Fallback to the file writing option.
1272 pass
1273 except Exception as e:
1274 raise RuntimeError(
1275 f"Failed to serialize dataset {ref} of type {type(inMemoryDataset)} to bytes."
1276 ) from e
1277 else:
1278 log.debug("Writing bytes directly to %s", uri)
1279 uri.write(serializedDataset, overwrite=True)
1280 log.debug("Successfully wrote bytes directly to %s", uri)
1281 data_written = True
1283 if not data_written:
1284 # Did not write the bytes directly to object store so instead
1285 # write to temporary file. Always write to a temporary even if
1286 # using a local file system -- that gives us atomic writes.
1287 # If a process is killed as the file is being written we do not
1288 # want it to remain in the correct place but in corrupt state.
1289 # For local files write to the output directory not temporary dir.
1290 prefix = uri.dirname() if uri.isLocal else None
1291 with ResourcePath.temporary_uri(suffix=uri.getExtension(), prefix=prefix) as temporary_uri:
1292 # Need to configure the formatter to write to a different
1293 # location and that needs us to overwrite internals
1294 log.debug("Writing dataset to temporary location at %s", temporary_uri)
1295 with formatter._updateLocation(Location(None, temporary_uri)):
1296 try:
1297 formatter.write(inMemoryDataset)
1298 except Exception as e:
1299 raise RuntimeError(
1300 f"Failed to serialize dataset {ref} of type"
1301 f" {type(inMemoryDataset)} to "
1302 f"temporary location {temporary_uri}"
1303 ) from e
1305 # Use move for a local file since that becomes an efficient
1306 # os.rename. For remote resources we use copy to allow the
1307 # file to be cached afterwards.
1308 transfer = "move" if uri.isLocal else "copy"
1310 uri.transfer_from(temporary_uri, transfer=transfer, overwrite=True)
1312 if transfer == "copy":
1313 # Cache if required
1314 self.cacheManager.move_to_cache(temporary_uri, ref)
1316 log.debug("Successfully wrote dataset to %s via a temporary file.", uri)
1318 # URI is needed to resolve what ingest case are we dealing with
1319 return self._extractIngestInfo(uri, ref, formatter=formatter)
1321 def knows(self, ref: DatasetRef) -> bool:
1322 """Check if the dataset is known to the datastore.
1324 Does not check for existence of any artifact.
1326 Parameters
1327 ----------
1328 ref : `DatasetRef`
1329 Reference to the required dataset.
1331 Returns
1332 -------
1333 exists : `bool`
1334 `True` if the dataset is known to the datastore.
1335 """
1336 # We cannot trust datastore records from ref, as many unit tests delete
1337 # datasets and check their existence.
1338 fileLocations = self._get_dataset_locations_info(ref, ignore_datastore_records=True)
1339 if fileLocations:
1340 return True
1341 return False
1343 def knows_these(self, refs: Iterable[DatasetRef]) -> dict[DatasetRef, bool]:
1344 # Docstring inherited from the base class.
1346 # The records themselves. Could be missing some entries.
1347 records = self._get_stored_records_associated_with_refs(refs, ignore_datastore_records=True)
1349 return {ref: ref.id in records for ref in refs}
1351 def _process_mexists_records(
1352 self,
1353 id_to_ref: dict[DatasetId, DatasetRef],
1354 records: dict[DatasetId, list[StoredFileInfo]],
1355 all_required: bool,
1356 artifact_existence: dict[ResourcePath, bool] | None = None,
1357 ) -> dict[DatasetRef, bool]:
1358 """Check given records for existence.
1360 Helper function for `mexists()`.
1362 Parameters
1363 ----------
1364 id_to_ref : `dict` of [`DatasetId`, `DatasetRef`]
1365 Mapping of the dataset ID to the dataset ref itself.
1366 records : `dict` of [`DatasetId`, `list` of `StoredFileInfo`]
1367 Records as generally returned by
1368 ``_get_stored_records_associated_with_refs``.
1369 all_required : `bool`
1370 Flag to indicate whether existence requires all artifacts
1371 associated with a dataset ID to exist or not for existence.
1372 artifact_existence : `dict` [`lsst.resources.ResourcePath`, `bool`]
1373 Optional mapping of datastore artifact to existence. Updated by
1374 this method with details of all artifacts tested. Can be `None`
1375 if the caller is not interested.
1377 Returns
1378 -------
1379 existence : `dict` of [`DatasetRef`, `bool`]
1380 Mapping from dataset to boolean indicating existence.
1381 """
1382 # The URIs to be checked and a mapping of those URIs to
1383 # the dataset ID.
1384 uris_to_check: list[ResourcePath] = []
1385 location_map: dict[ResourcePath, DatasetId] = {}
1387 location_factory = self.locationFactory
1389 uri_existence: dict[ResourcePath, bool] = {}
1390 for ref_id, infos in records.items():
1391 # Key is the dataset Id, value is list of StoredItemInfo
1392 uris = [info.file_location(location_factory).uri for info in infos]
1393 location_map.update({uri: ref_id for uri in uris})
1395 # Check the local cache directly for a dataset corresponding
1396 # to the remote URI.
1397 if self.cacheManager.file_count > 0:
1398 ref = id_to_ref[ref_id]
1399 for uri, storedFileInfo in zip(uris, infos, strict=True):
1400 check_ref = ref
1401 if not ref.datasetType.isComponent() and (component := storedFileInfo.component):
1402 check_ref = ref.makeComponentRef(component)
1403 if self.cacheManager.known_to_cache(check_ref, uri.getExtension()):
1404 # Proxy for URI existence.
1405 uri_existence[uri] = True
1406 else:
1407 uris_to_check.append(uri)
1408 else:
1409 # Check all of them.
1410 uris_to_check.extend(uris)
1412 if artifact_existence is not None:
1413 # If a URI has already been checked remove it from the list
1414 # and immediately add the status to the output dict.
1415 filtered_uris_to_check = []
1416 for uri in uris_to_check:
1417 if uri in artifact_existence:
1418 uri_existence[uri] = artifact_existence[uri]
1419 else:
1420 filtered_uris_to_check.append(uri)
1421 uris_to_check = filtered_uris_to_check
1423 # Results.
1424 dataset_existence: dict[DatasetRef, bool] = {}
1426 uri_existence.update(ResourcePath.mexists(uris_to_check))
1427 for uri, exists in uri_existence.items():
1428 dataset_id = location_map[uri]
1429 ref = id_to_ref[dataset_id]
1431 # Disassembled composite needs to check all locations.
1432 # all_required indicates whether all need to exist or not.
1433 if ref in dataset_existence:
1434 if all_required:
1435 exists = dataset_existence[ref] and exists
1436 else:
1437 exists = dataset_existence[ref] or exists
1438 dataset_existence[ref] = exists
1440 if artifact_existence is not None:
1441 artifact_existence.update(uri_existence)
1443 return dataset_existence
1445 def mexists(
1446 self, refs: Iterable[DatasetRef], artifact_existence: dict[ResourcePath, bool] | None = None
1447 ) -> dict[DatasetRef, bool]:
1448 """Check the existence of multiple datasets at once.
1450 Parameters
1451 ----------
1452 refs : iterable of `DatasetRef`
1453 The datasets to be checked.
1454 artifact_existence : `dict` [`lsst.resources.ResourcePath`, `bool`]
1455 Optional mapping of datastore artifact to existence. Updated by
1456 this method with details of all artifacts tested. Can be `None`
1457 if the caller is not interested.
1459 Returns
1460 -------
1461 existence : `dict` of [`DatasetRef`, `bool`]
1462 Mapping from dataset to boolean indicating existence.
1464 Notes
1465 -----
1466 To minimize potentially costly remote existence checks, the local
1467 cache is checked as a proxy for existence. If a file for this
1468 `DatasetRef` does exist no check is done for the actual URI. This
1469 could result in possibly unexpected behavior if the dataset itself
1470 has been removed from the datastore by another process whilst it is
1471 still in the cache.
1472 """
1473 chunk_size = 10_000
1474 dataset_existence: dict[DatasetRef, bool] = {}
1475 log.debug("Checking for the existence of multiple artifacts in datastore in chunks of %d", chunk_size)
1476 n_found_total = 0
1477 n_checked = 0
1478 n_chunks = 0
1479 for chunk in chunk_iterable(refs, chunk_size=chunk_size):
1480 chunk_result = self._mexists(chunk, artifact_existence)
1482 # The log message level and content depend on how many
1483 # datasets we are processing.
1484 n_results = len(chunk_result)
1486 # Use verbose logging to ensure that messages can be seen
1487 # easily if many refs are being checked.
1488 log_threshold = VERBOSE
1489 n_checked += n_results
1491 # This sum can take some time so only do it if we know the
1492 # result is going to be used.
1493 n_found = 0
1494 if log.isEnabledFor(log_threshold):
1495 # Can treat the booleans as 0, 1 integers and sum them.
1496 n_found = sum(chunk_result.values())
1497 n_found_total += n_found
1499 # We are deliberately not trying to count the number of refs
1500 # provided in case it's in the millions. This means there is a
1501 # situation where the number of refs exactly matches the chunk
1502 # size and we will switch to the multi-chunk path even though
1503 # we only have a single chunk.
1504 if n_results < chunk_size and n_chunks == 0:
1505 # Single chunk will be processed so we can provide more detail.
1506 if n_results == 1:
1507 ref = list(chunk_result)[0]
1508 # Use debug logging to be consistent with `exists()`.
1509 log.debug(
1510 "Calling mexists() with single ref that does%s exist (%s).",
1511 "" if chunk_result[ref] else " not",
1512 ref,
1513 )
1514 else:
1515 # Single chunk but multiple files. Summarize.
1516 log.log(
1517 log_threshold,
1518 "Number of datasets found in datastore: %d out of %d datasets checked.",
1519 n_found,
1520 n_checked,
1521 )
1523 else:
1524 # Use incremental verbose logging when we have multiple chunks.
1525 log.log(
1526 log_threshold,
1527 "Number of datasets found in datastore for chunk %d: %d out of %d checked "
1528 "(running total from all chunks so far: %d found out of %d checked)",
1529 n_chunks,
1530 n_found,
1531 n_results,
1532 n_found_total,
1533 n_checked,
1534 )
1535 dataset_existence.update(chunk_result)
1536 n_chunks += 1
1538 return dataset_existence
1540 def _mexists(
1541 self, refs: Sequence[DatasetRef], artifact_existence: dict[ResourcePath, bool] | None = None
1542 ) -> dict[DatasetRef, bool]:
1543 """Check the existence of multiple datasets at once.
1545 Parameters
1546 ----------
1547 refs : iterable of `DatasetRef`
1548 The datasets to be checked.
1549 artifact_existence : `dict` [`lsst.resources.ResourcePath`, `bool`]
1550 Optional mapping of datastore artifact to existence. Updated by
1551 this method with details of all artifacts tested. Can be `None`
1552 if the caller is not interested.
1554 Returns
1555 -------
1556 existence : `dict` of [`DatasetRef`, `bool`]
1557 Mapping from dataset to boolean indicating existence.
1558 """
1559 # Make a mapping from refs with the internal storage class to the given
1560 # refs that may have a different one. We'll use the internal refs
1561 # throughout this method and convert back at the very end.
1562 internal_ref_to_input_ref = {self._cast_storage_class(ref): ref for ref in refs}
1564 # Need a mapping of dataset_id to (internal) dataset ref since some
1565 # internal APIs work with dataset_id.
1566 id_to_ref = {ref.id: ref for ref in internal_ref_to_input_ref}
1568 # Set of all IDs we are checking for.
1569 requested_ids = set(id_to_ref.keys())
1571 # The records themselves. Could be missing some entries.
1572 records = self._get_stored_records_associated_with_refs(
1573 id_to_ref.values(), ignore_datastore_records=True
1574 )
1576 dataset_existence = self._process_mexists_records(
1577 id_to_ref, records, True, artifact_existence=artifact_existence
1578 )
1580 # Set of IDs that have been handled.
1581 handled_ids = {ref.id for ref in dataset_existence}
1583 missing_ids = requested_ids - handled_ids
1584 if missing_ids:
1585 dataset_existence.update(
1586 self._mexists_check_expected(
1587 [id_to_ref[missing] for missing in missing_ids], artifact_existence
1588 )
1589 )
1591 return {
1592 internal_ref_to_input_ref[internal_ref]: existence
1593 for internal_ref, existence in dataset_existence.items()
1594 }
1596 def _mexists_check_expected(
1597 self, refs: Sequence[DatasetRef], artifact_existence: dict[ResourcePath, bool] | None = None
1598 ) -> dict[DatasetRef, bool]:
1599 """Check existence of refs that are not known to datastore.
1601 Parameters
1602 ----------
1603 refs : iterable of `DatasetRef`
1604 The datasets to be checked. These are assumed not to be known
1605 to datastore.
1606 artifact_existence : `dict` [`lsst.resources.ResourcePath`, `bool`]
1607 Optional mapping of datastore artifact to existence. Updated by
1608 this method with details of all artifacts tested. Can be `None`
1609 if the caller is not interested.
1611 Returns
1612 -------
1613 existence : `dict` of [`DatasetRef`, `bool`]
1614 Mapping from dataset to boolean indicating existence.
1615 """
1616 dataset_existence: dict[DatasetRef, bool] = {}
1617 if not self.trustGetRequest:
1618 # Must assume these do not exist
1619 for ref in refs:
1620 dataset_existence[ref] = False
1621 else:
1622 log.debug(
1623 "%d datasets were not known to datastore during initial existence check.",
1624 len(refs),
1625 )
1627 # Construct data structure identical to that returned
1628 # by _get_stored_records_associated_with_refs() but using
1629 # guessed names.
1630 records = {}
1631 id_to_ref = {}
1632 for missing_ref in refs:
1633 expected = self._get_expected_dataset_locations_info(missing_ref)
1634 dataset_id = missing_ref.id
1635 records[dataset_id] = [info for _, info in expected]
1636 id_to_ref[dataset_id] = missing_ref
1638 dataset_existence.update(
1639 self._process_mexists_records(
1640 id_to_ref,
1641 records,
1642 False,
1643 artifact_existence=artifact_existence,
1644 )
1645 )
1647 return dataset_existence
1649 def exists(self, ref: DatasetRef) -> bool:
1650 """Check if the dataset exists in the datastore.
1652 Parameters
1653 ----------
1654 ref : `DatasetRef`
1655 Reference to the required dataset.
1657 Returns
1658 -------
1659 exists : `bool`
1660 `True` if the entity exists in the `Datastore`.
1662 Notes
1663 -----
1664 The local cache is checked as a proxy for existence in the remote
1665 object store. It is possible that another process on a different
1666 compute node could remove the file from the object store even
1667 though it is present in the local cache.
1668 """
1669 ref = self._cast_storage_class(ref)
1670 # We cannot trust datastore records from ref, as many unit tests delete
1671 # datasets and check their existence.
1672 fileLocations = self._get_dataset_locations_info(ref, ignore_datastore_records=True)
1674 # if we are being asked to trust that registry might not be correct
1675 # we ask for the expected locations and check them explicitly
1676 if not fileLocations:
1677 if not self.trustGetRequest:
1678 return False
1680 # First check the cache. If it is not found we must check
1681 # the datastore itself. Assume that any component in the cache
1682 # means that the dataset does exist somewhere.
1683 if self.cacheManager.known_to_cache(ref):
1684 return True
1686 # When we are guessing a dataset location we can not check
1687 # for the existence of every component since we can not
1688 # know if every component was written. Instead we check
1689 # for the existence of any of the expected locations.
1690 for location, _ in self._get_expected_dataset_locations_info(ref):
1691 if self._artifact_exists(location):
1692 return True
1693 return False
1695 # All listed artifacts must exist.
1696 for location, storedFileInfo in fileLocations:
1697 # Checking in cache needs the component ref.
1698 check_ref = ref
1699 if not ref.datasetType.isComponent() and (component := storedFileInfo.component):
1700 check_ref = ref.makeComponentRef(component)
1701 if self.cacheManager.known_to_cache(check_ref, location.getExtension()):
1702 continue
1704 if not self._artifact_exists(location):
1705 return False
1707 return True
1709 def getURIs(self, ref: DatasetRef, predict: bool = False) -> DatasetRefURIs:
1710 """Return URIs associated with dataset.
1712 Parameters
1713 ----------
1714 ref : `DatasetRef`
1715 Reference to the required dataset.
1716 predict : `bool`, optional
1717 If the datastore does not know about the dataset, controls whether
1718 it should return a predicted URI or not.
1720 Returns
1721 -------
1722 uris : `DatasetRefURIs`
1723 The URI to the primary artifact associated with this dataset (if
1724 the dataset was disassembled within the datastore this may be
1725 `None`), and the URIs to any components associated with the dataset
1726 artifact. (can be empty if there are no components).
1727 """
1728 many = self.getManyURIs([ref], predict=predict, allow_missing=False)
1729 return many[ref]
1731 def getURI(self, ref: DatasetRef, predict: bool = False) -> ResourcePath:
1732 """URI to the Dataset.
1734 Parameters
1735 ----------
1736 ref : `DatasetRef`
1737 Reference to the required Dataset.
1738 predict : `bool`
1739 If `True`, allow URIs to be returned of datasets that have not
1740 been written.
1742 Returns
1743 -------
1744 uri : `str`
1745 URI pointing to the dataset within the datastore. If the
1746 dataset does not exist in the datastore, and if ``predict`` is
1747 `True`, the URI will be a prediction and will include a URI
1748 fragment "#predicted".
1749 If the datastore does not have entities that relate well
1750 to the concept of a URI the returned URI will be
1751 descriptive. The returned URI is not guaranteed to be obtainable.
1753 Raises
1754 ------
1755 FileNotFoundError
1756 Raised if a URI has been requested for a dataset that does not
1757 exist and guessing is not allowed.
1758 RuntimeError
1759 Raised if a request is made for a single URI but multiple URIs
1760 are associated with this dataset.
1762 Notes
1763 -----
1764 When a predicted URI is requested an attempt will be made to form
1765 a reasonable URI based on file templates and the expected formatter.
1766 """
1767 primary, components = self.getURIs(ref, predict)
1768 if primary is None or components:
1769 raise RuntimeError(
1770 f"Dataset ({ref}) includes distinct URIs for components. Use Datastore.getURIs() instead."
1771 )
1772 return primary
1774 def _predict_URIs(
1775 self,
1776 ref: DatasetRef,
1777 ) -> DatasetRefURIs:
1778 """Predict the URIs of a dataset ref.
1780 Parameters
1781 ----------
1782 ref : `DatasetRef`
1783 Reference to the required Dataset.
1785 Returns
1786 -------
1787 URI : DatasetRefUris
1788 Primary and component URIs. URIs will contain a URI fragment
1789 "#predicted".
1790 """
1791 uris = DatasetRefURIs()
1793 if self.composites.shouldBeDisassembled(ref):
1794 for component, _ in ref.datasetType.storageClass.components.items():
1795 comp_ref = ref.makeComponentRef(component)
1796 comp_location, _ = self._determine_put_formatter_location(comp_ref)
1798 # Add the "#predicted" URI fragment to indicate this is a
1799 # guess
1800 uris.componentURIs[component] = ResourcePath(
1801 comp_location.uri.geturl() + "#predicted", forceDirectory=comp_location.uri.dirLike
1802 )
1804 else:
1805 location, _ = self._determine_put_formatter_location(ref)
1807 # Add the "#predicted" URI fragment to indicate this is a guess
1808 uris.primaryURI = ResourcePath(
1809 location.uri.geturl() + "#predicted", forceDirectory=location.uri.dirLike
1810 )
1812 return uris
1814 def getManyURIs(
1815 self,
1816 refs: Iterable[DatasetRef],
1817 predict: bool = False,
1818 allow_missing: bool = False,
1819 ) -> dict[DatasetRef, DatasetRefURIs]:
1820 # Docstring inherited
1822 uris: dict[DatasetRef, DatasetRefURIs] = {}
1824 records = self._get_stored_records_associated_with_refs(refs)
1825 records_keys = records.keys()
1827 existing_refs = tuple(ref for ref in refs if ref.id in records_keys)
1828 missing_refs = tuple(ref for ref in refs if ref.id not in records_keys)
1830 # Have to handle trustGetRequest mode by checking for the existence
1831 # of the missing refs on disk.
1832 if missing_refs:
1833 dataset_existence = self._mexists_check_expected(missing_refs, None)
1834 really_missing = set()
1835 not_missing = set()
1836 for ref, exists in dataset_existence.items():
1837 if exists:
1838 not_missing.add(ref)
1839 else:
1840 really_missing.add(ref)
1842 if not_missing:
1843 # Need to recalculate the missing/existing split.
1844 existing_refs = existing_refs + tuple(not_missing)
1845 missing_refs = tuple(really_missing)
1847 for ref in missing_refs:
1848 # if this has never been written then we have to guess
1849 if not predict:
1850 if not allow_missing:
1851 raise FileNotFoundError(f"Dataset {ref} not in this datastore.")
1852 else:
1853 uris[ref] = self._predict_URIs(ref)
1855 for ref in existing_refs:
1856 file_infos = records[ref.id]
1857 file_locations = [(i.file_location(self.locationFactory), i) for i in file_infos]
1858 uris[ref] = self._locations_to_URI(ref, file_locations)
1860 return uris
1862 def _locations_to_URI(
1863 self,
1864 ref: DatasetRef,
1865 file_locations: Sequence[tuple[Location, StoredFileInfo]],
1866 ) -> DatasetRefURIs:
1867 """Convert one or more file locations associated with a DatasetRef
1868 to a DatasetRefURIs.
1870 Parameters
1871 ----------
1872 ref : `DatasetRef`
1873 Reference to the dataset.
1874 file_locations : Sequence[Tuple[Location, StoredFileInfo]]
1875 Each item in the sequence is the location of the dataset within the
1876 datastore and stored information about the file and its formatter.
1877 If there is only one item in the sequence then it is treated as the
1878 primary URI. If there is more than one item then they are treated
1879 as component URIs. If there are no items then an error is raised
1880 unless ``self.trustGetRequest`` is `True`.
1882 Returns
1883 -------
1884 uris: DatasetRefURIs
1885 Represents the primary URI or component URIs described by the
1886 inputs.
1888 Raises
1889 ------
1890 RuntimeError
1891 If no file locations are passed in and ``self.trustGetRequest`` is
1892 `False`.
1893 FileNotFoundError
1894 If the a passed-in URI does not exist, and ``self.trustGetRequest``
1895 is `False`.
1896 RuntimeError
1897 If a passed in `StoredFileInfo`'s ``component`` is `None` (this is
1898 unexpected).
1899 """
1900 guessing = False
1901 uris = DatasetRefURIs()
1903 if not file_locations:
1904 if not self.trustGetRequest:
1905 raise RuntimeError(f"Unexpectedly got no artifacts for dataset {ref}")
1906 file_locations = self._get_expected_dataset_locations_info(ref)
1907 guessing = True
1909 if len(file_locations) == 1:
1910 # No disassembly so this is the primary URI
1911 uris.primaryURI = file_locations[0][0].uri
1912 if guessing and not uris.primaryURI.exists():
1913 raise FileNotFoundError(f"Expected URI ({uris.primaryURI}) does not exist")
1914 else:
1915 for location, file_info in file_locations:
1916 if file_info.component is None:
1917 raise RuntimeError(f"Unexpectedly got no component name for a component at {location}")
1918 if guessing and not location.uri.exists():
1919 # If we are trusting then it is entirely possible for
1920 # some components to be missing. In that case we skip
1921 # to the next component.
1922 if self.trustGetRequest:
1923 continue
1924 raise FileNotFoundError(f"Expected URI ({location.uri}) does not exist")
1925 uris.componentURIs[file_info.component] = location.uri
1927 return uris
1929 def retrieveArtifacts(
1930 self,
1931 refs: Iterable[DatasetRef],
1932 destination: ResourcePath,
1933 transfer: str = "auto",
1934 preserve_path: bool = True,
1935 overwrite: bool = False,
1936 ) -> list[ResourcePath]:
1937 """Retrieve the file artifacts associated with the supplied refs.
1939 Parameters
1940 ----------
1941 refs : iterable of `DatasetRef`
1942 The datasets for which file artifacts are to be retrieved.
1943 A single ref can result in multiple files. The refs must
1944 be resolved.
1945 destination : `lsst.resources.ResourcePath`
1946 Location to write the file artifacts.
1947 transfer : `str`, optional
1948 Method to use to transfer the artifacts. Must be one of the options
1949 supported by `lsst.resources.ResourcePath.transfer_from()`.
1950 "move" is not allowed.
1951 preserve_path : `bool`, optional
1952 If `True` the full path of the file artifact within the datastore
1953 is preserved. If `False` the final file component of the path
1954 is used.
1955 overwrite : `bool`, optional
1956 If `True` allow transfers to overwrite existing files at the
1957 destination.
1959 Returns
1960 -------
1961 targets : `list` of `lsst.resources.ResourcePath`
1962 URIs of file artifacts in destination location. Order is not
1963 preserved.
1964 """
1965 if not destination.isdir():
1966 raise ValueError(f"Destination location must refer to a directory. Given {destination}")
1968 if transfer == "move":
1969 raise ValueError("Can not move artifacts out of datastore. Use copy instead.")
1971 # Source -> Destination
1972 # This also helps filter out duplicate DatasetRef in the request
1973 # that will map to the same underlying file transfer.
1974 to_transfer: dict[ResourcePath, ResourcePath] = {}
1976 for ref in refs:
1977 locations = self._get_dataset_locations_info(ref)
1978 for location, _ in locations:
1979 source_uri = location.uri
1980 target_path: ResourcePathExpression
1981 if preserve_path:
1982 target_path = location.pathInStore
1983 if target_path.isabs():
1984 # This is an absolute path to an external file.
1985 # Use the full path.
1986 target_path = target_path.relativeToPathRoot
1987 else:
1988 target_path = source_uri.basename()
1989 target_uri = destination.join(target_path)
1990 to_transfer[source_uri] = target_uri
1992 # In theory can now parallelize the transfer
1993 log.debug("Number of artifacts to transfer to %s: %d", str(destination), len(to_transfer))
1994 for source_uri, target_uri in to_transfer.items():
1995 target_uri.transfer_from(source_uri, transfer=transfer, overwrite=overwrite)
1997 return list(to_transfer.values())
1999 def get(
2000 self,
2001 ref: DatasetRef,
2002 parameters: Mapping[str, Any] | None = None,
2003 storageClass: StorageClass | str | None = None,
2004 ) -> Any:
2005 """Load an InMemoryDataset from the store.
2007 Parameters
2008 ----------
2009 ref : `DatasetRef`
2010 Reference to the required Dataset.
2011 parameters : `dict`
2012 `StorageClass`-specific parameters that specify, for example,
2013 a slice of the dataset to be loaded.
2014 storageClass : `StorageClass` or `str`, optional
2015 The storage class to be used to override the Python type
2016 returned by this method. By default the returned type matches
2017 the dataset type definition for this dataset. Specifying a
2018 read `StorageClass` can force a different type to be returned.
2019 This type must be compatible with the original type.
2021 Returns
2022 -------
2023 inMemoryDataset : `object`
2024 Requested dataset or slice thereof as an InMemoryDataset.
2026 Raises
2027 ------
2028 FileNotFoundError
2029 Requested dataset can not be retrieved.
2030 TypeError
2031 Return value from formatter has unexpected type.
2032 ValueError
2033 Formatter failed to process the dataset.
2034 """
2035 # Supplied storage class for the component being read is either
2036 # from the ref itself or some an override if we want to force
2037 # type conversion.
2038 if storageClass is not None:
2039 ref = ref.overrideStorageClass(storageClass)
2041 allGetInfo = self._prepare_for_direct_get(ref, parameters)
2042 return get_dataset_as_python_object_from_get_info(
2043 allGetInfo, ref=ref, parameters=parameters, cache_manager=self.cacheManager
2044 )
2046 def prepare_get_for_external_client(self, ref: DatasetRef) -> FileDatastoreGetPayload:
2047 # Docstring inherited
2049 # 1 hour. Chosen somewhat arbitrarily -- this is long enough that the
2050 # client should have time to download a large file with retries if
2051 # needed, but short enough that it will become obvious quickly that
2052 # these URLs expire.
2053 # From a strictly technical standpoint there is no reason this
2054 # shouldn't be a day or more, but there seems to be a political issue
2055 # where people think there is a risk of end users posting presigned
2056 # URLs for people without access rights to download.
2057 url_expiration_time_seconds = 1 * 60 * 60
2059 def to_file_info_payload(info: DatasetLocationInformation) -> FileDatastoreGetPayloadFileInfo:
2060 location, file_info = info
2061 return FileDatastoreGetPayloadFileInfo(
2062 url=location.uri.generate_presigned_get_url(
2063 expiration_time_seconds=url_expiration_time_seconds
2064 ),
2065 datastoreRecords=file_info.to_simple(),
2066 )
2068 return FileDatastoreGetPayload(
2069 datastore_type="file",
2070 dataset_ref=ref.to_simple(),
2071 file_info=[to_file_info_payload(info) for info in self._get_dataset_locations_info(ref)],
2072 )
2074 @transactional
2075 def put(self, inMemoryDataset: Any, ref: DatasetRef) -> None:
2076 """Write a InMemoryDataset with a given `DatasetRef` to the store.
2078 Parameters
2079 ----------
2080 inMemoryDataset : `object`
2081 The dataset to store.
2082 ref : `DatasetRef`
2083 Reference to the associated Dataset.
2085 Raises
2086 ------
2087 TypeError
2088 Supplied object and storage class are inconsistent.
2089 DatasetTypeNotSupportedError
2090 The associated `DatasetType` is not handled by this datastore.
2092 Notes
2093 -----
2094 If the datastore is configured to reject certain dataset types it
2095 is possible that the put will fail and raise a
2096 `DatasetTypeNotSupportedError`. The main use case for this is to
2097 allow `ChainedDatastore` to put to multiple datastores without
2098 requiring that every datastore accepts the dataset.
2099 """
2100 doDisassembly = self.composites.shouldBeDisassembled(ref)
2101 # doDisassembly = True
2103 artifacts = []
2104 if doDisassembly:
2105 components = ref.datasetType.storageClass.delegate().disassemble(inMemoryDataset)
2106 if components is None:
2107 raise RuntimeError(
2108 f"Inconsistent configuration: dataset type {ref.datasetType.name} "
2109 f"with storage class {ref.datasetType.storageClass.name} "
2110 "is configured to be disassembled, but cannot be."
2111 )
2112 for component, componentInfo in components.items():
2113 # Don't recurse because we want to take advantage of
2114 # bulk insert -- need a new DatasetRef that refers to the
2115 # same dataset_id but has the component DatasetType
2116 # DatasetType does not refer to the types of components
2117 # So we construct one ourselves.
2118 compRef = ref.makeComponentRef(component)
2119 storedInfo = self._write_in_memory_to_artifact(componentInfo.component, compRef)
2120 artifacts.append((compRef, storedInfo))
2121 else:
2122 # Write the entire thing out
2123 storedInfo = self._write_in_memory_to_artifact(inMemoryDataset, ref)
2124 artifacts.append((ref, storedInfo))
2126 self._register_datasets(artifacts, insert_mode=DatabaseInsertMode.INSERT)
2128 @transactional
2129 def put_new(self, in_memory_dataset: Any, ref: DatasetRef) -> Mapping[str, DatasetRef]:
2130 doDisassembly = self.composites.shouldBeDisassembled(ref)
2131 # doDisassembly = True
2133 artifacts = []
2134 if doDisassembly:
2135 components = ref.datasetType.storageClass.delegate().disassemble(in_memory_dataset)
2136 if components is None:
2137 raise RuntimeError(
2138 f"Inconsistent configuration: dataset type {ref.datasetType.name} "
2139 f"with storage class {ref.datasetType.storageClass.name} "
2140 "is configured to be disassembled, but cannot be."
2141 )
2142 for component, componentInfo in components.items():
2143 # Don't recurse because we want to take advantage of
2144 # bulk insert -- need a new DatasetRef that refers to the
2145 # same dataset_id but has the component DatasetType
2146 # DatasetType does not refer to the types of components
2147 # So we construct one ourselves.
2148 compRef = ref.makeComponentRef(component)
2149 storedInfo = self._write_in_memory_to_artifact(componentInfo.component, compRef)
2150 artifacts.append((compRef, storedInfo))
2151 else:
2152 # Write the entire thing out
2153 storedInfo = self._write_in_memory_to_artifact(in_memory_dataset, ref)
2154 artifacts.append((ref, storedInfo))
2156 ref_records = {self._opaque_table_name: [info for _, info in artifacts]}
2157 ref = ref.replace(datastore_records=ref_records)
2158 return {self.name: ref}
2160 @transactional
2161 def trash(self, ref: DatasetRef | Iterable[DatasetRef], ignore_errors: bool = True) -> None:
2162 # At this point can safely remove these datasets from the cache
2163 # to avoid confusion later on. If they are not trashed later
2164 # the cache will simply be refilled.
2165 self.cacheManager.remove_from_cache(ref)
2167 # If we are in trust mode there will be nothing to move to
2168 # the trash table and we will have to try to delete the file
2169 # immediately.
2170 if self.trustGetRequest:
2171 # Try to keep the logic below for a single file trash.
2172 if isinstance(ref, DatasetRef):
2173 refs = {ref}
2174 else:
2175 # Will recreate ref at the end of this branch.
2176 refs = set(ref)
2178 # Determine which datasets are known to datastore directly.
2179 id_to_ref = {ref.id: ref for ref in refs}
2180 existing_ids = self._get_stored_records_associated_with_refs(refs, ignore_datastore_records=True)
2181 existing_refs = {id_to_ref[ref_id] for ref_id in existing_ids}
2183 missing = refs - existing_refs
2184 if missing:
2185 # Do an explicit existence check on these refs.
2186 # We only care about the artifacts at this point and not
2187 # the dataset existence.
2188 artifact_existence: dict[ResourcePath, bool] = {}
2189 _ = self.mexists(missing, artifact_existence)
2190 uris = [uri for uri, exists in artifact_existence.items() if exists]
2192 # FUTURE UPGRADE: Implement a parallelized bulk remove.
2193 log.debug("Removing %d artifacts from datastore that are unknown to datastore", len(uris))
2194 for uri in uris:
2195 try:
2196 uri.remove()
2197 except Exception as e:
2198 if ignore_errors:
2199 log.debug("Artifact %s could not be removed: %s", uri, e)
2200 continue
2201 raise
2203 # There is no point asking the code below to remove refs we
2204 # know are missing so update it with the list of existing
2205 # records. Try to retain one vs many logic.
2206 if not existing_refs:
2207 # Nothing more to do since none of the datasets were
2208 # known to the datastore record table.
2209 return
2210 ref = list(existing_refs)
2211 if len(ref) == 1:
2212 ref = ref[0]
2214 # Get file metadata and internal metadata
2215 if not isinstance(ref, DatasetRef):
2216 log.debug("Doing multi-dataset trash in datastore %s", self.name)
2217 # Assumed to be an iterable of refs so bulk mode enabled.
2218 try:
2219 self.bridge.moveToTrash(ref, transaction=self._transaction)
2220 except Exception as e:
2221 if ignore_errors:
2222 log.warning("Unexpected issue moving multiple datasets to trash: %s", e)
2223 else:
2224 raise
2225 return
2227 log.debug("Trashing dataset %s in datastore %s", ref, self.name)
2229 fileLocations = self._get_dataset_locations_info(ref)
2231 if not fileLocations:
2232 err_msg = f"Requested dataset to trash ({ref}) is not known to datastore {self.name}"
2233 if ignore_errors:
2234 log.warning(err_msg)
2235 return
2236 else:
2237 raise FileNotFoundError(err_msg)
2239 for location, _ in fileLocations:
2240 if not self._artifact_exists(location):
2241 err_msg = (
2242 f"Dataset is known to datastore {self.name} but "
2243 f"associated artifact ({location.uri}) is missing"
2244 )
2245 if ignore_errors:
2246 log.warning(err_msg)
2247 return
2248 else:
2249 raise FileNotFoundError(err_msg)
2251 # Mark dataset as trashed
2252 try:
2253 self.bridge.moveToTrash([ref], transaction=self._transaction)
2254 except Exception as e:
2255 if ignore_errors:
2256 log.warning(
2257 "Attempted to mark dataset (%s) to be trashed in datastore %s "
2258 "but encountered an error: %s",
2259 ref,
2260 self.name,
2261 e,
2262 )
2263 pass
2264 else:
2265 raise
2267 @transactional
2268 def emptyTrash(self, ignore_errors: bool = True) -> None:
2269 """Remove all datasets from the trash.
2271 Parameters
2272 ----------
2273 ignore_errors : `bool`
2274 If `True` return without error even if something went wrong.
2275 Problems could occur if another process is simultaneously trying
2276 to delete.
2277 """
2278 log.debug("Emptying trash in datastore %s", self.name)
2280 # Context manager will empty trash iff we finish it without raising.
2281 # It will also automatically delete the relevant rows from the
2282 # trash table and the records table.
2283 with self.bridge.emptyTrash(
2284 self._table, record_class=StoredFileInfo, record_column="path"
2285 ) as trash_data:
2286 # Removing the artifacts themselves requires that the files are
2287 # not also associated with refs that are not to be trashed.
2288 # Therefore need to do a query with the file paths themselves
2289 # and return all the refs associated with them. Can only delete
2290 # a file if the refs to be trashed are the only refs associated
2291 # with the file.
2292 # This requires multiple copies of the trashed items
2293 trashed, artifacts_to_keep = trash_data
2295 if artifacts_to_keep is None:
2296 # The bridge is not helping us so have to work it out
2297 # ourselves. This is not going to be as efficient.
2298 trashed = list(trashed)
2300 # The instance check is for mypy since up to this point it
2301 # does not know the type of info.
2302 path_map = self._refs_associated_with_artifacts(
2303 [info.path for _, info in trashed if isinstance(info, StoredFileInfo)]
2304 )
2306 for ref, info in trashed:
2307 # Mypy needs to know this is not the base class
2308 assert isinstance(info, StoredFileInfo), f"Unexpectedly got info of class {type(info)}"
2310 path_map[info.path].remove(ref.id)
2311 if not path_map[info.path]:
2312 del path_map[info.path]
2314 artifacts_to_keep = set(path_map)
2316 for ref, info in trashed:
2317 # Should not happen for this implementation but need
2318 # to keep mypy happy.
2319 assert info is not None, f"Internal logic error in emptyTrash with ref {ref}."
2321 # Mypy needs to know this is not the base class
2322 assert isinstance(info, StoredFileInfo), f"Unexpectedly got info of class {type(info)}"
2324 if info.path in artifacts_to_keep:
2325 # This is a multi-dataset artifact and we are not
2326 # removing all associated refs.
2327 continue
2329 # Only trashed refs still known to datastore will be returned.
2330 location = info.file_location(self.locationFactory)
2332 # Point of no return for this artifact
2333 log.debug("Removing artifact %s from datastore %s", location.uri, self.name)
2334 try:
2335 self._delete_artifact(location)
2336 except FileNotFoundError:
2337 # If the file itself has been deleted there is nothing
2338 # we can do about it. It is possible that trash has
2339 # been run in parallel in another process or someone
2340 # decided to delete the file. It is unlikely to come
2341 # back and so we should still continue with the removal
2342 # of the entry from the trash table. It is also possible
2343 # we removed it in a previous iteration if it was
2344 # a multi-dataset artifact. The delete artifact method
2345 # will log a debug message in this scenario.
2346 # Distinguishing file missing before trash started and
2347 # file already removed previously as part of this trash
2348 # is not worth the distinction with regards to potential
2349 # memory cost.
2350 pass
2351 except Exception as e:
2352 if ignore_errors:
2353 # Use a debug message here even though it's not
2354 # a good situation. In some cases this can be
2355 # caused by a race between user A and user B
2356 # and neither of them has permissions for the
2357 # other's files. Butler does not know about users
2358 # and trash has no idea what collections these
2359 # files were in (without guessing from a path).
2360 log.debug(
2361 "Encountered error removing artifact %s from datastore %s: %s",
2362 location.uri,
2363 self.name,
2364 e,
2365 )
2366 else:
2367 raise
2369 @transactional
2370 def transfer_from(
2371 self,
2372 source_datastore: Datastore,
2373 refs: Collection[DatasetRef],
2374 transfer: str = "auto",
2375 artifact_existence: dict[ResourcePath, bool] | None = None,
2376 dry_run: bool = False,
2377 ) -> tuple[set[DatasetRef], set[DatasetRef]]:
2378 # Docstring inherited
2379 if type(self) is not type(source_datastore):
2380 raise TypeError(
2381 f"Datastore mismatch between this datastore ({type(self)}) and the "
2382 f"source datastore ({type(source_datastore)})."
2383 )
2385 # Be explicit for mypy
2386 if not isinstance(source_datastore, FileDatastore):
2387 raise TypeError(
2388 "Can only transfer to a FileDatastore from another FileDatastore, not"
2389 f" {type(source_datastore)}"
2390 )
2392 # Stop early if "direct" transfer mode is requested. That would
2393 # require that the URI inside the source datastore should be stored
2394 # directly in the target datastore, which seems unlikely to be useful
2395 # since at any moment the source datastore could delete the file.
2396 if transfer in ("direct", "split"):
2397 raise ValueError(
2398 f"Can not transfer from a source datastore using {transfer} mode since"
2399 " those files are controlled by the other datastore."
2400 )
2402 # Empty existence lookup if none given.
2403 if artifact_existence is None:
2404 artifact_existence = {}
2406 # In order to handle disassembled composites the code works
2407 # at the records level since it can assume that internal APIs
2408 # can be used.
2409 # - If the record already exists in the destination this is assumed
2410 # to be okay.
2411 # - If there is no record but the source and destination URIs are
2412 # identical no transfer is done but the record is added.
2413 # - If the source record refers to an absolute URI currently assume
2414 # that that URI should remain absolute and will be visible to the
2415 # destination butler. May need to have a flag to indicate whether
2416 # the dataset should be transferred. This will only happen if
2417 # the detached Butler has had a local ingest.
2419 # What we really want is all the records in the source datastore
2420 # associated with these refs. Or derived ones if they don't exist
2421 # in the source.
2422 source_records = source_datastore._get_stored_records_associated_with_refs(
2423 refs, ignore_datastore_records=True
2424 )
2426 # The source dataset_ids are the keys in these records
2427 source_ids = set(source_records)
2428 log.debug("Number of datastore records found in source: %d", len(source_ids))
2430 requested_ids = {ref.id for ref in refs}
2431 missing_ids = requested_ids - source_ids
2433 # Missing IDs can be okay if that datastore has allowed
2434 # gets based on file existence. Should we transfer what we can
2435 # or complain about it and warn?
2436 if missing_ids and not source_datastore.trustGetRequest:
2437 raise ValueError(
2438 f"Some datasets are missing from source datastore {source_datastore}: {missing_ids}"
2439 )
2441 # Need to map these missing IDs to a DatasetRef so we can guess
2442 # the details.
2443 if missing_ids:
2444 log.info(
2445 "Number of expected datasets missing from source datastore records: %d out of %d",
2446 len(missing_ids),
2447 len(requested_ids),
2448 )
2449 id_to_ref = {ref.id: ref for ref in refs if ref.id in missing_ids}
2451 # This should be chunked in case we end up having to check
2452 # the file store since we need some log output to show
2453 # progress.
2454 for missing_ids_chunk in chunk_iterable(missing_ids, chunk_size=10_000):
2455 records = {}
2456 for missing in missing_ids_chunk:
2457 # Ask the source datastore where the missing artifacts
2458 # should be. An execution butler might not know about the
2459 # artifacts even if they are there.
2460 expected = source_datastore._get_expected_dataset_locations_info(id_to_ref[missing])
2461 records[missing] = [info for _, info in expected]
2463 # Call the mexist helper method in case we have not already
2464 # checked these artifacts such that artifact_existence is
2465 # empty. This allows us to benefit from parallelism.
2466 # datastore.mexists() itself does not give us access to the
2467 # derived datastore record.
2468 log.verbose("Checking existence of %d datasets unknown to datastore", len(records))
2469 ref_exists = source_datastore._process_mexists_records(
2470 id_to_ref, records, False, artifact_existence=artifact_existence
2471 )
2473 # Now go through the records and propagate the ones that exist.
2474 location_factory = source_datastore.locationFactory
2475 for missing, record_list in records.items():
2476 # Skip completely if the ref does not exist.
2477 ref = id_to_ref[missing]
2478 if not ref_exists[ref]:
2479 log.warning("Asked to transfer dataset %s but no file artifacts exist for it.", ref)
2480 continue
2481 # Check for file artifact to decide which parts of a
2482 # disassembled composite do exist. If there is only a
2483 # single record we don't even need to look because it can't
2484 # be a composite and must exist.
2485 if len(record_list) == 1:
2486 dataset_records = record_list
2487 else:
2488 dataset_records = [
2489 record
2490 for record in record_list
2491 if artifact_existence[record.file_location(location_factory).uri]
2492 ]
2493 assert len(dataset_records) > 0, "Disassembled composite should have had some files."
2495 # Rely on source_records being a defaultdict.
2496 source_records[missing].extend(dataset_records)
2497 log.verbose("Completed scan for missing data files")
2499 # See if we already have these records
2500 target_records = self._get_stored_records_associated_with_refs(refs, ignore_datastore_records=True)
2502 # The artifacts to register
2503 artifacts = []
2505 # Refs that already exist
2506 already_present = []
2508 # Refs that were rejected by this datastore.
2509 rejected = set()
2511 # Refs that were transferred successfully.
2512 accepted = set()
2514 # Record each time we have done a "direct" transfer.
2515 direct_transfers = []
2517 # Now can transfer the artifacts
2518 for ref in refs:
2519 if not self.constraints.isAcceptable(ref):
2520 # This datastore should not be accepting this dataset.
2521 rejected.add(ref)
2522 continue
2524 accepted.add(ref)
2526 if ref.id in target_records:
2527 # Already have an artifact for this.
2528 already_present.append(ref)
2529 continue
2531 # mypy needs to know these are always resolved refs
2532 for info in source_records[ref.id]:
2533 source_location = info.file_location(source_datastore.locationFactory)
2534 target_location = info.file_location(self.locationFactory)
2535 if source_location == target_location and not source_location.pathInStore.isabs():
2536 # Artifact is already in the target location.
2537 # (which is how execution butler currently runs)
2538 pass
2539 else:
2540 if target_location.pathInStore.isabs():
2541 # Just because we can see the artifact when running
2542 # the transfer doesn't mean it will be generally
2543 # accessible to a user of this butler. Need to decide
2544 # what to do about an absolute path.
2545 if transfer == "auto":
2546 # For "auto" transfers we allow the absolute URI
2547 # to be recorded in the target datastore.
2548 direct_transfers.append(source_location)
2549 else:
2550 # The user is explicitly requesting a transfer
2551 # even for an absolute URI. This requires us to
2552 # calculate the target path.
2553 template_ref = ref
2554 if info.component:
2555 template_ref = ref.makeComponentRef(info.component)
2556 target_location = self._calculate_ingested_datastore_name(
2557 source_location.uri,
2558 template_ref,
2559 )
2561 info = info.update(path=target_location.pathInStore.path)
2563 # Need to transfer it to the new location.
2564 # Assume we should always overwrite. If the artifact
2565 # is there this might indicate that a previous transfer
2566 # was interrupted but was not able to be rolled back
2567 # completely (eg pre-emption) so follow Datastore default
2568 # and overwrite. Do not copy if we are in dry-run mode.
2569 if not dry_run:
2570 target_location.uri.transfer_from(
2571 source_location.uri,
2572 transfer=transfer,
2573 overwrite=True,
2574 transaction=self._transaction,
2575 )
2577 artifacts.append((ref, info))
2579 if direct_transfers:
2580 log.info(
2581 "Transfer request for an outside-datastore artifact with absolute URI done %d time%s",
2582 len(direct_transfers),
2583 "" if len(direct_transfers) == 1 else "s",
2584 )
2586 # We are overwriting previous datasets that may have already
2587 # existed. We therefore should ensure that we force the
2588 # datastore records to agree. Note that this can potentially lead
2589 # to difficulties if the dataset has previously been ingested
2590 # disassembled and is somehow now assembled, or vice versa.
2591 if not dry_run:
2592 self._register_datasets(artifacts, insert_mode=DatabaseInsertMode.REPLACE)
2594 if already_present:
2595 n_skipped = len(already_present)
2596 log.info(
2597 "Skipped transfer of %d dataset%s already present in datastore",
2598 n_skipped,
2599 "" if n_skipped == 1 else "s",
2600 )
2602 return accepted, rejected
2604 @transactional
2605 def forget(self, refs: Iterable[DatasetRef]) -> None:
2606 # Docstring inherited.
2607 refs = list(refs)
2608 self.bridge.forget(refs)
2609 self._table.delete(["dataset_id"], *[{"dataset_id": ref.id} for ref in refs])
2611 def validateConfiguration(
2612 self, entities: Iterable[DatasetRef | DatasetType | StorageClass], logFailures: bool = False
2613 ) -> None:
2614 """Validate some of the configuration for this datastore.
2616 Parameters
2617 ----------
2618 entities : iterable of `DatasetRef`, `DatasetType`, or `StorageClass`
2619 Entities to test against this configuration. Can be differing
2620 types.
2621 logFailures : `bool`, optional
2622 If `True`, output a log message for every validation error
2623 detected.
2625 Raises
2626 ------
2627 DatastoreValidationError
2628 Raised if there is a validation problem with a configuration.
2629 All the problems are reported in a single exception.
2631 Notes
2632 -----
2633 This method checks that all the supplied entities have valid file
2634 templates and also have formatters defined.
2635 """
2636 templateFailed = None
2637 try:
2638 self.templates.validateTemplates(entities, logFailures=logFailures)
2639 except FileTemplateValidationError as e:
2640 templateFailed = str(e)
2642 formatterFailed = []
2643 for entity in entities:
2644 try:
2645 self.formatterFactory.getFormatterClass(entity)
2646 except KeyError as e:
2647 formatterFailed.append(str(e))
2648 if logFailures:
2649 log.critical("Formatter failure: %s", e)
2651 if templateFailed or formatterFailed:
2652 messages = []
2653 if templateFailed:
2654 messages.append(templateFailed)
2655 if formatterFailed:
2656 messages.append(",".join(formatterFailed))
2657 msg = ";\n".join(messages)
2658 raise DatastoreValidationError(msg)
2660 def getLookupKeys(self) -> set[LookupKey]:
2661 # Docstring is inherited from base class
2662 return (
2663 self.templates.getLookupKeys()
2664 | self.formatterFactory.getLookupKeys()
2665 | self.constraints.getLookupKeys()
2666 )
2668 def validateKey(self, lookupKey: LookupKey, entity: DatasetRef | DatasetType | StorageClass) -> None:
2669 # Docstring is inherited from base class
2670 # The key can be valid in either formatters or templates so we can
2671 # only check the template if it exists
2672 if lookupKey in self.templates:
2673 try:
2674 self.templates[lookupKey].validateTemplate(entity)
2675 except FileTemplateValidationError as e:
2676 raise DatastoreValidationError(e) from e
2678 def export(
2679 self,
2680 refs: Iterable[DatasetRef],
2681 *,
2682 directory: ResourcePathExpression | None = None,
2683 transfer: str | None = "auto",
2684 ) -> Iterable[FileDataset]:
2685 # Docstring inherited from Datastore.export.
2686 if transfer == "auto" and directory is None:
2687 transfer = None
2689 if transfer is not None and directory is None:
2690 raise TypeError(f"Cannot export using transfer mode {transfer} with no export directory given")
2692 if transfer == "move":
2693 raise TypeError("Can not export by moving files out of datastore.")
2694 elif transfer == "direct":
2695 # For an export, treat this as equivalent to None. We do not
2696 # want an import to risk using absolute URIs to datasets owned
2697 # by another datastore.
2698 log.info("Treating 'direct' transfer mode as in-place export.")
2699 transfer = None
2701 # Force the directory to be a URI object
2702 directoryUri: ResourcePath | None = None
2703 if directory is not None:
2704 directoryUri = ResourcePath(directory, forceDirectory=True)
2706 if transfer is not None and directoryUri is not None and not directoryUri.exists():
2707 # mypy needs the second test
2708 raise FileNotFoundError(f"Export location {directory} does not exist")
2710 progress = Progress("lsst.daf.butler.datastores.FileDatastore.export", level=logging.DEBUG)
2711 for ref in progress.wrap(refs, "Exporting dataset files"):
2712 fileLocations = self._get_dataset_locations_info(ref)
2713 if not fileLocations:
2714 raise FileNotFoundError(f"Could not retrieve dataset {ref}.")
2715 # For now we can not export disassembled datasets
2716 if len(fileLocations) > 1:
2717 raise NotImplementedError(f"Can not export disassembled datasets such as {ref}")
2718 location, storedFileInfo = fileLocations[0]
2720 pathInStore = location.pathInStore.path
2721 if transfer is None:
2722 # TODO: do we also need to return the readStorageClass somehow?
2723 # We will use the path in store directly. If this is an
2724 # absolute URI, preserve it.
2725 if location.pathInStore.isabs():
2726 pathInStore = str(location.uri)
2727 elif transfer == "direct":
2728 # Use full URIs to the remote store in the export
2729 pathInStore = str(location.uri)
2730 else:
2731 # mypy needs help
2732 assert directoryUri is not None, "directoryUri must be defined to get here"
2733 storeUri = ResourcePath(location.uri, forceDirectory=False)
2735 # if the datastore has an absolute URI to a resource, we
2736 # have two options:
2737 # 1. Keep the absolute URI in the exported YAML
2738 # 2. Allocate a new name in the local datastore and transfer
2739 # it.
2740 # For now go with option 2
2741 if location.pathInStore.isabs():
2742 template = self.templates.getTemplate(ref)
2743 newURI = ResourcePath(template.format(ref), forceAbsolute=False, forceDirectory=False)
2744 pathInStore = str(newURI.updatedExtension(location.pathInStore.getExtension()))
2746 exportUri = directoryUri.join(pathInStore)
2747 exportUri.transfer_from(storeUri, transfer=transfer)
2749 yield FileDataset(refs=[ref], path=pathInStore, formatter=storedFileInfo.formatter)
2751 @staticmethod
2752 def computeChecksum(uri: ResourcePath, algorithm: str = "blake2b", block_size: int = 8192) -> str | None:
2753 """Compute the checksum of the supplied file.
2755 Parameters
2756 ----------
2757 uri : `lsst.resources.ResourcePath`
2758 Name of resource to calculate checksum from.
2759 algorithm : `str`, optional
2760 Name of algorithm to use. Must be one of the algorithms supported
2761 by :py:class`hashlib`.
2762 block_size : `int`
2763 Number of bytes to read from file at one time.
2765 Returns
2766 -------
2767 hexdigest : `str`
2768 Hex digest of the file.
2770 Notes
2771 -----
2772 Currently returns None if the URI is for a remote resource.
2773 """
2774 if algorithm not in hashlib.algorithms_guaranteed:
2775 raise NameError(f"The specified algorithm '{algorithm}' is not supported by hashlib")
2777 if not uri.isLocal:
2778 return None
2780 hasher = hashlib.new(algorithm)
2782 with uri.as_local() as local_uri, open(local_uri.ospath, "rb") as f:
2783 for chunk in iter(lambda: f.read(block_size), b""):
2784 hasher.update(chunk)
2786 return hasher.hexdigest()
2788 def needs_expanded_data_ids(
2789 self,
2790 transfer: str | None,
2791 entity: DatasetRef | DatasetType | StorageClass | None = None,
2792 ) -> bool:
2793 # Docstring inherited.
2794 # This _could_ also use entity to inspect whether the filename template
2795 # involves placeholders other than the required dimensions for its
2796 # dataset type, but that's not necessary for correctness; it just
2797 # enables more optimizations (perhaps only in theory).
2798 return transfer not in ("direct", None)
2800 def import_records(self, data: Mapping[str, DatastoreRecordData]) -> None:
2801 # Docstring inherited from the base class.
2802 record_data = data.get(self.name)
2803 if not record_data:
2804 return
2806 self._bridge.insert(FakeDatasetRef(dataset_id) for dataset_id in record_data.records)
2808 # TODO: Verify that there are no unexpected table names in the dict?
2809 unpacked_records = []
2810 for dataset_id, dataset_data in record_data.records.items():
2811 records = dataset_data.get(self._table.name)
2812 if records:
2813 for info in records:
2814 assert isinstance(info, StoredFileInfo), "Expecting StoredFileInfo records"
2815 unpacked_records.append(info.to_record(dataset_id=dataset_id))
2816 if unpacked_records:
2817 self._table.insert(*unpacked_records, transaction=self._transaction)
2819 def export_records(self, refs: Iterable[DatasetIdRef]) -> Mapping[str, DatastoreRecordData]:
2820 # Docstring inherited from the base class.
2821 exported_refs = list(self._bridge.check(refs))
2822 ids = {ref.id for ref in exported_refs}
2823 records: dict[DatasetId, dict[str, list[StoredDatastoreItemInfo]]] = {id: {} for id in ids}
2824 for row in self._table.fetch(dataset_id=ids):
2825 info: StoredDatastoreItemInfo = StoredFileInfo.from_record(row)
2826 dataset_records = records.setdefault(row["dataset_id"], {})
2827 dataset_records.setdefault(self._table.name, []).append(info)
2829 record_data = DatastoreRecordData(records=records)
2830 return {self.name: record_data}
2832 def set_retrieve_dataset_type_method(self, method: Callable[[str], DatasetType | None] | None) -> None:
2833 # Docstring inherited from the base class.
2834 self._retrieve_dataset_method = method
2836 def _cast_storage_class(self, ref: DatasetRef) -> DatasetRef:
2837 """Update dataset reference to use the storage class from registry."""
2838 if self._retrieve_dataset_method is None:
2839 # We could raise an exception here but unit tests do not define
2840 # this method.
2841 return ref
2842 dataset_type = self._retrieve_dataset_method(ref.datasetType.name)
2843 if dataset_type is not None:
2844 ref = ref.overrideStorageClass(dataset_type.storageClass)
2845 return ref
2847 def get_opaque_table_definitions(self) -> Mapping[str, DatastoreOpaqueTable]:
2848 # Docstring inherited from the base class.
2849 return {self._opaque_table_name: DatastoreOpaqueTable(self.makeTableSpec(ddl.GUID), StoredFileInfo)}