Coverage for python/lsst/daf/butler/datastores/fileDatastore.py: 85%
928 statements
« prev ^ index » next coverage.py v6.5.0, created at 2022-11-29 02:00 -0800
« prev ^ index » next coverage.py v6.5.0, created at 2022-11-29 02:00 -0800
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
21from __future__ import annotations
23"""Generic file-based datastore code."""
25__all__ = ("FileDatastore",)
27import hashlib
28import logging
29from collections import defaultdict
30from dataclasses import dataclass
31from typing import (
32 TYPE_CHECKING,
33 Any,
34 ClassVar,
35 Dict,
36 Iterable,
37 List,
38 Mapping,
39 Optional,
40 Sequence,
41 Set,
42 Tuple,
43 Type,
44 Union,
45)
47from lsst.daf.butler import (
48 CompositesMap,
49 Config,
50 DatasetId,
51 DatasetRef,
52 DatasetRefURIs,
53 DatasetType,
54 DatasetTypeNotSupportedError,
55 Datastore,
56 DatastoreCacheManager,
57 DatastoreConfig,
58 DatastoreDisabledCacheManager,
59 DatastoreRecordData,
60 DatastoreValidationError,
61 FileDataset,
62 FileDescriptor,
63 FileTemplates,
64 FileTemplateValidationError,
65 Formatter,
66 FormatterFactory,
67 Location,
68 LocationFactory,
69 Progress,
70 StorageClass,
71 StoredDatastoreItemInfo,
72 StoredFileInfo,
73 ddl,
74)
75from lsst.daf.butler.core.repoRelocation import replaceRoot
76from lsst.daf.butler.core.utils import transactional
77from lsst.daf.butler.registry.interfaces import DatastoreRegistryBridge, ReadOnlyDatabaseError
78from lsst.resources import ResourcePath, ResourcePathExpression
79from lsst.utils.introspection import get_class_of, get_instance_of
80from lsst.utils.iteration import chunk_iterable
82# For VERBOSE logging usage.
83from lsst.utils.logging import VERBOSE, getLogger
84from lsst.utils.timer import time_this
85from sqlalchemy import BigInteger, String
87from ..registry.interfaces import FakeDatasetRef
88from .genericDatastore import GenericBaseDatastore
90if TYPE_CHECKING: 90 ↛ 91line 90 didn't jump to line 91, because the condition on line 90 was never true
91 from lsst.daf.butler import AbstractDatastoreCacheManager, LookupKey
92 from lsst.daf.butler.registry.interfaces import DatasetIdRef, DatastoreRegistryBridgeManager
94log = getLogger(__name__)
97class _IngestPrepData(Datastore.IngestPrepData):
98 """Helper class for FileDatastore ingest implementation.
100 Parameters
101 ----------
102 datasets : `list` of `FileDataset`
103 Files to be ingested by this datastore.
104 """
106 def __init__(self, datasets: List[FileDataset]):
107 super().__init__(ref for dataset in datasets for ref in dataset.refs)
108 self.datasets = datasets
111@dataclass(frozen=True)
112class DatastoreFileGetInformation:
113 """Collection of useful parameters needed to retrieve a file from
114 a Datastore.
115 """
117 location: Location
118 """The location from which to read the dataset."""
120 formatter: Formatter
121 """The `Formatter` to use to deserialize the dataset."""
123 info: StoredFileInfo
124 """Stored information about this file and its formatter."""
126 assemblerParams: Mapping[str, Any]
127 """Parameters to use for post-processing the retrieved dataset."""
129 formatterParams: Mapping[str, Any]
130 """Parameters that were understood by the associated formatter."""
132 component: Optional[str]
133 """The component to be retrieved (can be `None`)."""
135 readStorageClass: StorageClass
136 """The `StorageClass` of the dataset being read."""
139class FileDatastore(GenericBaseDatastore):
140 """Generic Datastore for file-based implementations.
142 Should always be sub-classed since key abstract methods are missing.
144 Parameters
145 ----------
146 config : `DatastoreConfig` or `str`
147 Configuration as either a `Config` object or URI to file.
148 bridgeManager : `DatastoreRegistryBridgeManager`
149 Object that manages the interface between `Registry` and datastores.
150 butlerRoot : `str`, optional
151 New datastore root to use to override the configuration value.
153 Raises
154 ------
155 ValueError
156 If root location does not exist and ``create`` is `False` in the
157 configuration.
158 """
160 defaultConfigFile: ClassVar[Optional[str]] = None
161 """Path to configuration defaults. Accessed within the ``config`` resource
162 or relative to a search path. Can be None if no defaults specified.
163 """
165 root: ResourcePath
166 """Root directory URI of this `Datastore`."""
168 locationFactory: LocationFactory
169 """Factory for creating locations relative to the datastore root."""
171 formatterFactory: FormatterFactory
172 """Factory for creating instances of formatters."""
174 templates: FileTemplates
175 """File templates that can be used by this `Datastore`."""
177 composites: CompositesMap
178 """Determines whether a dataset should be disassembled on put."""
180 defaultConfigFile = "datastores/fileDatastore.yaml"
181 """Path to configuration defaults. Accessed within the ``config`` resource
182 or relative to a search path. Can be None if no defaults specified.
183 """
185 @classmethod
186 def setConfigRoot(cls, root: str, config: Config, full: Config, overwrite: bool = True) -> None:
187 """Set any filesystem-dependent config options for this Datastore to
188 be appropriate for a new empty repository with the given root.
190 Parameters
191 ----------
192 root : `str`
193 URI to the root of the data repository.
194 config : `Config`
195 A `Config` to update. Only the subset understood by
196 this component will be updated. Will not expand
197 defaults.
198 full : `Config`
199 A complete config with all defaults expanded that can be
200 converted to a `DatastoreConfig`. Read-only and will not be
201 modified by this method.
202 Repository-specific options that should not be obtained
203 from defaults when Butler instances are constructed
204 should be copied from ``full`` to ``config``.
205 overwrite : `bool`, optional
206 If `False`, do not modify a value in ``config`` if the value
207 already exists. Default is always to overwrite with the provided
208 ``root``.
210 Notes
211 -----
212 If a keyword is explicitly defined in the supplied ``config`` it
213 will not be overridden by this method if ``overwrite`` is `False`.
214 This allows explicit values set in external configs to be retained.
215 """
216 Config.updateParameters(
217 DatastoreConfig,
218 config,
219 full,
220 toUpdate={"root": root},
221 toCopy=("cls", ("records", "table")),
222 overwrite=overwrite,
223 )
225 @classmethod
226 def makeTableSpec(cls, datasetIdColumnType: type) -> ddl.TableSpec:
227 return ddl.TableSpec(
228 fields=[
229 ddl.FieldSpec(name="dataset_id", dtype=datasetIdColumnType, primaryKey=True),
230 ddl.FieldSpec(name="path", dtype=String, length=256, nullable=False),
231 ddl.FieldSpec(name="formatter", dtype=String, length=128, nullable=False),
232 ddl.FieldSpec(name="storage_class", dtype=String, length=64, nullable=False),
233 # Use empty string to indicate no component
234 ddl.FieldSpec(name="component", dtype=String, length=32, primaryKey=True),
235 # TODO: should checksum be Base64Bytes instead?
236 ddl.FieldSpec(name="checksum", dtype=String, length=128, nullable=True),
237 ddl.FieldSpec(name="file_size", dtype=BigInteger, nullable=True),
238 ],
239 unique=frozenset(),
240 indexes=[ddl.IndexSpec("path")],
241 )
243 def __init__(
244 self,
245 config: Union[DatastoreConfig, str],
246 bridgeManager: DatastoreRegistryBridgeManager,
247 butlerRoot: str | None = None,
248 ):
249 super().__init__(config, bridgeManager)
250 if "root" not in self.config: 250 ↛ 251line 250 didn't jump to line 251, because the condition on line 250 was never true
251 raise ValueError("No root directory specified in configuration")
253 self._bridgeManager = bridgeManager
255 # Name ourselves either using an explicit name or a name
256 # derived from the (unexpanded) root
257 if "name" in self.config:
258 self.name = self.config["name"]
259 else:
260 # We use the unexpanded root in the name to indicate that this
261 # datastore can be moved without having to update registry.
262 self.name = "{}@{}".format(type(self).__name__, self.config["root"])
264 # Support repository relocation in config
265 # Existence of self.root is checked in subclass
266 self.root = ResourcePath(
267 replaceRoot(self.config["root"], butlerRoot), forceDirectory=True, forceAbsolute=True
268 )
270 self.locationFactory = LocationFactory(self.root)
271 self.formatterFactory = FormatterFactory()
273 # Now associate formatters with storage classes
274 self.formatterFactory.registerFormatters(self.config["formatters"], universe=bridgeManager.universe)
276 # Read the file naming templates
277 self.templates = FileTemplates(self.config["templates"], universe=bridgeManager.universe)
279 # See if composites should be disassembled
280 self.composites = CompositesMap(self.config["composites"], universe=bridgeManager.universe)
282 tableName = self.config["records", "table"]
283 try:
284 # Storage of paths and formatters, keyed by dataset_id
285 self._table = bridgeManager.opaque.register(
286 tableName, self.makeTableSpec(bridgeManager.datasetIdColumnType)
287 )
288 # Interface to Registry.
289 self._bridge = bridgeManager.register(self.name)
290 except ReadOnlyDatabaseError:
291 # If the database is read only and we just tried and failed to
292 # create a table, it means someone is trying to create a read-only
293 # butler client for an empty repo. That should be okay, as long
294 # as they then try to get any datasets before some other client
295 # creates the table. Chances are they'rejust validating
296 # configuration.
297 pass
299 # Determine whether checksums should be used - default to False
300 self.useChecksum = self.config.get("checksum", False)
302 # Determine whether we can fall back to configuration if a
303 # requested dataset is not known to registry
304 self.trustGetRequest = self.config.get("trust_get_request", False)
306 # Create a cache manager
307 self.cacheManager: AbstractDatastoreCacheManager
308 if "cached" in self.config: 308 ↛ 311line 308 didn't jump to line 311, because the condition on line 308 was never false
309 self.cacheManager = DatastoreCacheManager(self.config["cached"], universe=bridgeManager.universe)
310 else:
311 self.cacheManager = DatastoreDisabledCacheManager("", universe=bridgeManager.universe)
313 # Check existence and create directory structure if necessary
314 if not self.root.exists():
315 if "create" not in self.config or not self.config["create"]: 315 ↛ 316line 315 didn't jump to line 316, because the condition on line 315 was never true
316 raise ValueError(f"No valid root and not allowed to create one at: {self.root}")
317 try:
318 self.root.mkdir()
319 except Exception as e:
320 raise ValueError(
321 f"Can not create datastore root '{self.root}', check permissions. Got error: {e}"
322 ) from e
324 def __str__(self) -> str:
325 return str(self.root)
327 @property
328 def bridge(self) -> DatastoreRegistryBridge:
329 return self._bridge
331 def _artifact_exists(self, location: Location) -> bool:
332 """Check that an artifact exists in this datastore at the specified
333 location.
335 Parameters
336 ----------
337 location : `Location`
338 Expected location of the artifact associated with this datastore.
340 Returns
341 -------
342 exists : `bool`
343 True if the location can be found, false otherwise.
344 """
345 log.debug("Checking if resource exists: %s", location.uri)
346 return location.uri.exists()
348 def _delete_artifact(self, location: Location) -> None:
349 """Delete the artifact from the datastore.
351 Parameters
352 ----------
353 location : `Location`
354 Location of the artifact associated with this datastore.
355 """
356 if location.pathInStore.isabs(): 356 ↛ 357line 356 didn't jump to line 357, because the condition on line 356 was never true
357 raise RuntimeError(f"Cannot delete artifact with absolute uri {location.uri}.")
359 try:
360 location.uri.remove()
361 except FileNotFoundError:
362 log.debug("File %s did not exist and so could not be deleted.", location.uri)
363 raise
364 except Exception as e:
365 log.critical("Failed to delete file: %s (%s)", location.uri, e)
366 raise
367 log.debug("Successfully deleted file: %s", location.uri)
369 def addStoredItemInfo(self, refs: Iterable[DatasetRef], infos: Iterable[StoredFileInfo]) -> None:
370 # Docstring inherited from GenericBaseDatastore
371 records = [info.rebase(ref).to_record() for ref, info in zip(refs, infos)]
372 self._table.insert(*records)
374 def getStoredItemsInfo(self, ref: DatasetIdRef) -> List[StoredFileInfo]:
375 # Docstring inherited from GenericBaseDatastore
377 # Look for the dataset_id -- there might be multiple matches
378 # if we have disassembled the dataset.
379 records = self._table.fetch(dataset_id=ref.id)
380 return [StoredFileInfo.from_record(record) for record in records]
382 def _get_stored_records_associated_with_refs(
383 self, refs: Iterable[DatasetIdRef]
384 ) -> Dict[DatasetId, List[StoredFileInfo]]:
385 """Retrieve all records associated with the provided refs.
387 Parameters
388 ----------
389 refs : iterable of `DatasetIdRef`
390 The refs for which records are to be retrieved.
392 Returns
393 -------
394 records : `dict` of [`DatasetId`, `list` of `StoredFileInfo`]
395 The matching records indexed by the ref ID. The number of entries
396 in the dict can be smaller than the number of requested refs.
397 """
398 records = self._table.fetch(dataset_id=[ref.id for ref in refs])
400 # Uniqueness is dataset_id + component so can have multiple records
401 # per ref.
402 records_by_ref = defaultdict(list)
403 for record in records:
404 records_by_ref[record["dataset_id"]].append(StoredFileInfo.from_record(record))
405 return records_by_ref
407 def _refs_associated_with_artifacts(
408 self, paths: List[Union[str, ResourcePath]]
409 ) -> Dict[str, Set[DatasetId]]:
410 """Return paths and associated dataset refs.
412 Parameters
413 ----------
414 paths : `list` of `str` or `lsst.resources.ResourcePath`
415 All the paths to include in search.
417 Returns
418 -------
419 mapping : `dict` of [`str`, `set` [`DatasetId`]]
420 Mapping of each path to a set of associated database IDs.
421 """
422 records = self._table.fetch(path=[str(path) for path in paths])
423 result = defaultdict(set)
424 for row in records:
425 result[row["path"]].add(row["dataset_id"])
426 return result
428 def _registered_refs_per_artifact(self, pathInStore: ResourcePath) -> Set[DatasetId]:
429 """Return all dataset refs associated with the supplied path.
431 Parameters
432 ----------
433 pathInStore : `lsst.resources.ResourcePath`
434 Path of interest in the data store.
436 Returns
437 -------
438 ids : `set` of `int`
439 All `DatasetRef` IDs associated with this path.
440 """
441 records = list(self._table.fetch(path=str(pathInStore)))
442 ids = {r["dataset_id"] for r in records}
443 return ids
445 def removeStoredItemInfo(self, ref: DatasetIdRef) -> None:
446 # Docstring inherited from GenericBaseDatastore
447 self._table.delete(["dataset_id"], {"dataset_id": ref.id})
449 def _get_dataset_locations_info(self, ref: DatasetIdRef) -> List[Tuple[Location, StoredFileInfo]]:
450 r"""Find all the `Location`\ s of the requested dataset in the
451 `Datastore` and the associated stored file information.
453 Parameters
454 ----------
455 ref : `DatasetRef`
456 Reference to the required `Dataset`.
458 Returns
459 -------
460 results : `list` [`tuple` [`Location`, `StoredFileInfo` ]]
461 Location of the dataset within the datastore and
462 stored information about each file and its formatter.
463 """
464 # Get the file information (this will fail if no file)
465 records = self.getStoredItemsInfo(ref)
467 # Use the path to determine the location -- we need to take
468 # into account absolute URIs in the datastore record
469 return [(r.file_location(self.locationFactory), r) for r in records]
471 def _can_remove_dataset_artifact(self, ref: DatasetIdRef, location: Location) -> bool:
472 """Check that there is only one dataset associated with the
473 specified artifact.
475 Parameters
476 ----------
477 ref : `DatasetRef` or `FakeDatasetRef`
478 Dataset to be removed.
479 location : `Location`
480 The location of the artifact to be removed.
482 Returns
483 -------
484 can_remove : `Bool`
485 True if the artifact can be safely removed.
486 """
487 # Can't ever delete absolute URIs.
488 if location.pathInStore.isabs():
489 return False
491 # Get all entries associated with this path
492 allRefs = self._registered_refs_per_artifact(location.pathInStore)
493 if not allRefs:
494 raise RuntimeError(f"Datastore inconsistency error. {location.pathInStore} not in registry")
496 # Remove these refs from all the refs and if there is nothing left
497 # then we can delete
498 remainingRefs = allRefs - {ref.id}
500 if remainingRefs:
501 return False
502 return True
504 def _get_expected_dataset_locations_info(self, ref: DatasetRef) -> List[Tuple[Location, StoredFileInfo]]:
505 """Predict the location and related file information of the requested
506 dataset in this datastore.
508 Parameters
509 ----------
510 ref : `DatasetRef`
511 Reference to the required `Dataset`.
513 Returns
514 -------
515 results : `list` [`tuple` [`Location`, `StoredFileInfo` ]]
516 Expected Location of the dataset within the datastore and
517 placeholder information about each file and its formatter.
519 Notes
520 -----
521 Uses the current configuration to determine how we would expect the
522 datastore files to have been written if we couldn't ask registry.
523 This is safe so long as there has been no change to datastore
524 configuration between writing the dataset and wanting to read it.
525 Will not work for files that have been ingested without using the
526 standard file template or default formatter.
527 """
529 # If we have a component ref we always need to ask the questions
530 # of the composite. If the composite is disassembled this routine
531 # should return all components. If the composite was not
532 # disassembled the composite is what is stored regardless of
533 # component request. Note that if the caller has disassembled
534 # a composite there is no way for this guess to know that
535 # without trying both the composite and component ref and seeing
536 # if there is something at the component Location even without
537 # disassembly being enabled.
538 if ref.datasetType.isComponent():
539 ref = ref.makeCompositeRef()
541 # See if the ref is a composite that should be disassembled
542 doDisassembly = self.composites.shouldBeDisassembled(ref)
544 all_info: List[Tuple[Location, Formatter, StorageClass, Optional[str]]] = []
546 if doDisassembly:
547 for component, componentStorage in ref.datasetType.storageClass.components.items():
548 compRef = ref.makeComponentRef(component)
549 location, formatter = self._determine_put_formatter_location(compRef)
550 all_info.append((location, formatter, componentStorage, component))
552 else:
553 # Always use the composite ref if no disassembly
554 location, formatter = self._determine_put_formatter_location(ref)
555 all_info.append((location, formatter, ref.datasetType.storageClass, None))
557 # Convert the list of tuples to have StoredFileInfo as second element
558 return [
559 (
560 location,
561 StoredFileInfo(
562 formatter=formatter,
563 path=location.pathInStore.path,
564 storageClass=storageClass,
565 component=component,
566 checksum=None,
567 file_size=-1,
568 dataset_id=ref.getCheckedId(),
569 ),
570 )
571 for location, formatter, storageClass, component in all_info
572 ]
574 def _prepare_for_get(
575 self, ref: DatasetRef, parameters: Optional[Mapping[str, Any]] = None
576 ) -> List[DatastoreFileGetInformation]:
577 """Check parameters for ``get`` and obtain formatter and
578 location.
580 Parameters
581 ----------
582 ref : `DatasetRef`
583 Reference to the required Dataset.
584 parameters : `dict`
585 `StorageClass`-specific parameters that specify, for example,
586 a slice of the dataset to be loaded.
588 Returns
589 -------
590 getInfo : `list` [`DatastoreFileGetInformation`]
591 Parameters needed to retrieve each file.
592 """
593 log.debug("Retrieve %s from %s with parameters %s", ref, self.name, parameters)
595 # Get file metadata and internal metadata
596 fileLocations = self._get_dataset_locations_info(ref)
597 if not fileLocations:
598 if not self.trustGetRequest:
599 raise FileNotFoundError(f"Could not retrieve dataset {ref}.")
600 # Assume the dataset is where we think it should be
601 fileLocations = self._get_expected_dataset_locations_info(ref)
603 # The storage class we want to use eventually
604 refStorageClass = ref.datasetType.storageClass
606 if len(fileLocations) > 1:
607 disassembled = True
609 # If trust is involved it is possible that there will be
610 # components listed here that do not exist in the datastore.
611 # Explicitly check for file artifact existence and filter out any
612 # that are missing.
613 if self.trustGetRequest:
614 fileLocations = [loc for loc in fileLocations if loc[0].uri.exists()]
616 # For now complain only if we have no components at all. One
617 # component is probably a problem but we can punt that to the
618 # assembler.
619 if not fileLocations: 619 ↛ 620line 619 didn't jump to line 620, because the condition on line 619 was never true
620 raise FileNotFoundError(f"None of the component files for dataset {ref} exist.")
622 else:
623 disassembled = False
625 # Is this a component request?
626 refComponent = ref.datasetType.component()
628 fileGetInfo = []
629 for location, storedFileInfo in fileLocations:
631 # The storage class used to write the file
632 writeStorageClass = storedFileInfo.storageClass
634 # If this has been disassembled we need read to match the write
635 if disassembled:
636 readStorageClass = writeStorageClass
637 else:
638 readStorageClass = refStorageClass
640 formatter = get_instance_of(
641 storedFileInfo.formatter,
642 FileDescriptor(
643 location,
644 readStorageClass=readStorageClass,
645 storageClass=writeStorageClass,
646 parameters=parameters,
647 ),
648 ref.dataId,
649 )
651 formatterParams, notFormatterParams = formatter.segregateParameters()
653 # Of the remaining parameters, extract the ones supported by
654 # this StorageClass (for components not all will be handled)
655 assemblerParams = readStorageClass.filterParameters(notFormatterParams)
657 # The ref itself could be a component if the dataset was
658 # disassembled by butler, or we disassembled in datastore and
659 # components came from the datastore records
660 component = storedFileInfo.component if storedFileInfo.component else refComponent
662 fileGetInfo.append(
663 DatastoreFileGetInformation(
664 location,
665 formatter,
666 storedFileInfo,
667 assemblerParams,
668 formatterParams,
669 component,
670 readStorageClass,
671 )
672 )
674 return fileGetInfo
676 def _prepare_for_put(self, inMemoryDataset: Any, ref: DatasetRef) -> Tuple[Location, Formatter]:
677 """Check the arguments for ``put`` and obtain formatter and
678 location.
680 Parameters
681 ----------
682 inMemoryDataset : `object`
683 The dataset to store.
684 ref : `DatasetRef`
685 Reference to the associated Dataset.
687 Returns
688 -------
689 location : `Location`
690 The location to write the dataset.
691 formatter : `Formatter`
692 The `Formatter` to use to write the dataset.
694 Raises
695 ------
696 TypeError
697 Supplied object and storage class are inconsistent.
698 DatasetTypeNotSupportedError
699 The associated `DatasetType` is not handled by this datastore.
700 """
701 self._validate_put_parameters(inMemoryDataset, ref)
702 return self._determine_put_formatter_location(ref)
704 def _determine_put_formatter_location(self, ref: DatasetRef) -> Tuple[Location, Formatter]:
705 """Calculate the formatter and output location to use for put.
707 Parameters
708 ----------
709 ref : `DatasetRef`
710 Reference to the associated Dataset.
712 Returns
713 -------
714 location : `Location`
715 The location to write the dataset.
716 formatter : `Formatter`
717 The `Formatter` to use to write the dataset.
718 """
719 # Work out output file name
720 try:
721 template = self.templates.getTemplate(ref)
722 except KeyError as e:
723 raise DatasetTypeNotSupportedError(f"Unable to find template for {ref}") from e
725 # Validate the template to protect against filenames from different
726 # dataIds returning the same and causing overwrite confusion.
727 template.validateTemplate(ref)
729 location = self.locationFactory.fromPath(template.format(ref))
731 # Get the formatter based on the storage class
732 storageClass = ref.datasetType.storageClass
733 try:
734 formatter = self.formatterFactory.getFormatter(
735 ref, FileDescriptor(location, storageClass=storageClass), ref.dataId
736 )
737 except KeyError as e:
738 raise DatasetTypeNotSupportedError(
739 f"Unable to find formatter for {ref} in datastore {self.name}"
740 ) from e
742 # Now that we know the formatter, update the location
743 location = formatter.makeUpdatedLocation(location)
745 return location, formatter
747 def _overrideTransferMode(self, *datasets: FileDataset, transfer: Optional[str] = None) -> Optional[str]:
748 # Docstring inherited from base class
749 if transfer != "auto":
750 return transfer
752 # See if the paths are within the datastore or not
753 inside = [self._pathInStore(d.path) is not None for d in datasets]
755 if all(inside):
756 transfer = None
757 elif not any(inside): 757 ↛ 766line 757 didn't jump to line 766, because the condition on line 757 was never false
758 # Allow ResourcePath to use its own knowledge
759 transfer = "auto"
760 else:
761 # This can happen when importing from a datastore that
762 # has had some datasets ingested using "direct" mode.
763 # Also allow ResourcePath to sort it out but warn about it.
764 # This can happen if you are importing from a datastore
765 # that had some direct transfer datasets.
766 log.warning(
767 "Some datasets are inside the datastore and some are outside. Using 'split' "
768 "transfer mode. This assumes that the files outside the datastore are "
769 "still accessible to the new butler since they will not be copied into "
770 "the target datastore."
771 )
772 transfer = "split"
774 return transfer
776 def _pathInStore(self, path: ResourcePathExpression) -> Optional[str]:
777 """Return path relative to datastore root
779 Parameters
780 ----------
781 path : `lsst.resources.ResourcePathExpression`
782 Path to dataset. Can be absolute URI. If relative assumed to
783 be relative to the datastore. Returns path in datastore
784 or raises an exception if the path it outside.
786 Returns
787 -------
788 inStore : `str`
789 Path relative to datastore root. Returns `None` if the file is
790 outside the root.
791 """
792 # Relative path will always be relative to datastore
793 pathUri = ResourcePath(path, forceAbsolute=False)
794 return pathUri.relative_to(self.root)
796 def _standardizeIngestPath(
797 self, path: Union[str, ResourcePath], *, transfer: Optional[str] = None
798 ) -> Union[str, ResourcePath]:
799 """Standardize the path of a to-be-ingested file.
801 Parameters
802 ----------
803 path : `str` or `lsst.resources.ResourcePath`
804 Path of a file to be ingested. This parameter is not expected
805 to be all the types that can be used to construct a
806 `~lsst.resources.ResourcePath`.
807 transfer : `str`, optional
808 How (and whether) the dataset should be added to the datastore.
809 See `ingest` for details of transfer modes.
810 This implementation is provided only so
811 `NotImplementedError` can be raised if the mode is not supported;
812 actual transfers are deferred to `_extractIngestInfo`.
814 Returns
815 -------
816 path : `str` or `lsst.resources.ResourcePath`
817 New path in what the datastore considers standard form. If an
818 absolute URI was given that will be returned unchanged.
820 Notes
821 -----
822 Subclasses of `FileDatastore` can implement this method instead
823 of `_prepIngest`. It should not modify the data repository or given
824 file in any way.
826 Raises
827 ------
828 NotImplementedError
829 Raised if the datastore does not support the given transfer mode
830 (including the case where ingest is not supported at all).
831 FileNotFoundError
832 Raised if one of the given files does not exist.
833 """
834 if transfer not in (None, "direct", "split") + self.root.transferModes: 834 ↛ 835line 834 didn't jump to line 835, because the condition on line 834 was never true
835 raise NotImplementedError(f"Transfer mode {transfer} not supported.")
837 # A relative URI indicates relative to datastore root
838 srcUri = ResourcePath(path, forceAbsolute=False)
839 if not srcUri.isabs():
840 srcUri = self.root.join(path)
842 if not srcUri.exists():
843 raise FileNotFoundError(
844 f"Resource at {srcUri} does not exist; note that paths to ingest "
845 f"are assumed to be relative to {self.root} unless they are absolute."
846 )
848 if transfer is None:
849 relpath = srcUri.relative_to(self.root)
850 if not relpath:
851 raise RuntimeError(
852 f"Transfer is none but source file ({srcUri}) is not within datastore ({self.root})"
853 )
855 # Return the relative path within the datastore for internal
856 # transfer
857 path = relpath
859 return path
861 def _extractIngestInfo(
862 self,
863 path: ResourcePathExpression,
864 ref: DatasetRef,
865 *,
866 formatter: Union[Formatter, Type[Formatter]],
867 transfer: Optional[str] = None,
868 record_validation_info: bool = True,
869 ) -> StoredFileInfo:
870 """Relocate (if necessary) and extract `StoredFileInfo` from a
871 to-be-ingested file.
873 Parameters
874 ----------
875 path : `lsst.resources.ResourcePathExpression`
876 URI or path of a file to be ingested.
877 ref : `DatasetRef`
878 Reference for the dataset being ingested. Guaranteed to have
879 ``dataset_id not None`.
880 formatter : `type` or `Formatter`
881 `Formatter` subclass to use for this dataset or an instance.
882 transfer : `str`, optional
883 How (and whether) the dataset should be added to the datastore.
884 See `ingest` for details of transfer modes.
885 record_validation_info : `bool`, optional
886 If `True`, the default, the datastore can record validation
887 information associated with the file. If `False` the datastore
888 will not attempt to track any information such as checksums
889 or file sizes. This can be useful if such information is tracked
890 in an external system or if the file is to be compressed in place.
891 It is up to the datastore whether this parameter is relevant.
893 Returns
894 -------
895 info : `StoredFileInfo`
896 Internal datastore record for this file. This will be inserted by
897 the caller; the `_extractIngestInfo` is only responsible for
898 creating and populating the struct.
900 Raises
901 ------
902 FileNotFoundError
903 Raised if one of the given files does not exist.
904 FileExistsError
905 Raised if transfer is not `None` but the (internal) location the
906 file would be moved to is already occupied.
907 """
908 if self._transaction is None: 908 ↛ 909line 908 didn't jump to line 909, because the condition on line 908 was never true
909 raise RuntimeError("Ingest called without transaction enabled")
911 # Create URI of the source path, do not need to force a relative
912 # path to absolute.
913 srcUri = ResourcePath(path, forceAbsolute=False)
915 # Track whether we have read the size of the source yet
916 have_sized = False
918 tgtLocation: Optional[Location]
919 if transfer is None or transfer == "split":
920 # A relative path is assumed to be relative to the datastore
921 # in this context
922 if not srcUri.isabs():
923 tgtLocation = self.locationFactory.fromPath(srcUri.ospath)
924 else:
925 # Work out the path in the datastore from an absolute URI
926 # This is required to be within the datastore.
927 pathInStore = srcUri.relative_to(self.root)
928 if pathInStore is None and transfer is None: 928 ↛ 929line 928 didn't jump to line 929, because the condition on line 928 was never true
929 raise RuntimeError(
930 f"Unexpectedly learned that {srcUri} is not within datastore {self.root}"
931 )
932 if pathInStore: 932 ↛ 934line 932 didn't jump to line 934, because the condition on line 932 was never false
933 tgtLocation = self.locationFactory.fromPath(pathInStore)
934 elif transfer == "split":
935 # Outside the datastore but treat that as a direct ingest
936 # instead.
937 tgtLocation = None
938 else:
939 raise RuntimeError(f"Unexpected transfer mode encountered: {transfer} for URI {srcUri}")
940 elif transfer == "direct": 940 ↛ 945line 940 didn't jump to line 945, because the condition on line 940 was never true
941 # Want to store the full URI to the resource directly in
942 # datastore. This is useful for referring to permanent archive
943 # storage for raw data.
944 # Trust that people know what they are doing.
945 tgtLocation = None
946 else:
947 # Work out the name we want this ingested file to have
948 # inside the datastore
949 tgtLocation = self._calculate_ingested_datastore_name(srcUri, ref, formatter)
950 if not tgtLocation.uri.dirname().exists():
951 log.debug("Folder %s does not exist yet.", tgtLocation.uri.dirname())
952 tgtLocation.uri.dirname().mkdir()
954 # if we are transferring from a local file to a remote location
955 # it may be more efficient to get the size and checksum of the
956 # local file rather than the transferred one
957 if record_validation_info and srcUri.isLocal:
958 size = srcUri.size()
959 checksum = self.computeChecksum(srcUri) if self.useChecksum else None
960 have_sized = True
962 # Transfer the resource to the destination.
963 # Allow overwrite of an existing file. This matches the behavior
964 # of datastore.put() in that it trusts that registry would not
965 # be asking to overwrite unless registry thought that the
966 # overwrite was allowed.
967 tgtLocation.uri.transfer_from(
968 srcUri, transfer=transfer, transaction=self._transaction, overwrite=True
969 )
971 if tgtLocation is None: 971 ↛ 973line 971 didn't jump to line 973, because the condition on line 971 was never true
972 # This means we are using direct mode
973 targetUri = srcUri
974 targetPath = str(srcUri)
975 else:
976 targetUri = tgtLocation.uri
977 targetPath = tgtLocation.pathInStore.path
979 # the file should exist in the datastore now
980 if record_validation_info:
981 if not have_sized:
982 size = targetUri.size()
983 checksum = self.computeChecksum(targetUri) if self.useChecksum else None
984 else:
985 # Not recording any file information.
986 size = -1
987 checksum = None
989 return StoredFileInfo(
990 formatter=formatter,
991 path=targetPath,
992 storageClass=ref.datasetType.storageClass,
993 component=ref.datasetType.component(),
994 file_size=size,
995 checksum=checksum,
996 dataset_id=ref.getCheckedId(),
997 )
999 def _prepIngest(self, *datasets: FileDataset, transfer: Optional[str] = None) -> _IngestPrepData:
1000 # Docstring inherited from Datastore._prepIngest.
1001 filtered = []
1002 for dataset in datasets:
1003 acceptable = [ref for ref in dataset.refs if self.constraints.isAcceptable(ref)]
1004 if not acceptable:
1005 continue
1006 else:
1007 dataset.refs = acceptable
1008 if dataset.formatter is None:
1009 dataset.formatter = self.formatterFactory.getFormatterClass(dataset.refs[0])
1010 else:
1011 assert isinstance(dataset.formatter, (type, str))
1012 formatter_class = get_class_of(dataset.formatter)
1013 if not issubclass(formatter_class, Formatter): 1013 ↛ 1014line 1013 didn't jump to line 1014, because the condition on line 1013 was never true
1014 raise TypeError(f"Requested formatter {dataset.formatter} is not a Formatter class.")
1015 dataset.formatter = formatter_class
1016 dataset.path = self._standardizeIngestPath(dataset.path, transfer=transfer)
1017 filtered.append(dataset)
1018 return _IngestPrepData(filtered)
1020 @transactional
1021 def _finishIngest(
1022 self,
1023 prepData: Datastore.IngestPrepData,
1024 *,
1025 transfer: Optional[str] = None,
1026 record_validation_info: bool = True,
1027 ) -> None:
1028 # Docstring inherited from Datastore._finishIngest.
1029 refsAndInfos = []
1030 progress = Progress("lsst.daf.butler.datastores.FileDatastore.ingest", level=logging.DEBUG)
1031 for dataset in progress.wrap(prepData.datasets, desc="Ingesting dataset files"):
1032 # Do ingest as if the first dataset ref is associated with the file
1033 info = self._extractIngestInfo(
1034 dataset.path,
1035 dataset.refs[0],
1036 formatter=dataset.formatter,
1037 transfer=transfer,
1038 record_validation_info=record_validation_info,
1039 )
1040 refsAndInfos.extend([(ref, info) for ref in dataset.refs])
1041 self._register_datasets(refsAndInfos)
1043 def _calculate_ingested_datastore_name(
1044 self, srcUri: ResourcePath, ref: DatasetRef, formatter: Union[Formatter, Type[Formatter]]
1045 ) -> Location:
1046 """Given a source URI and a DatasetRef, determine the name the
1047 dataset will have inside datastore.
1049 Parameters
1050 ----------
1051 srcUri : `lsst.resources.ResourcePath`
1052 URI to the source dataset file.
1053 ref : `DatasetRef`
1054 Ref associated with the newly-ingested dataset artifact. This
1055 is used to determine the name within the datastore.
1056 formatter : `Formatter` or Formatter class.
1057 Formatter to use for validation. Can be a class or an instance.
1059 Returns
1060 -------
1061 location : `Location`
1062 Target location for the newly-ingested dataset.
1063 """
1064 # Ingesting a file from outside the datastore.
1065 # This involves a new name.
1066 template = self.templates.getTemplate(ref)
1067 location = self.locationFactory.fromPath(template.format(ref))
1069 # Get the extension
1070 ext = srcUri.getExtension()
1072 # Update the destination to include that extension
1073 location.updateExtension(ext)
1075 # Ask the formatter to validate this extension
1076 formatter.validateExtension(location)
1078 return location
1080 def _write_in_memory_to_artifact(self, inMemoryDataset: Any, ref: DatasetRef) -> StoredFileInfo:
1081 """Write out in memory dataset to datastore.
1083 Parameters
1084 ----------
1085 inMemoryDataset : `object`
1086 Dataset to write to datastore.
1087 ref : `DatasetRef`
1088 Registry information associated with this dataset.
1090 Returns
1091 -------
1092 info : `StoredFileInfo`
1093 Information describing the artifact written to the datastore.
1094 """
1095 # May need to coerce the in memory dataset to the correct
1096 # python type.
1097 inMemoryDataset = ref.datasetType.storageClass.coerce_type(inMemoryDataset)
1099 location, formatter = self._prepare_for_put(inMemoryDataset, ref)
1100 uri = location.uri
1102 if not uri.dirname().exists():
1103 log.debug("Folder %s does not exist yet so creating it.", uri.dirname())
1104 uri.dirname().mkdir()
1106 if self._transaction is None: 1106 ↛ 1107line 1106 didn't jump to line 1107, because the condition on line 1106 was never true
1107 raise RuntimeError("Attempting to write artifact without transaction enabled")
1109 def _removeFileExists(uri: ResourcePath) -> None:
1110 """Remove a file and do not complain if it is not there.
1112 This is important since a formatter might fail before the file
1113 is written and we should not confuse people by writing spurious
1114 error messages to the log.
1115 """
1116 try:
1117 uri.remove()
1118 except FileNotFoundError:
1119 pass
1121 # Register a callback to try to delete the uploaded data if
1122 # something fails below
1123 self._transaction.registerUndo("artifactWrite", _removeFileExists, uri)
1125 data_written = False
1126 if not uri.isLocal:
1127 # This is a remote URI. Some datasets can be serialized directly
1128 # to bytes and sent to the remote datastore without writing a
1129 # file. If the dataset is intended to be saved to the cache
1130 # a file is always written and direct write to the remote
1131 # datastore is bypassed.
1132 if not self.cacheManager.should_be_cached(ref):
1133 try:
1134 serializedDataset = formatter.toBytes(inMemoryDataset)
1135 except NotImplementedError:
1136 # Fallback to the file writing option.
1137 pass
1138 except Exception as e:
1139 raise RuntimeError(
1140 f"Failed to serialize dataset {ref} of type {type(inMemoryDataset)} to bytes."
1141 ) from e
1142 else:
1143 log.debug("Writing bytes directly to %s", uri)
1144 uri.write(serializedDataset, overwrite=True)
1145 log.debug("Successfully wrote bytes directly to %s", uri)
1146 data_written = True
1148 if not data_written:
1149 # Did not write the bytes directly to object store so instead
1150 # write to temporary file. Always write to a temporary even if
1151 # using a local file system -- that gives us atomic writes.
1152 # If a process is killed as the file is being written we do not
1153 # want it to remain in the correct place but in corrupt state.
1154 # For local files write to the output directory not temporary dir.
1155 prefix = uri.dirname() if uri.isLocal else None
1156 with ResourcePath.temporary_uri(suffix=uri.getExtension(), prefix=prefix) as temporary_uri:
1157 # Need to configure the formatter to write to a different
1158 # location and that needs us to overwrite internals
1159 log.debug("Writing dataset to temporary location at %s", temporary_uri)
1160 with formatter._updateLocation(Location(None, temporary_uri)):
1161 try:
1162 formatter.write(inMemoryDataset)
1163 except Exception as e:
1164 raise RuntimeError(
1165 f"Failed to serialize dataset {ref} of type"
1166 f" {type(inMemoryDataset)} to "
1167 f"temporary location {temporary_uri}"
1168 ) from e
1170 # Use move for a local file since that becomes an efficient
1171 # os.rename. For remote resources we use copy to allow the
1172 # file to be cached afterwards.
1173 transfer = "move" if uri.isLocal else "copy"
1175 uri.transfer_from(temporary_uri, transfer=transfer, overwrite=True)
1177 if transfer == "copy":
1178 # Cache if required
1179 self.cacheManager.move_to_cache(temporary_uri, ref)
1181 log.debug("Successfully wrote dataset to %s via a temporary file.", uri)
1183 # URI is needed to resolve what ingest case are we dealing with
1184 return self._extractIngestInfo(uri, ref, formatter=formatter)
1186 def _read_artifact_into_memory(
1187 self,
1188 getInfo: DatastoreFileGetInformation,
1189 ref: DatasetRef,
1190 isComponent: bool = False,
1191 cache_ref: Optional[DatasetRef] = None,
1192 ) -> Any:
1193 """Read the artifact from datastore into in memory object.
1195 Parameters
1196 ----------
1197 getInfo : `DatastoreFileGetInformation`
1198 Information about the artifact within the datastore.
1199 ref : `DatasetRef`
1200 The registry information associated with this artifact.
1201 isComponent : `bool`
1202 Flag to indicate if a component is being read from this artifact.
1203 cache_ref : `DatasetRef`, optional
1204 The DatasetRef to use when looking up the file in the cache.
1205 This ref must have the same ID as the supplied ref but can
1206 be a parent ref or component ref to indicate to the cache whether
1207 a composite file is being requested from the cache or a component
1208 file. Without this the cache will default to the supplied ref but
1209 it can get confused with read-only derived components for
1210 disassembled composites.
1212 Returns
1213 -------
1214 inMemoryDataset : `object`
1215 The artifact as a python object.
1216 """
1217 location = getInfo.location
1218 uri = location.uri
1219 log.debug("Accessing data from %s", uri)
1221 if cache_ref is None:
1222 cache_ref = ref
1223 if cache_ref.id != ref.id: 1223 ↛ 1224line 1223 didn't jump to line 1224, because the condition on line 1223 was never true
1224 raise ValueError(
1225 "The supplied cache dataset ref refers to a different dataset than expected:"
1226 f" {ref.id} != {cache_ref.id}"
1227 )
1229 # Cannot recalculate checksum but can compare size as a quick check
1230 # Do not do this if the size is negative since that indicates
1231 # we do not know.
1232 recorded_size = getInfo.info.file_size
1233 resource_size = uri.size()
1234 if recorded_size >= 0 and resource_size != recorded_size: 1234 ↛ 1235line 1234 didn't jump to line 1235, because the condition on line 1234 was never true
1235 raise RuntimeError(
1236 "Integrity failure in Datastore. "
1237 f"Size of file {uri} ({resource_size}) "
1238 f"does not match size recorded in registry of {recorded_size}"
1239 )
1241 # For the general case we have choices for how to proceed.
1242 # 1. Always use a local file (downloading the remote resource to a
1243 # temporary file if needed).
1244 # 2. Use a threshold size and read into memory and use bytes.
1245 # Use both for now with an arbitrary hand off size.
1246 # This allows small datasets to be downloaded from remote object
1247 # stores without requiring a temporary file.
1249 formatter = getInfo.formatter
1250 nbytes_max = 10_000_000 # Arbitrary number that we can tune
1251 if resource_size <= nbytes_max and formatter.can_read_bytes():
1252 with self.cacheManager.find_in_cache(cache_ref, uri.getExtension()) as cached_file:
1253 if cached_file is not None:
1254 desired_uri = cached_file
1255 msg = f" (cached version of {uri})"
1256 else:
1257 desired_uri = uri
1258 msg = ""
1259 with time_this(log, msg="Reading bytes from %s%s", args=(desired_uri, msg)):
1260 serializedDataset = desired_uri.read()
1261 log.debug(
1262 "Deserializing %s from %d bytes from location %s with formatter %s",
1263 f"component {getInfo.component}" if isComponent else "",
1264 len(serializedDataset),
1265 uri,
1266 formatter.name(),
1267 )
1268 try:
1269 result = formatter.fromBytes(
1270 serializedDataset, component=getInfo.component if isComponent else None
1271 )
1272 except Exception as e:
1273 raise ValueError(
1274 f"Failure from formatter '{formatter.name()}' for dataset {ref.id}"
1275 f" ({ref.datasetType.name} from {uri}): {e}"
1276 ) from e
1277 else:
1278 # Read from file.
1280 # Have to update the Location associated with the formatter
1281 # because formatter.read does not allow an override.
1282 # This could be improved.
1283 location_updated = False
1284 msg = ""
1286 # First check in cache for local version.
1287 # The cache will only be relevant for remote resources but
1288 # no harm in always asking. Context manager ensures that cache
1289 # file is not deleted during cache expiration.
1290 with self.cacheManager.find_in_cache(cache_ref, uri.getExtension()) as cached_file:
1291 if cached_file is not None:
1292 msg = f"(via cache read of remote file {uri})"
1293 uri = cached_file
1294 location_updated = True
1296 with uri.as_local() as local_uri:
1298 can_be_cached = False
1299 if uri != local_uri: 1299 ↛ 1301line 1299 didn't jump to line 1301, because the condition on line 1299 was never true
1300 # URI was remote and file was downloaded
1301 cache_msg = ""
1302 location_updated = True
1304 if self.cacheManager.should_be_cached(cache_ref):
1305 # In this scenario we want to ask if the downloaded
1306 # file should be cached but we should not cache
1307 # it until after we've used it (to ensure it can't
1308 # be expired whilst we are using it).
1309 can_be_cached = True
1311 # Say that it is "likely" to be cached because
1312 # if the formatter read fails we will not be
1313 # caching this file.
1314 cache_msg = " and likely cached"
1316 msg = f"(via download to local file{cache_msg})"
1318 # Calculate the (possibly) new location for the formatter
1319 # to use.
1320 newLocation = Location(*local_uri.split()) if location_updated else None
1322 log.debug(
1323 "Reading%s from location %s %s with formatter %s",
1324 f" component {getInfo.component}" if isComponent else "",
1325 uri,
1326 msg,
1327 formatter.name(),
1328 )
1329 try:
1330 with formatter._updateLocation(newLocation):
1331 with time_this(
1332 log,
1333 msg="Reading%s from location %s %s with formatter %s",
1334 args=(
1335 f" component {getInfo.component}" if isComponent else "",
1336 uri,
1337 msg,
1338 formatter.name(),
1339 ),
1340 ):
1341 result = formatter.read(component=getInfo.component if isComponent else None)
1342 except Exception as e:
1343 raise ValueError(
1344 f"Failure from formatter '{formatter.name()}' for dataset {ref.id}"
1345 f" ({ref.datasetType.name} from {uri}): {e}"
1346 ) from e
1348 # File was read successfully so can move to cache
1349 if can_be_cached: 1349 ↛ 1350line 1349 didn't jump to line 1350, because the condition on line 1349 was never true
1350 self.cacheManager.move_to_cache(local_uri, cache_ref)
1352 return self._post_process_get(
1353 result, getInfo.readStorageClass, getInfo.assemblerParams, isComponent=isComponent
1354 )
1356 def knows(self, ref: DatasetRef) -> bool:
1357 """Check if the dataset is known to the datastore.
1359 Does not check for existence of any artifact.
1361 Parameters
1362 ----------
1363 ref : `DatasetRef`
1364 Reference to the required dataset.
1366 Returns
1367 -------
1368 exists : `bool`
1369 `True` if the dataset is known to the datastore.
1370 """
1371 fileLocations = self._get_dataset_locations_info(ref)
1372 if fileLocations:
1373 return True
1374 return False
1376 def knows_these(self, refs: Iterable[DatasetRef]) -> dict[DatasetRef, bool]:
1377 # Docstring inherited from the base class.
1379 # The records themselves. Could be missing some entries.
1380 records = self._get_stored_records_associated_with_refs(refs)
1382 return {ref: ref.id in records for ref in refs}
1384 def _process_mexists_records(
1385 self,
1386 id_to_ref: Dict[DatasetId, DatasetRef],
1387 records: Dict[DatasetId, List[StoredFileInfo]],
1388 all_required: bool,
1389 artifact_existence: Optional[Dict[ResourcePath, bool]] = None,
1390 ) -> Dict[DatasetRef, bool]:
1391 """Helper function for mexists that checks the given records.
1393 Parameters
1394 ----------
1395 id_to_ref : `dict` of [`DatasetId`, `DatasetRef`]
1396 Mapping of the dataset ID to the dataset ref itself.
1397 records : `dict` of [`DatasetId`, `list` of `StoredFileInfo`]
1398 Records as generally returned by
1399 ``_get_stored_records_associated_with_refs``.
1400 all_required : `bool`
1401 Flag to indicate whether existence requires all artifacts
1402 associated with a dataset ID to exist or not for existence.
1403 artifact_existence : `dict` [`lsst.resources.ResourcePath`, `bool`]
1404 Optional mapping of datastore artifact to existence. Updated by
1405 this method with details of all artifacts tested. Can be `None`
1406 if the caller is not interested.
1408 Returns
1409 -------
1410 existence : `dict` of [`DatasetRef`, `bool`]
1411 Mapping from dataset to boolean indicating existence.
1412 """
1413 # The URIs to be checked and a mapping of those URIs to
1414 # the dataset ID.
1415 uris_to_check: List[ResourcePath] = []
1416 location_map: Dict[ResourcePath, DatasetId] = {}
1418 location_factory = self.locationFactory
1420 uri_existence: Dict[ResourcePath, bool] = {}
1421 for ref_id, infos in records.items():
1422 # Key is the dataset Id, value is list of StoredItemInfo
1423 uris = [info.file_location(location_factory).uri for info in infos]
1424 location_map.update({uri: ref_id for uri in uris})
1426 # Check the local cache directly for a dataset corresponding
1427 # to the remote URI.
1428 if self.cacheManager.file_count > 0: 1428 ↛ 1429line 1428 didn't jump to line 1429, because the condition on line 1428 was never true
1429 ref = id_to_ref[ref_id]
1430 for uri, storedFileInfo in zip(uris, infos):
1431 check_ref = ref
1432 if not ref.datasetType.isComponent() and (component := storedFileInfo.component):
1433 check_ref = ref.makeComponentRef(component)
1434 if self.cacheManager.known_to_cache(check_ref, uri.getExtension()):
1435 # Proxy for URI existence.
1436 uri_existence[uri] = True
1437 else:
1438 uris_to_check.append(uri)
1439 else:
1440 # Check all of them.
1441 uris_to_check.extend(uris)
1443 if artifact_existence is not None:
1444 # If a URI has already been checked remove it from the list
1445 # and immediately add the status to the output dict.
1446 filtered_uris_to_check = []
1447 for uri in uris_to_check:
1448 if uri in artifact_existence:
1449 uri_existence[uri] = artifact_existence[uri]
1450 else:
1451 filtered_uris_to_check.append(uri)
1452 uris_to_check = filtered_uris_to_check
1454 # Results.
1455 dataset_existence: Dict[DatasetRef, bool] = {}
1457 uri_existence.update(ResourcePath.mexists(uris_to_check))
1458 for uri, exists in uri_existence.items():
1459 dataset_id = location_map[uri]
1460 ref = id_to_ref[dataset_id]
1462 # Disassembled composite needs to check all locations.
1463 # all_required indicates whether all need to exist or not.
1464 if ref in dataset_existence:
1465 if all_required:
1466 exists = dataset_existence[ref] and exists
1467 else:
1468 exists = dataset_existence[ref] or exists
1469 dataset_existence[ref] = exists
1471 if artifact_existence is not None:
1472 artifact_existence.update(uri_existence)
1474 return dataset_existence
1476 def mexists(
1477 self, refs: Iterable[DatasetRef], artifact_existence: Optional[Dict[ResourcePath, bool]] = None
1478 ) -> Dict[DatasetRef, bool]:
1479 """Check the existence of multiple datasets at once.
1481 Parameters
1482 ----------
1483 refs : iterable of `DatasetRef`
1484 The datasets to be checked.
1485 artifact_existence : `dict` [`lsst.resources.ResourcePath`, `bool`]
1486 Optional mapping of datastore artifact to existence. Updated by
1487 this method with details of all artifacts tested. Can be `None`
1488 if the caller is not interested.
1490 Returns
1491 -------
1492 existence : `dict` of [`DatasetRef`, `bool`]
1493 Mapping from dataset to boolean indicating existence.
1495 Notes
1496 -----
1497 To minimize potentially costly remote existence checks, the local
1498 cache is checked as a proxy for existence. If a file for this
1499 `DatasetRef` does exist no check is done for the actual URI. This
1500 could result in possibly unexpected behavior if the dataset itself
1501 has been removed from the datastore by another process whilst it is
1502 still in the cache.
1503 """
1504 chunk_size = 10_000
1505 dataset_existence: Dict[DatasetRef, bool] = {}
1506 log.debug("Checking for the existence of multiple artifacts in datastore in chunks of %d", chunk_size)
1507 n_found_total = 0
1508 n_checked = 0
1509 n_chunks = 0
1510 for chunk in chunk_iterable(refs, chunk_size=chunk_size):
1511 chunk_result = self._mexists(chunk, artifact_existence)
1512 if log.isEnabledFor(VERBOSE):
1513 n_results = len(chunk_result)
1514 n_checked += n_results
1515 # Can treat the booleans as 0, 1 integers and sum them.
1516 n_found = sum(chunk_result.values())
1517 n_found_total += n_found
1518 log.verbose(
1519 "Number of datasets found in datastore for chunk %d = %d/%d (running total: %d/%d)",
1520 n_chunks,
1521 n_found,
1522 n_results,
1523 n_found_total,
1524 n_checked,
1525 )
1526 dataset_existence.update(chunk_result)
1527 n_chunks += 1
1529 return dataset_existence
1531 def _mexists(
1532 self, refs: Iterable[DatasetRef], artifact_existence: Optional[Dict[ResourcePath, bool]] = None
1533 ) -> Dict[DatasetRef, bool]:
1534 """Check the existence of multiple datasets at once.
1536 Parameters
1537 ----------
1538 refs : iterable of `DatasetRef`
1539 The datasets to be checked.
1541 Returns
1542 -------
1543 existence : `dict` of [`DatasetRef`, `bool`]
1544 Mapping from dataset to boolean indicating existence.
1545 """
1546 # Need a mapping of dataset_id to dataset ref since the API
1547 # works with dataset_id
1548 id_to_ref = {ref.getCheckedId(): ref for ref in refs}
1550 # Set of all IDs we are checking for.
1551 requested_ids = set(id_to_ref.keys())
1553 # The records themselves. Could be missing some entries.
1554 records = self._get_stored_records_associated_with_refs(refs)
1556 dataset_existence = self._process_mexists_records(
1557 id_to_ref, records, True, artifact_existence=artifact_existence
1558 )
1560 # Set of IDs that have been handled.
1561 handled_ids = {ref.id for ref in dataset_existence.keys()}
1563 missing_ids = requested_ids - handled_ids
1564 if missing_ids:
1565 if not self.trustGetRequest:
1566 # Must assume these do not exist
1567 for missing in missing_ids:
1568 dataset_existence[id_to_ref[missing]] = False
1569 else:
1570 log.debug(
1571 "%d out of %d datasets were not known to datastore during initial existence check.",
1572 len(missing_ids),
1573 len(requested_ids),
1574 )
1576 # Construct data structure identical to that returned
1577 # by _get_stored_records_associated_with_refs() but using
1578 # guessed names.
1579 records = {}
1580 for missing in missing_ids:
1581 expected = self._get_expected_dataset_locations_info(id_to_ref[missing])
1582 records[missing] = [info for _, info in expected]
1584 dataset_existence.update(
1585 self._process_mexists_records(
1586 id_to_ref, records, False, artifact_existence=artifact_existence
1587 )
1588 )
1590 return dataset_existence
1592 def exists(self, ref: DatasetRef) -> bool:
1593 """Check if the dataset exists in the datastore.
1595 Parameters
1596 ----------
1597 ref : `DatasetRef`
1598 Reference to the required dataset.
1600 Returns
1601 -------
1602 exists : `bool`
1603 `True` if the entity exists in the `Datastore`.
1605 Notes
1606 -----
1607 The local cache is checked as a proxy for existence in the remote
1608 object store. It is possible that another process on a different
1609 compute node could remove the file from the object store even
1610 though it is present in the local cache.
1611 """
1612 fileLocations = self._get_dataset_locations_info(ref)
1614 # if we are being asked to trust that registry might not be correct
1615 # we ask for the expected locations and check them explicitly
1616 if not fileLocations:
1617 if not self.trustGetRequest:
1618 return False
1620 # First check the cache. If it is not found we must check
1621 # the datastore itself. Assume that any component in the cache
1622 # means that the dataset does exist somewhere.
1623 if self.cacheManager.known_to_cache(ref): 1623 ↛ 1624line 1623 didn't jump to line 1624, because the condition on line 1623 was never true
1624 return True
1626 # When we are guessing a dataset location we can not check
1627 # for the existence of every component since we can not
1628 # know if every component was written. Instead we check
1629 # for the existence of any of the expected locations.
1630 for location, _ in self._get_expected_dataset_locations_info(ref):
1631 if self._artifact_exists(location):
1632 return True
1633 return False
1635 # All listed artifacts must exist.
1636 for location, storedFileInfo in fileLocations:
1637 # Checking in cache needs the component ref.
1638 check_ref = ref
1639 if not ref.datasetType.isComponent() and (component := storedFileInfo.component):
1640 check_ref = ref.makeComponentRef(component)
1641 if self.cacheManager.known_to_cache(check_ref, location.getExtension()):
1642 continue
1644 if not self._artifact_exists(location):
1645 return False
1647 return True
1649 def getURIs(self, ref: DatasetRef, predict: bool = False) -> DatasetRefURIs:
1650 """Return URIs associated with dataset.
1652 Parameters
1653 ----------
1654 ref : `DatasetRef`
1655 Reference to the required dataset.
1656 predict : `bool`, optional
1657 If the datastore does not know about the dataset, should it
1658 return a predicted URI or not?
1660 Returns
1661 -------
1662 uris : `DatasetRefURIs`
1663 The URI to the primary artifact associated with this dataset (if
1664 the dataset was disassembled within the datastore this may be
1665 `None`), and the URIs to any components associated with the dataset
1666 artifact. (can be empty if there are no components).
1667 """
1668 # if this has never been written then we have to guess
1669 if not self.exists(ref):
1670 if not predict:
1671 raise FileNotFoundError("Dataset {} not in this datastore".format(ref))
1673 return self._predict_URIs(ref)
1675 # If this is a ref that we have written we can get the path.
1676 # Get file metadata and internal metadata
1677 fileLocations = self._get_dataset_locations_info(ref)
1679 return self._locations_to_URI(ref, fileLocations)
1681 def getURI(self, ref: DatasetRef, predict: bool = False) -> ResourcePath:
1682 """URI to the Dataset.
1684 Parameters
1685 ----------
1686 ref : `DatasetRef`
1687 Reference to the required Dataset.
1688 predict : `bool`
1689 If `True`, allow URIs to be returned of datasets that have not
1690 been written.
1692 Returns
1693 -------
1694 uri : `str`
1695 URI pointing to the dataset within the datastore. If the
1696 dataset does not exist in the datastore, and if ``predict`` is
1697 `True`, the URI will be a prediction and will include a URI
1698 fragment "#predicted".
1699 If the datastore does not have entities that relate well
1700 to the concept of a URI the returned URI will be
1701 descriptive. The returned URI is not guaranteed to be obtainable.
1703 Raises
1704 ------
1705 FileNotFoundError
1706 Raised if a URI has been requested for a dataset that does not
1707 exist and guessing is not allowed.
1708 RuntimeError
1709 Raised if a request is made for a single URI but multiple URIs
1710 are associated with this dataset.
1712 Notes
1713 -----
1714 When a predicted URI is requested an attempt will be made to form
1715 a reasonable URI based on file templates and the expected formatter.
1716 """
1717 primary, components = self.getURIs(ref, predict)
1718 if primary is None or components: 1718 ↛ 1719line 1718 didn't jump to line 1719, because the condition on line 1718 was never true
1719 raise RuntimeError(
1720 f"Dataset ({ref}) includes distinct URIs for components. Use Datastore.getURIs() instead."
1721 )
1722 return primary
1724 def _predict_URIs(
1725 self,
1726 ref: DatasetRef,
1727 ) -> DatasetRefURIs:
1728 """Predict the URIs of a dataset ref.
1730 Parameters
1731 ----------
1732 ref : `DatasetRef`
1733 Reference to the required Dataset.
1735 Returns
1736 -------
1737 URI : DatasetRefUris
1738 Primary and component URIs. URIs will contain a URI fragment
1739 "#predicted".
1740 """
1741 uris = DatasetRefURIs()
1743 if self.composites.shouldBeDisassembled(ref):
1745 for component, _ in ref.datasetType.storageClass.components.items():
1746 comp_ref = ref.makeComponentRef(component)
1747 comp_location, _ = self._determine_put_formatter_location(comp_ref)
1749 # Add the "#predicted" URI fragment to indicate this is a
1750 # guess
1751 uris.componentURIs[component] = ResourcePath(comp_location.uri.geturl() + "#predicted")
1753 else:
1755 location, _ = self._determine_put_formatter_location(ref)
1757 # Add the "#predicted" URI fragment to indicate this is a guess
1758 uris.primaryURI = ResourcePath(location.uri.geturl() + "#predicted")
1760 return uris
1762 def getManyURIs(
1763 self,
1764 refs: Iterable[DatasetRef],
1765 predict: bool = False,
1766 allow_missing: bool = False,
1767 ) -> Dict[DatasetRef, DatasetRefURIs]:
1768 # Docstring inherited
1770 uris: Dict[DatasetRef, DatasetRefURIs] = {}
1772 records = self._get_stored_records_associated_with_refs(refs)
1773 records_keys = records.keys()
1775 existing_refs = (ref for ref in refs if ref.id in records_keys)
1776 missing_refs = (ref for ref in refs if ref.id not in records_keys)
1778 for ref in missing_refs:
1780 # if this has never been written then we have to guess
1781 if not predict:
1782 if not allow_missing:
1783 raise FileNotFoundError("Dataset {} not in this datastore.".format(ref))
1784 else:
1785 uris[ref] = self._predict_URIs(ref)
1787 for ref in existing_refs:
1788 file_infos = records[ref.getCheckedId()]
1789 file_locations = [(i.file_location(self.locationFactory), i) for i in file_infos]
1790 uris[ref] = self._locations_to_URI(ref, file_locations)
1792 return uris
1794 def _locations_to_URI(
1795 self,
1796 ref: DatasetRef,
1797 file_locations: Sequence[Tuple[Location, StoredFileInfo]],
1798 ) -> DatasetRefURIs:
1799 """Convert one or more file locations associated with a DatasetRef
1800 to a DatasetRefURIs.
1802 Parameters
1803 ----------
1804 ref : `DatasetRef`
1805 Reference to the dataset.
1806 file_locations : Sequence[Tuple[Location, StoredFileInfo]]
1807 Each item in the sequence is the location of the dataset within the
1808 datastore and stored information about the file and its formatter.
1809 If there is only one item in the sequence then it is treated as the
1810 primary URI. If there is more than one item then they are treated
1811 as component URIs. If there are no items then an error is raised
1812 unless ``self.trustGetRequest`` is `True`.
1814 Returns
1815 -------
1816 uris: DatasetRefURIs
1817 Represents the primary URI or component URIs described by the
1818 inputs.
1820 Raises
1821 ------
1822 RuntimeError
1823 If no file locations are passed in and ``self.trustGetRequest`` is
1824 `False`.
1825 FileNotFoundError
1826 If the a passed-in URI does not exist, and ``self.trustGetRequest``
1827 is `False`.
1828 RuntimeError
1829 If a passed in `StoredFileInfo`'s ``component`` is `None` (this is
1830 unexpected).
1831 """
1833 guessing = False
1834 uris = DatasetRefURIs()
1836 if not file_locations:
1837 if not self.trustGetRequest: 1837 ↛ 1838line 1837 didn't jump to line 1838, because the condition on line 1837 was never true
1838 raise RuntimeError(f"Unexpectedly got no artifacts for dataset {ref}")
1839 file_locations = self._get_expected_dataset_locations_info(ref)
1840 guessing = True
1842 if len(file_locations) == 1:
1843 # No disassembly so this is the primary URI
1844 uris.primaryURI = file_locations[0][0].uri
1845 if guessing and not uris.primaryURI.exists(): 1845 ↛ 1846line 1845 didn't jump to line 1846, because the condition on line 1845 was never true
1846 raise FileNotFoundError(f"Expected URI ({uris.primaryURI}) does not exist")
1847 else:
1848 for location, file_info in file_locations:
1849 if file_info.component is None: 1849 ↛ 1850line 1849 didn't jump to line 1850, because the condition on line 1849 was never true
1850 raise RuntimeError(f"Unexpectedly got no component name for a component at {location}")
1851 if guessing and not location.uri.exists(): 1851 ↛ 1855line 1851 didn't jump to line 1855, because the condition on line 1851 was never true
1852 # If we are trusting then it is entirely possible for
1853 # some components to be missing. In that case we skip
1854 # to the next component.
1855 if self.trustGetRequest:
1856 continue
1857 raise FileNotFoundError(f"Expected URI ({location.uri}) does not exist")
1858 uris.componentURIs[file_info.component] = location.uri
1860 return uris
1862 def retrieveArtifacts(
1863 self,
1864 refs: Iterable[DatasetRef],
1865 destination: ResourcePath,
1866 transfer: str = "auto",
1867 preserve_path: bool = True,
1868 overwrite: bool = False,
1869 ) -> List[ResourcePath]:
1870 """Retrieve the file artifacts associated with the supplied refs.
1872 Parameters
1873 ----------
1874 refs : iterable of `DatasetRef`
1875 The datasets for which file artifacts are to be retrieved.
1876 A single ref can result in multiple files. The refs must
1877 be resolved.
1878 destination : `lsst.resources.ResourcePath`
1879 Location to write the file artifacts.
1880 transfer : `str`, optional
1881 Method to use to transfer the artifacts. Must be one of the options
1882 supported by `lsst.resources.ResourcePath.transfer_from()`.
1883 "move" is not allowed.
1884 preserve_path : `bool`, optional
1885 If `True` the full path of the file artifact within the datastore
1886 is preserved. If `False` the final file component of the path
1887 is used.
1888 overwrite : `bool`, optional
1889 If `True` allow transfers to overwrite existing files at the
1890 destination.
1892 Returns
1893 -------
1894 targets : `list` of `lsst.resources.ResourcePath`
1895 URIs of file artifacts in destination location. Order is not
1896 preserved.
1897 """
1898 if not destination.isdir(): 1898 ↛ 1899line 1898 didn't jump to line 1899, because the condition on line 1898 was never true
1899 raise ValueError(f"Destination location must refer to a directory. Given {destination}")
1901 if transfer == "move":
1902 raise ValueError("Can not move artifacts out of datastore. Use copy instead.")
1904 # Source -> Destination
1905 # This also helps filter out duplicate DatasetRef in the request
1906 # that will map to the same underlying file transfer.
1907 to_transfer: Dict[ResourcePath, ResourcePath] = {}
1909 for ref in refs:
1910 locations = self._get_dataset_locations_info(ref)
1911 for location, _ in locations:
1912 source_uri = location.uri
1913 target_path: ResourcePathExpression
1914 if preserve_path:
1915 target_path = location.pathInStore
1916 if target_path.isabs(): 1916 ↛ 1919line 1916 didn't jump to line 1919, because the condition on line 1916 was never true
1917 # This is an absolute path to an external file.
1918 # Use the full path.
1919 target_path = target_path.relativeToPathRoot
1920 else:
1921 target_path = source_uri.basename()
1922 target_uri = destination.join(target_path)
1923 to_transfer[source_uri] = target_uri
1925 # In theory can now parallelize the transfer
1926 log.debug("Number of artifacts to transfer to %s: %d", str(destination), len(to_transfer))
1927 for source_uri, target_uri in to_transfer.items():
1928 target_uri.transfer_from(source_uri, transfer=transfer, overwrite=overwrite)
1930 return list(to_transfer.values())
1932 def get(
1933 self,
1934 ref: DatasetRef,
1935 parameters: Optional[Mapping[str, Any]] = None,
1936 storageClass: Optional[Union[StorageClass, str]] = None,
1937 ) -> Any:
1938 """Load an InMemoryDataset from the store.
1940 Parameters
1941 ----------
1942 ref : `DatasetRef`
1943 Reference to the required Dataset.
1944 parameters : `dict`
1945 `StorageClass`-specific parameters that specify, for example,
1946 a slice of the dataset to be loaded.
1947 storageClass : `StorageClass` or `str`, optional
1948 The storage class to be used to override the Python type
1949 returned by this method. By default the returned type matches
1950 the dataset type definition for this dataset. Specifying a
1951 read `StorageClass` can force a different type to be returned.
1952 This type must be compatible with the original type.
1954 Returns
1955 -------
1956 inMemoryDataset : `object`
1957 Requested dataset or slice thereof as an InMemoryDataset.
1959 Raises
1960 ------
1961 FileNotFoundError
1962 Requested dataset can not be retrieved.
1963 TypeError
1964 Return value from formatter has unexpected type.
1965 ValueError
1966 Formatter failed to process the dataset.
1967 """
1968 # Supplied storage class for the component being read is either
1969 # from the ref itself or some an override if we want to force
1970 # type conversion.
1971 if storageClass is not None:
1972 ref = ref.overrideStorageClass(storageClass)
1973 refStorageClass = ref.datasetType.storageClass
1975 allGetInfo = self._prepare_for_get(ref, parameters)
1976 refComponent = ref.datasetType.component()
1978 # Create mapping from component name to related info
1979 allComponents = {i.component: i for i in allGetInfo}
1981 # By definition the dataset is disassembled if we have more
1982 # than one record for it.
1983 isDisassembled = len(allGetInfo) > 1
1985 # Look for the special case where we are disassembled but the
1986 # component is a derived component that was not written during
1987 # disassembly. For this scenario we need to check that the
1988 # component requested is listed as a derived component for the
1989 # composite storage class
1990 isDisassembledReadOnlyComponent = False
1991 if isDisassembled and refComponent:
1992 # The composite storage class should be accessible through
1993 # the component dataset type
1994 compositeStorageClass = ref.datasetType.parentStorageClass
1996 # In the unlikely scenario where the composite storage
1997 # class is not known, we can only assume that this is a
1998 # normal component. If that assumption is wrong then the
1999 # branch below that reads a persisted component will fail
2000 # so there is no need to complain here.
2001 if compositeStorageClass is not None: 2001 ↛ 2004line 2001 didn't jump to line 2004, because the condition on line 2001 was never false
2002 isDisassembledReadOnlyComponent = refComponent in compositeStorageClass.derivedComponents
2004 if isDisassembled and not refComponent:
2005 # This was a disassembled dataset spread over multiple files
2006 # and we need to put them all back together again.
2007 # Read into memory and then assemble
2009 # Check that the supplied parameters are suitable for the type read
2010 refStorageClass.validateParameters(parameters)
2012 # We want to keep track of all the parameters that were not used
2013 # by formatters. We assume that if any of the component formatters
2014 # use a parameter that we do not need to apply it again in the
2015 # assembler.
2016 usedParams = set()
2018 components: Dict[str, Any] = {}
2019 for getInfo in allGetInfo:
2020 # assemblerParams are parameters not understood by the
2021 # associated formatter.
2022 usedParams.update(set(getInfo.formatterParams))
2024 component = getInfo.component
2026 if component is None: 2026 ↛ 2027line 2026 didn't jump to line 2027, because the condition on line 2026 was never true
2027 raise RuntimeError(f"Internal error in datastore assembly of {ref}")
2029 # We do not want the formatter to think it's reading
2030 # a component though because it is really reading a
2031 # standalone dataset -- always tell reader it is not a
2032 # component.
2033 components[component] = self._read_artifact_into_memory(
2034 getInfo, ref.makeComponentRef(component), isComponent=False
2035 )
2037 inMemoryDataset = ref.datasetType.storageClass.delegate().assemble(components)
2039 # Any unused parameters will have to be passed to the assembler
2040 if parameters:
2041 unusedParams = {k: v for k, v in parameters.items() if k not in usedParams}
2042 else:
2043 unusedParams = {}
2045 # Process parameters
2046 return ref.datasetType.storageClass.delegate().handleParameters(
2047 inMemoryDataset, parameters=unusedParams
2048 )
2050 elif isDisassembledReadOnlyComponent:
2052 compositeStorageClass = ref.datasetType.parentStorageClass
2053 if compositeStorageClass is None: 2053 ↛ 2054line 2053 didn't jump to line 2054, because the condition on line 2053 was never true
2054 raise RuntimeError(
2055 f"Unable to retrieve derived component '{refComponent}' since"
2056 "no composite storage class is available."
2057 )
2059 if refComponent is None: 2059 ↛ 2061line 2059 didn't jump to line 2061, because the condition on line 2059 was never true
2060 # Mainly for mypy
2061 raise RuntimeError(f"Internal error in datastore {self.name}: component can not be None here")
2063 # Assume that every derived component can be calculated by
2064 # forwarding the request to a single read/write component.
2065 # Rather than guessing which rw component is the right one by
2066 # scanning each for a derived component of the same name,
2067 # we ask the storage class delegate directly which one is best to
2068 # use.
2069 compositeDelegate = compositeStorageClass.delegate()
2070 forwardedComponent = compositeDelegate.selectResponsibleComponent(
2071 refComponent, set(allComponents)
2072 )
2074 # Select the relevant component
2075 rwInfo = allComponents[forwardedComponent]
2077 # For now assume that read parameters are validated against
2078 # the real component and not the requested component
2079 forwardedStorageClass = rwInfo.formatter.fileDescriptor.readStorageClass
2080 forwardedStorageClass.validateParameters(parameters)
2082 # The reference to use for the caching must refer to the forwarded
2083 # component and not the derived component.
2084 cache_ref = ref.makeCompositeRef().makeComponentRef(forwardedComponent)
2086 # Unfortunately the FileDescriptor inside the formatter will have
2087 # the wrong write storage class so we need to create a new one
2088 # given the immutability constraint.
2089 writeStorageClass = rwInfo.info.storageClass
2091 # We may need to put some thought into parameters for read
2092 # components but for now forward them on as is
2093 readFormatter = type(rwInfo.formatter)(
2094 FileDescriptor(
2095 rwInfo.location,
2096 readStorageClass=refStorageClass,
2097 storageClass=writeStorageClass,
2098 parameters=parameters,
2099 ),
2100 ref.dataId,
2101 )
2103 # The assembler can not receive any parameter requests for a
2104 # derived component at this time since the assembler will
2105 # see the storage class of the derived component and those
2106 # parameters will have to be handled by the formatter on the
2107 # forwarded storage class.
2108 assemblerParams: Dict[str, Any] = {}
2110 # Need to created a new info that specifies the derived
2111 # component and associated storage class
2112 readInfo = DatastoreFileGetInformation(
2113 rwInfo.location,
2114 readFormatter,
2115 rwInfo.info,
2116 assemblerParams,
2117 {},
2118 refComponent,
2119 refStorageClass,
2120 )
2122 return self._read_artifact_into_memory(readInfo, ref, isComponent=True, cache_ref=cache_ref)
2124 else:
2125 # Single file request or component from that composite file
2126 for lookup in (refComponent, None): 2126 ↛ 2131line 2126 didn't jump to line 2131, because the loop on line 2126 didn't complete
2127 if lookup in allComponents: 2127 ↛ 2126line 2127 didn't jump to line 2126, because the condition on line 2127 was never false
2128 getInfo = allComponents[lookup]
2129 break
2130 else:
2131 raise FileNotFoundError(
2132 f"Component {refComponent} not found for ref {ref} in datastore {self.name}"
2133 )
2135 # Do not need the component itself if already disassembled
2136 if isDisassembled:
2137 isComponent = False
2138 else:
2139 isComponent = getInfo.component is not None
2141 # For a component read of a composite we want the cache to
2142 # be looking at the composite ref itself.
2143 cache_ref = ref.makeCompositeRef() if isComponent else ref
2145 # For a disassembled component we can validate parametersagainst
2146 # the component storage class directly
2147 if isDisassembled:
2148 refStorageClass.validateParameters(parameters)
2149 else:
2150 # For an assembled composite this could be a derived
2151 # component derived from a real component. The validity
2152 # of the parameters is not clear. For now validate against
2153 # the composite storage class
2154 getInfo.formatter.fileDescriptor.storageClass.validateParameters(parameters)
2156 return self._read_artifact_into_memory(getInfo, ref, isComponent=isComponent, cache_ref=cache_ref)
2158 @transactional
2159 def put(self, inMemoryDataset: Any, ref: DatasetRef) -> None:
2160 """Write a InMemoryDataset with a given `DatasetRef` to the store.
2162 Parameters
2163 ----------
2164 inMemoryDataset : `object`
2165 The dataset to store.
2166 ref : `DatasetRef`
2167 Reference to the associated Dataset.
2169 Raises
2170 ------
2171 TypeError
2172 Supplied object and storage class are inconsistent.
2173 DatasetTypeNotSupportedError
2174 The associated `DatasetType` is not handled by this datastore.
2176 Notes
2177 -----
2178 If the datastore is configured to reject certain dataset types it
2179 is possible that the put will fail and raise a
2180 `DatasetTypeNotSupportedError`. The main use case for this is to
2181 allow `ChainedDatastore` to put to multiple datastores without
2182 requiring that every datastore accepts the dataset.
2183 """
2185 doDisassembly = self.composites.shouldBeDisassembled(ref)
2186 # doDisassembly = True
2188 artifacts = []
2189 if doDisassembly:
2190 components = ref.datasetType.storageClass.delegate().disassemble(inMemoryDataset)
2191 if components is None: 2191 ↛ 2192line 2191 didn't jump to line 2192, because the condition on line 2191 was never true
2192 raise RuntimeError(
2193 f"Inconsistent configuration: dataset type {ref.datasetType.name} "
2194 f"with storage class {ref.datasetType.storageClass.name} "
2195 "is configured to be disassembled, but cannot be."
2196 )
2197 for component, componentInfo in components.items():
2198 # Don't recurse because we want to take advantage of
2199 # bulk insert -- need a new DatasetRef that refers to the
2200 # same dataset_id but has the component DatasetType
2201 # DatasetType does not refer to the types of components
2202 # So we construct one ourselves.
2203 compRef = ref.makeComponentRef(component)
2204 storedInfo = self._write_in_memory_to_artifact(componentInfo.component, compRef)
2205 artifacts.append((compRef, storedInfo))
2206 else:
2207 # Write the entire thing out
2208 storedInfo = self._write_in_memory_to_artifact(inMemoryDataset, ref)
2209 artifacts.append((ref, storedInfo))
2211 self._register_datasets(artifacts)
2213 @transactional
2214 def trash(self, ref: Union[DatasetRef, Iterable[DatasetRef]], ignore_errors: bool = True) -> None:
2215 # At this point can safely remove these datasets from the cache
2216 # to avoid confusion later on. If they are not trashed later
2217 # the cache will simply be refilled.
2218 self.cacheManager.remove_from_cache(ref)
2220 # If we are in trust mode there will be nothing to move to
2221 # the trash table and we will have to try to delete the file
2222 # immediately.
2223 if self.trustGetRequest:
2224 # Try to keep the logic below for a single file trash.
2225 if isinstance(ref, DatasetRef):
2226 refs = {ref}
2227 else:
2228 # Will recreate ref at the end of this branch.
2229 refs = set(ref)
2231 # Determine which datasets are known to datastore directly.
2232 id_to_ref = {ref.getCheckedId(): ref for ref in refs}
2233 existing_ids = self._get_stored_records_associated_with_refs(refs)
2234 existing_refs = {id_to_ref[ref_id] for ref_id in existing_ids}
2236 missing = refs - existing_refs
2237 if missing:
2238 # Do an explicit existence check on these refs.
2239 # We only care about the artifacts at this point and not
2240 # the dataset existence.
2241 artifact_existence: Dict[ResourcePath, bool] = {}
2242 _ = self.mexists(missing, artifact_existence)
2243 uris = [uri for uri, exists in artifact_existence.items() if exists]
2245 # FUTURE UPGRADE: Implement a parallelized bulk remove.
2246 log.debug("Removing %d artifacts from datastore that are unknown to datastore", len(uris))
2247 for uri in uris:
2248 try:
2249 uri.remove()
2250 except Exception as e:
2251 if ignore_errors:
2252 log.debug("Artifact %s could not be removed: %s", uri, e)
2253 continue
2254 raise
2256 # There is no point asking the code below to remove refs we
2257 # know are missing so update it with the list of existing
2258 # records. Try to retain one vs many logic.
2259 if not existing_refs:
2260 # Nothing more to do since none of the datasets were
2261 # known to the datastore record table.
2262 return
2263 ref = list(existing_refs)
2264 if len(ref) == 1:
2265 ref = ref[0]
2267 # Get file metadata and internal metadata
2268 if not isinstance(ref, DatasetRef):
2269 log.debug("Doing multi-dataset trash in datastore %s", self.name)
2270 # Assumed to be an iterable of refs so bulk mode enabled.
2271 try:
2272 self.bridge.moveToTrash(ref, transaction=self._transaction)
2273 except Exception as e:
2274 if ignore_errors:
2275 log.warning("Unexpected issue moving multiple datasets to trash: %s", e)
2276 else:
2277 raise
2278 return
2280 log.debug("Trashing dataset %s in datastore %s", ref, self.name)
2282 fileLocations = self._get_dataset_locations_info(ref)
2284 if not fileLocations:
2285 err_msg = f"Requested dataset to trash ({ref}) is not known to datastore {self.name}"
2286 if ignore_errors:
2287 log.warning(err_msg)
2288 return
2289 else:
2290 raise FileNotFoundError(err_msg)
2292 for location, storedFileInfo in fileLocations:
2293 if not self._artifact_exists(location): 2293 ↛ 2294line 2293 didn't jump to line 2294
2294 err_msg = (
2295 f"Dataset is known to datastore {self.name} but "
2296 f"associated artifact ({location.uri}) is missing"
2297 )
2298 if ignore_errors:
2299 log.warning(err_msg)
2300 return
2301 else:
2302 raise FileNotFoundError(err_msg)
2304 # Mark dataset as trashed
2305 try:
2306 self.bridge.moveToTrash([ref], transaction=self._transaction)
2307 except Exception as e:
2308 if ignore_errors:
2309 log.warning(
2310 "Attempted to mark dataset (%s) to be trashed in datastore %s "
2311 "but encountered an error: %s",
2312 ref,
2313 self.name,
2314 e,
2315 )
2316 pass
2317 else:
2318 raise
2320 @transactional
2321 def emptyTrash(self, ignore_errors: bool = True) -> None:
2322 """Remove all datasets from the trash.
2324 Parameters
2325 ----------
2326 ignore_errors : `bool`
2327 If `True` return without error even if something went wrong.
2328 Problems could occur if another process is simultaneously trying
2329 to delete.
2330 """
2331 log.debug("Emptying trash in datastore %s", self.name)
2333 # Context manager will empty trash iff we finish it without raising.
2334 # It will also automatically delete the relevant rows from the
2335 # trash table and the records table.
2336 with self.bridge.emptyTrash(
2337 self._table, record_class=StoredFileInfo, record_column="path"
2338 ) as trash_data:
2339 # Removing the artifacts themselves requires that the files are
2340 # not also associated with refs that are not to be trashed.
2341 # Therefore need to do a query with the file paths themselves
2342 # and return all the refs associated with them. Can only delete
2343 # a file if the refs to be trashed are the only refs associated
2344 # with the file.
2345 # This requires multiple copies of the trashed items
2346 trashed, artifacts_to_keep = trash_data
2348 if artifacts_to_keep is None:
2349 # The bridge is not helping us so have to work it out
2350 # ourselves. This is not going to be as efficient.
2351 trashed = list(trashed)
2353 # The instance check is for mypy since up to this point it
2354 # does not know the type of info.
2355 path_map = self._refs_associated_with_artifacts(
2356 [info.path for _, info in trashed if isinstance(info, StoredFileInfo)]
2357 )
2359 for ref, info in trashed:
2361 # Mypy needs to know this is not the base class
2362 assert isinstance(info, StoredFileInfo), f"Unexpectedly got info of class {type(info)}"
2364 # Check for mypy
2365 assert ref.id is not None, f"Internal logic error in emptyTrash with ref {ref}/{info}"
2367 path_map[info.path].remove(ref.id)
2368 if not path_map[info.path]: 2368 ↛ 2359line 2368 didn't jump to line 2359, because the condition on line 2368 was never false
2369 del path_map[info.path]
2371 artifacts_to_keep = set(path_map)
2373 for ref, info in trashed:
2375 # Should not happen for this implementation but need
2376 # to keep mypy happy.
2377 assert info is not None, f"Internal logic error in emptyTrash with ref {ref}."
2379 # Mypy needs to know this is not the base class
2380 assert isinstance(info, StoredFileInfo), f"Unexpectedly got info of class {type(info)}"
2382 # Check for mypy
2383 assert ref.id is not None, f"Internal logic error in emptyTrash with ref {ref}/{info}"
2385 if info.path in artifacts_to_keep:
2386 # This is a multi-dataset artifact and we are not
2387 # removing all associated refs.
2388 continue
2390 # Only trashed refs still known to datastore will be returned.
2391 location = info.file_location(self.locationFactory)
2393 # Point of no return for this artifact
2394 log.debug("Removing artifact %s from datastore %s", location.uri, self.name)
2395 try:
2396 self._delete_artifact(location)
2397 except FileNotFoundError:
2398 # If the file itself has been deleted there is nothing
2399 # we can do about it. It is possible that trash has
2400 # been run in parallel in another process or someone
2401 # decided to delete the file. It is unlikely to come
2402 # back and so we should still continue with the removal
2403 # of the entry from the trash table. It is also possible
2404 # we removed it in a previous iteration if it was
2405 # a multi-dataset artifact. The delete artifact method
2406 # will log a debug message in this scenario.
2407 # Distinguishing file missing before trash started and
2408 # file already removed previously as part of this trash
2409 # is not worth the distinction with regards to potential
2410 # memory cost.
2411 pass
2412 except Exception as e:
2413 if ignore_errors:
2414 # Use a debug message here even though it's not
2415 # a good situation. In some cases this can be
2416 # caused by a race between user A and user B
2417 # and neither of them has permissions for the
2418 # other's files. Butler does not know about users
2419 # and trash has no idea what collections these
2420 # files were in (without guessing from a path).
2421 log.debug(
2422 "Encountered error removing artifact %s from datastore %s: %s",
2423 location.uri,
2424 self.name,
2425 e,
2426 )
2427 else:
2428 raise
2430 @transactional
2431 def transfer_from(
2432 self,
2433 source_datastore: Datastore,
2434 refs: Iterable[DatasetRef],
2435 local_refs: Optional[Iterable[DatasetRef]] = None,
2436 transfer: str = "auto",
2437 artifact_existence: Optional[Dict[ResourcePath, bool]] = None,
2438 ) -> None:
2439 # Docstring inherited
2440 if type(self) is not type(source_datastore):
2441 raise TypeError(
2442 f"Datastore mismatch between this datastore ({type(self)}) and the "
2443 f"source datastore ({type(source_datastore)})."
2444 )
2446 # Be explicit for mypy
2447 if not isinstance(source_datastore, FileDatastore): 2447 ↛ 2448line 2447 didn't jump to line 2448, because the condition on line 2447 was never true
2448 raise TypeError(
2449 "Can only transfer to a FileDatastore from another FileDatastore, not"
2450 f" {type(source_datastore)}"
2451 )
2453 # Stop early if "direct" transfer mode is requested. That would
2454 # require that the URI inside the source datastore should be stored
2455 # directly in the target datastore, which seems unlikely to be useful
2456 # since at any moment the source datastore could delete the file.
2457 if transfer in ("direct", "split"):
2458 raise ValueError(
2459 f"Can not transfer from a source datastore using {transfer} mode since"
2460 " those files are controlled by the other datastore."
2461 )
2463 # Empty existence lookup if none given.
2464 if artifact_existence is None:
2465 artifact_existence = {}
2467 # We will go through the list multiple times so must convert
2468 # generators to lists.
2469 refs = list(refs)
2471 if local_refs is None:
2472 local_refs = refs
2473 else:
2474 local_refs = list(local_refs)
2476 # In order to handle disassembled composites the code works
2477 # at the records level since it can assume that internal APIs
2478 # can be used.
2479 # - If the record already exists in the destination this is assumed
2480 # to be okay.
2481 # - If there is no record but the source and destination URIs are
2482 # identical no transfer is done but the record is added.
2483 # - If the source record refers to an absolute URI currently assume
2484 # that that URI should remain absolute and will be visible to the
2485 # destination butler. May need to have a flag to indicate whether
2486 # the dataset should be transferred. This will only happen if
2487 # the detached Butler has had a local ingest.
2489 # What we really want is all the records in the source datastore
2490 # associated with these refs. Or derived ones if they don't exist
2491 # in the source.
2492 source_records = source_datastore._get_stored_records_associated_with_refs(refs)
2494 # The source dataset_ids are the keys in these records
2495 source_ids = set(source_records)
2496 log.debug("Number of datastore records found in source: %d", len(source_ids))
2498 # The not None check is to appease mypy
2499 requested_ids = set(ref.id for ref in refs if ref.id is not None)
2500 missing_ids = requested_ids - source_ids
2502 # Missing IDs can be okay if that datastore has allowed
2503 # gets based on file existence. Should we transfer what we can
2504 # or complain about it and warn?
2505 if missing_ids and not source_datastore.trustGetRequest: 2505 ↛ 2506line 2505 didn't jump to line 2506, because the condition on line 2505 was never true
2506 raise ValueError(
2507 f"Some datasets are missing from source datastore {source_datastore}: {missing_ids}"
2508 )
2510 # Need to map these missing IDs to a DatasetRef so we can guess
2511 # the details.
2512 if missing_ids:
2513 log.info(
2514 "Number of expected datasets missing from source datastore records: %d out of %d",
2515 len(missing_ids),
2516 len(requested_ids),
2517 )
2518 id_to_ref = {ref.id: ref for ref in refs if ref.id in missing_ids}
2520 # This should be chunked in case we end up having to check
2521 # the file store since we need some log output to show
2522 # progress.
2523 for missing_ids_chunk in chunk_iterable(missing_ids, chunk_size=10_000):
2524 records = {}
2525 for missing in missing_ids_chunk:
2526 # Ask the source datastore where the missing artifacts
2527 # should be. An execution butler might not know about the
2528 # artifacts even if they are there.
2529 expected = source_datastore._get_expected_dataset_locations_info(id_to_ref[missing])
2530 records[missing] = [info for _, info in expected]
2532 # Call the mexist helper method in case we have not already
2533 # checked these artifacts such that artifact_existence is
2534 # empty. This allows us to benefit from parallelism.
2535 # datastore.mexists() itself does not give us access to the
2536 # derived datastore record.
2537 log.verbose("Checking existence of %d datasets unknown to datastore", len(records))
2538 ref_exists = source_datastore._process_mexists_records(
2539 id_to_ref, records, False, artifact_existence=artifact_existence
2540 )
2542 # Now go through the records and propagate the ones that exist.
2543 location_factory = source_datastore.locationFactory
2544 for missing, record_list in records.items():
2545 # Skip completely if the ref does not exist.
2546 ref = id_to_ref[missing]
2547 if not ref_exists[ref]:
2548 log.warning("Asked to transfer dataset %s but no file artifacts exist for it.", ref)
2549 continue
2550 # Check for file artifact to decide which parts of a
2551 # disassembled composite do exist. If there is only a
2552 # single record we don't even need to look because it can't
2553 # be a composite and must exist.
2554 if len(record_list) == 1:
2555 dataset_records = record_list
2556 else:
2557 dataset_records = [
2558 record
2559 for record in record_list
2560 if artifact_existence[record.file_location(location_factory).uri]
2561 ]
2562 assert len(dataset_records) > 0, "Disassembled composite should have had some files."
2564 # Rely on source_records being a defaultdict.
2565 source_records[missing].extend(dataset_records)
2567 # See if we already have these records
2568 target_records = self._get_stored_records_associated_with_refs(local_refs)
2570 # The artifacts to register
2571 artifacts = []
2573 # Refs that already exist
2574 already_present = []
2576 # Now can transfer the artifacts
2577 for source_ref, target_ref in zip(refs, local_refs):
2578 if target_ref.id in target_records:
2579 # Already have an artifact for this.
2580 already_present.append(target_ref)
2581 continue
2583 # mypy needs to know these are always resolved refs
2584 for info in source_records[source_ref.getCheckedId()]:
2585 source_location = info.file_location(source_datastore.locationFactory)
2586 target_location = info.file_location(self.locationFactory)
2587 if source_location == target_location: 2587 ↛ 2591line 2587 didn't jump to line 2591, because the condition on line 2587 was never true
2588 # Either the dataset is already in the target datastore
2589 # (which is how execution butler currently runs) or
2590 # it is an absolute URI.
2591 if source_location.pathInStore.isabs():
2592 # Just because we can see the artifact when running
2593 # the transfer doesn't mean it will be generally
2594 # accessible to a user of this butler. For now warn
2595 # but assume it will be accessible.
2596 log.warning(
2597 "Transfer request for an outside-datastore artifact has been found at %s",
2598 source_location,
2599 )
2600 else:
2601 # Need to transfer it to the new location.
2602 # Assume we should always overwrite. If the artifact
2603 # is there this might indicate that a previous transfer
2604 # was interrupted but was not able to be rolled back
2605 # completely (eg pre-emption) so follow Datastore default
2606 # and overwrite.
2607 target_location.uri.transfer_from(
2608 source_location.uri, transfer=transfer, overwrite=True, transaction=self._transaction
2609 )
2611 artifacts.append((target_ref, info))
2613 self._register_datasets(artifacts)
2615 if already_present:
2616 n_skipped = len(already_present)
2617 log.info(
2618 "Skipped transfer of %d dataset%s already present in datastore",
2619 n_skipped,
2620 "" if n_skipped == 1 else "s",
2621 )
2623 @transactional
2624 def forget(self, refs: Iterable[DatasetRef]) -> None:
2625 # Docstring inherited.
2626 refs = list(refs)
2627 self.bridge.forget(refs)
2628 self._table.delete(["dataset_id"], *[{"dataset_id": ref.getCheckedId()} for ref in refs])
2630 def validateConfiguration(
2631 self, entities: Iterable[Union[DatasetRef, DatasetType, StorageClass]], logFailures: bool = False
2632 ) -> None:
2633 """Validate some of the configuration for this datastore.
2635 Parameters
2636 ----------
2637 entities : iterable of `DatasetRef`, `DatasetType`, or `StorageClass`
2638 Entities to test against this configuration. Can be differing
2639 types.
2640 logFailures : `bool`, optional
2641 If `True`, output a log message for every validation error
2642 detected.
2644 Raises
2645 ------
2646 DatastoreValidationError
2647 Raised if there is a validation problem with a configuration.
2648 All the problems are reported in a single exception.
2650 Notes
2651 -----
2652 This method checks that all the supplied entities have valid file
2653 templates and also have formatters defined.
2654 """
2656 templateFailed = None
2657 try:
2658 self.templates.validateTemplates(entities, logFailures=logFailures)
2659 except FileTemplateValidationError as e:
2660 templateFailed = str(e)
2662 formatterFailed = []
2663 for entity in entities:
2664 try:
2665 self.formatterFactory.getFormatterClass(entity)
2666 except KeyError as e:
2667 formatterFailed.append(str(e))
2668 if logFailures: 2668 ↛ 2663line 2668 didn't jump to line 2663, because the condition on line 2668 was never false
2669 log.critical("Formatter failure: %s", e)
2671 if templateFailed or formatterFailed:
2672 messages = []
2673 if templateFailed: 2673 ↛ 2674line 2673 didn't jump to line 2674, because the condition on line 2673 was never true
2674 messages.append(templateFailed)
2675 if formatterFailed: 2675 ↛ 2677line 2675 didn't jump to line 2677, because the condition on line 2675 was never false
2676 messages.append(",".join(formatterFailed))
2677 msg = ";\n".join(messages)
2678 raise DatastoreValidationError(msg)
2680 def getLookupKeys(self) -> Set[LookupKey]:
2681 # Docstring is inherited from base class
2682 return (
2683 self.templates.getLookupKeys()
2684 | self.formatterFactory.getLookupKeys()
2685 | self.constraints.getLookupKeys()
2686 )
2688 def validateKey(self, lookupKey: LookupKey, entity: Union[DatasetRef, DatasetType, StorageClass]) -> None:
2689 # Docstring is inherited from base class
2690 # The key can be valid in either formatters or templates so we can
2691 # only check the template if it exists
2692 if lookupKey in self.templates:
2693 try:
2694 self.templates[lookupKey].validateTemplate(entity)
2695 except FileTemplateValidationError as e:
2696 raise DatastoreValidationError(e) from e
2698 def export(
2699 self,
2700 refs: Iterable[DatasetRef],
2701 *,
2702 directory: Optional[ResourcePathExpression] = None,
2703 transfer: Optional[str] = "auto",
2704 ) -> Iterable[FileDataset]:
2705 # Docstring inherited from Datastore.export.
2706 if transfer == "auto" and directory is None:
2707 transfer = None
2709 if transfer is not None and directory is None:
2710 raise TypeError(f"Cannot export using transfer mode {transfer} with no export directory given")
2712 if transfer == "move":
2713 raise TypeError("Can not export by moving files out of datastore.")
2714 elif transfer == "direct": 2714 ↛ 2718line 2714 didn't jump to line 2718, because the condition on line 2714 was never true
2715 # For an export, treat this as equivalent to None. We do not
2716 # want an import to risk using absolute URIs to datasets owned
2717 # by another datastore.
2718 log.info("Treating 'direct' transfer mode as in-place export.")
2719 transfer = None
2721 # Force the directory to be a URI object
2722 directoryUri: Optional[ResourcePath] = None
2723 if directory is not None:
2724 directoryUri = ResourcePath(directory, forceDirectory=True)
2726 if transfer is not None and directoryUri is not None:
2727 # mypy needs the second test
2728 if not directoryUri.exists(): 2728 ↛ 2729line 2728 didn't jump to line 2729, because the condition on line 2728 was never true
2729 raise FileNotFoundError(f"Export location {directory} does not exist")
2731 progress = Progress("lsst.daf.butler.datastores.FileDatastore.export", level=logging.DEBUG)
2732 for ref in progress.wrap(refs, "Exporting dataset files"):
2733 fileLocations = self._get_dataset_locations_info(ref)
2734 if not fileLocations:
2735 raise FileNotFoundError(f"Could not retrieve dataset {ref}.")
2736 # For now we can not export disassembled datasets
2737 if len(fileLocations) > 1:
2738 raise NotImplementedError(f"Can not export disassembled datasets such as {ref}")
2739 location, storedFileInfo = fileLocations[0]
2741 pathInStore = location.pathInStore.path
2742 if transfer is None:
2743 # TODO: do we also need to return the readStorageClass somehow?
2744 # We will use the path in store directly. If this is an
2745 # absolute URI, preserve it.
2746 if location.pathInStore.isabs(): 2746 ↛ 2747line 2746 didn't jump to line 2747, because the condition on line 2746 was never true
2747 pathInStore = str(location.uri)
2748 elif transfer == "direct": 2748 ↛ 2750line 2748 didn't jump to line 2750, because the condition on line 2748 was never true
2749 # Use full URIs to the remote store in the export
2750 pathInStore = str(location.uri)
2751 else:
2752 # mypy needs help
2753 assert directoryUri is not None, "directoryUri must be defined to get here"
2754 storeUri = ResourcePath(location.uri)
2756 # if the datastore has an absolute URI to a resource, we
2757 # have two options:
2758 # 1. Keep the absolute URI in the exported YAML
2759 # 2. Allocate a new name in the local datastore and transfer
2760 # it.
2761 # For now go with option 2
2762 if location.pathInStore.isabs(): 2762 ↛ 2763line 2762 didn't jump to line 2763, because the condition on line 2762 was never true
2763 template = self.templates.getTemplate(ref)
2764 newURI = ResourcePath(template.format(ref), forceAbsolute=False)
2765 pathInStore = str(newURI.updatedExtension(location.pathInStore.getExtension()))
2767 exportUri = directoryUri.join(pathInStore)
2768 exportUri.transfer_from(storeUri, transfer=transfer)
2770 yield FileDataset(refs=[ref], path=pathInStore, formatter=storedFileInfo.formatter)
2772 @staticmethod
2773 def computeChecksum(
2774 uri: ResourcePath, algorithm: str = "blake2b", block_size: int = 8192
2775 ) -> Optional[str]:
2776 """Compute the checksum of the supplied file.
2778 Parameters
2779 ----------
2780 uri : `lsst.resources.ResourcePath`
2781 Name of resource to calculate checksum from.
2782 algorithm : `str`, optional
2783 Name of algorithm to use. Must be one of the algorithms supported
2784 by :py:class`hashlib`.
2785 block_size : `int`
2786 Number of bytes to read from file at one time.
2788 Returns
2789 -------
2790 hexdigest : `str`
2791 Hex digest of the file.
2793 Notes
2794 -----
2795 Currently returns None if the URI is for a remote resource.
2796 """
2797 if algorithm not in hashlib.algorithms_guaranteed: 2797 ↛ 2798line 2797 didn't jump to line 2798, because the condition on line 2797 was never true
2798 raise NameError("The specified algorithm '{}' is not supported by hashlib".format(algorithm))
2800 if not uri.isLocal: 2800 ↛ 2801line 2800 didn't jump to line 2801, because the condition on line 2800 was never true
2801 return None
2803 hasher = hashlib.new(algorithm)
2805 with uri.as_local() as local_uri:
2806 with open(local_uri.ospath, "rb") as f:
2807 for chunk in iter(lambda: f.read(block_size), b""):
2808 hasher.update(chunk)
2810 return hasher.hexdigest()
2812 def needs_expanded_data_ids(
2813 self,
2814 transfer: Optional[str],
2815 entity: Optional[Union[DatasetRef, DatasetType, StorageClass]] = None,
2816 ) -> bool:
2817 # Docstring inherited.
2818 # This _could_ also use entity to inspect whether the filename template
2819 # involves placeholders other than the required dimensions for its
2820 # dataset type, but that's not necessary for correctness; it just
2821 # enables more optimizations (perhaps only in theory).
2822 return transfer not in ("direct", None)
2824 def import_records(self, data: Mapping[str, DatastoreRecordData]) -> None:
2825 # Docstring inherited from the base class.
2826 record_data = data.get(self.name)
2827 if not record_data: 2827 ↛ 2828line 2827 didn't jump to line 2828, because the condition on line 2827 was never true
2828 return
2830 self._bridge.insert(FakeDatasetRef(dataset_id) for dataset_id in record_data.records.keys())
2832 # TODO: Verify that there are no unexpected table names in the dict?
2833 unpacked_records = []
2834 for dataset_data in record_data.records.values():
2835 records = dataset_data.get(self._table.name)
2836 if records: 2836 ↛ 2834line 2836 didn't jump to line 2834, because the condition on line 2836 was never false
2837 for info in records:
2838 assert isinstance(info, StoredFileInfo), "Expecting StoredFileInfo records"
2839 unpacked_records.append(info.to_record())
2840 if unpacked_records:
2841 self._table.insert(*unpacked_records)
2843 def export_records(self, refs: Iterable[DatasetIdRef]) -> Mapping[str, DatastoreRecordData]:
2844 # Docstring inherited from the base class.
2845 exported_refs = list(self._bridge.check(refs))
2846 ids = {ref.getCheckedId() for ref in exported_refs}
2847 records: defaultdict[DatasetId, defaultdict[str, List[StoredDatastoreItemInfo]]] = defaultdict(
2848 lambda: defaultdict(list), {id: defaultdict(list) for id in ids}
2849 )
2850 for row in self._table.fetch(dataset_id=ids):
2851 info: StoredDatastoreItemInfo = StoredFileInfo.from_record(row)
2852 records[info.dataset_id][self._table.name].append(info)
2854 record_data = DatastoreRecordData(records=records)
2855 return {self.name: record_data}