Coverage for python/lsst/daf/butler/datastores/fileDatastore.py: 85%
928 statements
« prev ^ index » next coverage.py v6.5.0, created at 2023-10-26 15:13 +0000
« prev ^ index » next coverage.py v6.5.0, created at 2023-10-26 15:13 +0000
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
21from __future__ import annotations
23"""Generic file-based datastore code."""
25__all__ = ("FileDatastore",)
27import hashlib
28import logging
29from collections import defaultdict
30from dataclasses import dataclass
31from typing import (
32 TYPE_CHECKING,
33 Any,
34 ClassVar,
35 Dict,
36 Iterable,
37 List,
38 Mapping,
39 Optional,
40 Sequence,
41 Set,
42 Tuple,
43 Type,
44 Union,
45)
47from lsst.daf.butler import (
48 CompositesMap,
49 Config,
50 DatasetId,
51 DatasetRef,
52 DatasetRefURIs,
53 DatasetType,
54 DatasetTypeNotSupportedError,
55 Datastore,
56 DatastoreCacheManager,
57 DatastoreConfig,
58 DatastoreDisabledCacheManager,
59 DatastoreRecordData,
60 DatastoreValidationError,
61 FileDataset,
62 FileDescriptor,
63 FileTemplates,
64 FileTemplateValidationError,
65 Formatter,
66 FormatterFactory,
67 Location,
68 LocationFactory,
69 Progress,
70 StorageClass,
71 StoredDatastoreItemInfo,
72 StoredFileInfo,
73 ddl,
74)
75from lsst.daf.butler.core.repoRelocation import replaceRoot
76from lsst.daf.butler.core.utils import transactional
77from lsst.daf.butler.registry.interfaces import DatastoreRegistryBridge, ReadOnlyDatabaseError
78from lsst.resources import ResourcePath, ResourcePathExpression
79from lsst.utils.introspection import get_class_of, get_instance_of
80from lsst.utils.iteration import chunk_iterable
82# For VERBOSE logging usage.
83from lsst.utils.logging import VERBOSE, getLogger
84from lsst.utils.timer import time_this
85from sqlalchemy import BigInteger, String
87from ..registry.interfaces import FakeDatasetRef
88from .genericDatastore import GenericBaseDatastore
90if TYPE_CHECKING: 90 ↛ 91line 90 didn't jump to line 91, because the condition on line 90 was never true
91 from lsst.daf.butler import AbstractDatastoreCacheManager, LookupKey
92 from lsst.daf.butler.registry.interfaces import DatasetIdRef, DatastoreRegistryBridgeManager
94log = getLogger(__name__)
97class _IngestPrepData(Datastore.IngestPrepData):
98 """Helper class for FileDatastore ingest implementation.
100 Parameters
101 ----------
102 datasets : `list` of `FileDataset`
103 Files to be ingested by this datastore.
104 """
106 def __init__(self, datasets: List[FileDataset]):
107 super().__init__(ref for dataset in datasets for ref in dataset.refs)
108 self.datasets = datasets
111@dataclass(frozen=True)
112class DatastoreFileGetInformation:
113 """Collection of useful parameters needed to retrieve a file from
114 a Datastore.
115 """
117 location: Location
118 """The location from which to read the dataset."""
120 formatter: Formatter
121 """The `Formatter` to use to deserialize the dataset."""
123 info: StoredFileInfo
124 """Stored information about this file and its formatter."""
126 assemblerParams: Mapping[str, Any]
127 """Parameters to use for post-processing the retrieved dataset."""
129 formatterParams: Mapping[str, Any]
130 """Parameters that were understood by the associated formatter."""
132 component: Optional[str]
133 """The component to be retrieved (can be `None`)."""
135 readStorageClass: StorageClass
136 """The `StorageClass` of the dataset being read."""
139class FileDatastore(GenericBaseDatastore):
140 """Generic Datastore for file-based implementations.
142 Should always be sub-classed since key abstract methods are missing.
144 Parameters
145 ----------
146 config : `DatastoreConfig` or `str`
147 Configuration as either a `Config` object or URI to file.
148 bridgeManager : `DatastoreRegistryBridgeManager`
149 Object that manages the interface between `Registry` and datastores.
150 butlerRoot : `str`, optional
151 New datastore root to use to override the configuration value.
153 Raises
154 ------
155 ValueError
156 If root location does not exist and ``create`` is `False` in the
157 configuration.
158 """
160 defaultConfigFile: ClassVar[Optional[str]] = None
161 """Path to configuration defaults. Accessed within the ``config`` resource
162 or relative to a search path. Can be None if no defaults specified.
163 """
165 root: ResourcePath
166 """Root directory URI of this `Datastore`."""
168 locationFactory: LocationFactory
169 """Factory for creating locations relative to the datastore root."""
171 formatterFactory: FormatterFactory
172 """Factory for creating instances of formatters."""
174 templates: FileTemplates
175 """File templates that can be used by this `Datastore`."""
177 composites: CompositesMap
178 """Determines whether a dataset should be disassembled on put."""
180 defaultConfigFile = "datastores/fileDatastore.yaml"
181 """Path to configuration defaults. Accessed within the ``config`` resource
182 or relative to a search path. Can be None if no defaults specified.
183 """
185 @classmethod
186 def setConfigRoot(cls, root: str, config: Config, full: Config, overwrite: bool = True) -> None:
187 """Set any filesystem-dependent config options for this Datastore to
188 be appropriate for a new empty repository with the given root.
190 Parameters
191 ----------
192 root : `str`
193 URI to the root of the data repository.
194 config : `Config`
195 A `Config` to update. Only the subset understood by
196 this component will be updated. Will not expand
197 defaults.
198 full : `Config`
199 A complete config with all defaults expanded that can be
200 converted to a `DatastoreConfig`. Read-only and will not be
201 modified by this method.
202 Repository-specific options that should not be obtained
203 from defaults when Butler instances are constructed
204 should be copied from ``full`` to ``config``.
205 overwrite : `bool`, optional
206 If `False`, do not modify a value in ``config`` if the value
207 already exists. Default is always to overwrite with the provided
208 ``root``.
210 Notes
211 -----
212 If a keyword is explicitly defined in the supplied ``config`` it
213 will not be overridden by this method if ``overwrite`` is `False`.
214 This allows explicit values set in external configs to be retained.
215 """
216 Config.updateParameters(
217 DatastoreConfig,
218 config,
219 full,
220 toUpdate={"root": root},
221 toCopy=("cls", ("records", "table")),
222 overwrite=overwrite,
223 )
225 @classmethod
226 def makeTableSpec(cls, datasetIdColumnType: type) -> ddl.TableSpec:
227 return ddl.TableSpec(
228 fields=[
229 ddl.FieldSpec(name="dataset_id", dtype=datasetIdColumnType, primaryKey=True),
230 ddl.FieldSpec(name="path", dtype=String, length=256, nullable=False),
231 ddl.FieldSpec(name="formatter", dtype=String, length=128, nullable=False),
232 ddl.FieldSpec(name="storage_class", dtype=String, length=64, nullable=False),
233 # Use empty string to indicate no component
234 ddl.FieldSpec(name="component", dtype=String, length=32, primaryKey=True),
235 # TODO: should checksum be Base64Bytes instead?
236 ddl.FieldSpec(name="checksum", dtype=String, length=128, nullable=True),
237 ddl.FieldSpec(name="file_size", dtype=BigInteger, nullable=True),
238 ],
239 unique=frozenset(),
240 indexes=[ddl.IndexSpec("path")],
241 )
243 def __init__(
244 self,
245 config: Union[DatastoreConfig, str],
246 bridgeManager: DatastoreRegistryBridgeManager,
247 butlerRoot: str | None = None,
248 ):
249 super().__init__(config, bridgeManager)
250 if "root" not in self.config: 250 ↛ 251line 250 didn't jump to line 251, because the condition on line 250 was never true
251 raise ValueError("No root directory specified in configuration")
253 self._bridgeManager = bridgeManager
255 # Name ourselves either using an explicit name or a name
256 # derived from the (unexpanded) root
257 if "name" in self.config:
258 self.name = self.config["name"]
259 else:
260 # We use the unexpanded root in the name to indicate that this
261 # datastore can be moved without having to update registry.
262 self.name = "{}@{}".format(type(self).__name__, self.config["root"])
264 # Support repository relocation in config
265 # Existence of self.root is checked in subclass
266 self.root = ResourcePath(
267 replaceRoot(self.config["root"], butlerRoot), forceDirectory=True, forceAbsolute=True
268 )
270 self.locationFactory = LocationFactory(self.root)
271 self.formatterFactory = FormatterFactory()
273 # Now associate formatters with storage classes
274 self.formatterFactory.registerFormatters(self.config["formatters"], universe=bridgeManager.universe)
276 # Read the file naming templates
277 self.templates = FileTemplates(self.config["templates"], universe=bridgeManager.universe)
279 # See if composites should be disassembled
280 self.composites = CompositesMap(self.config["composites"], universe=bridgeManager.universe)
282 tableName = self.config["records", "table"]
283 try:
284 # Storage of paths and formatters, keyed by dataset_id
285 self._table = bridgeManager.opaque.register(
286 tableName, self.makeTableSpec(bridgeManager.datasetIdColumnType)
287 )
288 # Interface to Registry.
289 self._bridge = bridgeManager.register(self.name)
290 except ReadOnlyDatabaseError:
291 # If the database is read only and we just tried and failed to
292 # create a table, it means someone is trying to create a read-only
293 # butler client for an empty repo. That should be okay, as long
294 # as they then try to get any datasets before some other client
295 # creates the table. Chances are they'rejust validating
296 # configuration.
297 pass
299 # Determine whether checksums should be used - default to False
300 self.useChecksum = self.config.get("checksum", False)
302 # Determine whether we can fall back to configuration if a
303 # requested dataset is not known to registry
304 self.trustGetRequest = self.config.get("trust_get_request", False)
306 # Create a cache manager
307 self.cacheManager: AbstractDatastoreCacheManager
308 if "cached" in self.config: 308 ↛ 311line 308 didn't jump to line 311, because the condition on line 308 was never false
309 self.cacheManager = DatastoreCacheManager(self.config["cached"], universe=bridgeManager.universe)
310 else:
311 self.cacheManager = DatastoreDisabledCacheManager("", universe=bridgeManager.universe)
313 # Check existence and create directory structure if necessary
314 if not self.root.exists():
315 if "create" not in self.config or not self.config["create"]: 315 ↛ 316line 315 didn't jump to line 316, because the condition on line 315 was never true
316 raise ValueError(f"No valid root and not allowed to create one at: {self.root}")
317 try:
318 self.root.mkdir()
319 except Exception as e:
320 raise ValueError(
321 f"Can not create datastore root '{self.root}', check permissions. Got error: {e}"
322 ) from e
324 def __str__(self) -> str:
325 return str(self.root)
327 @property
328 def bridge(self) -> DatastoreRegistryBridge:
329 return self._bridge
331 def _artifact_exists(self, location: Location) -> bool:
332 """Check that an artifact exists in this datastore at the specified
333 location.
335 Parameters
336 ----------
337 location : `Location`
338 Expected location of the artifact associated with this datastore.
340 Returns
341 -------
342 exists : `bool`
343 True if the location can be found, false otherwise.
344 """
345 log.debug("Checking if resource exists: %s", location.uri)
346 return location.uri.exists()
348 def _delete_artifact(self, location: Location) -> None:
349 """Delete the artifact from the datastore.
351 Parameters
352 ----------
353 location : `Location`
354 Location of the artifact associated with this datastore.
355 """
356 if location.pathInStore.isabs(): 356 ↛ 357line 356 didn't jump to line 357, because the condition on line 356 was never true
357 raise RuntimeError(f"Cannot delete artifact with absolute uri {location.uri}.")
359 try:
360 location.uri.remove()
361 except FileNotFoundError:
362 log.debug("File %s did not exist and so could not be deleted.", location.uri)
363 raise
364 except Exception as e:
365 log.critical("Failed to delete file: %s (%s)", location.uri, e)
366 raise
367 log.debug("Successfully deleted file: %s", location.uri)
369 def addStoredItemInfo(self, refs: Iterable[DatasetRef], infos: Iterable[StoredFileInfo]) -> None:
370 # Docstring inherited from GenericBaseDatastore
371 records = [info.rebase(ref).to_record() for ref, info in zip(refs, infos)]
372 self._table.insert(*records)
374 def getStoredItemsInfo(self, ref: DatasetIdRef) -> List[StoredFileInfo]:
375 # Docstring inherited from GenericBaseDatastore
377 # Look for the dataset_id -- there might be multiple matches
378 # if we have disassembled the dataset.
379 records = self._table.fetch(dataset_id=ref.id)
380 return [StoredFileInfo.from_record(record) for record in records]
382 def _get_stored_records_associated_with_refs(
383 self, refs: Iterable[DatasetIdRef]
384 ) -> Dict[DatasetId, List[StoredFileInfo]]:
385 """Retrieve all records associated with the provided refs.
387 Parameters
388 ----------
389 refs : iterable of `DatasetIdRef`
390 The refs for which records are to be retrieved.
392 Returns
393 -------
394 records : `dict` of [`DatasetId`, `list` of `StoredFileInfo`]
395 The matching records indexed by the ref ID. The number of entries
396 in the dict can be smaller than the number of requested refs.
397 """
398 records = self._table.fetch(dataset_id=[ref.id for ref in refs])
400 # Uniqueness is dataset_id + component so can have multiple records
401 # per ref.
402 records_by_ref = defaultdict(list)
403 for record in records:
404 records_by_ref[record["dataset_id"]].append(StoredFileInfo.from_record(record))
405 return records_by_ref
407 def _refs_associated_with_artifacts(
408 self, paths: List[Union[str, ResourcePath]]
409 ) -> Dict[str, Set[DatasetId]]:
410 """Return paths and associated dataset refs.
412 Parameters
413 ----------
414 paths : `list` of `str` or `lsst.resources.ResourcePath`
415 All the paths to include in search.
417 Returns
418 -------
419 mapping : `dict` of [`str`, `set` [`DatasetId`]]
420 Mapping of each path to a set of associated database IDs.
421 """
422 records = self._table.fetch(path=[str(path) for path in paths])
423 result = defaultdict(set)
424 for row in records:
425 result[row["path"]].add(row["dataset_id"])
426 return result
428 def _registered_refs_per_artifact(self, pathInStore: ResourcePath) -> Set[DatasetId]:
429 """Return all dataset refs associated with the supplied path.
431 Parameters
432 ----------
433 pathInStore : `lsst.resources.ResourcePath`
434 Path of interest in the data store.
436 Returns
437 -------
438 ids : `set` of `int`
439 All `DatasetRef` IDs associated with this path.
440 """
441 records = list(self._table.fetch(path=str(pathInStore)))
442 ids = {r["dataset_id"] for r in records}
443 return ids
445 def removeStoredItemInfo(self, ref: DatasetIdRef) -> None:
446 # Docstring inherited from GenericBaseDatastore
447 self._table.delete(["dataset_id"], {"dataset_id": ref.id})
449 def _get_dataset_locations_info(self, ref: DatasetIdRef) -> List[Tuple[Location, StoredFileInfo]]:
450 r"""Find all the `Location`\ s of the requested dataset in the
451 `Datastore` and the associated stored file information.
453 Parameters
454 ----------
455 ref : `DatasetRef`
456 Reference to the required `Dataset`.
458 Returns
459 -------
460 results : `list` [`tuple` [`Location`, `StoredFileInfo` ]]
461 Location of the dataset within the datastore and
462 stored information about each file and its formatter.
463 """
464 # Get the file information (this will fail if no file)
465 records = self.getStoredItemsInfo(ref)
467 # Use the path to determine the location -- we need to take
468 # into account absolute URIs in the datastore record
469 return [(r.file_location(self.locationFactory), r) for r in records]
471 def _can_remove_dataset_artifact(self, ref: DatasetIdRef, location: Location) -> bool:
472 """Check that there is only one dataset associated with the
473 specified artifact.
475 Parameters
476 ----------
477 ref : `DatasetRef` or `FakeDatasetRef`
478 Dataset to be removed.
479 location : `Location`
480 The location of the artifact to be removed.
482 Returns
483 -------
484 can_remove : `Bool`
485 True if the artifact can be safely removed.
486 """
487 # Can't ever delete absolute URIs.
488 if location.pathInStore.isabs():
489 return False
491 # Get all entries associated with this path
492 allRefs = self._registered_refs_per_artifact(location.pathInStore)
493 if not allRefs:
494 raise RuntimeError(f"Datastore inconsistency error. {location.pathInStore} not in registry")
496 # Remove these refs from all the refs and if there is nothing left
497 # then we can delete
498 remainingRefs = allRefs - {ref.id}
500 if remainingRefs:
501 return False
502 return True
504 def _get_expected_dataset_locations_info(self, ref: DatasetRef) -> List[Tuple[Location, StoredFileInfo]]:
505 """Predict the location and related file information of the requested
506 dataset in this datastore.
508 Parameters
509 ----------
510 ref : `DatasetRef`
511 Reference to the required `Dataset`.
513 Returns
514 -------
515 results : `list` [`tuple` [`Location`, `StoredFileInfo` ]]
516 Expected Location of the dataset within the datastore and
517 placeholder information about each file and its formatter.
519 Notes
520 -----
521 Uses the current configuration to determine how we would expect the
522 datastore files to have been written if we couldn't ask registry.
523 This is safe so long as there has been no change to datastore
524 configuration between writing the dataset and wanting to read it.
525 Will not work for files that have been ingested without using the
526 standard file template or default formatter.
527 """
529 # If we have a component ref we always need to ask the questions
530 # of the composite. If the composite is disassembled this routine
531 # should return all components. If the composite was not
532 # disassembled the composite is what is stored regardless of
533 # component request. Note that if the caller has disassembled
534 # a composite there is no way for this guess to know that
535 # without trying both the composite and component ref and seeing
536 # if there is something at the component Location even without
537 # disassembly being enabled.
538 if ref.datasetType.isComponent():
539 ref = ref.makeCompositeRef()
541 # See if the ref is a composite that should be disassembled
542 doDisassembly = self.composites.shouldBeDisassembled(ref)
544 all_info: List[Tuple[Location, Formatter, StorageClass, Optional[str]]] = []
546 if doDisassembly:
547 for component, componentStorage in ref.datasetType.storageClass.components.items():
548 compRef = ref.makeComponentRef(component)
549 location, formatter = self._determine_put_formatter_location(compRef)
550 all_info.append((location, formatter, componentStorage, component))
552 else:
553 # Always use the composite ref if no disassembly
554 location, formatter = self._determine_put_formatter_location(ref)
555 all_info.append((location, formatter, ref.datasetType.storageClass, None))
557 # Convert the list of tuples to have StoredFileInfo as second element
558 return [
559 (
560 location,
561 StoredFileInfo(
562 formatter=formatter,
563 path=location.pathInStore.path,
564 storageClass=storageClass,
565 component=component,
566 checksum=None,
567 file_size=-1,
568 dataset_id=ref.getCheckedId(),
569 ),
570 )
571 for location, formatter, storageClass, component in all_info
572 ]
574 def _prepare_for_get(
575 self, ref: DatasetRef, parameters: Optional[Mapping[str, Any]] = None
576 ) -> List[DatastoreFileGetInformation]:
577 """Check parameters for ``get`` and obtain formatter and
578 location.
580 Parameters
581 ----------
582 ref : `DatasetRef`
583 Reference to the required Dataset.
584 parameters : `dict`
585 `StorageClass`-specific parameters that specify, for example,
586 a slice of the dataset to be loaded.
588 Returns
589 -------
590 getInfo : `list` [`DatastoreFileGetInformation`]
591 Parameters needed to retrieve each file.
592 """
593 log.debug("Retrieve %s from %s with parameters %s", ref, self.name, parameters)
595 # Get file metadata and internal metadata
596 fileLocations = self._get_dataset_locations_info(ref)
597 if not fileLocations:
598 if not self.trustGetRequest:
599 raise FileNotFoundError(f"Could not retrieve dataset {ref}.")
600 # Assume the dataset is where we think it should be
601 fileLocations = self._get_expected_dataset_locations_info(ref)
603 # The storage class we want to use eventually
604 refStorageClass = ref.datasetType.storageClass
606 if len(fileLocations) > 1:
607 disassembled = True
609 # If trust is involved it is possible that there will be
610 # components listed here that do not exist in the datastore.
611 # Explicitly check for file artifact existence and filter out any
612 # that are missing.
613 if self.trustGetRequest:
614 fileLocations = [loc for loc in fileLocations if loc[0].uri.exists()]
616 # For now complain only if we have no components at all. One
617 # component is probably a problem but we can punt that to the
618 # assembler.
619 if not fileLocations: 619 ↛ 620line 619 didn't jump to line 620, because the condition on line 619 was never true
620 raise FileNotFoundError(f"None of the component files for dataset {ref} exist.")
622 else:
623 disassembled = False
625 # Is this a component request?
626 refComponent = ref.datasetType.component()
628 fileGetInfo = []
629 for location, storedFileInfo in fileLocations:
630 # The storage class used to write the file
631 writeStorageClass = storedFileInfo.storageClass
633 # If this has been disassembled we need read to match the write
634 if disassembled:
635 readStorageClass = writeStorageClass
636 else:
637 readStorageClass = refStorageClass
639 formatter = get_instance_of(
640 storedFileInfo.formatter,
641 FileDescriptor(
642 location,
643 readStorageClass=readStorageClass,
644 storageClass=writeStorageClass,
645 parameters=parameters,
646 ),
647 ref.dataId,
648 )
650 formatterParams, notFormatterParams = formatter.segregateParameters()
652 # Of the remaining parameters, extract the ones supported by
653 # this StorageClass (for components not all will be handled)
654 assemblerParams = readStorageClass.filterParameters(notFormatterParams)
656 # The ref itself could be a component if the dataset was
657 # disassembled by butler, or we disassembled in datastore and
658 # components came from the datastore records
659 component = storedFileInfo.component if storedFileInfo.component else refComponent
661 fileGetInfo.append(
662 DatastoreFileGetInformation(
663 location,
664 formatter,
665 storedFileInfo,
666 assemblerParams,
667 formatterParams,
668 component,
669 readStorageClass,
670 )
671 )
673 return fileGetInfo
675 def _prepare_for_put(self, inMemoryDataset: Any, ref: DatasetRef) -> Tuple[Location, Formatter]:
676 """Check the arguments for ``put`` and obtain formatter and
677 location.
679 Parameters
680 ----------
681 inMemoryDataset : `object`
682 The dataset to store.
683 ref : `DatasetRef`
684 Reference to the associated Dataset.
686 Returns
687 -------
688 location : `Location`
689 The location to write the dataset.
690 formatter : `Formatter`
691 The `Formatter` to use to write the dataset.
693 Raises
694 ------
695 TypeError
696 Supplied object and storage class are inconsistent.
697 DatasetTypeNotSupportedError
698 The associated `DatasetType` is not handled by this datastore.
699 """
700 self._validate_put_parameters(inMemoryDataset, ref)
701 return self._determine_put_formatter_location(ref)
703 def _determine_put_formatter_location(self, ref: DatasetRef) -> Tuple[Location, Formatter]:
704 """Calculate the formatter and output location to use for put.
706 Parameters
707 ----------
708 ref : `DatasetRef`
709 Reference to the associated Dataset.
711 Returns
712 -------
713 location : `Location`
714 The location to write the dataset.
715 formatter : `Formatter`
716 The `Formatter` to use to write the dataset.
717 """
718 # Work out output file name
719 try:
720 template = self.templates.getTemplate(ref)
721 except KeyError as e:
722 raise DatasetTypeNotSupportedError(f"Unable to find template for {ref}") from e
724 # Validate the template to protect against filenames from different
725 # dataIds returning the same and causing overwrite confusion.
726 template.validateTemplate(ref)
728 location = self.locationFactory.fromPath(template.format(ref))
730 # Get the formatter based on the storage class
731 storageClass = ref.datasetType.storageClass
732 try:
733 formatter = self.formatterFactory.getFormatter(
734 ref, FileDescriptor(location, storageClass=storageClass), ref.dataId
735 )
736 except KeyError as e:
737 raise DatasetTypeNotSupportedError(
738 f"Unable to find formatter for {ref} in datastore {self.name}"
739 ) from e
741 # Now that we know the formatter, update the location
742 location = formatter.makeUpdatedLocation(location)
744 return location, formatter
746 def _overrideTransferMode(self, *datasets: FileDataset, transfer: Optional[str] = None) -> Optional[str]:
747 # Docstring inherited from base class
748 if transfer != "auto":
749 return transfer
751 # See if the paths are within the datastore or not
752 inside = [self._pathInStore(d.path) is not None for d in datasets]
754 if all(inside):
755 transfer = None
756 elif not any(inside): 756 ↛ 765line 756 didn't jump to line 765, because the condition on line 756 was never false
757 # Allow ResourcePath to use its own knowledge
758 transfer = "auto"
759 else:
760 # This can happen when importing from a datastore that
761 # has had some datasets ingested using "direct" mode.
762 # Also allow ResourcePath to sort it out but warn about it.
763 # This can happen if you are importing from a datastore
764 # that had some direct transfer datasets.
765 log.warning(
766 "Some datasets are inside the datastore and some are outside. Using 'split' "
767 "transfer mode. This assumes that the files outside the datastore are "
768 "still accessible to the new butler since they will not be copied into "
769 "the target datastore."
770 )
771 transfer = "split"
773 return transfer
775 def _pathInStore(self, path: ResourcePathExpression) -> Optional[str]:
776 """Return path relative to datastore root
778 Parameters
779 ----------
780 path : `lsst.resources.ResourcePathExpression`
781 Path to dataset. Can be absolute URI. If relative assumed to
782 be relative to the datastore. Returns path in datastore
783 or raises an exception if the path it outside.
785 Returns
786 -------
787 inStore : `str`
788 Path relative to datastore root. Returns `None` if the file is
789 outside the root.
790 """
791 # Relative path will always be relative to datastore
792 pathUri = ResourcePath(path, forceAbsolute=False)
793 return pathUri.relative_to(self.root)
795 def _standardizeIngestPath(
796 self, path: Union[str, ResourcePath], *, transfer: Optional[str] = None
797 ) -> Union[str, ResourcePath]:
798 """Standardize the path of a to-be-ingested file.
800 Parameters
801 ----------
802 path : `str` or `lsst.resources.ResourcePath`
803 Path of a file to be ingested. This parameter is not expected
804 to be all the types that can be used to construct a
805 `~lsst.resources.ResourcePath`.
806 transfer : `str`, optional
807 How (and whether) the dataset should be added to the datastore.
808 See `ingest` for details of transfer modes.
809 This implementation is provided only so
810 `NotImplementedError` can be raised if the mode is not supported;
811 actual transfers are deferred to `_extractIngestInfo`.
813 Returns
814 -------
815 path : `str` or `lsst.resources.ResourcePath`
816 New path in what the datastore considers standard form. If an
817 absolute URI was given that will be returned unchanged.
819 Notes
820 -----
821 Subclasses of `FileDatastore` can implement this method instead
822 of `_prepIngest`. It should not modify the data repository or given
823 file in any way.
825 Raises
826 ------
827 NotImplementedError
828 Raised if the datastore does not support the given transfer mode
829 (including the case where ingest is not supported at all).
830 FileNotFoundError
831 Raised if one of the given files does not exist.
832 """
833 if transfer not in (None, "direct", "split") + self.root.transferModes: 833 ↛ 834line 833 didn't jump to line 834, because the condition on line 833 was never true
834 raise NotImplementedError(f"Transfer mode {transfer} not supported.")
836 # A relative URI indicates relative to datastore root
837 srcUri = ResourcePath(path, forceAbsolute=False)
838 if not srcUri.isabs():
839 srcUri = self.root.join(path)
841 if not srcUri.exists():
842 raise FileNotFoundError(
843 f"Resource at {srcUri} does not exist; note that paths to ingest "
844 f"are assumed to be relative to {self.root} unless they are absolute."
845 )
847 if transfer is None:
848 relpath = srcUri.relative_to(self.root)
849 if not relpath:
850 raise RuntimeError(
851 f"Transfer is none but source file ({srcUri}) is not within datastore ({self.root})"
852 )
854 # Return the relative path within the datastore for internal
855 # transfer
856 path = relpath
858 return path
860 def _extractIngestInfo(
861 self,
862 path: ResourcePathExpression,
863 ref: DatasetRef,
864 *,
865 formatter: Union[Formatter, Type[Formatter]],
866 transfer: Optional[str] = None,
867 record_validation_info: bool = True,
868 ) -> StoredFileInfo:
869 """Relocate (if necessary) and extract `StoredFileInfo` from a
870 to-be-ingested file.
872 Parameters
873 ----------
874 path : `lsst.resources.ResourcePathExpression`
875 URI or path of a file to be ingested.
876 ref : `DatasetRef`
877 Reference for the dataset being ingested. Guaranteed to have
878 ``dataset_id not None`.
879 formatter : `type` or `Formatter`
880 `Formatter` subclass to use for this dataset or an instance.
881 transfer : `str`, optional
882 How (and whether) the dataset should be added to the datastore.
883 See `ingest` for details of transfer modes.
884 record_validation_info : `bool`, optional
885 If `True`, the default, the datastore can record validation
886 information associated with the file. If `False` the datastore
887 will not attempt to track any information such as checksums
888 or file sizes. This can be useful if such information is tracked
889 in an external system or if the file is to be compressed in place.
890 It is up to the datastore whether this parameter is relevant.
892 Returns
893 -------
894 info : `StoredFileInfo`
895 Internal datastore record for this file. This will be inserted by
896 the caller; the `_extractIngestInfo` is only responsible for
897 creating and populating the struct.
899 Raises
900 ------
901 FileNotFoundError
902 Raised if one of the given files does not exist.
903 FileExistsError
904 Raised if transfer is not `None` but the (internal) location the
905 file would be moved to is already occupied.
906 """
907 if self._transaction is None: 907 ↛ 908line 907 didn't jump to line 908, because the condition on line 907 was never true
908 raise RuntimeError("Ingest called without transaction enabled")
910 # Create URI of the source path, do not need to force a relative
911 # path to absolute.
912 srcUri = ResourcePath(path, forceAbsolute=False)
914 # Track whether we have read the size of the source yet
915 have_sized = False
917 tgtLocation: Optional[Location]
918 if transfer is None or transfer == "split":
919 # A relative path is assumed to be relative to the datastore
920 # in this context
921 if not srcUri.isabs():
922 tgtLocation = self.locationFactory.fromPath(srcUri.ospath)
923 else:
924 # Work out the path in the datastore from an absolute URI
925 # This is required to be within the datastore.
926 pathInStore = srcUri.relative_to(self.root)
927 if pathInStore is None and transfer is None: 927 ↛ 928line 927 didn't jump to line 928, because the condition on line 927 was never true
928 raise RuntimeError(
929 f"Unexpectedly learned that {srcUri} is not within datastore {self.root}"
930 )
931 if pathInStore: 931 ↛ 933line 931 didn't jump to line 933, because the condition on line 931 was never false
932 tgtLocation = self.locationFactory.fromPath(pathInStore)
933 elif transfer == "split":
934 # Outside the datastore but treat that as a direct ingest
935 # instead.
936 tgtLocation = None
937 else:
938 raise RuntimeError(f"Unexpected transfer mode encountered: {transfer} for URI {srcUri}")
939 elif transfer == "direct": 939 ↛ 944line 939 didn't jump to line 944, because the condition on line 939 was never true
940 # Want to store the full URI to the resource directly in
941 # datastore. This is useful for referring to permanent archive
942 # storage for raw data.
943 # Trust that people know what they are doing.
944 tgtLocation = None
945 else:
946 # Work out the name we want this ingested file to have
947 # inside the datastore
948 tgtLocation = self._calculate_ingested_datastore_name(srcUri, ref, formatter)
949 if not tgtLocation.uri.dirname().exists():
950 log.debug("Folder %s does not exist yet.", tgtLocation.uri.dirname())
951 tgtLocation.uri.dirname().mkdir()
953 # if we are transferring from a local file to a remote location
954 # it may be more efficient to get the size and checksum of the
955 # local file rather than the transferred one
956 if record_validation_info and srcUri.isLocal:
957 size = srcUri.size()
958 checksum = self.computeChecksum(srcUri) if self.useChecksum else None
959 have_sized = True
961 # Transfer the resource to the destination.
962 # Allow overwrite of an existing file. This matches the behavior
963 # of datastore.put() in that it trusts that registry would not
964 # be asking to overwrite unless registry thought that the
965 # overwrite was allowed.
966 tgtLocation.uri.transfer_from(
967 srcUri, transfer=transfer, transaction=self._transaction, overwrite=True
968 )
970 if tgtLocation is None: 970 ↛ 972line 970 didn't jump to line 972, because the condition on line 970 was never true
971 # This means we are using direct mode
972 targetUri = srcUri
973 targetPath = str(srcUri)
974 else:
975 targetUri = tgtLocation.uri
976 targetPath = tgtLocation.pathInStore.path
978 # the file should exist in the datastore now
979 if record_validation_info:
980 if not have_sized:
981 size = targetUri.size()
982 checksum = self.computeChecksum(targetUri) if self.useChecksum else None
983 else:
984 # Not recording any file information.
985 size = -1
986 checksum = None
988 return StoredFileInfo(
989 formatter=formatter,
990 path=targetPath,
991 storageClass=ref.datasetType.storageClass,
992 component=ref.datasetType.component(),
993 file_size=size,
994 checksum=checksum,
995 dataset_id=ref.getCheckedId(),
996 )
998 def _prepIngest(self, *datasets: FileDataset, transfer: Optional[str] = None) -> _IngestPrepData:
999 # Docstring inherited from Datastore._prepIngest.
1000 filtered = []
1001 for dataset in datasets:
1002 acceptable = [ref for ref in dataset.refs if self.constraints.isAcceptable(ref)]
1003 if not acceptable:
1004 continue
1005 else:
1006 dataset.refs = acceptable
1007 if dataset.formatter is None:
1008 dataset.formatter = self.formatterFactory.getFormatterClass(dataset.refs[0])
1009 else:
1010 assert isinstance(dataset.formatter, (type, str))
1011 formatter_class = get_class_of(dataset.formatter)
1012 if not issubclass(formatter_class, Formatter): 1012 ↛ 1013line 1012 didn't jump to line 1013, because the condition on line 1012 was never true
1013 raise TypeError(f"Requested formatter {dataset.formatter} is not a Formatter class.")
1014 dataset.formatter = formatter_class
1015 dataset.path = self._standardizeIngestPath(dataset.path, transfer=transfer)
1016 filtered.append(dataset)
1017 return _IngestPrepData(filtered)
1019 @transactional
1020 def _finishIngest(
1021 self,
1022 prepData: Datastore.IngestPrepData,
1023 *,
1024 transfer: Optional[str] = None,
1025 record_validation_info: bool = True,
1026 ) -> None:
1027 # Docstring inherited from Datastore._finishIngest.
1028 refsAndInfos = []
1029 progress = Progress("lsst.daf.butler.datastores.FileDatastore.ingest", level=logging.DEBUG)
1030 for dataset in progress.wrap(prepData.datasets, desc="Ingesting dataset files"):
1031 # Do ingest as if the first dataset ref is associated with the file
1032 info = self._extractIngestInfo(
1033 dataset.path,
1034 dataset.refs[0],
1035 formatter=dataset.formatter,
1036 transfer=transfer,
1037 record_validation_info=record_validation_info,
1038 )
1039 refsAndInfos.extend([(ref, info) for ref in dataset.refs])
1040 self._register_datasets(refsAndInfos)
1042 def _calculate_ingested_datastore_name(
1043 self, srcUri: ResourcePath, ref: DatasetRef, formatter: Union[Formatter, Type[Formatter]]
1044 ) -> Location:
1045 """Given a source URI and a DatasetRef, determine the name the
1046 dataset will have inside datastore.
1048 Parameters
1049 ----------
1050 srcUri : `lsst.resources.ResourcePath`
1051 URI to the source dataset file.
1052 ref : `DatasetRef`
1053 Ref associated with the newly-ingested dataset artifact. This
1054 is used to determine the name within the datastore.
1055 formatter : `Formatter` or Formatter class.
1056 Formatter to use for validation. Can be a class or an instance.
1058 Returns
1059 -------
1060 location : `Location`
1061 Target location for the newly-ingested dataset.
1062 """
1063 # Ingesting a file from outside the datastore.
1064 # This involves a new name.
1065 template = self.templates.getTemplate(ref)
1066 location = self.locationFactory.fromPath(template.format(ref))
1068 # Get the extension
1069 ext = srcUri.getExtension()
1071 # Update the destination to include that extension
1072 location.updateExtension(ext)
1074 # Ask the formatter to validate this extension
1075 formatter.validateExtension(location)
1077 return location
1079 def _write_in_memory_to_artifact(self, inMemoryDataset: Any, ref: DatasetRef) -> StoredFileInfo:
1080 """Write out in memory dataset to datastore.
1082 Parameters
1083 ----------
1084 inMemoryDataset : `object`
1085 Dataset to write to datastore.
1086 ref : `DatasetRef`
1087 Registry information associated with this dataset.
1089 Returns
1090 -------
1091 info : `StoredFileInfo`
1092 Information describing the artifact written to the datastore.
1093 """
1094 # May need to coerce the in memory dataset to the correct
1095 # python type.
1096 inMemoryDataset = ref.datasetType.storageClass.coerce_type(inMemoryDataset)
1098 location, formatter = self._prepare_for_put(inMemoryDataset, ref)
1099 uri = location.uri
1101 if not uri.dirname().exists():
1102 log.debug("Folder %s does not exist yet so creating it.", uri.dirname())
1103 uri.dirname().mkdir()
1105 if self._transaction is None: 1105 ↛ 1106line 1105 didn't jump to line 1106, because the condition on line 1105 was never true
1106 raise RuntimeError("Attempting to write artifact without transaction enabled")
1108 def _removeFileExists(uri: ResourcePath) -> None:
1109 """Remove a file and do not complain if it is not there.
1111 This is important since a formatter might fail before the file
1112 is written and we should not confuse people by writing spurious
1113 error messages to the log.
1114 """
1115 try:
1116 uri.remove()
1117 except FileNotFoundError:
1118 pass
1120 # Register a callback to try to delete the uploaded data if
1121 # something fails below
1122 self._transaction.registerUndo("artifactWrite", _removeFileExists, uri)
1124 data_written = False
1125 if not uri.isLocal:
1126 # This is a remote URI. Some datasets can be serialized directly
1127 # to bytes and sent to the remote datastore without writing a
1128 # file. If the dataset is intended to be saved to the cache
1129 # a file is always written and direct write to the remote
1130 # datastore is bypassed.
1131 if not self.cacheManager.should_be_cached(ref):
1132 try:
1133 serializedDataset = formatter.toBytes(inMemoryDataset)
1134 except NotImplementedError:
1135 # Fallback to the file writing option.
1136 pass
1137 except Exception as e:
1138 raise RuntimeError(
1139 f"Failed to serialize dataset {ref} of type {type(inMemoryDataset)} to bytes."
1140 ) from e
1141 else:
1142 log.debug("Writing bytes directly to %s", uri)
1143 uri.write(serializedDataset, overwrite=True)
1144 log.debug("Successfully wrote bytes directly to %s", uri)
1145 data_written = True
1147 if not data_written:
1148 # Did not write the bytes directly to object store so instead
1149 # write to temporary file. Always write to a temporary even if
1150 # using a local file system -- that gives us atomic writes.
1151 # If a process is killed as the file is being written we do not
1152 # want it to remain in the correct place but in corrupt state.
1153 # For local files write to the output directory not temporary dir.
1154 prefix = uri.dirname() if uri.isLocal else None
1155 with ResourcePath.temporary_uri(suffix=uri.getExtension(), prefix=prefix) as temporary_uri:
1156 # Need to configure the formatter to write to a different
1157 # location and that needs us to overwrite internals
1158 log.debug("Writing dataset to temporary location at %s", temporary_uri)
1159 with formatter._updateLocation(Location(None, temporary_uri)):
1160 try:
1161 formatter.write(inMemoryDataset)
1162 except Exception as e:
1163 raise RuntimeError(
1164 f"Failed to serialize dataset {ref} of type"
1165 f" {type(inMemoryDataset)} to "
1166 f"temporary location {temporary_uri}"
1167 ) from e
1169 # Use move for a local file since that becomes an efficient
1170 # os.rename. For remote resources we use copy to allow the
1171 # file to be cached afterwards.
1172 transfer = "move" if uri.isLocal else "copy"
1174 uri.transfer_from(temporary_uri, transfer=transfer, overwrite=True)
1176 if transfer == "copy":
1177 # Cache if required
1178 self.cacheManager.move_to_cache(temporary_uri, ref)
1180 log.debug("Successfully wrote dataset to %s via a temporary file.", uri)
1182 # URI is needed to resolve what ingest case are we dealing with
1183 return self._extractIngestInfo(uri, ref, formatter=formatter)
1185 def _read_artifact_into_memory(
1186 self,
1187 getInfo: DatastoreFileGetInformation,
1188 ref: DatasetRef,
1189 isComponent: bool = False,
1190 cache_ref: Optional[DatasetRef] = None,
1191 ) -> Any:
1192 """Read the artifact from datastore into in memory object.
1194 Parameters
1195 ----------
1196 getInfo : `DatastoreFileGetInformation`
1197 Information about the artifact within the datastore.
1198 ref : `DatasetRef`
1199 The registry information associated with this artifact.
1200 isComponent : `bool`
1201 Flag to indicate if a component is being read from this artifact.
1202 cache_ref : `DatasetRef`, optional
1203 The DatasetRef to use when looking up the file in the cache.
1204 This ref must have the same ID as the supplied ref but can
1205 be a parent ref or component ref to indicate to the cache whether
1206 a composite file is being requested from the cache or a component
1207 file. Without this the cache will default to the supplied ref but
1208 it can get confused with read-only derived components for
1209 disassembled composites.
1211 Returns
1212 -------
1213 inMemoryDataset : `object`
1214 The artifact as a python object.
1215 """
1216 location = getInfo.location
1217 uri = location.uri
1218 log.debug("Accessing data from %s", uri)
1220 if cache_ref is None:
1221 cache_ref = ref
1222 if cache_ref.id != ref.id: 1222 ↛ 1223line 1222 didn't jump to line 1223, because the condition on line 1222 was never true
1223 raise ValueError(
1224 "The supplied cache dataset ref refers to a different dataset than expected:"
1225 f" {ref.id} != {cache_ref.id}"
1226 )
1228 # Cannot recalculate checksum but can compare size as a quick check
1229 # Do not do this if the size is negative since that indicates
1230 # we do not know.
1231 recorded_size = getInfo.info.file_size
1232 resource_size = uri.size()
1233 if recorded_size >= 0 and resource_size != recorded_size: 1233 ↛ 1234line 1233 didn't jump to line 1234, because the condition on line 1233 was never true
1234 raise RuntimeError(
1235 "Integrity failure in Datastore. "
1236 f"Size of file {uri} ({resource_size}) "
1237 f"does not match size recorded in registry of {recorded_size}"
1238 )
1240 # For the general case we have choices for how to proceed.
1241 # 1. Always use a local file (downloading the remote resource to a
1242 # temporary file if needed).
1243 # 2. Use a threshold size and read into memory and use bytes.
1244 # Use both for now with an arbitrary hand off size.
1245 # This allows small datasets to be downloaded from remote object
1246 # stores without requiring a temporary file.
1248 formatter = getInfo.formatter
1249 nbytes_max = 10_000_000 # Arbitrary number that we can tune
1250 if resource_size <= nbytes_max and formatter.can_read_bytes():
1251 with self.cacheManager.find_in_cache(cache_ref, uri.getExtension()) as cached_file:
1252 if cached_file is not None:
1253 desired_uri = cached_file
1254 msg = f" (cached version of {uri})"
1255 else:
1256 desired_uri = uri
1257 msg = ""
1258 with time_this(log, msg="Reading bytes from %s%s", args=(desired_uri, msg)):
1259 serializedDataset = desired_uri.read()
1260 log.debug(
1261 "Deserializing %s from %d bytes from location %s with formatter %s",
1262 f"component {getInfo.component}" if isComponent else "",
1263 len(serializedDataset),
1264 uri,
1265 formatter.name(),
1266 )
1267 try:
1268 result = formatter.fromBytes(
1269 serializedDataset, component=getInfo.component if isComponent else None
1270 )
1271 except Exception as e:
1272 raise ValueError(
1273 f"Failure from formatter '{formatter.name()}' for dataset {ref.id}"
1274 f" ({ref.datasetType.name} from {uri}): {e}"
1275 ) from e
1276 else:
1277 # Read from file.
1279 # Have to update the Location associated with the formatter
1280 # because formatter.read does not allow an override.
1281 # This could be improved.
1282 location_updated = False
1283 msg = ""
1285 # First check in cache for local version.
1286 # The cache will only be relevant for remote resources but
1287 # no harm in always asking. Context manager ensures that cache
1288 # file is not deleted during cache expiration.
1289 with self.cacheManager.find_in_cache(cache_ref, uri.getExtension()) as cached_file:
1290 if cached_file is not None:
1291 msg = f"(via cache read of remote file {uri})"
1292 uri = cached_file
1293 location_updated = True
1295 with uri.as_local() as local_uri:
1296 can_be_cached = False
1297 if uri != local_uri: 1297 ↛ 1299line 1297 didn't jump to line 1299, because the condition on line 1297 was never true
1298 # URI was remote and file was downloaded
1299 cache_msg = ""
1300 location_updated = True
1302 if self.cacheManager.should_be_cached(cache_ref):
1303 # In this scenario we want to ask if the downloaded
1304 # file should be cached but we should not cache
1305 # it until after we've used it (to ensure it can't
1306 # be expired whilst we are using it).
1307 can_be_cached = True
1309 # Say that it is "likely" to be cached because
1310 # if the formatter read fails we will not be
1311 # caching this file.
1312 cache_msg = " and likely cached"
1314 msg = f"(via download to local file{cache_msg})"
1316 # Calculate the (possibly) new location for the formatter
1317 # to use.
1318 newLocation = Location(*local_uri.split()) if location_updated else None
1320 log.debug(
1321 "Reading%s from location %s %s with formatter %s",
1322 f" component {getInfo.component}" if isComponent else "",
1323 uri,
1324 msg,
1325 formatter.name(),
1326 )
1327 try:
1328 with formatter._updateLocation(newLocation):
1329 with time_this(
1330 log,
1331 msg="Reading%s from location %s %s with formatter %s",
1332 args=(
1333 f" component {getInfo.component}" if isComponent else "",
1334 uri,
1335 msg,
1336 formatter.name(),
1337 ),
1338 ):
1339 result = formatter.read(component=getInfo.component if isComponent else None)
1340 except Exception as e:
1341 raise ValueError(
1342 f"Failure from formatter '{formatter.name()}' for dataset {ref.id}"
1343 f" ({ref.datasetType.name} from {uri}): {e}"
1344 ) from e
1346 # File was read successfully so can move to cache
1347 if can_be_cached: 1347 ↛ 1348line 1347 didn't jump to line 1348, because the condition on line 1347 was never true
1348 self.cacheManager.move_to_cache(local_uri, cache_ref)
1350 return self._post_process_get(
1351 result, getInfo.readStorageClass, getInfo.assemblerParams, isComponent=isComponent
1352 )
1354 def knows(self, ref: DatasetRef) -> bool:
1355 """Check if the dataset is known to the datastore.
1357 Does not check for existence of any artifact.
1359 Parameters
1360 ----------
1361 ref : `DatasetRef`
1362 Reference to the required dataset.
1364 Returns
1365 -------
1366 exists : `bool`
1367 `True` if the dataset is known to the datastore.
1368 """
1369 fileLocations = self._get_dataset_locations_info(ref)
1370 if fileLocations:
1371 return True
1372 return False
1374 def knows_these(self, refs: Iterable[DatasetRef]) -> dict[DatasetRef, bool]:
1375 # Docstring inherited from the base class.
1377 # The records themselves. Could be missing some entries.
1378 records = self._get_stored_records_associated_with_refs(refs)
1380 return {ref: ref.id in records for ref in refs}
1382 def _process_mexists_records(
1383 self,
1384 id_to_ref: Dict[DatasetId, DatasetRef],
1385 records: Dict[DatasetId, List[StoredFileInfo]],
1386 all_required: bool,
1387 artifact_existence: Optional[Dict[ResourcePath, bool]] = None,
1388 ) -> Dict[DatasetRef, bool]:
1389 """Helper function for mexists that checks the given records.
1391 Parameters
1392 ----------
1393 id_to_ref : `dict` of [`DatasetId`, `DatasetRef`]
1394 Mapping of the dataset ID to the dataset ref itself.
1395 records : `dict` of [`DatasetId`, `list` of `StoredFileInfo`]
1396 Records as generally returned by
1397 ``_get_stored_records_associated_with_refs``.
1398 all_required : `bool`
1399 Flag to indicate whether existence requires all artifacts
1400 associated with a dataset ID to exist or not for existence.
1401 artifact_existence : `dict` [`lsst.resources.ResourcePath`, `bool`]
1402 Optional mapping of datastore artifact to existence. Updated by
1403 this method with details of all artifacts tested. Can be `None`
1404 if the caller is not interested.
1406 Returns
1407 -------
1408 existence : `dict` of [`DatasetRef`, `bool`]
1409 Mapping from dataset to boolean indicating existence.
1410 """
1411 # The URIs to be checked and a mapping of those URIs to
1412 # the dataset ID.
1413 uris_to_check: List[ResourcePath] = []
1414 location_map: Dict[ResourcePath, DatasetId] = {}
1416 location_factory = self.locationFactory
1418 uri_existence: Dict[ResourcePath, bool] = {}
1419 for ref_id, infos in records.items():
1420 # Key is the dataset Id, value is list of StoredItemInfo
1421 uris = [info.file_location(location_factory).uri for info in infos]
1422 location_map.update({uri: ref_id for uri in uris})
1424 # Check the local cache directly for a dataset corresponding
1425 # to the remote URI.
1426 if self.cacheManager.file_count > 0: 1426 ↛ 1427line 1426 didn't jump to line 1427, because the condition on line 1426 was never true
1427 ref = id_to_ref[ref_id]
1428 for uri, storedFileInfo in zip(uris, infos):
1429 check_ref = ref
1430 if not ref.datasetType.isComponent() and (component := storedFileInfo.component):
1431 check_ref = ref.makeComponentRef(component)
1432 if self.cacheManager.known_to_cache(check_ref, uri.getExtension()):
1433 # Proxy for URI existence.
1434 uri_existence[uri] = True
1435 else:
1436 uris_to_check.append(uri)
1437 else:
1438 # Check all of them.
1439 uris_to_check.extend(uris)
1441 if artifact_existence is not None:
1442 # If a URI has already been checked remove it from the list
1443 # and immediately add the status to the output dict.
1444 filtered_uris_to_check = []
1445 for uri in uris_to_check:
1446 if uri in artifact_existence:
1447 uri_existence[uri] = artifact_existence[uri]
1448 else:
1449 filtered_uris_to_check.append(uri)
1450 uris_to_check = filtered_uris_to_check
1452 # Results.
1453 dataset_existence: Dict[DatasetRef, bool] = {}
1455 uri_existence.update(ResourcePath.mexists(uris_to_check))
1456 for uri, exists in uri_existence.items():
1457 dataset_id = location_map[uri]
1458 ref = id_to_ref[dataset_id]
1460 # Disassembled composite needs to check all locations.
1461 # all_required indicates whether all need to exist or not.
1462 if ref in dataset_existence:
1463 if all_required:
1464 exists = dataset_existence[ref] and exists
1465 else:
1466 exists = dataset_existence[ref] or exists
1467 dataset_existence[ref] = exists
1469 if artifact_existence is not None:
1470 artifact_existence.update(uri_existence)
1472 return dataset_existence
1474 def mexists(
1475 self, refs: Iterable[DatasetRef], artifact_existence: Optional[Dict[ResourcePath, bool]] = None
1476 ) -> Dict[DatasetRef, bool]:
1477 """Check the existence of multiple datasets at once.
1479 Parameters
1480 ----------
1481 refs : iterable of `DatasetRef`
1482 The datasets to be checked.
1483 artifact_existence : `dict` [`lsst.resources.ResourcePath`, `bool`]
1484 Optional mapping of datastore artifact to existence. Updated by
1485 this method with details of all artifacts tested. Can be `None`
1486 if the caller is not interested.
1488 Returns
1489 -------
1490 existence : `dict` of [`DatasetRef`, `bool`]
1491 Mapping from dataset to boolean indicating existence.
1493 Notes
1494 -----
1495 To minimize potentially costly remote existence checks, the local
1496 cache is checked as a proxy for existence. If a file for this
1497 `DatasetRef` does exist no check is done for the actual URI. This
1498 could result in possibly unexpected behavior if the dataset itself
1499 has been removed from the datastore by another process whilst it is
1500 still in the cache.
1501 """
1502 chunk_size = 10_000
1503 dataset_existence: Dict[DatasetRef, bool] = {}
1504 log.debug("Checking for the existence of multiple artifacts in datastore in chunks of %d", chunk_size)
1505 n_found_total = 0
1506 n_checked = 0
1507 n_chunks = 0
1508 for chunk in chunk_iterable(refs, chunk_size=chunk_size):
1509 chunk_result = self._mexists(chunk, artifact_existence)
1510 if log.isEnabledFor(VERBOSE):
1511 n_results = len(chunk_result)
1512 n_checked += n_results
1513 # Can treat the booleans as 0, 1 integers and sum them.
1514 n_found = sum(chunk_result.values())
1515 n_found_total += n_found
1516 log.verbose(
1517 "Number of datasets found in datastore for chunk %d = %d/%d (running total: %d/%d)",
1518 n_chunks,
1519 n_found,
1520 n_results,
1521 n_found_total,
1522 n_checked,
1523 )
1524 dataset_existence.update(chunk_result)
1525 n_chunks += 1
1527 return dataset_existence
1529 def _mexists(
1530 self, refs: Iterable[DatasetRef], artifact_existence: Optional[Dict[ResourcePath, bool]] = None
1531 ) -> Dict[DatasetRef, bool]:
1532 """Check the existence of multiple datasets at once.
1534 Parameters
1535 ----------
1536 refs : iterable of `DatasetRef`
1537 The datasets to be checked.
1539 Returns
1540 -------
1541 existence : `dict` of [`DatasetRef`, `bool`]
1542 Mapping from dataset to boolean indicating existence.
1543 """
1544 # Need a mapping of dataset_id to dataset ref since the API
1545 # works with dataset_id
1546 id_to_ref = {ref.getCheckedId(): ref for ref in refs}
1548 # Set of all IDs we are checking for.
1549 requested_ids = set(id_to_ref.keys())
1551 # The records themselves. Could be missing some entries.
1552 records = self._get_stored_records_associated_with_refs(refs)
1554 dataset_existence = self._process_mexists_records(
1555 id_to_ref, records, True, artifact_existence=artifact_existence
1556 )
1558 # Set of IDs that have been handled.
1559 handled_ids = {ref.id for ref in dataset_existence.keys()}
1561 missing_ids = requested_ids - handled_ids
1562 if missing_ids:
1563 if not self.trustGetRequest:
1564 # Must assume these do not exist
1565 for missing in missing_ids:
1566 dataset_existence[id_to_ref[missing]] = False
1567 else:
1568 log.debug(
1569 "%d out of %d datasets were not known to datastore during initial existence check.",
1570 len(missing_ids),
1571 len(requested_ids),
1572 )
1574 # Construct data structure identical to that returned
1575 # by _get_stored_records_associated_with_refs() but using
1576 # guessed names.
1577 records = {}
1578 for missing in missing_ids:
1579 expected = self._get_expected_dataset_locations_info(id_to_ref[missing])
1580 records[missing] = [info for _, info in expected]
1582 dataset_existence.update(
1583 self._process_mexists_records(
1584 id_to_ref, records, False, artifact_existence=artifact_existence
1585 )
1586 )
1588 return dataset_existence
1590 def exists(self, ref: DatasetRef) -> bool:
1591 """Check if the dataset exists in the datastore.
1593 Parameters
1594 ----------
1595 ref : `DatasetRef`
1596 Reference to the required dataset.
1598 Returns
1599 -------
1600 exists : `bool`
1601 `True` if the entity exists in the `Datastore`.
1603 Notes
1604 -----
1605 The local cache is checked as a proxy for existence in the remote
1606 object store. It is possible that another process on a different
1607 compute node could remove the file from the object store even
1608 though it is present in the local cache.
1609 """
1610 fileLocations = self._get_dataset_locations_info(ref)
1612 # if we are being asked to trust that registry might not be correct
1613 # we ask for the expected locations and check them explicitly
1614 if not fileLocations:
1615 if not self.trustGetRequest:
1616 return False
1618 # First check the cache. If it is not found we must check
1619 # the datastore itself. Assume that any component in the cache
1620 # means that the dataset does exist somewhere.
1621 if self.cacheManager.known_to_cache(ref): 1621 ↛ 1622line 1621 didn't jump to line 1622, because the condition on line 1621 was never true
1622 return True
1624 # When we are guessing a dataset location we can not check
1625 # for the existence of every component since we can not
1626 # know if every component was written. Instead we check
1627 # for the existence of any of the expected locations.
1628 for location, _ in self._get_expected_dataset_locations_info(ref):
1629 if self._artifact_exists(location):
1630 return True
1631 return False
1633 # All listed artifacts must exist.
1634 for location, storedFileInfo in fileLocations:
1635 # Checking in cache needs the component ref.
1636 check_ref = ref
1637 if not ref.datasetType.isComponent() and (component := storedFileInfo.component):
1638 check_ref = ref.makeComponentRef(component)
1639 if self.cacheManager.known_to_cache(check_ref, location.getExtension()):
1640 continue
1642 if not self._artifact_exists(location):
1643 return False
1645 return True
1647 def getURIs(self, ref: DatasetRef, predict: bool = False) -> DatasetRefURIs:
1648 """Return URIs associated with dataset.
1650 Parameters
1651 ----------
1652 ref : `DatasetRef`
1653 Reference to the required dataset.
1654 predict : `bool`, optional
1655 If the datastore does not know about the dataset, should it
1656 return a predicted URI or not?
1658 Returns
1659 -------
1660 uris : `DatasetRefURIs`
1661 The URI to the primary artifact associated with this dataset (if
1662 the dataset was disassembled within the datastore this may be
1663 `None`), and the URIs to any components associated with the dataset
1664 artifact. (can be empty if there are no components).
1665 """
1666 # if this has never been written then we have to guess
1667 if not self.exists(ref):
1668 if not predict:
1669 raise FileNotFoundError("Dataset {} not in this datastore".format(ref))
1671 return self._predict_URIs(ref)
1673 # If this is a ref that we have written we can get the path.
1674 # Get file metadata and internal metadata
1675 fileLocations = self._get_dataset_locations_info(ref)
1677 return self._locations_to_URI(ref, fileLocations)
1679 def getURI(self, ref: DatasetRef, predict: bool = False) -> ResourcePath:
1680 """URI to the Dataset.
1682 Parameters
1683 ----------
1684 ref : `DatasetRef`
1685 Reference to the required Dataset.
1686 predict : `bool`
1687 If `True`, allow URIs to be returned of datasets that have not
1688 been written.
1690 Returns
1691 -------
1692 uri : `str`
1693 URI pointing to the dataset within the datastore. If the
1694 dataset does not exist in the datastore, and if ``predict`` is
1695 `True`, the URI will be a prediction and will include a URI
1696 fragment "#predicted".
1697 If the datastore does not have entities that relate well
1698 to the concept of a URI the returned URI will be
1699 descriptive. The returned URI is not guaranteed to be obtainable.
1701 Raises
1702 ------
1703 FileNotFoundError
1704 Raised if a URI has been requested for a dataset that does not
1705 exist and guessing is not allowed.
1706 RuntimeError
1707 Raised if a request is made for a single URI but multiple URIs
1708 are associated with this dataset.
1710 Notes
1711 -----
1712 When a predicted URI is requested an attempt will be made to form
1713 a reasonable URI based on file templates and the expected formatter.
1714 """
1715 primary, components = self.getURIs(ref, predict)
1716 if primary is None or components: 1716 ↛ 1717line 1716 didn't jump to line 1717, because the condition on line 1716 was never true
1717 raise RuntimeError(
1718 f"Dataset ({ref}) includes distinct URIs for components. Use Datastore.getURIs() instead."
1719 )
1720 return primary
1722 def _predict_URIs(
1723 self,
1724 ref: DatasetRef,
1725 ) -> DatasetRefURIs:
1726 """Predict the URIs of a dataset ref.
1728 Parameters
1729 ----------
1730 ref : `DatasetRef`
1731 Reference to the required Dataset.
1733 Returns
1734 -------
1735 URI : DatasetRefUris
1736 Primary and component URIs. URIs will contain a URI fragment
1737 "#predicted".
1738 """
1739 uris = DatasetRefURIs()
1741 if self.composites.shouldBeDisassembled(ref):
1742 for component, _ in ref.datasetType.storageClass.components.items():
1743 comp_ref = ref.makeComponentRef(component)
1744 comp_location, _ = self._determine_put_formatter_location(comp_ref)
1746 # Add the "#predicted" URI fragment to indicate this is a
1747 # guess
1748 uris.componentURIs[component] = ResourcePath(comp_location.uri.geturl() + "#predicted")
1750 else:
1751 location, _ = self._determine_put_formatter_location(ref)
1753 # Add the "#predicted" URI fragment to indicate this is a guess
1754 uris.primaryURI = ResourcePath(location.uri.geturl() + "#predicted")
1756 return uris
1758 def getManyURIs(
1759 self,
1760 refs: Iterable[DatasetRef],
1761 predict: bool = False,
1762 allow_missing: bool = False,
1763 ) -> Dict[DatasetRef, DatasetRefURIs]:
1764 # Docstring inherited
1766 uris: Dict[DatasetRef, DatasetRefURIs] = {}
1768 records = self._get_stored_records_associated_with_refs(refs)
1769 records_keys = records.keys()
1771 existing_refs = (ref for ref in refs if ref.id in records_keys)
1772 missing_refs = (ref for ref in refs if ref.id not in records_keys)
1774 for ref in missing_refs:
1775 # if this has never been written then we have to guess
1776 if not predict:
1777 if not allow_missing:
1778 raise FileNotFoundError("Dataset {} not in this datastore.".format(ref))
1779 else:
1780 uris[ref] = self._predict_URIs(ref)
1782 for ref in existing_refs:
1783 file_infos = records[ref.getCheckedId()]
1784 file_locations = [(i.file_location(self.locationFactory), i) for i in file_infos]
1785 uris[ref] = self._locations_to_URI(ref, file_locations)
1787 return uris
1789 def _locations_to_URI(
1790 self,
1791 ref: DatasetRef,
1792 file_locations: Sequence[Tuple[Location, StoredFileInfo]],
1793 ) -> DatasetRefURIs:
1794 """Convert one or more file locations associated with a DatasetRef
1795 to a DatasetRefURIs.
1797 Parameters
1798 ----------
1799 ref : `DatasetRef`
1800 Reference to the dataset.
1801 file_locations : Sequence[Tuple[Location, StoredFileInfo]]
1802 Each item in the sequence is the location of the dataset within the
1803 datastore and stored information about the file and its formatter.
1804 If there is only one item in the sequence then it is treated as the
1805 primary URI. If there is more than one item then they are treated
1806 as component URIs. If there are no items then an error is raised
1807 unless ``self.trustGetRequest`` is `True`.
1809 Returns
1810 -------
1811 uris: DatasetRefURIs
1812 Represents the primary URI or component URIs described by the
1813 inputs.
1815 Raises
1816 ------
1817 RuntimeError
1818 If no file locations are passed in and ``self.trustGetRequest`` is
1819 `False`.
1820 FileNotFoundError
1821 If the a passed-in URI does not exist, and ``self.trustGetRequest``
1822 is `False`.
1823 RuntimeError
1824 If a passed in `StoredFileInfo`'s ``component`` is `None` (this is
1825 unexpected).
1826 """
1828 guessing = False
1829 uris = DatasetRefURIs()
1831 if not file_locations:
1832 if not self.trustGetRequest: 1832 ↛ 1833line 1832 didn't jump to line 1833, because the condition on line 1832 was never true
1833 raise RuntimeError(f"Unexpectedly got no artifacts for dataset {ref}")
1834 file_locations = self._get_expected_dataset_locations_info(ref)
1835 guessing = True
1837 if len(file_locations) == 1:
1838 # No disassembly so this is the primary URI
1839 uris.primaryURI = file_locations[0][0].uri
1840 if guessing and not uris.primaryURI.exists(): 1840 ↛ 1841line 1840 didn't jump to line 1841, because the condition on line 1840 was never true
1841 raise FileNotFoundError(f"Expected URI ({uris.primaryURI}) does not exist")
1842 else:
1843 for location, file_info in file_locations:
1844 if file_info.component is None: 1844 ↛ 1845line 1844 didn't jump to line 1845, because the condition on line 1844 was never true
1845 raise RuntimeError(f"Unexpectedly got no component name for a component at {location}")
1846 if guessing and not location.uri.exists(): 1846 ↛ 1850line 1846 didn't jump to line 1850, because the condition on line 1846 was never true
1847 # If we are trusting then it is entirely possible for
1848 # some components to be missing. In that case we skip
1849 # to the next component.
1850 if self.trustGetRequest:
1851 continue
1852 raise FileNotFoundError(f"Expected URI ({location.uri}) does not exist")
1853 uris.componentURIs[file_info.component] = location.uri
1855 return uris
1857 def retrieveArtifacts(
1858 self,
1859 refs: Iterable[DatasetRef],
1860 destination: ResourcePath,
1861 transfer: str = "auto",
1862 preserve_path: bool = True,
1863 overwrite: bool = False,
1864 ) -> List[ResourcePath]:
1865 """Retrieve the file artifacts associated with the supplied refs.
1867 Parameters
1868 ----------
1869 refs : iterable of `DatasetRef`
1870 The datasets for which file artifacts are to be retrieved.
1871 A single ref can result in multiple files. The refs must
1872 be resolved.
1873 destination : `lsst.resources.ResourcePath`
1874 Location to write the file artifacts.
1875 transfer : `str`, optional
1876 Method to use to transfer the artifacts. Must be one of the options
1877 supported by `lsst.resources.ResourcePath.transfer_from()`.
1878 "move" is not allowed.
1879 preserve_path : `bool`, optional
1880 If `True` the full path of the file artifact within the datastore
1881 is preserved. If `False` the final file component of the path
1882 is used.
1883 overwrite : `bool`, optional
1884 If `True` allow transfers to overwrite existing files at the
1885 destination.
1887 Returns
1888 -------
1889 targets : `list` of `lsst.resources.ResourcePath`
1890 URIs of file artifacts in destination location. Order is not
1891 preserved.
1892 """
1893 if not destination.isdir(): 1893 ↛ 1894line 1893 didn't jump to line 1894, because the condition on line 1893 was never true
1894 raise ValueError(f"Destination location must refer to a directory. Given {destination}")
1896 if transfer == "move":
1897 raise ValueError("Can not move artifacts out of datastore. Use copy instead.")
1899 # Source -> Destination
1900 # This also helps filter out duplicate DatasetRef in the request
1901 # that will map to the same underlying file transfer.
1902 to_transfer: Dict[ResourcePath, ResourcePath] = {}
1904 for ref in refs:
1905 locations = self._get_dataset_locations_info(ref)
1906 for location, _ in locations:
1907 source_uri = location.uri
1908 target_path: ResourcePathExpression
1909 if preserve_path:
1910 target_path = location.pathInStore
1911 if target_path.isabs(): 1911 ↛ 1914line 1911 didn't jump to line 1914, because the condition on line 1911 was never true
1912 # This is an absolute path to an external file.
1913 # Use the full path.
1914 target_path = target_path.relativeToPathRoot
1915 else:
1916 target_path = source_uri.basename()
1917 target_uri = destination.join(target_path)
1918 to_transfer[source_uri] = target_uri
1920 # In theory can now parallelize the transfer
1921 log.debug("Number of artifacts to transfer to %s: %d", str(destination), len(to_transfer))
1922 for source_uri, target_uri in to_transfer.items():
1923 target_uri.transfer_from(source_uri, transfer=transfer, overwrite=overwrite)
1925 return list(to_transfer.values())
1927 def get(
1928 self,
1929 ref: DatasetRef,
1930 parameters: Optional[Mapping[str, Any]] = None,
1931 storageClass: Optional[Union[StorageClass, str]] = None,
1932 ) -> Any:
1933 """Load an InMemoryDataset from the store.
1935 Parameters
1936 ----------
1937 ref : `DatasetRef`
1938 Reference to the required Dataset.
1939 parameters : `dict`
1940 `StorageClass`-specific parameters that specify, for example,
1941 a slice of the dataset to be loaded.
1942 storageClass : `StorageClass` or `str`, optional
1943 The storage class to be used to override the Python type
1944 returned by this method. By default the returned type matches
1945 the dataset type definition for this dataset. Specifying a
1946 read `StorageClass` can force a different type to be returned.
1947 This type must be compatible with the original type.
1949 Returns
1950 -------
1951 inMemoryDataset : `object`
1952 Requested dataset or slice thereof as an InMemoryDataset.
1954 Raises
1955 ------
1956 FileNotFoundError
1957 Requested dataset can not be retrieved.
1958 TypeError
1959 Return value from formatter has unexpected type.
1960 ValueError
1961 Formatter failed to process the dataset.
1962 """
1963 # Supplied storage class for the component being read is either
1964 # from the ref itself or some an override if we want to force
1965 # type conversion.
1966 if storageClass is not None:
1967 ref = ref.overrideStorageClass(storageClass)
1968 refStorageClass = ref.datasetType.storageClass
1970 allGetInfo = self._prepare_for_get(ref, parameters)
1971 refComponent = ref.datasetType.component()
1973 # Create mapping from component name to related info
1974 allComponents = {i.component: i for i in allGetInfo}
1976 # By definition the dataset is disassembled if we have more
1977 # than one record for it.
1978 isDisassembled = len(allGetInfo) > 1
1980 # Look for the special case where we are disassembled but the
1981 # component is a derived component that was not written during
1982 # disassembly. For this scenario we need to check that the
1983 # component requested is listed as a derived component for the
1984 # composite storage class
1985 isDisassembledReadOnlyComponent = False
1986 if isDisassembled and refComponent:
1987 # The composite storage class should be accessible through
1988 # the component dataset type
1989 compositeStorageClass = ref.datasetType.parentStorageClass
1991 # In the unlikely scenario where the composite storage
1992 # class is not known, we can only assume that this is a
1993 # normal component. If that assumption is wrong then the
1994 # branch below that reads a persisted component will fail
1995 # so there is no need to complain here.
1996 if compositeStorageClass is not None: 1996 ↛ 1999line 1996 didn't jump to line 1999, because the condition on line 1996 was never false
1997 isDisassembledReadOnlyComponent = refComponent in compositeStorageClass.derivedComponents
1999 if isDisassembled and not refComponent:
2000 # This was a disassembled dataset spread over multiple files
2001 # and we need to put them all back together again.
2002 # Read into memory and then assemble
2004 # Check that the supplied parameters are suitable for the type read
2005 refStorageClass.validateParameters(parameters)
2007 # We want to keep track of all the parameters that were not used
2008 # by formatters. We assume that if any of the component formatters
2009 # use a parameter that we do not need to apply it again in the
2010 # assembler.
2011 usedParams = set()
2013 components: Dict[str, Any] = {}
2014 for getInfo in allGetInfo:
2015 # assemblerParams are parameters not understood by the
2016 # associated formatter.
2017 usedParams.update(set(getInfo.formatterParams))
2019 component = getInfo.component
2021 if component is None: 2021 ↛ 2022line 2021 didn't jump to line 2022, because the condition on line 2021 was never true
2022 raise RuntimeError(f"Internal error in datastore assembly of {ref}")
2024 # We do not want the formatter to think it's reading
2025 # a component though because it is really reading a
2026 # standalone dataset -- always tell reader it is not a
2027 # component.
2028 components[component] = self._read_artifact_into_memory(
2029 getInfo, ref.makeComponentRef(component), isComponent=False
2030 )
2032 inMemoryDataset = ref.datasetType.storageClass.delegate().assemble(components)
2034 # Any unused parameters will have to be passed to the assembler
2035 if parameters:
2036 unusedParams = {k: v for k, v in parameters.items() if k not in usedParams}
2037 else:
2038 unusedParams = {}
2040 # Process parameters
2041 return ref.datasetType.storageClass.delegate().handleParameters(
2042 inMemoryDataset, parameters=unusedParams
2043 )
2045 elif isDisassembledReadOnlyComponent:
2046 compositeStorageClass = ref.datasetType.parentStorageClass
2047 if compositeStorageClass is None: 2047 ↛ 2048line 2047 didn't jump to line 2048, because the condition on line 2047 was never true
2048 raise RuntimeError(
2049 f"Unable to retrieve derived component '{refComponent}' since"
2050 "no composite storage class is available."
2051 )
2053 if refComponent is None: 2053 ↛ 2055line 2053 didn't jump to line 2055, because the condition on line 2053 was never true
2054 # Mainly for mypy
2055 raise RuntimeError(f"Internal error in datastore {self.name}: component can not be None here")
2057 # Assume that every derived component can be calculated by
2058 # forwarding the request to a single read/write component.
2059 # Rather than guessing which rw component is the right one by
2060 # scanning each for a derived component of the same name,
2061 # we ask the storage class delegate directly which one is best to
2062 # use.
2063 compositeDelegate = compositeStorageClass.delegate()
2064 forwardedComponent = compositeDelegate.selectResponsibleComponent(
2065 refComponent, set(allComponents)
2066 )
2068 # Select the relevant component
2069 rwInfo = allComponents[forwardedComponent]
2071 # For now assume that read parameters are validated against
2072 # the real component and not the requested component
2073 forwardedStorageClass = rwInfo.formatter.fileDescriptor.readStorageClass
2074 forwardedStorageClass.validateParameters(parameters)
2076 # The reference to use for the caching must refer to the forwarded
2077 # component and not the derived component.
2078 cache_ref = ref.makeCompositeRef().makeComponentRef(forwardedComponent)
2080 # Unfortunately the FileDescriptor inside the formatter will have
2081 # the wrong write storage class so we need to create a new one
2082 # given the immutability constraint.
2083 writeStorageClass = rwInfo.info.storageClass
2085 # We may need to put some thought into parameters for read
2086 # components but for now forward them on as is
2087 readFormatter = type(rwInfo.formatter)(
2088 FileDescriptor(
2089 rwInfo.location,
2090 readStorageClass=refStorageClass,
2091 storageClass=writeStorageClass,
2092 parameters=parameters,
2093 ),
2094 ref.dataId,
2095 )
2097 # The assembler can not receive any parameter requests for a
2098 # derived component at this time since the assembler will
2099 # see the storage class of the derived component and those
2100 # parameters will have to be handled by the formatter on the
2101 # forwarded storage class.
2102 assemblerParams: Dict[str, Any] = {}
2104 # Need to created a new info that specifies the derived
2105 # component and associated storage class
2106 readInfo = DatastoreFileGetInformation(
2107 rwInfo.location,
2108 readFormatter,
2109 rwInfo.info,
2110 assemblerParams,
2111 {},
2112 refComponent,
2113 refStorageClass,
2114 )
2116 return self._read_artifact_into_memory(readInfo, ref, isComponent=True, cache_ref=cache_ref)
2118 else:
2119 # Single file request or component from that composite file
2120 for lookup in (refComponent, None): 2120 ↛ 2125line 2120 didn't jump to line 2125, because the loop on line 2120 didn't complete
2121 if lookup in allComponents: 2121 ↛ 2120line 2121 didn't jump to line 2120, because the condition on line 2121 was never false
2122 getInfo = allComponents[lookup]
2123 break
2124 else:
2125 raise FileNotFoundError(
2126 f"Component {refComponent} not found for ref {ref} in datastore {self.name}"
2127 )
2129 # Do not need the component itself if already disassembled
2130 if isDisassembled:
2131 isComponent = False
2132 else:
2133 isComponent = getInfo.component is not None
2135 # For a component read of a composite we want the cache to
2136 # be looking at the composite ref itself.
2137 cache_ref = ref.makeCompositeRef() if isComponent else ref
2139 # For a disassembled component we can validate parametersagainst
2140 # the component storage class directly
2141 if isDisassembled:
2142 refStorageClass.validateParameters(parameters)
2143 else:
2144 # For an assembled composite this could be a derived
2145 # component derived from a real component. The validity
2146 # of the parameters is not clear. For now validate against
2147 # the composite storage class
2148 getInfo.formatter.fileDescriptor.storageClass.validateParameters(parameters)
2150 return self._read_artifact_into_memory(getInfo, ref, isComponent=isComponent, cache_ref=cache_ref)
2152 @transactional
2153 def put(self, inMemoryDataset: Any, ref: DatasetRef) -> None:
2154 """Write a InMemoryDataset with a given `DatasetRef` to the store.
2156 Parameters
2157 ----------
2158 inMemoryDataset : `object`
2159 The dataset to store.
2160 ref : `DatasetRef`
2161 Reference to the associated Dataset.
2163 Raises
2164 ------
2165 TypeError
2166 Supplied object and storage class are inconsistent.
2167 DatasetTypeNotSupportedError
2168 The associated `DatasetType` is not handled by this datastore.
2170 Notes
2171 -----
2172 If the datastore is configured to reject certain dataset types it
2173 is possible that the put will fail and raise a
2174 `DatasetTypeNotSupportedError`. The main use case for this is to
2175 allow `ChainedDatastore` to put to multiple datastores without
2176 requiring that every datastore accepts the dataset.
2177 """
2179 doDisassembly = self.composites.shouldBeDisassembled(ref)
2180 # doDisassembly = True
2182 artifacts = []
2183 if doDisassembly:
2184 components = ref.datasetType.storageClass.delegate().disassemble(inMemoryDataset)
2185 if components is None: 2185 ↛ 2186line 2185 didn't jump to line 2186, because the condition on line 2185 was never true
2186 raise RuntimeError(
2187 f"Inconsistent configuration: dataset type {ref.datasetType.name} "
2188 f"with storage class {ref.datasetType.storageClass.name} "
2189 "is configured to be disassembled, but cannot be."
2190 )
2191 for component, componentInfo in components.items():
2192 # Don't recurse because we want to take advantage of
2193 # bulk insert -- need a new DatasetRef that refers to the
2194 # same dataset_id but has the component DatasetType
2195 # DatasetType does not refer to the types of components
2196 # So we construct one ourselves.
2197 compRef = ref.makeComponentRef(component)
2198 storedInfo = self._write_in_memory_to_artifact(componentInfo.component, compRef)
2199 artifacts.append((compRef, storedInfo))
2200 else:
2201 # Write the entire thing out
2202 storedInfo = self._write_in_memory_to_artifact(inMemoryDataset, ref)
2203 artifacts.append((ref, storedInfo))
2205 self._register_datasets(artifacts)
2207 @transactional
2208 def trash(self, ref: Union[DatasetRef, Iterable[DatasetRef]], ignore_errors: bool = True) -> None:
2209 # At this point can safely remove these datasets from the cache
2210 # to avoid confusion later on. If they are not trashed later
2211 # the cache will simply be refilled.
2212 self.cacheManager.remove_from_cache(ref)
2214 # If we are in trust mode there will be nothing to move to
2215 # the trash table and we will have to try to delete the file
2216 # immediately.
2217 if self.trustGetRequest:
2218 # Try to keep the logic below for a single file trash.
2219 if isinstance(ref, DatasetRef):
2220 refs = {ref}
2221 else:
2222 # Will recreate ref at the end of this branch.
2223 refs = set(ref)
2225 # Determine which datasets are known to datastore directly.
2226 id_to_ref = {ref.getCheckedId(): ref for ref in refs}
2227 existing_ids = self._get_stored_records_associated_with_refs(refs)
2228 existing_refs = {id_to_ref[ref_id] for ref_id in existing_ids}
2230 missing = refs - existing_refs
2231 if missing:
2232 # Do an explicit existence check on these refs.
2233 # We only care about the artifacts at this point and not
2234 # the dataset existence.
2235 artifact_existence: Dict[ResourcePath, bool] = {}
2236 _ = self.mexists(missing, artifact_existence)
2237 uris = [uri for uri, exists in artifact_existence.items() if exists]
2239 # FUTURE UPGRADE: Implement a parallelized bulk remove.
2240 log.debug("Removing %d artifacts from datastore that are unknown to datastore", len(uris))
2241 for uri in uris:
2242 try:
2243 uri.remove()
2244 except Exception as e:
2245 if ignore_errors:
2246 log.debug("Artifact %s could not be removed: %s", uri, e)
2247 continue
2248 raise
2250 # There is no point asking the code below to remove refs we
2251 # know are missing so update it with the list of existing
2252 # records. Try to retain one vs many logic.
2253 if not existing_refs:
2254 # Nothing more to do since none of the datasets were
2255 # known to the datastore record table.
2256 return
2257 ref = list(existing_refs)
2258 if len(ref) == 1:
2259 ref = ref[0]
2261 # Get file metadata and internal metadata
2262 if not isinstance(ref, DatasetRef):
2263 log.debug("Doing multi-dataset trash in datastore %s", self.name)
2264 # Assumed to be an iterable of refs so bulk mode enabled.
2265 try:
2266 self.bridge.moveToTrash(ref, transaction=self._transaction)
2267 except Exception as e:
2268 if ignore_errors:
2269 log.warning("Unexpected issue moving multiple datasets to trash: %s", e)
2270 else:
2271 raise
2272 return
2274 log.debug("Trashing dataset %s in datastore %s", ref, self.name)
2276 fileLocations = self._get_dataset_locations_info(ref)
2278 if not fileLocations:
2279 err_msg = f"Requested dataset to trash ({ref}) is not known to datastore {self.name}"
2280 if ignore_errors:
2281 log.warning(err_msg)
2282 return
2283 else:
2284 raise FileNotFoundError(err_msg)
2286 for location, storedFileInfo in fileLocations:
2287 if not self._artifact_exists(location): 2287 ↛ 2288line 2287 didn't jump to line 2288
2288 err_msg = (
2289 f"Dataset is known to datastore {self.name} but "
2290 f"associated artifact ({location.uri}) is missing"
2291 )
2292 if ignore_errors:
2293 log.warning(err_msg)
2294 return
2295 else:
2296 raise FileNotFoundError(err_msg)
2298 # Mark dataset as trashed
2299 try:
2300 self.bridge.moveToTrash([ref], transaction=self._transaction)
2301 except Exception as e:
2302 if ignore_errors:
2303 log.warning(
2304 "Attempted to mark dataset (%s) to be trashed in datastore %s "
2305 "but encountered an error: %s",
2306 ref,
2307 self.name,
2308 e,
2309 )
2310 pass
2311 else:
2312 raise
2314 @transactional
2315 def emptyTrash(self, ignore_errors: bool = True) -> None:
2316 """Remove all datasets from the trash.
2318 Parameters
2319 ----------
2320 ignore_errors : `bool`
2321 If `True` return without error even if something went wrong.
2322 Problems could occur if another process is simultaneously trying
2323 to delete.
2324 """
2325 log.debug("Emptying trash in datastore %s", self.name)
2327 # Context manager will empty trash iff we finish it without raising.
2328 # It will also automatically delete the relevant rows from the
2329 # trash table and the records table.
2330 with self.bridge.emptyTrash(
2331 self._table, record_class=StoredFileInfo, record_column="path"
2332 ) as trash_data:
2333 # Removing the artifacts themselves requires that the files are
2334 # not also associated with refs that are not to be trashed.
2335 # Therefore need to do a query with the file paths themselves
2336 # and return all the refs associated with them. Can only delete
2337 # a file if the refs to be trashed are the only refs associated
2338 # with the file.
2339 # This requires multiple copies of the trashed items
2340 trashed, artifacts_to_keep = trash_data
2342 if artifacts_to_keep is None:
2343 # The bridge is not helping us so have to work it out
2344 # ourselves. This is not going to be as efficient.
2345 trashed = list(trashed)
2347 # The instance check is for mypy since up to this point it
2348 # does not know the type of info.
2349 path_map = self._refs_associated_with_artifacts(
2350 [info.path for _, info in trashed if isinstance(info, StoredFileInfo)]
2351 )
2353 for ref, info in trashed:
2354 # Mypy needs to know this is not the base class
2355 assert isinstance(info, StoredFileInfo), f"Unexpectedly got info of class {type(info)}"
2357 # Check for mypy
2358 assert ref.id is not None, f"Internal logic error in emptyTrash with ref {ref}/{info}"
2360 path_map[info.path].remove(ref.id)
2361 if not path_map[info.path]: 2361 ↛ 2353line 2361 didn't jump to line 2353, because the condition on line 2361 was never false
2362 del path_map[info.path]
2364 artifacts_to_keep = set(path_map)
2366 for ref, info in trashed:
2367 # Should not happen for this implementation but need
2368 # to keep mypy happy.
2369 assert info is not None, f"Internal logic error in emptyTrash with ref {ref}."
2371 # Mypy needs to know this is not the base class
2372 assert isinstance(info, StoredFileInfo), f"Unexpectedly got info of class {type(info)}"
2374 # Check for mypy
2375 assert ref.id is not None, f"Internal logic error in emptyTrash with ref {ref}/{info}"
2377 if info.path in artifacts_to_keep:
2378 # This is a multi-dataset artifact and we are not
2379 # removing all associated refs.
2380 continue
2382 # Only trashed refs still known to datastore will be returned.
2383 location = info.file_location(self.locationFactory)
2385 # Point of no return for this artifact
2386 log.debug("Removing artifact %s from datastore %s", location.uri, self.name)
2387 try:
2388 self._delete_artifact(location)
2389 except FileNotFoundError:
2390 # If the file itself has been deleted there is nothing
2391 # we can do about it. It is possible that trash has
2392 # been run in parallel in another process or someone
2393 # decided to delete the file. It is unlikely to come
2394 # back and so we should still continue with the removal
2395 # of the entry from the trash table. It is also possible
2396 # we removed it in a previous iteration if it was
2397 # a multi-dataset artifact. The delete artifact method
2398 # will log a debug message in this scenario.
2399 # Distinguishing file missing before trash started and
2400 # file already removed previously as part of this trash
2401 # is not worth the distinction with regards to potential
2402 # memory cost.
2403 pass
2404 except Exception as e:
2405 if ignore_errors:
2406 # Use a debug message here even though it's not
2407 # a good situation. In some cases this can be
2408 # caused by a race between user A and user B
2409 # and neither of them has permissions for the
2410 # other's files. Butler does not know about users
2411 # and trash has no idea what collections these
2412 # files were in (without guessing from a path).
2413 log.debug(
2414 "Encountered error removing artifact %s from datastore %s: %s",
2415 location.uri,
2416 self.name,
2417 e,
2418 )
2419 else:
2420 raise
2422 @transactional
2423 def transfer_from(
2424 self,
2425 source_datastore: Datastore,
2426 refs: Iterable[DatasetRef],
2427 local_refs: Optional[Iterable[DatasetRef]] = None,
2428 transfer: str = "auto",
2429 artifact_existence: Optional[Dict[ResourcePath, bool]] = None,
2430 ) -> None:
2431 # Docstring inherited
2432 if type(self) is not type(source_datastore):
2433 raise TypeError(
2434 f"Datastore mismatch between this datastore ({type(self)}) and the "
2435 f"source datastore ({type(source_datastore)})."
2436 )
2438 # Be explicit for mypy
2439 if not isinstance(source_datastore, FileDatastore): 2439 ↛ 2440line 2439 didn't jump to line 2440, because the condition on line 2439 was never true
2440 raise TypeError(
2441 "Can only transfer to a FileDatastore from another FileDatastore, not"
2442 f" {type(source_datastore)}"
2443 )
2445 # Stop early if "direct" transfer mode is requested. That would
2446 # require that the URI inside the source datastore should be stored
2447 # directly in the target datastore, which seems unlikely to be useful
2448 # since at any moment the source datastore could delete the file.
2449 if transfer in ("direct", "split"):
2450 raise ValueError(
2451 f"Can not transfer from a source datastore using {transfer} mode since"
2452 " those files are controlled by the other datastore."
2453 )
2455 # Empty existence lookup if none given.
2456 if artifact_existence is None:
2457 artifact_existence = {}
2459 # We will go through the list multiple times so must convert
2460 # generators to lists.
2461 refs = list(refs)
2463 if local_refs is None:
2464 local_refs = refs
2465 else:
2466 local_refs = list(local_refs)
2468 # In order to handle disassembled composites the code works
2469 # at the records level since it can assume that internal APIs
2470 # can be used.
2471 # - If the record already exists in the destination this is assumed
2472 # to be okay.
2473 # - If there is no record but the source and destination URIs are
2474 # identical no transfer is done but the record is added.
2475 # - If the source record refers to an absolute URI currently assume
2476 # that that URI should remain absolute and will be visible to the
2477 # destination butler. May need to have a flag to indicate whether
2478 # the dataset should be transferred. This will only happen if
2479 # the detached Butler has had a local ingest.
2481 # What we really want is all the records in the source datastore
2482 # associated with these refs. Or derived ones if they don't exist
2483 # in the source.
2484 source_records = source_datastore._get_stored_records_associated_with_refs(refs)
2486 # The source dataset_ids are the keys in these records
2487 source_ids = set(source_records)
2488 log.debug("Number of datastore records found in source: %d", len(source_ids))
2490 # The not None check is to appease mypy
2491 requested_ids = set(ref.id for ref in refs if ref.id is not None)
2492 missing_ids = requested_ids - source_ids
2494 # Missing IDs can be okay if that datastore has allowed
2495 # gets based on file existence. Should we transfer what we can
2496 # or complain about it and warn?
2497 if missing_ids and not source_datastore.trustGetRequest: 2497 ↛ 2498line 2497 didn't jump to line 2498, because the condition on line 2497 was never true
2498 raise ValueError(
2499 f"Some datasets are missing from source datastore {source_datastore}: {missing_ids}"
2500 )
2502 # Need to map these missing IDs to a DatasetRef so we can guess
2503 # the details.
2504 if missing_ids:
2505 log.info(
2506 "Number of expected datasets missing from source datastore records: %d out of %d",
2507 len(missing_ids),
2508 len(requested_ids),
2509 )
2510 id_to_ref = {ref.id: ref for ref in refs if ref.id in missing_ids}
2512 # This should be chunked in case we end up having to check
2513 # the file store since we need some log output to show
2514 # progress.
2515 for missing_ids_chunk in chunk_iterable(missing_ids, chunk_size=10_000):
2516 records = {}
2517 for missing in missing_ids_chunk:
2518 # Ask the source datastore where the missing artifacts
2519 # should be. An execution butler might not know about the
2520 # artifacts even if they are there.
2521 expected = source_datastore._get_expected_dataset_locations_info(id_to_ref[missing])
2522 records[missing] = [info for _, info in expected]
2524 # Call the mexist helper method in case we have not already
2525 # checked these artifacts such that artifact_existence is
2526 # empty. This allows us to benefit from parallelism.
2527 # datastore.mexists() itself does not give us access to the
2528 # derived datastore record.
2529 log.verbose("Checking existence of %d datasets unknown to datastore", len(records))
2530 ref_exists = source_datastore._process_mexists_records(
2531 id_to_ref, records, False, artifact_existence=artifact_existence
2532 )
2534 # Now go through the records and propagate the ones that exist.
2535 location_factory = source_datastore.locationFactory
2536 for missing, record_list in records.items():
2537 # Skip completely if the ref does not exist.
2538 ref = id_to_ref[missing]
2539 if not ref_exists[ref]:
2540 log.warning("Asked to transfer dataset %s but no file artifacts exist for it.", ref)
2541 continue
2542 # Check for file artifact to decide which parts of a
2543 # disassembled composite do exist. If there is only a
2544 # single record we don't even need to look because it can't
2545 # be a composite and must exist.
2546 if len(record_list) == 1:
2547 dataset_records = record_list
2548 else:
2549 dataset_records = [
2550 record
2551 for record in record_list
2552 if artifact_existence[record.file_location(location_factory).uri]
2553 ]
2554 assert len(dataset_records) > 0, "Disassembled composite should have had some files."
2556 # Rely on source_records being a defaultdict.
2557 source_records[missing].extend(dataset_records)
2559 # See if we already have these records
2560 target_records = self._get_stored_records_associated_with_refs(local_refs)
2562 # The artifacts to register
2563 artifacts = []
2565 # Refs that already exist
2566 already_present = []
2568 # Now can transfer the artifacts
2569 for source_ref, target_ref in zip(refs, local_refs):
2570 if target_ref.id in target_records:
2571 # Already have an artifact for this.
2572 already_present.append(target_ref)
2573 continue
2575 # mypy needs to know these are always resolved refs
2576 for info in source_records[source_ref.getCheckedId()]:
2577 source_location = info.file_location(source_datastore.locationFactory)
2578 target_location = info.file_location(self.locationFactory)
2579 if source_location == target_location: 2579 ↛ 2583line 2579 didn't jump to line 2583, because the condition on line 2579 was never true
2580 # Either the dataset is already in the target datastore
2581 # (which is how execution butler currently runs) or
2582 # it is an absolute URI.
2583 if source_location.pathInStore.isabs():
2584 # Just because we can see the artifact when running
2585 # the transfer doesn't mean it will be generally
2586 # accessible to a user of this butler. For now warn
2587 # but assume it will be accessible.
2588 log.warning(
2589 "Transfer request for an outside-datastore artifact has been found at %s",
2590 source_location,
2591 )
2592 else:
2593 # Need to transfer it to the new location.
2594 # Assume we should always overwrite. If the artifact
2595 # is there this might indicate that a previous transfer
2596 # was interrupted but was not able to be rolled back
2597 # completely (eg pre-emption) so follow Datastore default
2598 # and overwrite.
2599 target_location.uri.transfer_from(
2600 source_location.uri, transfer=transfer, overwrite=True, transaction=self._transaction
2601 )
2603 artifacts.append((target_ref, info))
2605 self._register_datasets(artifacts)
2607 if already_present:
2608 n_skipped = len(already_present)
2609 log.info(
2610 "Skipped transfer of %d dataset%s already present in datastore",
2611 n_skipped,
2612 "" if n_skipped == 1 else "s",
2613 )
2615 @transactional
2616 def forget(self, refs: Iterable[DatasetRef]) -> None:
2617 # Docstring inherited.
2618 refs = list(refs)
2619 self.bridge.forget(refs)
2620 self._table.delete(["dataset_id"], *[{"dataset_id": ref.getCheckedId()} for ref in refs])
2622 def validateConfiguration(
2623 self, entities: Iterable[Union[DatasetRef, DatasetType, StorageClass]], logFailures: bool = False
2624 ) -> None:
2625 """Validate some of the configuration for this datastore.
2627 Parameters
2628 ----------
2629 entities : iterable of `DatasetRef`, `DatasetType`, or `StorageClass`
2630 Entities to test against this configuration. Can be differing
2631 types.
2632 logFailures : `bool`, optional
2633 If `True`, output a log message for every validation error
2634 detected.
2636 Raises
2637 ------
2638 DatastoreValidationError
2639 Raised if there is a validation problem with a configuration.
2640 All the problems are reported in a single exception.
2642 Notes
2643 -----
2644 This method checks that all the supplied entities have valid file
2645 templates and also have formatters defined.
2646 """
2648 templateFailed = None
2649 try:
2650 self.templates.validateTemplates(entities, logFailures=logFailures)
2651 except FileTemplateValidationError as e:
2652 templateFailed = str(e)
2654 formatterFailed = []
2655 for entity in entities:
2656 try:
2657 self.formatterFactory.getFormatterClass(entity)
2658 except KeyError as e:
2659 formatterFailed.append(str(e))
2660 if logFailures: 2660 ↛ 2655line 2660 didn't jump to line 2655, because the condition on line 2660 was never false
2661 log.critical("Formatter failure: %s", e)
2663 if templateFailed or formatterFailed:
2664 messages = []
2665 if templateFailed: 2665 ↛ 2666line 2665 didn't jump to line 2666, because the condition on line 2665 was never true
2666 messages.append(templateFailed)
2667 if formatterFailed: 2667 ↛ 2669line 2667 didn't jump to line 2669, because the condition on line 2667 was never false
2668 messages.append(",".join(formatterFailed))
2669 msg = ";\n".join(messages)
2670 raise DatastoreValidationError(msg)
2672 def getLookupKeys(self) -> Set[LookupKey]:
2673 # Docstring is inherited from base class
2674 return (
2675 self.templates.getLookupKeys()
2676 | self.formatterFactory.getLookupKeys()
2677 | self.constraints.getLookupKeys()
2678 )
2680 def validateKey(self, lookupKey: LookupKey, entity: Union[DatasetRef, DatasetType, StorageClass]) -> None:
2681 # Docstring is inherited from base class
2682 # The key can be valid in either formatters or templates so we can
2683 # only check the template if it exists
2684 if lookupKey in self.templates:
2685 try:
2686 self.templates[lookupKey].validateTemplate(entity)
2687 except FileTemplateValidationError as e:
2688 raise DatastoreValidationError(e) from e
2690 def export(
2691 self,
2692 refs: Iterable[DatasetRef],
2693 *,
2694 directory: Optional[ResourcePathExpression] = None,
2695 transfer: Optional[str] = "auto",
2696 ) -> Iterable[FileDataset]:
2697 # Docstring inherited from Datastore.export.
2698 if transfer == "auto" and directory is None:
2699 transfer = None
2701 if transfer is not None and directory is None:
2702 raise TypeError(f"Cannot export using transfer mode {transfer} with no export directory given")
2704 if transfer == "move":
2705 raise TypeError("Can not export by moving files out of datastore.")
2706 elif transfer == "direct": 2706 ↛ 2710line 2706 didn't jump to line 2710, because the condition on line 2706 was never true
2707 # For an export, treat this as equivalent to None. We do not
2708 # want an import to risk using absolute URIs to datasets owned
2709 # by another datastore.
2710 log.info("Treating 'direct' transfer mode as in-place export.")
2711 transfer = None
2713 # Force the directory to be a URI object
2714 directoryUri: Optional[ResourcePath] = None
2715 if directory is not None:
2716 directoryUri = ResourcePath(directory, forceDirectory=True)
2718 if transfer is not None and directoryUri is not None:
2719 # mypy needs the second test
2720 if not directoryUri.exists(): 2720 ↛ 2721line 2720 didn't jump to line 2721, because the condition on line 2720 was never true
2721 raise FileNotFoundError(f"Export location {directory} does not exist")
2723 progress = Progress("lsst.daf.butler.datastores.FileDatastore.export", level=logging.DEBUG)
2724 for ref in progress.wrap(refs, "Exporting dataset files"):
2725 fileLocations = self._get_dataset_locations_info(ref)
2726 if not fileLocations:
2727 raise FileNotFoundError(f"Could not retrieve dataset {ref}.")
2728 # For now we can not export disassembled datasets
2729 if len(fileLocations) > 1:
2730 raise NotImplementedError(f"Can not export disassembled datasets such as {ref}")
2731 location, storedFileInfo = fileLocations[0]
2733 pathInStore = location.pathInStore.path
2734 if transfer is None:
2735 # TODO: do we also need to return the readStorageClass somehow?
2736 # We will use the path in store directly. If this is an
2737 # absolute URI, preserve it.
2738 if location.pathInStore.isabs(): 2738 ↛ 2739line 2738 didn't jump to line 2739, because the condition on line 2738 was never true
2739 pathInStore = str(location.uri)
2740 elif transfer == "direct": 2740 ↛ 2742line 2740 didn't jump to line 2742, because the condition on line 2740 was never true
2741 # Use full URIs to the remote store in the export
2742 pathInStore = str(location.uri)
2743 else:
2744 # mypy needs help
2745 assert directoryUri is not None, "directoryUri must be defined to get here"
2746 storeUri = ResourcePath(location.uri)
2748 # if the datastore has an absolute URI to a resource, we
2749 # have two options:
2750 # 1. Keep the absolute URI in the exported YAML
2751 # 2. Allocate a new name in the local datastore and transfer
2752 # it.
2753 # For now go with option 2
2754 if location.pathInStore.isabs(): 2754 ↛ 2755line 2754 didn't jump to line 2755, because the condition on line 2754 was never true
2755 template = self.templates.getTemplate(ref)
2756 newURI = ResourcePath(template.format(ref), forceAbsolute=False)
2757 pathInStore = str(newURI.updatedExtension(location.pathInStore.getExtension()))
2759 exportUri = directoryUri.join(pathInStore)
2760 exportUri.transfer_from(storeUri, transfer=transfer)
2762 yield FileDataset(refs=[ref], path=pathInStore, formatter=storedFileInfo.formatter)
2764 @staticmethod
2765 def computeChecksum(
2766 uri: ResourcePath, algorithm: str = "blake2b", block_size: int = 8192
2767 ) -> Optional[str]:
2768 """Compute the checksum of the supplied file.
2770 Parameters
2771 ----------
2772 uri : `lsst.resources.ResourcePath`
2773 Name of resource to calculate checksum from.
2774 algorithm : `str`, optional
2775 Name of algorithm to use. Must be one of the algorithms supported
2776 by :py:class`hashlib`.
2777 block_size : `int`
2778 Number of bytes to read from file at one time.
2780 Returns
2781 -------
2782 hexdigest : `str`
2783 Hex digest of the file.
2785 Notes
2786 -----
2787 Currently returns None if the URI is for a remote resource.
2788 """
2789 if algorithm not in hashlib.algorithms_guaranteed: 2789 ↛ 2790line 2789 didn't jump to line 2790, because the condition on line 2789 was never true
2790 raise NameError("The specified algorithm '{}' is not supported by hashlib".format(algorithm))
2792 if not uri.isLocal: 2792 ↛ 2793line 2792 didn't jump to line 2793, because the condition on line 2792 was never true
2793 return None
2795 hasher = hashlib.new(algorithm)
2797 with uri.as_local() as local_uri:
2798 with open(local_uri.ospath, "rb") as f:
2799 for chunk in iter(lambda: f.read(block_size), b""):
2800 hasher.update(chunk)
2802 return hasher.hexdigest()
2804 def needs_expanded_data_ids(
2805 self,
2806 transfer: Optional[str],
2807 entity: Optional[Union[DatasetRef, DatasetType, StorageClass]] = None,
2808 ) -> bool:
2809 # Docstring inherited.
2810 # This _could_ also use entity to inspect whether the filename template
2811 # involves placeholders other than the required dimensions for its
2812 # dataset type, but that's not necessary for correctness; it just
2813 # enables more optimizations (perhaps only in theory).
2814 return transfer not in ("direct", None)
2816 def import_records(self, data: Mapping[str, DatastoreRecordData]) -> None:
2817 # Docstring inherited from the base class.
2818 record_data = data.get(self.name)
2819 if not record_data: 2819 ↛ 2820line 2819 didn't jump to line 2820, because the condition on line 2819 was never true
2820 return
2822 self._bridge.insert(FakeDatasetRef(dataset_id) for dataset_id in record_data.records.keys())
2824 # TODO: Verify that there are no unexpected table names in the dict?
2825 unpacked_records = []
2826 for dataset_data in record_data.records.values():
2827 records = dataset_data.get(self._table.name)
2828 if records: 2828 ↛ 2826line 2828 didn't jump to line 2826, because the condition on line 2828 was never false
2829 for info in records:
2830 assert isinstance(info, StoredFileInfo), "Expecting StoredFileInfo records"
2831 unpacked_records.append(info.to_record())
2832 if unpacked_records:
2833 self._table.insert(*unpacked_records)
2835 def export_records(self, refs: Iterable[DatasetIdRef]) -> Mapping[str, DatastoreRecordData]:
2836 # Docstring inherited from the base class.
2837 exported_refs = list(self._bridge.check(refs))
2838 ids = {ref.getCheckedId() for ref in exported_refs}
2839 records: defaultdict[DatasetId, defaultdict[str, List[StoredDatastoreItemInfo]]] = defaultdict(
2840 lambda: defaultdict(list), {id: defaultdict(list) for id in ids}
2841 )
2842 for row in self._table.fetch(dataset_id=ids):
2843 info: StoredDatastoreItemInfo = StoredFileInfo.from_record(row)
2844 records[info.dataset_id][self._table.name].append(info)
2846 record_data = DatastoreRecordData(records=records)
2847 return {self.name: record_data}