Coverage for python/lsst/daf/butler/datastores/fileDatastore.py: 86%
969 statements
« prev ^ index » next coverage.py v7.2.5, created at 2023-05-17 02:30 -0700
« prev ^ index » next coverage.py v7.2.5, created at 2023-05-17 02:30 -0700
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
21from __future__ import annotations
23"""Generic file-based datastore code."""
25__all__ = ("FileDatastore",)
27import hashlib
28import logging
29from collections import defaultdict
30from collections.abc import Callable
31from dataclasses import dataclass
32from typing import (
33 TYPE_CHECKING,
34 Any,
35 ClassVar,
36 Dict,
37 Iterable,
38 List,
39 Mapping,
40 Optional,
41 Sequence,
42 Set,
43 Tuple,
44 Type,
45 Union,
46)
48from lsst.daf.butler import (
49 CompositesMap,
50 Config,
51 DatasetId,
52 DatasetRef,
53 DatasetRefURIs,
54 DatasetType,
55 DatasetTypeNotSupportedError,
56 Datastore,
57 DatastoreCacheManager,
58 DatastoreConfig,
59 DatastoreDisabledCacheManager,
60 DatastoreRecordData,
61 DatastoreValidationError,
62 FileDataset,
63 FileDescriptor,
64 FileTemplates,
65 FileTemplateValidationError,
66 Formatter,
67 FormatterFactory,
68 Location,
69 LocationFactory,
70 Progress,
71 StorageClass,
72 StoredDatastoreItemInfo,
73 StoredFileInfo,
74 ddl,
75)
76from lsst.daf.butler.core.repoRelocation import replaceRoot
77from lsst.daf.butler.core.utils import transactional
78from lsst.daf.butler.registry.interfaces import DatastoreRegistryBridge, ReadOnlyDatabaseError
79from lsst.resources import ResourcePath, ResourcePathExpression
80from lsst.utils.introspection import get_class_of, get_instance_of
81from lsst.utils.iteration import chunk_iterable
83# For VERBOSE logging usage.
84from lsst.utils.logging import VERBOSE, getLogger
85from lsst.utils.timer import time_this
86from sqlalchemy import BigInteger, String
88from ..registry.interfaces import FakeDatasetRef
89from .genericDatastore import GenericBaseDatastore
91if TYPE_CHECKING:
92 from lsst.daf.butler import AbstractDatastoreCacheManager, LookupKey
93 from lsst.daf.butler.registry.interfaces import DatasetIdRef, DatastoreRegistryBridgeManager
95log = getLogger(__name__)
98class _IngestPrepData(Datastore.IngestPrepData):
99 """Helper class for FileDatastore ingest implementation.
101 Parameters
102 ----------
103 datasets : `list` of `FileDataset`
104 Files to be ingested by this datastore.
105 """
107 def __init__(self, datasets: List[FileDataset]):
108 super().__init__(ref for dataset in datasets for ref in dataset.refs)
109 self.datasets = datasets
112@dataclass(frozen=True)
113class DatastoreFileGetInformation:
114 """Collection of useful parameters needed to retrieve a file from
115 a Datastore.
116 """
118 location: Location
119 """The location from which to read the dataset."""
121 formatter: Formatter
122 """The `Formatter` to use to deserialize the dataset."""
124 info: StoredFileInfo
125 """Stored information about this file and its formatter."""
127 assemblerParams: Mapping[str, Any]
128 """Parameters to use for post-processing the retrieved dataset."""
130 formatterParams: Mapping[str, Any]
131 """Parameters that were understood by the associated formatter."""
133 component: Optional[str]
134 """The component to be retrieved (can be `None`)."""
136 readStorageClass: StorageClass
137 """The `StorageClass` of the dataset being read."""
140class FileDatastore(GenericBaseDatastore):
141 """Generic Datastore for file-based implementations.
143 Should always be sub-classed since key abstract methods are missing.
145 Parameters
146 ----------
147 config : `DatastoreConfig` or `str`
148 Configuration as either a `Config` object or URI to file.
149 bridgeManager : `DatastoreRegistryBridgeManager`
150 Object that manages the interface between `Registry` and datastores.
151 butlerRoot : `str`, optional
152 New datastore root to use to override the configuration value.
154 Raises
155 ------
156 ValueError
157 If root location does not exist and ``create`` is `False` in the
158 configuration.
159 """
161 defaultConfigFile: ClassVar[Optional[str]] = None
162 """Path to configuration defaults. Accessed within the ``config`` resource
163 or relative to a search path. Can be None if no defaults specified.
164 """
166 root: ResourcePath
167 """Root directory URI of this `Datastore`."""
169 locationFactory: LocationFactory
170 """Factory for creating locations relative to the datastore root."""
172 formatterFactory: FormatterFactory
173 """Factory for creating instances of formatters."""
175 templates: FileTemplates
176 """File templates that can be used by this `Datastore`."""
178 composites: CompositesMap
179 """Determines whether a dataset should be disassembled on put."""
181 defaultConfigFile = "datastores/fileDatastore.yaml"
182 """Path to configuration defaults. Accessed within the ``config`` resource
183 or relative to a search path. Can be None if no defaults specified.
184 """
186 _retrieve_dataset_method: Callable[[str], DatasetType | None] | None = None
187 """Callable that is used in trusted mode to retrieve registry definition
188 of a named dataset type.
189 """
191 @classmethod
192 def setConfigRoot(cls, root: str, config: Config, full: Config, overwrite: bool = True) -> None:
193 """Set any filesystem-dependent config options for this Datastore to
194 be appropriate for a new empty repository with the given root.
196 Parameters
197 ----------
198 root : `str`
199 URI to the root of the data repository.
200 config : `Config`
201 A `Config` to update. Only the subset understood by
202 this component will be updated. Will not expand
203 defaults.
204 full : `Config`
205 A complete config with all defaults expanded that can be
206 converted to a `DatastoreConfig`. Read-only and will not be
207 modified by this method.
208 Repository-specific options that should not be obtained
209 from defaults when Butler instances are constructed
210 should be copied from ``full`` to ``config``.
211 overwrite : `bool`, optional
212 If `False`, do not modify a value in ``config`` if the value
213 already exists. Default is always to overwrite with the provided
214 ``root``.
216 Notes
217 -----
218 If a keyword is explicitly defined in the supplied ``config`` it
219 will not be overridden by this method if ``overwrite`` is `False`.
220 This allows explicit values set in external configs to be retained.
221 """
222 Config.updateParameters(
223 DatastoreConfig,
224 config,
225 full,
226 toUpdate={"root": root},
227 toCopy=("cls", ("records", "table")),
228 overwrite=overwrite,
229 )
231 @classmethod
232 def makeTableSpec(cls, datasetIdColumnType: type) -> ddl.TableSpec:
233 return ddl.TableSpec(
234 fields=[
235 ddl.FieldSpec(name="dataset_id", dtype=datasetIdColumnType, primaryKey=True),
236 ddl.FieldSpec(name="path", dtype=String, length=256, nullable=False),
237 ddl.FieldSpec(name="formatter", dtype=String, length=128, nullable=False),
238 ddl.FieldSpec(name="storage_class", dtype=String, length=64, nullable=False),
239 # Use empty string to indicate no component
240 ddl.FieldSpec(name="component", dtype=String, length=32, primaryKey=True),
241 # TODO: should checksum be Base64Bytes instead?
242 ddl.FieldSpec(name="checksum", dtype=String, length=128, nullable=True),
243 ddl.FieldSpec(name="file_size", dtype=BigInteger, nullable=True),
244 ],
245 unique=frozenset(),
246 indexes=[ddl.IndexSpec("path")],
247 )
249 def __init__(
250 self,
251 config: Union[DatastoreConfig, str],
252 bridgeManager: DatastoreRegistryBridgeManager,
253 butlerRoot: str | None = None,
254 ):
255 super().__init__(config, bridgeManager)
256 if "root" not in self.config: 256 ↛ 257line 256 didn't jump to line 257, because the condition on line 256 was never true
257 raise ValueError("No root directory specified in configuration")
259 self._bridgeManager = bridgeManager
261 # Name ourselves either using an explicit name or a name
262 # derived from the (unexpanded) root
263 if "name" in self.config:
264 self.name = self.config["name"]
265 else:
266 # We use the unexpanded root in the name to indicate that this
267 # datastore can be moved without having to update registry.
268 self.name = "{}@{}".format(type(self).__name__, self.config["root"])
270 # Support repository relocation in config
271 # Existence of self.root is checked in subclass
272 self.root = ResourcePath(
273 replaceRoot(self.config["root"], butlerRoot), forceDirectory=True, forceAbsolute=True
274 )
276 self.locationFactory = LocationFactory(self.root)
277 self.formatterFactory = FormatterFactory()
279 # Now associate formatters with storage classes
280 self.formatterFactory.registerFormatters(self.config["formatters"], universe=bridgeManager.universe)
282 # Read the file naming templates
283 self.templates = FileTemplates(self.config["templates"], universe=bridgeManager.universe)
285 # See if composites should be disassembled
286 self.composites = CompositesMap(self.config["composites"], universe=bridgeManager.universe)
288 tableName = self.config["records", "table"]
289 try:
290 # Storage of paths and formatters, keyed by dataset_id
291 self._table = bridgeManager.opaque.register(
292 tableName, self.makeTableSpec(bridgeManager.datasetIdColumnType)
293 )
294 # Interface to Registry.
295 self._bridge = bridgeManager.register(self.name)
296 except ReadOnlyDatabaseError:
297 # If the database is read only and we just tried and failed to
298 # create a table, it means someone is trying to create a read-only
299 # butler client for an empty repo. That should be okay, as long
300 # as they then try to get any datasets before some other client
301 # creates the table. Chances are they'rejust validating
302 # configuration.
303 pass
305 # Determine whether checksums should be used - default to False
306 self.useChecksum = self.config.get("checksum", False)
308 # Determine whether we can fall back to configuration if a
309 # requested dataset is not known to registry
310 self.trustGetRequest = self.config.get("trust_get_request", False)
312 # Create a cache manager
313 self.cacheManager: AbstractDatastoreCacheManager
314 if "cached" in self.config: 314 ↛ 317line 314 didn't jump to line 317, because the condition on line 314 was never false
315 self.cacheManager = DatastoreCacheManager(self.config["cached"], universe=bridgeManager.universe)
316 else:
317 self.cacheManager = DatastoreDisabledCacheManager("", universe=bridgeManager.universe)
319 # Check existence and create directory structure if necessary
320 if not self.root.exists():
321 if "create" not in self.config or not self.config["create"]: 321 ↛ 322line 321 didn't jump to line 322, because the condition on line 321 was never true
322 raise ValueError(f"No valid root and not allowed to create one at: {self.root}")
323 try:
324 self.root.mkdir()
325 except Exception as e:
326 raise ValueError(
327 f"Can not create datastore root '{self.root}', check permissions. Got error: {e}"
328 ) from e
330 def __str__(self) -> str:
331 return str(self.root)
333 @property
334 def bridge(self) -> DatastoreRegistryBridge:
335 return self._bridge
337 def _artifact_exists(self, location: Location) -> bool:
338 """Check that an artifact exists in this datastore at the specified
339 location.
341 Parameters
342 ----------
343 location : `Location`
344 Expected location of the artifact associated with this datastore.
346 Returns
347 -------
348 exists : `bool`
349 True if the location can be found, false otherwise.
350 """
351 log.debug("Checking if resource exists: %s", location.uri)
352 return location.uri.exists()
354 def _delete_artifact(self, location: Location) -> None:
355 """Delete the artifact from the datastore.
357 Parameters
358 ----------
359 location : `Location`
360 Location of the artifact associated with this datastore.
361 """
362 if location.pathInStore.isabs(): 362 ↛ 363line 362 didn't jump to line 363, because the condition on line 362 was never true
363 raise RuntimeError(f"Cannot delete artifact with absolute uri {location.uri}.")
365 try:
366 location.uri.remove()
367 except FileNotFoundError: 367 ↛ 370line 367 didn't jump to line 370
368 log.debug("File %s did not exist and so could not be deleted.", location.uri)
369 raise
370 except Exception as e:
371 log.critical("Failed to delete file: %s (%s)", location.uri, e)
372 raise
373 log.debug("Successfully deleted file: %s", location.uri)
375 def addStoredItemInfo(self, refs: Iterable[DatasetRef], infos: Iterable[StoredFileInfo]) -> None:
376 # Docstring inherited from GenericBaseDatastore
377 records = [info.rebase(ref).to_record() for ref, info in zip(refs, infos)]
378 self._table.insert(*records, transaction=self._transaction)
380 def getStoredItemsInfo(self, ref: DatasetIdRef) -> List[StoredFileInfo]:
381 # Docstring inherited from GenericBaseDatastore
383 # Look for the dataset_id -- there might be multiple matches
384 # if we have disassembled the dataset.
385 records = self._table.fetch(dataset_id=ref.id)
386 return [StoredFileInfo.from_record(record) for record in records]
388 def _get_stored_records_associated_with_refs(
389 self, refs: Iterable[DatasetIdRef]
390 ) -> Dict[DatasetId, List[StoredFileInfo]]:
391 """Retrieve all records associated with the provided refs.
393 Parameters
394 ----------
395 refs : iterable of `DatasetIdRef`
396 The refs for which records are to be retrieved.
398 Returns
399 -------
400 records : `dict` of [`DatasetId`, `list` of `StoredFileInfo`]
401 The matching records indexed by the ref ID. The number of entries
402 in the dict can be smaller than the number of requested refs.
403 """
404 records = self._table.fetch(dataset_id=[ref.id for ref in refs])
406 # Uniqueness is dataset_id + component so can have multiple records
407 # per ref.
408 records_by_ref = defaultdict(list)
409 for record in records:
410 records_by_ref[record["dataset_id"]].append(StoredFileInfo.from_record(record))
411 return records_by_ref
413 def _refs_associated_with_artifacts(
414 self, paths: List[Union[str, ResourcePath]]
415 ) -> Dict[str, Set[DatasetId]]:
416 """Return paths and associated dataset refs.
418 Parameters
419 ----------
420 paths : `list` of `str` or `lsst.resources.ResourcePath`
421 All the paths to include in search.
423 Returns
424 -------
425 mapping : `dict` of [`str`, `set` [`DatasetId`]]
426 Mapping of each path to a set of associated database IDs.
427 """
428 records = self._table.fetch(path=[str(path) for path in paths])
429 result = defaultdict(set)
430 for row in records:
431 result[row["path"]].add(row["dataset_id"])
432 return result
434 def _registered_refs_per_artifact(self, pathInStore: ResourcePath) -> Set[DatasetId]:
435 """Return all dataset refs associated with the supplied path.
437 Parameters
438 ----------
439 pathInStore : `lsst.resources.ResourcePath`
440 Path of interest in the data store.
442 Returns
443 -------
444 ids : `set` of `int`
445 All `DatasetRef` IDs associated with this path.
446 """
447 records = list(self._table.fetch(path=str(pathInStore)))
448 ids = {r["dataset_id"] for r in records}
449 return ids
451 def removeStoredItemInfo(self, ref: DatasetIdRef) -> None:
452 # Docstring inherited from GenericBaseDatastore
453 self._table.delete(["dataset_id"], {"dataset_id": ref.id})
455 def _get_dataset_locations_info(self, ref: DatasetIdRef) -> List[Tuple[Location, StoredFileInfo]]:
456 r"""Find all the `Location`\ s of the requested dataset in the
457 `Datastore` and the associated stored file information.
459 Parameters
460 ----------
461 ref : `DatasetRef`
462 Reference to the required `Dataset`.
464 Returns
465 -------
466 results : `list` [`tuple` [`Location`, `StoredFileInfo` ]]
467 Location of the dataset within the datastore and
468 stored information about each file and its formatter.
469 """
470 # Get the file information (this will fail if no file)
471 records = self.getStoredItemsInfo(ref)
473 # Use the path to determine the location -- we need to take
474 # into account absolute URIs in the datastore record
475 return [(r.file_location(self.locationFactory), r) for r in records]
477 def _can_remove_dataset_artifact(self, ref: DatasetIdRef, location: Location) -> bool:
478 """Check that there is only one dataset associated with the
479 specified artifact.
481 Parameters
482 ----------
483 ref : `DatasetRef` or `FakeDatasetRef`
484 Dataset to be removed.
485 location : `Location`
486 The location of the artifact to be removed.
488 Returns
489 -------
490 can_remove : `Bool`
491 True if the artifact can be safely removed.
492 """
493 # Can't ever delete absolute URIs.
494 if location.pathInStore.isabs():
495 return False
497 # Get all entries associated with this path
498 allRefs = self._registered_refs_per_artifact(location.pathInStore)
499 if not allRefs:
500 raise RuntimeError(f"Datastore inconsistency error. {location.pathInStore} not in registry")
502 # Remove these refs from all the refs and if there is nothing left
503 # then we can delete
504 remainingRefs = allRefs - {ref.id}
506 if remainingRefs:
507 return False
508 return True
510 def _get_expected_dataset_locations_info(self, ref: DatasetRef) -> List[Tuple[Location, StoredFileInfo]]:
511 """Predict the location and related file information of the requested
512 dataset in this datastore.
514 Parameters
515 ----------
516 ref : `DatasetRef`
517 Reference to the required `Dataset`.
519 Returns
520 -------
521 results : `list` [`tuple` [`Location`, `StoredFileInfo` ]]
522 Expected Location of the dataset within the datastore and
523 placeholder information about each file and its formatter.
525 Notes
526 -----
527 Uses the current configuration to determine how we would expect the
528 datastore files to have been written if we couldn't ask registry.
529 This is safe so long as there has been no change to datastore
530 configuration between writing the dataset and wanting to read it.
531 Will not work for files that have been ingested without using the
532 standard file template or default formatter.
533 """
535 # If we have a component ref we always need to ask the questions
536 # of the composite. If the composite is disassembled this routine
537 # should return all components. If the composite was not
538 # disassembled the composite is what is stored regardless of
539 # component request. Note that if the caller has disassembled
540 # a composite there is no way for this guess to know that
541 # without trying both the composite and component ref and seeing
542 # if there is something at the component Location even without
543 # disassembly being enabled.
544 if ref.datasetType.isComponent():
545 ref = ref.makeCompositeRef()
547 # See if the ref is a composite that should be disassembled
548 doDisassembly = self.composites.shouldBeDisassembled(ref)
550 all_info: List[Tuple[Location, Formatter, StorageClass, Optional[str]]] = []
552 if doDisassembly:
553 for component, componentStorage in ref.datasetType.storageClass.components.items():
554 compRef = ref.makeComponentRef(component)
555 location, formatter = self._determine_put_formatter_location(compRef)
556 all_info.append((location, formatter, componentStorage, component))
558 else:
559 # Always use the composite ref if no disassembly
560 location, formatter = self._determine_put_formatter_location(ref)
561 all_info.append((location, formatter, ref.datasetType.storageClass, None))
563 # Convert the list of tuples to have StoredFileInfo as second element
564 return [
565 (
566 location,
567 StoredFileInfo(
568 formatter=formatter,
569 path=location.pathInStore.path,
570 storageClass=storageClass,
571 component=component,
572 checksum=None,
573 file_size=-1,
574 dataset_id=ref.getCheckedId(),
575 ),
576 )
577 for location, formatter, storageClass, component in all_info
578 ]
580 def _prepare_for_get(
581 self, ref: DatasetRef, parameters: Optional[Mapping[str, Any]] = None
582 ) -> List[DatastoreFileGetInformation]:
583 """Check parameters for ``get`` and obtain formatter and
584 location.
586 Parameters
587 ----------
588 ref : `DatasetRef`
589 Reference to the required Dataset.
590 parameters : `dict`
591 `StorageClass`-specific parameters that specify, for example,
592 a slice of the dataset to be loaded.
594 Returns
595 -------
596 getInfo : `list` [`DatastoreFileGetInformation`]
597 Parameters needed to retrieve each file.
598 """
599 log.debug("Retrieve %s from %s with parameters %s", ref, self.name, parameters)
601 # For trusted mode need to reset storage class.
602 ref = self._cast_storage_class(ref)
604 # Get file metadata and internal metadata
605 fileLocations = self._get_dataset_locations_info(ref)
606 if not fileLocations:
607 if not self.trustGetRequest:
608 raise FileNotFoundError(f"Could not retrieve dataset {ref}.")
609 # Assume the dataset is where we think it should be
610 fileLocations = self._get_expected_dataset_locations_info(ref)
612 # The storage class we want to use eventually
613 refStorageClass = ref.datasetType.storageClass
615 if len(fileLocations) > 1:
616 disassembled = True
618 # If trust is involved it is possible that there will be
619 # components listed here that do not exist in the datastore.
620 # Explicitly check for file artifact existence and filter out any
621 # that are missing.
622 if self.trustGetRequest:
623 fileLocations = [loc for loc in fileLocations if loc[0].uri.exists()]
625 # For now complain only if we have no components at all. One
626 # component is probably a problem but we can punt that to the
627 # assembler.
628 if not fileLocations:
629 raise FileNotFoundError(f"None of the component files for dataset {ref} exist.")
631 else:
632 disassembled = False
634 # Is this a component request?
635 refComponent = ref.datasetType.component()
637 fileGetInfo = []
638 for location, storedFileInfo in fileLocations:
639 # The storage class used to write the file
640 writeStorageClass = storedFileInfo.storageClass
642 # If this has been disassembled we need read to match the write
643 if disassembled:
644 readStorageClass = writeStorageClass
645 else:
646 readStorageClass = refStorageClass
648 formatter = get_instance_of(
649 storedFileInfo.formatter,
650 FileDescriptor(
651 location,
652 readStorageClass=readStorageClass,
653 storageClass=writeStorageClass,
654 parameters=parameters,
655 ),
656 ref.dataId,
657 )
659 formatterParams, notFormatterParams = formatter.segregateParameters()
661 # Of the remaining parameters, extract the ones supported by
662 # this StorageClass (for components not all will be handled)
663 assemblerParams = readStorageClass.filterParameters(notFormatterParams)
665 # The ref itself could be a component if the dataset was
666 # disassembled by butler, or we disassembled in datastore and
667 # components came from the datastore records
668 component = storedFileInfo.component if storedFileInfo.component else refComponent
670 fileGetInfo.append(
671 DatastoreFileGetInformation(
672 location,
673 formatter,
674 storedFileInfo,
675 assemblerParams,
676 formatterParams,
677 component,
678 readStorageClass,
679 )
680 )
682 return fileGetInfo
684 def _prepare_for_put(self, inMemoryDataset: Any, ref: DatasetRef) -> Tuple[Location, Formatter]:
685 """Check the arguments for ``put`` and obtain formatter and
686 location.
688 Parameters
689 ----------
690 inMemoryDataset : `object`
691 The dataset to store.
692 ref : `DatasetRef`
693 Reference to the associated Dataset.
695 Returns
696 -------
697 location : `Location`
698 The location to write the dataset.
699 formatter : `Formatter`
700 The `Formatter` to use to write the dataset.
702 Raises
703 ------
704 TypeError
705 Supplied object and storage class are inconsistent.
706 DatasetTypeNotSupportedError
707 The associated `DatasetType` is not handled by this datastore.
708 """
709 self._validate_put_parameters(inMemoryDataset, ref)
710 return self._determine_put_formatter_location(ref)
712 def _determine_put_formatter_location(self, ref: DatasetRef) -> Tuple[Location, Formatter]:
713 """Calculate the formatter and output location to use for put.
715 Parameters
716 ----------
717 ref : `DatasetRef`
718 Reference to the associated Dataset.
720 Returns
721 -------
722 location : `Location`
723 The location to write the dataset.
724 formatter : `Formatter`
725 The `Formatter` to use to write the dataset.
726 """
727 # Work out output file name
728 try:
729 template = self.templates.getTemplate(ref)
730 except KeyError as e:
731 raise DatasetTypeNotSupportedError(f"Unable to find template for {ref}") from e
733 # Validate the template to protect against filenames from different
734 # dataIds returning the same and causing overwrite confusion.
735 template.validateTemplate(ref)
737 location = self.locationFactory.fromPath(template.format(ref))
739 # Get the formatter based on the storage class
740 storageClass = ref.datasetType.storageClass
741 try:
742 formatter = self.formatterFactory.getFormatter(
743 ref, FileDescriptor(location, storageClass=storageClass), ref.dataId
744 )
745 except KeyError as e:
746 raise DatasetTypeNotSupportedError(
747 f"Unable to find formatter for {ref} in datastore {self.name}"
748 ) from e
750 # Now that we know the formatter, update the location
751 location = formatter.makeUpdatedLocation(location)
753 return location, formatter
755 def _overrideTransferMode(self, *datasets: FileDataset, transfer: Optional[str] = None) -> Optional[str]:
756 # Docstring inherited from base class
757 if transfer != "auto":
758 return transfer
760 # See if the paths are within the datastore or not
761 inside = [self._pathInStore(d.path) is not None for d in datasets]
763 if all(inside):
764 transfer = None
765 elif not any(inside): 765 ↛ 774line 765 didn't jump to line 774, because the condition on line 765 was never false
766 # Allow ResourcePath to use its own knowledge
767 transfer = "auto"
768 else:
769 # This can happen when importing from a datastore that
770 # has had some datasets ingested using "direct" mode.
771 # Also allow ResourcePath to sort it out but warn about it.
772 # This can happen if you are importing from a datastore
773 # that had some direct transfer datasets.
774 log.warning(
775 "Some datasets are inside the datastore and some are outside. Using 'split' "
776 "transfer mode. This assumes that the files outside the datastore are "
777 "still accessible to the new butler since they will not be copied into "
778 "the target datastore."
779 )
780 transfer = "split"
782 return transfer
784 def _pathInStore(self, path: ResourcePathExpression) -> Optional[str]:
785 """Return path relative to datastore root
787 Parameters
788 ----------
789 path : `lsst.resources.ResourcePathExpression`
790 Path to dataset. Can be absolute URI. If relative assumed to
791 be relative to the datastore. Returns path in datastore
792 or raises an exception if the path it outside.
794 Returns
795 -------
796 inStore : `str`
797 Path relative to datastore root. Returns `None` if the file is
798 outside the root.
799 """
800 # Relative path will always be relative to datastore
801 pathUri = ResourcePath(path, forceAbsolute=False)
802 return pathUri.relative_to(self.root)
804 def _standardizeIngestPath(
805 self, path: Union[str, ResourcePath], *, transfer: Optional[str] = None
806 ) -> Union[str, ResourcePath]:
807 """Standardize the path of a to-be-ingested file.
809 Parameters
810 ----------
811 path : `str` or `lsst.resources.ResourcePath`
812 Path of a file to be ingested. This parameter is not expected
813 to be all the types that can be used to construct a
814 `~lsst.resources.ResourcePath`.
815 transfer : `str`, optional
816 How (and whether) the dataset should be added to the datastore.
817 See `ingest` for details of transfer modes.
818 This implementation is provided only so
819 `NotImplementedError` can be raised if the mode is not supported;
820 actual transfers are deferred to `_extractIngestInfo`.
822 Returns
823 -------
824 path : `str` or `lsst.resources.ResourcePath`
825 New path in what the datastore considers standard form. If an
826 absolute URI was given that will be returned unchanged.
828 Notes
829 -----
830 Subclasses of `FileDatastore` can implement this method instead
831 of `_prepIngest`. It should not modify the data repository or given
832 file in any way.
834 Raises
835 ------
836 NotImplementedError
837 Raised if the datastore does not support the given transfer mode
838 (including the case where ingest is not supported at all).
839 FileNotFoundError
840 Raised if one of the given files does not exist.
841 """
842 if transfer not in (None, "direct", "split") + self.root.transferModes:
843 raise NotImplementedError(f"Transfer mode {transfer} not supported.")
845 # A relative URI indicates relative to datastore root
846 srcUri = ResourcePath(path, forceAbsolute=False)
847 if not srcUri.isabs():
848 srcUri = self.root.join(path)
850 if not srcUri.exists():
851 raise FileNotFoundError(
852 f"Resource at {srcUri} does not exist; note that paths to ingest "
853 f"are assumed to be relative to {self.root} unless they are absolute."
854 )
856 if transfer is None:
857 relpath = srcUri.relative_to(self.root)
858 if not relpath:
859 raise RuntimeError(
860 f"Transfer is none but source file ({srcUri}) is not within datastore ({self.root})"
861 )
863 # Return the relative path within the datastore for internal
864 # transfer
865 path = relpath
867 return path
869 def _extractIngestInfo(
870 self,
871 path: ResourcePathExpression,
872 ref: DatasetRef,
873 *,
874 formatter: Union[Formatter, Type[Formatter]],
875 transfer: Optional[str] = None,
876 record_validation_info: bool = True,
877 ) -> StoredFileInfo:
878 """Relocate (if necessary) and extract `StoredFileInfo` from a
879 to-be-ingested file.
881 Parameters
882 ----------
883 path : `lsst.resources.ResourcePathExpression`
884 URI or path of a file to be ingested.
885 ref : `DatasetRef`
886 Reference for the dataset being ingested. Guaranteed to have
887 ``dataset_id not None`.
888 formatter : `type` or `Formatter`
889 `Formatter` subclass to use for this dataset or an instance.
890 transfer : `str`, optional
891 How (and whether) the dataset should be added to the datastore.
892 See `ingest` for details of transfer modes.
893 record_validation_info : `bool`, optional
894 If `True`, the default, the datastore can record validation
895 information associated with the file. If `False` the datastore
896 will not attempt to track any information such as checksums
897 or file sizes. This can be useful if such information is tracked
898 in an external system or if the file is to be compressed in place.
899 It is up to the datastore whether this parameter is relevant.
901 Returns
902 -------
903 info : `StoredFileInfo`
904 Internal datastore record for this file. This will be inserted by
905 the caller; the `_extractIngestInfo` is only responsible for
906 creating and populating the struct.
908 Raises
909 ------
910 FileNotFoundError
911 Raised if one of the given files does not exist.
912 FileExistsError
913 Raised if transfer is not `None` but the (internal) location the
914 file would be moved to is already occupied.
915 """
916 if self._transaction is None: 916 ↛ 917line 916 didn't jump to line 917, because the condition on line 916 was never true
917 raise RuntimeError("Ingest called without transaction enabled")
919 # Create URI of the source path, do not need to force a relative
920 # path to absolute.
921 srcUri = ResourcePath(path, forceAbsolute=False)
923 # Track whether we have read the size of the source yet
924 have_sized = False
926 tgtLocation: Optional[Location]
927 if transfer is None or transfer == "split":
928 # A relative path is assumed to be relative to the datastore
929 # in this context
930 if not srcUri.isabs():
931 tgtLocation = self.locationFactory.fromPath(srcUri.ospath)
932 else:
933 # Work out the path in the datastore from an absolute URI
934 # This is required to be within the datastore.
935 pathInStore = srcUri.relative_to(self.root)
936 if pathInStore is None and transfer is None: 936 ↛ 937line 936 didn't jump to line 937, because the condition on line 936 was never true
937 raise RuntimeError(
938 f"Unexpectedly learned that {srcUri} is not within datastore {self.root}"
939 )
940 if pathInStore: 940 ↛ 942line 940 didn't jump to line 942, because the condition on line 940 was never false
941 tgtLocation = self.locationFactory.fromPath(pathInStore)
942 elif transfer == "split":
943 # Outside the datastore but treat that as a direct ingest
944 # instead.
945 tgtLocation = None
946 else:
947 raise RuntimeError(f"Unexpected transfer mode encountered: {transfer} for URI {srcUri}")
948 elif transfer == "direct":
949 # Want to store the full URI to the resource directly in
950 # datastore. This is useful for referring to permanent archive
951 # storage for raw data.
952 # Trust that people know what they are doing.
953 tgtLocation = None
954 else:
955 # Work out the name we want this ingested file to have
956 # inside the datastore
957 tgtLocation = self._calculate_ingested_datastore_name(srcUri, ref, formatter)
958 if not tgtLocation.uri.dirname().exists():
959 log.debug("Folder %s does not exist yet.", tgtLocation.uri.dirname())
960 tgtLocation.uri.dirname().mkdir()
962 # if we are transferring from a local file to a remote location
963 # it may be more efficient to get the size and checksum of the
964 # local file rather than the transferred one
965 if record_validation_info and srcUri.isLocal:
966 size = srcUri.size()
967 checksum = self.computeChecksum(srcUri) if self.useChecksum else None
968 have_sized = True
970 # Transfer the resource to the destination.
971 # Allow overwrite of an existing file. This matches the behavior
972 # of datastore.put() in that it trusts that registry would not
973 # be asking to overwrite unless registry thought that the
974 # overwrite was allowed.
975 tgtLocation.uri.transfer_from(
976 srcUri, transfer=transfer, transaction=self._transaction, overwrite=True
977 )
979 if tgtLocation is None:
980 # This means we are using direct mode
981 targetUri = srcUri
982 targetPath = str(srcUri)
983 else:
984 targetUri = tgtLocation.uri
985 targetPath = tgtLocation.pathInStore.path
987 # the file should exist in the datastore now
988 if record_validation_info:
989 if not have_sized:
990 size = targetUri.size()
991 checksum = self.computeChecksum(targetUri) if self.useChecksum else None
992 else:
993 # Not recording any file information.
994 size = -1
995 checksum = None
997 return StoredFileInfo(
998 formatter=formatter,
999 path=targetPath,
1000 storageClass=ref.datasetType.storageClass,
1001 component=ref.datasetType.component(),
1002 file_size=size,
1003 checksum=checksum,
1004 dataset_id=ref.getCheckedId(),
1005 )
1007 def _prepIngest(self, *datasets: FileDataset, transfer: Optional[str] = None) -> _IngestPrepData:
1008 # Docstring inherited from Datastore._prepIngest.
1009 filtered = []
1010 for dataset in datasets:
1011 acceptable = [ref for ref in dataset.refs if self.constraints.isAcceptable(ref)]
1012 if not acceptable:
1013 continue
1014 else:
1015 dataset.refs = acceptable
1016 if dataset.formatter is None:
1017 dataset.formatter = self.formatterFactory.getFormatterClass(dataset.refs[0])
1018 else:
1019 assert isinstance(dataset.formatter, (type, str))
1020 formatter_class = get_class_of(dataset.formatter)
1021 if not issubclass(formatter_class, Formatter): 1021 ↛ 1022line 1021 didn't jump to line 1022, because the condition on line 1021 was never true
1022 raise TypeError(f"Requested formatter {dataset.formatter} is not a Formatter class.")
1023 dataset.formatter = formatter_class
1024 dataset.path = self._standardizeIngestPath(dataset.path, transfer=transfer)
1025 filtered.append(dataset)
1026 return _IngestPrepData(filtered)
1028 @transactional
1029 def _finishIngest(
1030 self,
1031 prepData: Datastore.IngestPrepData,
1032 *,
1033 transfer: Optional[str] = None,
1034 record_validation_info: bool = True,
1035 ) -> None:
1036 # Docstring inherited from Datastore._finishIngest.
1037 refsAndInfos = []
1038 progress = Progress("lsst.daf.butler.datastores.FileDatastore.ingest", level=logging.DEBUG)
1039 for dataset in progress.wrap(prepData.datasets, desc="Ingesting dataset files"):
1040 # Do ingest as if the first dataset ref is associated with the file
1041 info = self._extractIngestInfo(
1042 dataset.path,
1043 dataset.refs[0],
1044 formatter=dataset.formatter,
1045 transfer=transfer,
1046 record_validation_info=record_validation_info,
1047 )
1048 refsAndInfos.extend([(ref, info) for ref in dataset.refs])
1049 self._register_datasets(refsAndInfos)
1051 def _calculate_ingested_datastore_name(
1052 self,
1053 srcUri: ResourcePath,
1054 ref: DatasetRef,
1055 formatter: Formatter | Type[Formatter] | None = None,
1056 ) -> Location:
1057 """Given a source URI and a DatasetRef, determine the name the
1058 dataset will have inside datastore.
1060 Parameters
1061 ----------
1062 srcUri : `lsst.resources.ResourcePath`
1063 URI to the source dataset file.
1064 ref : `DatasetRef`
1065 Ref associated with the newly-ingested dataset artifact. This
1066 is used to determine the name within the datastore.
1067 formatter : `Formatter` or Formatter class.
1068 Formatter to use for validation. Can be a class or an instance.
1069 No validation of the file extension is performed if the
1070 ``formatter`` is `None`. This can be used if the caller knows
1071 that the source URI and target URI will use the same formatter.
1073 Returns
1074 -------
1075 location : `Location`
1076 Target location for the newly-ingested dataset.
1077 """
1078 # Ingesting a file from outside the datastore.
1079 # This involves a new name.
1080 template = self.templates.getTemplate(ref)
1081 location = self.locationFactory.fromPath(template.format(ref))
1083 # Get the extension
1084 ext = srcUri.getExtension()
1086 # Update the destination to include that extension
1087 location.updateExtension(ext)
1089 # Ask the formatter to validate this extension
1090 if formatter is not None:
1091 formatter.validateExtension(location)
1093 return location
1095 def _write_in_memory_to_artifact(self, inMemoryDataset: Any, ref: DatasetRef) -> StoredFileInfo:
1096 """Write out in memory dataset to datastore.
1098 Parameters
1099 ----------
1100 inMemoryDataset : `object`
1101 Dataset to write to datastore.
1102 ref : `DatasetRef`
1103 Registry information associated with this dataset.
1105 Returns
1106 -------
1107 info : `StoredFileInfo`
1108 Information describing the artifact written to the datastore.
1109 """
1110 # May need to coerce the in memory dataset to the correct
1111 # python type, but first we need to make sure the storage class
1112 # reflects the one defined in the data repository.
1113 ref = self._cast_storage_class(ref)
1114 inMemoryDataset = ref.datasetType.storageClass.coerce_type(inMemoryDataset)
1116 location, formatter = self._prepare_for_put(inMemoryDataset, ref)
1117 uri = location.uri
1119 if not uri.dirname().exists():
1120 log.debug("Folder %s does not exist yet so creating it.", uri.dirname())
1121 uri.dirname().mkdir()
1123 if self._transaction is None: 1123 ↛ 1124line 1123 didn't jump to line 1124, because the condition on line 1123 was never true
1124 raise RuntimeError("Attempting to write artifact without transaction enabled")
1126 def _removeFileExists(uri: ResourcePath) -> None:
1127 """Remove a file and do not complain if it is not there.
1129 This is important since a formatter might fail before the file
1130 is written and we should not confuse people by writing spurious
1131 error messages to the log.
1132 """
1133 try:
1134 uri.remove()
1135 except FileNotFoundError:
1136 pass
1138 # Register a callback to try to delete the uploaded data if
1139 # something fails below
1140 self._transaction.registerUndo("artifactWrite", _removeFileExists, uri)
1142 data_written = False
1143 if not uri.isLocal:
1144 # This is a remote URI. Some datasets can be serialized directly
1145 # to bytes and sent to the remote datastore without writing a
1146 # file. If the dataset is intended to be saved to the cache
1147 # a file is always written and direct write to the remote
1148 # datastore is bypassed.
1149 if not self.cacheManager.should_be_cached(ref):
1150 try:
1151 serializedDataset = formatter.toBytes(inMemoryDataset)
1152 except NotImplementedError:
1153 # Fallback to the file writing option.
1154 pass
1155 except Exception as e:
1156 raise RuntimeError(
1157 f"Failed to serialize dataset {ref} of type {type(inMemoryDataset)} to bytes."
1158 ) from e
1159 else:
1160 log.debug("Writing bytes directly to %s", uri)
1161 uri.write(serializedDataset, overwrite=True)
1162 log.debug("Successfully wrote bytes directly to %s", uri)
1163 data_written = True
1165 if not data_written:
1166 # Did not write the bytes directly to object store so instead
1167 # write to temporary file. Always write to a temporary even if
1168 # using a local file system -- that gives us atomic writes.
1169 # If a process is killed as the file is being written we do not
1170 # want it to remain in the correct place but in corrupt state.
1171 # For local files write to the output directory not temporary dir.
1172 prefix = uri.dirname() if uri.isLocal else None
1173 with ResourcePath.temporary_uri(suffix=uri.getExtension(), prefix=prefix) as temporary_uri:
1174 # Need to configure the formatter to write to a different
1175 # location and that needs us to overwrite internals
1176 log.debug("Writing dataset to temporary location at %s", temporary_uri)
1177 with formatter._updateLocation(Location(None, temporary_uri)):
1178 try:
1179 formatter.write(inMemoryDataset)
1180 except Exception as e:
1181 raise RuntimeError(
1182 f"Failed to serialize dataset {ref} of type"
1183 f" {type(inMemoryDataset)} to "
1184 f"temporary location {temporary_uri}"
1185 ) from e
1187 # Use move for a local file since that becomes an efficient
1188 # os.rename. For remote resources we use copy to allow the
1189 # file to be cached afterwards.
1190 transfer = "move" if uri.isLocal else "copy"
1192 uri.transfer_from(temporary_uri, transfer=transfer, overwrite=True)
1194 if transfer == "copy":
1195 # Cache if required
1196 self.cacheManager.move_to_cache(temporary_uri, ref)
1198 log.debug("Successfully wrote dataset to %s via a temporary file.", uri)
1200 # URI is needed to resolve what ingest case are we dealing with
1201 return self._extractIngestInfo(uri, ref, formatter=formatter)
1203 def _read_artifact_into_memory(
1204 self,
1205 getInfo: DatastoreFileGetInformation,
1206 ref: DatasetRef,
1207 isComponent: bool = False,
1208 cache_ref: Optional[DatasetRef] = None,
1209 ) -> Any:
1210 """Read the artifact from datastore into in memory object.
1212 Parameters
1213 ----------
1214 getInfo : `DatastoreFileGetInformation`
1215 Information about the artifact within the datastore.
1216 ref : `DatasetRef`
1217 The registry information associated with this artifact.
1218 isComponent : `bool`
1219 Flag to indicate if a component is being read from this artifact.
1220 cache_ref : `DatasetRef`, optional
1221 The DatasetRef to use when looking up the file in the cache.
1222 This ref must have the same ID as the supplied ref but can
1223 be a parent ref or component ref to indicate to the cache whether
1224 a composite file is being requested from the cache or a component
1225 file. Without this the cache will default to the supplied ref but
1226 it can get confused with read-only derived components for
1227 disassembled composites.
1229 Returns
1230 -------
1231 inMemoryDataset : `object`
1232 The artifact as a python object.
1233 """
1234 location = getInfo.location
1235 uri = location.uri
1236 log.debug("Accessing data from %s", uri)
1238 if cache_ref is None:
1239 cache_ref = ref
1240 if cache_ref.id != ref.id: 1240 ↛ 1241line 1240 didn't jump to line 1241, because the condition on line 1240 was never true
1241 raise ValueError(
1242 "The supplied cache dataset ref refers to a different dataset than expected:"
1243 f" {ref.id} != {cache_ref.id}"
1244 )
1246 # Cannot recalculate checksum but can compare size as a quick check
1247 # Do not do this if the size is negative since that indicates
1248 # we do not know.
1249 recorded_size = getInfo.info.file_size
1250 resource_size = uri.size()
1251 if recorded_size >= 0 and resource_size != recorded_size: 1251 ↛ 1252line 1251 didn't jump to line 1252, because the condition on line 1251 was never true
1252 raise RuntimeError(
1253 "Integrity failure in Datastore. "
1254 f"Size of file {uri} ({resource_size}) "
1255 f"does not match size recorded in registry of {recorded_size}"
1256 )
1258 # For the general case we have choices for how to proceed.
1259 # 1. Always use a local file (downloading the remote resource to a
1260 # temporary file if needed).
1261 # 2. Use a threshold size and read into memory and use bytes.
1262 # Use both for now with an arbitrary hand off size.
1263 # This allows small datasets to be downloaded from remote object
1264 # stores without requiring a temporary file.
1266 formatter = getInfo.formatter
1267 nbytes_max = 10_000_000 # Arbitrary number that we can tune
1268 if resource_size <= nbytes_max and formatter.can_read_bytes():
1269 with self.cacheManager.find_in_cache(cache_ref, uri.getExtension()) as cached_file:
1270 if cached_file is not None:
1271 desired_uri = cached_file
1272 msg = f" (cached version of {uri})"
1273 else:
1274 desired_uri = uri
1275 msg = ""
1276 with time_this(log, msg="Reading bytes from %s%s", args=(desired_uri, msg)):
1277 serializedDataset = desired_uri.read()
1278 log.debug(
1279 "Deserializing %s from %d bytes from location %s with formatter %s",
1280 f"component {getInfo.component}" if isComponent else "",
1281 len(serializedDataset),
1282 uri,
1283 formatter.name(),
1284 )
1285 try:
1286 result = formatter.fromBytes(
1287 serializedDataset, component=getInfo.component if isComponent else None
1288 )
1289 except Exception as e:
1290 raise ValueError(
1291 f"Failure from formatter '{formatter.name()}' for dataset {ref.id}"
1292 f" ({ref.datasetType.name} from {uri}): {e}"
1293 ) from e
1294 else:
1295 # Read from file.
1297 # Have to update the Location associated with the formatter
1298 # because formatter.read does not allow an override.
1299 # This could be improved.
1300 location_updated = False
1301 msg = ""
1303 # First check in cache for local version.
1304 # The cache will only be relevant for remote resources but
1305 # no harm in always asking. Context manager ensures that cache
1306 # file is not deleted during cache expiration.
1307 with self.cacheManager.find_in_cache(cache_ref, uri.getExtension()) as cached_file:
1308 if cached_file is not None:
1309 msg = f"(via cache read of remote file {uri})"
1310 uri = cached_file
1311 location_updated = True
1313 with uri.as_local() as local_uri:
1314 can_be_cached = False
1315 if uri != local_uri: 1315 ↛ 1317line 1315 didn't jump to line 1317, because the condition on line 1315 was never true
1316 # URI was remote and file was downloaded
1317 cache_msg = ""
1318 location_updated = True
1320 if self.cacheManager.should_be_cached(cache_ref):
1321 # In this scenario we want to ask if the downloaded
1322 # file should be cached but we should not cache
1323 # it until after we've used it (to ensure it can't
1324 # be expired whilst we are using it).
1325 can_be_cached = True
1327 # Say that it is "likely" to be cached because
1328 # if the formatter read fails we will not be
1329 # caching this file.
1330 cache_msg = " and likely cached"
1332 msg = f"(via download to local file{cache_msg})"
1334 # Calculate the (possibly) new location for the formatter
1335 # to use.
1336 newLocation = Location(*local_uri.split()) if location_updated else None
1338 log.debug(
1339 "Reading%s from location %s %s with formatter %s",
1340 f" component {getInfo.component}" if isComponent else "",
1341 uri,
1342 msg,
1343 formatter.name(),
1344 )
1345 try:
1346 with formatter._updateLocation(newLocation):
1347 with time_this(
1348 log,
1349 msg="Reading%s from location %s %s with formatter %s",
1350 args=(
1351 f" component {getInfo.component}" if isComponent else "",
1352 uri,
1353 msg,
1354 formatter.name(),
1355 ),
1356 ):
1357 result = formatter.read(component=getInfo.component if isComponent else None)
1358 except Exception as e:
1359 raise ValueError(
1360 f"Failure from formatter '{formatter.name()}' for dataset {ref.id}"
1361 f" ({ref.datasetType.name} from {uri}): {e}"
1362 ) from e
1364 # File was read successfully so can move to cache
1365 if can_be_cached: 1365 ↛ 1366line 1365 didn't jump to line 1366, because the condition on line 1365 was never true
1366 self.cacheManager.move_to_cache(local_uri, cache_ref)
1368 return self._post_process_get(
1369 result, ref.datasetType.storageClass, getInfo.assemblerParams, isComponent=isComponent
1370 )
1372 def knows(self, ref: DatasetRef) -> bool:
1373 """Check if the dataset is known to the datastore.
1375 Does not check for existence of any artifact.
1377 Parameters
1378 ----------
1379 ref : `DatasetRef`
1380 Reference to the required dataset.
1382 Returns
1383 -------
1384 exists : `bool`
1385 `True` if the dataset is known to the datastore.
1386 """
1387 fileLocations = self._get_dataset_locations_info(ref)
1388 if fileLocations:
1389 return True
1390 return False
1392 def knows_these(self, refs: Iterable[DatasetRef]) -> dict[DatasetRef, bool]:
1393 # Docstring inherited from the base class.
1395 # The records themselves. Could be missing some entries.
1396 records = self._get_stored_records_associated_with_refs(refs)
1398 return {ref: ref.id in records for ref in refs}
1400 def _process_mexists_records(
1401 self,
1402 id_to_ref: Dict[DatasetId, DatasetRef],
1403 records: Dict[DatasetId, List[StoredFileInfo]],
1404 all_required: bool,
1405 artifact_existence: Optional[Dict[ResourcePath, bool]] = None,
1406 ) -> Dict[DatasetRef, bool]:
1407 """Helper function for mexists that checks the given records.
1409 Parameters
1410 ----------
1411 id_to_ref : `dict` of [`DatasetId`, `DatasetRef`]
1412 Mapping of the dataset ID to the dataset ref itself.
1413 records : `dict` of [`DatasetId`, `list` of `StoredFileInfo`]
1414 Records as generally returned by
1415 ``_get_stored_records_associated_with_refs``.
1416 all_required : `bool`
1417 Flag to indicate whether existence requires all artifacts
1418 associated with a dataset ID to exist or not for existence.
1419 artifact_existence : `dict` [`lsst.resources.ResourcePath`, `bool`]
1420 Optional mapping of datastore artifact to existence. Updated by
1421 this method with details of all artifacts tested. Can be `None`
1422 if the caller is not interested.
1424 Returns
1425 -------
1426 existence : `dict` of [`DatasetRef`, `bool`]
1427 Mapping from dataset to boolean indicating existence.
1428 """
1429 # The URIs to be checked and a mapping of those URIs to
1430 # the dataset ID.
1431 uris_to_check: List[ResourcePath] = []
1432 location_map: Dict[ResourcePath, DatasetId] = {}
1434 location_factory = self.locationFactory
1436 uri_existence: Dict[ResourcePath, bool] = {}
1437 for ref_id, infos in records.items():
1438 # Key is the dataset Id, value is list of StoredItemInfo
1439 uris = [info.file_location(location_factory).uri for info in infos]
1440 location_map.update({uri: ref_id for uri in uris})
1442 # Check the local cache directly for a dataset corresponding
1443 # to the remote URI.
1444 if self.cacheManager.file_count > 0: 1444 ↛ 1445line 1444 didn't jump to line 1445, because the condition on line 1444 was never true
1445 ref = id_to_ref[ref_id]
1446 for uri, storedFileInfo in zip(uris, infos):
1447 check_ref = ref
1448 if not ref.datasetType.isComponent() and (component := storedFileInfo.component):
1449 check_ref = ref.makeComponentRef(component)
1450 if self.cacheManager.known_to_cache(check_ref, uri.getExtension()):
1451 # Proxy for URI existence.
1452 uri_existence[uri] = True
1453 else:
1454 uris_to_check.append(uri)
1455 else:
1456 # Check all of them.
1457 uris_to_check.extend(uris)
1459 if artifact_existence is not None:
1460 # If a URI has already been checked remove it from the list
1461 # and immediately add the status to the output dict.
1462 filtered_uris_to_check = []
1463 for uri in uris_to_check:
1464 if uri in artifact_existence:
1465 uri_existence[uri] = artifact_existence[uri]
1466 else:
1467 filtered_uris_to_check.append(uri)
1468 uris_to_check = filtered_uris_to_check
1470 # Results.
1471 dataset_existence: Dict[DatasetRef, bool] = {}
1473 uri_existence.update(ResourcePath.mexists(uris_to_check))
1474 for uri, exists in uri_existence.items():
1475 dataset_id = location_map[uri]
1476 ref = id_to_ref[dataset_id]
1478 # Disassembled composite needs to check all locations.
1479 # all_required indicates whether all need to exist or not.
1480 if ref in dataset_existence:
1481 if all_required:
1482 exists = dataset_existence[ref] and exists
1483 else:
1484 exists = dataset_existence[ref] or exists
1485 dataset_existence[ref] = exists
1487 if artifact_existence is not None:
1488 artifact_existence.update(uri_existence)
1490 return dataset_existence
1492 def mexists(
1493 self, refs: Iterable[DatasetRef], artifact_existence: Optional[Dict[ResourcePath, bool]] = None
1494 ) -> Dict[DatasetRef, bool]:
1495 """Check the existence of multiple datasets at once.
1497 Parameters
1498 ----------
1499 refs : iterable of `DatasetRef`
1500 The datasets to be checked.
1501 artifact_existence : `dict` [`lsst.resources.ResourcePath`, `bool`]
1502 Optional mapping of datastore artifact to existence. Updated by
1503 this method with details of all artifacts tested. Can be `None`
1504 if the caller is not interested.
1506 Returns
1507 -------
1508 existence : `dict` of [`DatasetRef`, `bool`]
1509 Mapping from dataset to boolean indicating existence.
1511 Notes
1512 -----
1513 To minimize potentially costly remote existence checks, the local
1514 cache is checked as a proxy for existence. If a file for this
1515 `DatasetRef` does exist no check is done for the actual URI. This
1516 could result in possibly unexpected behavior if the dataset itself
1517 has been removed from the datastore by another process whilst it is
1518 still in the cache.
1519 """
1520 chunk_size = 10_000
1521 dataset_existence: Dict[DatasetRef, bool] = {}
1522 log.debug("Checking for the existence of multiple artifacts in datastore in chunks of %d", chunk_size)
1523 n_found_total = 0
1524 n_checked = 0
1525 n_chunks = 0
1526 for chunk in chunk_iterable(refs, chunk_size=chunk_size):
1527 chunk_result = self._mexists(chunk, artifact_existence)
1528 if log.isEnabledFor(VERBOSE):
1529 n_results = len(chunk_result)
1530 n_checked += n_results
1531 # Can treat the booleans as 0, 1 integers and sum them.
1532 n_found = sum(chunk_result.values())
1533 n_found_total += n_found
1534 log.verbose(
1535 "Number of datasets found in datastore for chunk %d = %d/%d (running total: %d/%d)",
1536 n_chunks,
1537 n_found,
1538 n_results,
1539 n_found_total,
1540 n_checked,
1541 )
1542 dataset_existence.update(chunk_result)
1543 n_chunks += 1
1545 return dataset_existence
1547 def _mexists(
1548 self, refs: Sequence[DatasetRef], artifact_existence: Optional[Dict[ResourcePath, bool]] = None
1549 ) -> Dict[DatasetRef, bool]:
1550 """Check the existence of multiple datasets at once.
1552 Parameters
1553 ----------
1554 refs : iterable of `DatasetRef`
1555 The datasets to be checked.
1556 artifact_existence : `dict` [`lsst.resources.ResourcePath`, `bool`]
1557 Optional mapping of datastore artifact to existence. Updated by
1558 this method with details of all artifacts tested. Can be `None`
1559 if the caller is not interested.
1561 Returns
1562 -------
1563 existence : `dict` of [`DatasetRef`, `bool`]
1564 Mapping from dataset to boolean indicating existence.
1565 """
1566 # Make a mapping from refs with the internal storage class to the given
1567 # refs that may have a different one. We'll use the internal refs
1568 # throughout this method and convert back at the very end.
1569 internal_ref_to_input_ref = {self._cast_storage_class(ref): ref for ref in refs}
1571 # Need a mapping of dataset_id to (internal) dataset ref since some
1572 # internal APIs work with dataset_id.
1573 id_to_ref = {ref.getCheckedId(): ref for ref in internal_ref_to_input_ref}
1575 # Set of all IDs we are checking for.
1576 requested_ids = set(id_to_ref.keys())
1578 # The records themselves. Could be missing some entries.
1579 records = self._get_stored_records_associated_with_refs(id_to_ref.values())
1581 dataset_existence = self._process_mexists_records(
1582 id_to_ref, records, True, artifact_existence=artifact_existence
1583 )
1585 # Set of IDs that have been handled.
1586 handled_ids = {ref.id for ref in dataset_existence.keys()}
1588 missing_ids = requested_ids - handled_ids
1589 if missing_ids:
1590 dataset_existence.update(
1591 self._mexists_check_expected(
1592 [id_to_ref[missing] for missing in missing_ids], artifact_existence
1593 )
1594 )
1596 return {
1597 internal_ref_to_input_ref[internal_ref]: existence
1598 for internal_ref, existence in dataset_existence.items()
1599 }
1601 def _mexists_check_expected(
1602 self, refs: Sequence[DatasetRef], artifact_existence: Optional[Dict[ResourcePath, bool]] = None
1603 ) -> Dict[DatasetRef, bool]:
1604 """Check existence of refs that are not known to datastore.
1606 Parameters
1607 ----------
1608 refs : iterable of `DatasetRef`
1609 The datasets to be checked. These are assumed not to be known
1610 to datastore.
1611 artifact_existence : `dict` [`lsst.resources.ResourcePath`, `bool`]
1612 Optional mapping of datastore artifact to existence. Updated by
1613 this method with details of all artifacts tested. Can be `None`
1614 if the caller is not interested.
1616 Returns
1617 -------
1618 existence : `dict` of [`DatasetRef`, `bool`]
1619 Mapping from dataset to boolean indicating existence.
1620 """
1621 dataset_existence: Dict[DatasetRef, bool] = {}
1622 if not self.trustGetRequest:
1623 # Must assume these do not exist
1624 for ref in refs:
1625 dataset_existence[ref] = False
1626 else:
1627 log.debug(
1628 "%d datasets were not known to datastore during initial existence check.",
1629 len(refs),
1630 )
1632 # Construct data structure identical to that returned
1633 # by _get_stored_records_associated_with_refs() but using
1634 # guessed names.
1635 records = {}
1636 id_to_ref = {}
1637 for missing_ref in refs:
1638 expected = self._get_expected_dataset_locations_info(missing_ref)
1639 dataset_id = missing_ref.getCheckedId()
1640 records[dataset_id] = [info for _, info in expected]
1641 id_to_ref[dataset_id] = missing_ref
1643 dataset_existence.update(
1644 self._process_mexists_records(
1645 id_to_ref,
1646 records,
1647 False,
1648 artifact_existence=artifact_existence,
1649 )
1650 )
1652 return dataset_existence
1654 def exists(self, ref: DatasetRef) -> bool:
1655 """Check if the dataset exists in the datastore.
1657 Parameters
1658 ----------
1659 ref : `DatasetRef`
1660 Reference to the required dataset.
1662 Returns
1663 -------
1664 exists : `bool`
1665 `True` if the entity exists in the `Datastore`.
1667 Notes
1668 -----
1669 The local cache is checked as a proxy for existence in the remote
1670 object store. It is possible that another process on a different
1671 compute node could remove the file from the object store even
1672 though it is present in the local cache.
1673 """
1674 ref = self._cast_storage_class(ref)
1675 fileLocations = self._get_dataset_locations_info(ref)
1677 # if we are being asked to trust that registry might not be correct
1678 # we ask for the expected locations and check them explicitly
1679 if not fileLocations:
1680 if not self.trustGetRequest:
1681 return False
1683 # First check the cache. If it is not found we must check
1684 # the datastore itself. Assume that any component in the cache
1685 # means that the dataset does exist somewhere.
1686 if self.cacheManager.known_to_cache(ref): 1686 ↛ 1687line 1686 didn't jump to line 1687, because the condition on line 1686 was never true
1687 return True
1689 # When we are guessing a dataset location we can not check
1690 # for the existence of every component since we can not
1691 # know if every component was written. Instead we check
1692 # for the existence of any of the expected locations.
1693 for location, _ in self._get_expected_dataset_locations_info(ref):
1694 if self._artifact_exists(location):
1695 return True
1696 return False
1698 # All listed artifacts must exist.
1699 for location, storedFileInfo in fileLocations:
1700 # Checking in cache needs the component ref.
1701 check_ref = ref
1702 if not ref.datasetType.isComponent() and (component := storedFileInfo.component):
1703 check_ref = ref.makeComponentRef(component)
1704 if self.cacheManager.known_to_cache(check_ref, location.getExtension()):
1705 continue
1707 if not self._artifact_exists(location): 1707 ↛ 1708line 1707 didn't jump to line 1708, because the condition on line 1707 was never true
1708 return False
1710 return True
1712 def getURIs(self, ref: DatasetRef, predict: bool = False) -> DatasetRefURIs:
1713 """Return URIs associated with dataset.
1715 Parameters
1716 ----------
1717 ref : `DatasetRef`
1718 Reference to the required dataset.
1719 predict : `bool`, optional
1720 If the datastore does not know about the dataset, should it
1721 return a predicted URI or not?
1723 Returns
1724 -------
1725 uris : `DatasetRefURIs`
1726 The URI to the primary artifact associated with this dataset (if
1727 the dataset was disassembled within the datastore this may be
1728 `None`), and the URIs to any components associated with the dataset
1729 artifact. (can be empty if there are no components).
1730 """
1731 many = self.getManyURIs([ref], predict=predict, allow_missing=False)
1732 return many[ref]
1734 def getURI(self, ref: DatasetRef, predict: bool = False) -> ResourcePath:
1735 """URI to the Dataset.
1737 Parameters
1738 ----------
1739 ref : `DatasetRef`
1740 Reference to the required Dataset.
1741 predict : `bool`
1742 If `True`, allow URIs to be returned of datasets that have not
1743 been written.
1745 Returns
1746 -------
1747 uri : `str`
1748 URI pointing to the dataset within the datastore. If the
1749 dataset does not exist in the datastore, and if ``predict`` is
1750 `True`, the URI will be a prediction and will include a URI
1751 fragment "#predicted".
1752 If the datastore does not have entities that relate well
1753 to the concept of a URI the returned URI will be
1754 descriptive. The returned URI is not guaranteed to be obtainable.
1756 Raises
1757 ------
1758 FileNotFoundError
1759 Raised if a URI has been requested for a dataset that does not
1760 exist and guessing is not allowed.
1761 RuntimeError
1762 Raised if a request is made for a single URI but multiple URIs
1763 are associated with this dataset.
1765 Notes
1766 -----
1767 When a predicted URI is requested an attempt will be made to form
1768 a reasonable URI based on file templates and the expected formatter.
1769 """
1770 primary, components = self.getURIs(ref, predict)
1771 if primary is None or components: 1771 ↛ 1772line 1771 didn't jump to line 1772, because the condition on line 1771 was never true
1772 raise RuntimeError(
1773 f"Dataset ({ref}) includes distinct URIs for components. Use Datastore.getURIs() instead."
1774 )
1775 return primary
1777 def _predict_URIs(
1778 self,
1779 ref: DatasetRef,
1780 ) -> DatasetRefURIs:
1781 """Predict the URIs of a dataset ref.
1783 Parameters
1784 ----------
1785 ref : `DatasetRef`
1786 Reference to the required Dataset.
1788 Returns
1789 -------
1790 URI : DatasetRefUris
1791 Primary and component URIs. URIs will contain a URI fragment
1792 "#predicted".
1793 """
1794 uris = DatasetRefURIs()
1796 if self.composites.shouldBeDisassembled(ref):
1797 for component, _ in ref.datasetType.storageClass.components.items():
1798 comp_ref = ref.makeComponentRef(component)
1799 comp_location, _ = self._determine_put_formatter_location(comp_ref)
1801 # Add the "#predicted" URI fragment to indicate this is a
1802 # guess
1803 uris.componentURIs[component] = ResourcePath(comp_location.uri.geturl() + "#predicted")
1805 else:
1806 location, _ = self._determine_put_formatter_location(ref)
1808 # Add the "#predicted" URI fragment to indicate this is a guess
1809 uris.primaryURI = ResourcePath(location.uri.geturl() + "#predicted")
1811 return uris
1813 def getManyURIs(
1814 self,
1815 refs: Iterable[DatasetRef],
1816 predict: bool = False,
1817 allow_missing: bool = False,
1818 ) -> Dict[DatasetRef, DatasetRefURIs]:
1819 # Docstring inherited
1821 uris: Dict[DatasetRef, DatasetRefURIs] = {}
1823 records = self._get_stored_records_associated_with_refs(refs)
1824 records_keys = records.keys()
1826 existing_refs = tuple(ref for ref in refs if ref.id in records_keys)
1827 missing_refs = tuple(ref for ref in refs if ref.id not in records_keys)
1829 # Have to handle trustGetRequest mode by checking for the existence
1830 # of the missing refs on disk.
1831 if missing_refs:
1832 dataset_existence = self._mexists_check_expected(missing_refs, None)
1833 really_missing = set()
1834 not_missing = set()
1835 for ref, exists in dataset_existence.items():
1836 if exists:
1837 not_missing.add(ref)
1838 else:
1839 really_missing.add(ref)
1841 if not_missing:
1842 # Need to recalculate the missing/existing split.
1843 existing_refs = existing_refs + tuple(not_missing)
1844 missing_refs = tuple(really_missing)
1846 for ref in missing_refs:
1847 # if this has never been written then we have to guess
1848 if not predict:
1849 if not allow_missing:
1850 raise FileNotFoundError("Dataset {} not in this datastore.".format(ref))
1851 else:
1852 uris[ref] = self._predict_URIs(ref)
1854 for ref in existing_refs:
1855 file_infos = records[ref.getCheckedId()]
1856 file_locations = [(i.file_location(self.locationFactory), i) for i in file_infos]
1857 uris[ref] = self._locations_to_URI(ref, file_locations)
1859 return uris
1861 def _locations_to_URI(
1862 self,
1863 ref: DatasetRef,
1864 file_locations: Sequence[Tuple[Location, StoredFileInfo]],
1865 ) -> DatasetRefURIs:
1866 """Convert one or more file locations associated with a DatasetRef
1867 to a DatasetRefURIs.
1869 Parameters
1870 ----------
1871 ref : `DatasetRef`
1872 Reference to the dataset.
1873 file_locations : Sequence[Tuple[Location, StoredFileInfo]]
1874 Each item in the sequence is the location of the dataset within the
1875 datastore and stored information about the file and its formatter.
1876 If there is only one item in the sequence then it is treated as the
1877 primary URI. If there is more than one item then they are treated
1878 as component URIs. If there are no items then an error is raised
1879 unless ``self.trustGetRequest`` is `True`.
1881 Returns
1882 -------
1883 uris: DatasetRefURIs
1884 Represents the primary URI or component URIs described by the
1885 inputs.
1887 Raises
1888 ------
1889 RuntimeError
1890 If no file locations are passed in and ``self.trustGetRequest`` is
1891 `False`.
1892 FileNotFoundError
1893 If the a passed-in URI does not exist, and ``self.trustGetRequest``
1894 is `False`.
1895 RuntimeError
1896 If a passed in `StoredFileInfo`'s ``component`` is `None` (this is
1897 unexpected).
1898 """
1900 guessing = False
1901 uris = DatasetRefURIs()
1903 if not file_locations:
1904 if not self.trustGetRequest: 1904 ↛ 1905line 1904 didn't jump to line 1905, because the condition on line 1904 was never true
1905 raise RuntimeError(f"Unexpectedly got no artifacts for dataset {ref}")
1906 file_locations = self._get_expected_dataset_locations_info(ref)
1907 guessing = True
1909 if len(file_locations) == 1:
1910 # No disassembly so this is the primary URI
1911 uris.primaryURI = file_locations[0][0].uri
1912 if guessing and not uris.primaryURI.exists(): 1912 ↛ 1913line 1912 didn't jump to line 1913, because the condition on line 1912 was never true
1913 raise FileNotFoundError(f"Expected URI ({uris.primaryURI}) does not exist")
1914 else:
1915 for location, file_info in file_locations:
1916 if file_info.component is None: 1916 ↛ 1917line 1916 didn't jump to line 1917, because the condition on line 1916 was never true
1917 raise RuntimeError(f"Unexpectedly got no component name for a component at {location}")
1918 if guessing and not location.uri.exists(): 1918 ↛ 1922line 1918 didn't jump to line 1922, because the condition on line 1918 was never true
1919 # If we are trusting then it is entirely possible for
1920 # some components to be missing. In that case we skip
1921 # to the next component.
1922 if self.trustGetRequest:
1923 continue
1924 raise FileNotFoundError(f"Expected URI ({location.uri}) does not exist")
1925 uris.componentURIs[file_info.component] = location.uri
1927 return uris
1929 def retrieveArtifacts(
1930 self,
1931 refs: Iterable[DatasetRef],
1932 destination: ResourcePath,
1933 transfer: str = "auto",
1934 preserve_path: bool = True,
1935 overwrite: bool = False,
1936 ) -> List[ResourcePath]:
1937 """Retrieve the file artifacts associated with the supplied refs.
1939 Parameters
1940 ----------
1941 refs : iterable of `DatasetRef`
1942 The datasets for which file artifacts are to be retrieved.
1943 A single ref can result in multiple files. The refs must
1944 be resolved.
1945 destination : `lsst.resources.ResourcePath`
1946 Location to write the file artifacts.
1947 transfer : `str`, optional
1948 Method to use to transfer the artifacts. Must be one of the options
1949 supported by `lsst.resources.ResourcePath.transfer_from()`.
1950 "move" is not allowed.
1951 preserve_path : `bool`, optional
1952 If `True` the full path of the file artifact within the datastore
1953 is preserved. If `False` the final file component of the path
1954 is used.
1955 overwrite : `bool`, optional
1956 If `True` allow transfers to overwrite existing files at the
1957 destination.
1959 Returns
1960 -------
1961 targets : `list` of `lsst.resources.ResourcePath`
1962 URIs of file artifacts in destination location. Order is not
1963 preserved.
1964 """
1965 if not destination.isdir(): 1965 ↛ 1966line 1965 didn't jump to line 1966, because the condition on line 1965 was never true
1966 raise ValueError(f"Destination location must refer to a directory. Given {destination}")
1968 if transfer == "move":
1969 raise ValueError("Can not move artifacts out of datastore. Use copy instead.")
1971 # Source -> Destination
1972 # This also helps filter out duplicate DatasetRef in the request
1973 # that will map to the same underlying file transfer.
1974 to_transfer: Dict[ResourcePath, ResourcePath] = {}
1976 for ref in refs:
1977 locations = self._get_dataset_locations_info(ref)
1978 for location, _ in locations:
1979 source_uri = location.uri
1980 target_path: ResourcePathExpression
1981 if preserve_path:
1982 target_path = location.pathInStore
1983 if target_path.isabs(): 1983 ↛ 1986line 1983 didn't jump to line 1986, because the condition on line 1983 was never true
1984 # This is an absolute path to an external file.
1985 # Use the full path.
1986 target_path = target_path.relativeToPathRoot
1987 else:
1988 target_path = source_uri.basename()
1989 target_uri = destination.join(target_path)
1990 to_transfer[source_uri] = target_uri
1992 # In theory can now parallelize the transfer
1993 log.debug("Number of artifacts to transfer to %s: %d", str(destination), len(to_transfer))
1994 for source_uri, target_uri in to_transfer.items():
1995 target_uri.transfer_from(source_uri, transfer=transfer, overwrite=overwrite)
1997 return list(to_transfer.values())
1999 def get(
2000 self,
2001 ref: DatasetRef,
2002 parameters: Optional[Mapping[str, Any]] = None,
2003 storageClass: Optional[Union[StorageClass, str]] = None,
2004 ) -> Any:
2005 """Load an InMemoryDataset from the store.
2007 Parameters
2008 ----------
2009 ref : `DatasetRef`
2010 Reference to the required Dataset.
2011 parameters : `dict`
2012 `StorageClass`-specific parameters that specify, for example,
2013 a slice of the dataset to be loaded.
2014 storageClass : `StorageClass` or `str`, optional
2015 The storage class to be used to override the Python type
2016 returned by this method. By default the returned type matches
2017 the dataset type definition for this dataset. Specifying a
2018 read `StorageClass` can force a different type to be returned.
2019 This type must be compatible with the original type.
2021 Returns
2022 -------
2023 inMemoryDataset : `object`
2024 Requested dataset or slice thereof as an InMemoryDataset.
2026 Raises
2027 ------
2028 FileNotFoundError
2029 Requested dataset can not be retrieved.
2030 TypeError
2031 Return value from formatter has unexpected type.
2032 ValueError
2033 Formatter failed to process the dataset.
2034 """
2035 # Supplied storage class for the component being read is either
2036 # from the ref itself or some an override if we want to force
2037 # type conversion.
2038 if storageClass is not None:
2039 ref = ref.overrideStorageClass(storageClass)
2040 refStorageClass = ref.datasetType.storageClass
2042 allGetInfo = self._prepare_for_get(ref, parameters)
2043 refComponent = ref.datasetType.component()
2045 # Create mapping from component name to related info
2046 allComponents = {i.component: i for i in allGetInfo}
2048 # By definition the dataset is disassembled if we have more
2049 # than one record for it.
2050 isDisassembled = len(allGetInfo) > 1
2052 # Look for the special case where we are disassembled but the
2053 # component is a derived component that was not written during
2054 # disassembly. For this scenario we need to check that the
2055 # component requested is listed as a derived component for the
2056 # composite storage class
2057 isDisassembledReadOnlyComponent = False
2058 if isDisassembled and refComponent:
2059 # The composite storage class should be accessible through
2060 # the component dataset type
2061 compositeStorageClass = ref.datasetType.parentStorageClass
2063 # In the unlikely scenario where the composite storage
2064 # class is not known, we can only assume that this is a
2065 # normal component. If that assumption is wrong then the
2066 # branch below that reads a persisted component will fail
2067 # so there is no need to complain here.
2068 if compositeStorageClass is not None: 2068 ↛ 2071line 2068 didn't jump to line 2071, because the condition on line 2068 was never false
2069 isDisassembledReadOnlyComponent = refComponent in compositeStorageClass.derivedComponents
2071 if isDisassembled and not refComponent:
2072 # This was a disassembled dataset spread over multiple files
2073 # and we need to put them all back together again.
2074 # Read into memory and then assemble
2076 # Check that the supplied parameters are suitable for the type read
2077 refStorageClass.validateParameters(parameters)
2079 # We want to keep track of all the parameters that were not used
2080 # by formatters. We assume that if any of the component formatters
2081 # use a parameter that we do not need to apply it again in the
2082 # assembler.
2083 usedParams = set()
2085 components: Dict[str, Any] = {}
2086 for getInfo in allGetInfo:
2087 # assemblerParams are parameters not understood by the
2088 # associated formatter.
2089 usedParams.update(set(getInfo.formatterParams))
2091 component = getInfo.component
2093 if component is None: 2093 ↛ 2094line 2093 didn't jump to line 2094, because the condition on line 2093 was never true
2094 raise RuntimeError(f"Internal error in datastore assembly of {ref}")
2096 # We do not want the formatter to think it's reading
2097 # a component though because it is really reading a
2098 # standalone dataset -- always tell reader it is not a
2099 # component.
2100 components[component] = self._read_artifact_into_memory(
2101 getInfo, ref.makeComponentRef(component), isComponent=False
2102 )
2104 inMemoryDataset = ref.datasetType.storageClass.delegate().assemble(components)
2106 # Any unused parameters will have to be passed to the assembler
2107 if parameters:
2108 unusedParams = {k: v for k, v in parameters.items() if k not in usedParams}
2109 else:
2110 unusedParams = {}
2112 # Process parameters
2113 return ref.datasetType.storageClass.delegate().handleParameters(
2114 inMemoryDataset, parameters=unusedParams
2115 )
2117 elif isDisassembledReadOnlyComponent:
2118 compositeStorageClass = ref.datasetType.parentStorageClass
2119 if compositeStorageClass is None: 2119 ↛ 2120line 2119 didn't jump to line 2120, because the condition on line 2119 was never true
2120 raise RuntimeError(
2121 f"Unable to retrieve derived component '{refComponent}' since"
2122 "no composite storage class is available."
2123 )
2125 if refComponent is None: 2125 ↛ 2127line 2125 didn't jump to line 2127, because the condition on line 2125 was never true
2126 # Mainly for mypy
2127 raise RuntimeError(f"Internal error in datastore {self.name}: component can not be None here")
2129 # Assume that every derived component can be calculated by
2130 # forwarding the request to a single read/write component.
2131 # Rather than guessing which rw component is the right one by
2132 # scanning each for a derived component of the same name,
2133 # we ask the storage class delegate directly which one is best to
2134 # use.
2135 compositeDelegate = compositeStorageClass.delegate()
2136 forwardedComponent = compositeDelegate.selectResponsibleComponent(
2137 refComponent, set(allComponents)
2138 )
2140 # Select the relevant component
2141 rwInfo = allComponents[forwardedComponent]
2143 # For now assume that read parameters are validated against
2144 # the real component and not the requested component
2145 forwardedStorageClass = rwInfo.formatter.fileDescriptor.readStorageClass
2146 forwardedStorageClass.validateParameters(parameters)
2148 # The reference to use for the caching must refer to the forwarded
2149 # component and not the derived component.
2150 cache_ref = ref.makeCompositeRef().makeComponentRef(forwardedComponent)
2152 # Unfortunately the FileDescriptor inside the formatter will have
2153 # the wrong write storage class so we need to create a new one
2154 # given the immutability constraint.
2155 writeStorageClass = rwInfo.info.storageClass
2157 # We may need to put some thought into parameters for read
2158 # components but for now forward them on as is
2159 readFormatter = type(rwInfo.formatter)(
2160 FileDescriptor(
2161 rwInfo.location,
2162 readStorageClass=refStorageClass,
2163 storageClass=writeStorageClass,
2164 parameters=parameters,
2165 ),
2166 ref.dataId,
2167 )
2169 # The assembler can not receive any parameter requests for a
2170 # derived component at this time since the assembler will
2171 # see the storage class of the derived component and those
2172 # parameters will have to be handled by the formatter on the
2173 # forwarded storage class.
2174 assemblerParams: Dict[str, Any] = {}
2176 # Need to created a new info that specifies the derived
2177 # component and associated storage class
2178 readInfo = DatastoreFileGetInformation(
2179 rwInfo.location,
2180 readFormatter,
2181 rwInfo.info,
2182 assemblerParams,
2183 {},
2184 refComponent,
2185 refStorageClass,
2186 )
2188 return self._read_artifact_into_memory(readInfo, ref, isComponent=True, cache_ref=cache_ref)
2190 else:
2191 # Single file request or component from that composite file
2192 for lookup in (refComponent, None): 2192 ↛ 2197line 2192 didn't jump to line 2197, because the loop on line 2192 didn't complete
2193 if lookup in allComponents: 2193 ↛ 2192line 2193 didn't jump to line 2192, because the condition on line 2193 was never false
2194 getInfo = allComponents[lookup]
2195 break
2196 else:
2197 raise FileNotFoundError(
2198 f"Component {refComponent} not found for ref {ref} in datastore {self.name}"
2199 )
2201 # Do not need the component itself if already disassembled
2202 if isDisassembled:
2203 isComponent = False
2204 else:
2205 isComponent = getInfo.component is not None
2207 # For a component read of a composite we want the cache to
2208 # be looking at the composite ref itself.
2209 cache_ref = ref.makeCompositeRef() if isComponent else ref
2211 # For a disassembled component we can validate parametersagainst
2212 # the component storage class directly
2213 if isDisassembled:
2214 refStorageClass.validateParameters(parameters)
2215 else:
2216 # For an assembled composite this could be a derived
2217 # component derived from a real component. The validity
2218 # of the parameters is not clear. For now validate against
2219 # the composite storage class
2220 getInfo.formatter.fileDescriptor.storageClass.validateParameters(parameters)
2222 return self._read_artifact_into_memory(getInfo, ref, isComponent=isComponent, cache_ref=cache_ref)
2224 @transactional
2225 def put(self, inMemoryDataset: Any, ref: DatasetRef) -> None:
2226 """Write a InMemoryDataset with a given `DatasetRef` to the store.
2228 Parameters
2229 ----------
2230 inMemoryDataset : `object`
2231 The dataset to store.
2232 ref : `DatasetRef`
2233 Reference to the associated Dataset.
2235 Raises
2236 ------
2237 TypeError
2238 Supplied object and storage class are inconsistent.
2239 DatasetTypeNotSupportedError
2240 The associated `DatasetType` is not handled by this datastore.
2242 Notes
2243 -----
2244 If the datastore is configured to reject certain dataset types it
2245 is possible that the put will fail and raise a
2246 `DatasetTypeNotSupportedError`. The main use case for this is to
2247 allow `ChainedDatastore` to put to multiple datastores without
2248 requiring that every datastore accepts the dataset.
2249 """
2251 doDisassembly = self.composites.shouldBeDisassembled(ref)
2252 # doDisassembly = True
2254 artifacts = []
2255 if doDisassembly:
2256 components = ref.datasetType.storageClass.delegate().disassemble(inMemoryDataset)
2257 if components is None: 2257 ↛ 2258line 2257 didn't jump to line 2258, because the condition on line 2257 was never true
2258 raise RuntimeError(
2259 f"Inconsistent configuration: dataset type {ref.datasetType.name} "
2260 f"with storage class {ref.datasetType.storageClass.name} "
2261 "is configured to be disassembled, but cannot be."
2262 )
2263 for component, componentInfo in components.items():
2264 # Don't recurse because we want to take advantage of
2265 # bulk insert -- need a new DatasetRef that refers to the
2266 # same dataset_id but has the component DatasetType
2267 # DatasetType does not refer to the types of components
2268 # So we construct one ourselves.
2269 compRef = ref.makeComponentRef(component)
2270 storedInfo = self._write_in_memory_to_artifact(componentInfo.component, compRef)
2271 artifacts.append((compRef, storedInfo))
2272 else:
2273 # Write the entire thing out
2274 storedInfo = self._write_in_memory_to_artifact(inMemoryDataset, ref)
2275 artifacts.append((ref, storedInfo))
2277 self._register_datasets(artifacts)
2279 @transactional
2280 def trash(self, ref: Union[DatasetRef, Iterable[DatasetRef]], ignore_errors: bool = True) -> None:
2281 # At this point can safely remove these datasets from the cache
2282 # to avoid confusion later on. If they are not trashed later
2283 # the cache will simply be refilled.
2284 self.cacheManager.remove_from_cache(ref)
2286 # If we are in trust mode there will be nothing to move to
2287 # the trash table and we will have to try to delete the file
2288 # immediately.
2289 if self.trustGetRequest:
2290 # Try to keep the logic below for a single file trash.
2291 if isinstance(ref, DatasetRef):
2292 refs = {ref}
2293 else:
2294 # Will recreate ref at the end of this branch.
2295 refs = set(ref)
2297 # Determine which datasets are known to datastore directly.
2298 id_to_ref = {ref.getCheckedId(): ref for ref in refs}
2299 existing_ids = self._get_stored_records_associated_with_refs(refs)
2300 existing_refs = {id_to_ref[ref_id] for ref_id in existing_ids}
2302 missing = refs - existing_refs
2303 if missing:
2304 # Do an explicit existence check on these refs.
2305 # We only care about the artifacts at this point and not
2306 # the dataset existence.
2307 artifact_existence: Dict[ResourcePath, bool] = {}
2308 _ = self.mexists(missing, artifact_existence)
2309 uris = [uri for uri, exists in artifact_existence.items() if exists]
2311 # FUTURE UPGRADE: Implement a parallelized bulk remove.
2312 log.debug("Removing %d artifacts from datastore that are unknown to datastore", len(uris))
2313 for uri in uris:
2314 try:
2315 uri.remove()
2316 except Exception as e:
2317 if ignore_errors:
2318 log.debug("Artifact %s could not be removed: %s", uri, e)
2319 continue
2320 raise
2322 # There is no point asking the code below to remove refs we
2323 # know are missing so update it with the list of existing
2324 # records. Try to retain one vs many logic.
2325 if not existing_refs:
2326 # Nothing more to do since none of the datasets were
2327 # known to the datastore record table.
2328 return
2329 ref = list(existing_refs)
2330 if len(ref) == 1:
2331 ref = ref[0]
2333 # Get file metadata and internal metadata
2334 if not isinstance(ref, DatasetRef):
2335 log.debug("Doing multi-dataset trash in datastore %s", self.name)
2336 # Assumed to be an iterable of refs so bulk mode enabled.
2337 try:
2338 self.bridge.moveToTrash(ref, transaction=self._transaction)
2339 except Exception as e:
2340 if ignore_errors:
2341 log.warning("Unexpected issue moving multiple datasets to trash: %s", e)
2342 else:
2343 raise
2344 return
2346 log.debug("Trashing dataset %s in datastore %s", ref, self.name)
2348 fileLocations = self._get_dataset_locations_info(ref)
2350 if not fileLocations:
2351 err_msg = f"Requested dataset to trash ({ref}) is not known to datastore {self.name}"
2352 if ignore_errors:
2353 log.warning(err_msg)
2354 return
2355 else:
2356 raise FileNotFoundError(err_msg)
2358 for location, storedFileInfo in fileLocations:
2359 if not self._artifact_exists(location): 2359 ↛ 2360line 2359 didn't jump to line 2360
2360 err_msg = (
2361 f"Dataset is known to datastore {self.name} but "
2362 f"associated artifact ({location.uri}) is missing"
2363 )
2364 if ignore_errors:
2365 log.warning(err_msg)
2366 return
2367 else:
2368 raise FileNotFoundError(err_msg)
2370 # Mark dataset as trashed
2371 try:
2372 self.bridge.moveToTrash([ref], transaction=self._transaction)
2373 except Exception as e:
2374 if ignore_errors:
2375 log.warning(
2376 "Attempted to mark dataset (%s) to be trashed in datastore %s "
2377 "but encountered an error: %s",
2378 ref,
2379 self.name,
2380 e,
2381 )
2382 pass
2383 else:
2384 raise
2386 @transactional
2387 def emptyTrash(self, ignore_errors: bool = True) -> None:
2388 """Remove all datasets from the trash.
2390 Parameters
2391 ----------
2392 ignore_errors : `bool`
2393 If `True` return without error even if something went wrong.
2394 Problems could occur if another process is simultaneously trying
2395 to delete.
2396 """
2397 log.debug("Emptying trash in datastore %s", self.name)
2399 # Context manager will empty trash iff we finish it without raising.
2400 # It will also automatically delete the relevant rows from the
2401 # trash table and the records table.
2402 with self.bridge.emptyTrash(
2403 self._table, record_class=StoredFileInfo, record_column="path"
2404 ) as trash_data:
2405 # Removing the artifacts themselves requires that the files are
2406 # not also associated with refs that are not to be trashed.
2407 # Therefore need to do a query with the file paths themselves
2408 # and return all the refs associated with them. Can only delete
2409 # a file if the refs to be trashed are the only refs associated
2410 # with the file.
2411 # This requires multiple copies of the trashed items
2412 trashed, artifacts_to_keep = trash_data
2414 if artifacts_to_keep is None:
2415 # The bridge is not helping us so have to work it out
2416 # ourselves. This is not going to be as efficient.
2417 trashed = list(trashed)
2419 # The instance check is for mypy since up to this point it
2420 # does not know the type of info.
2421 path_map = self._refs_associated_with_artifacts(
2422 [info.path for _, info in trashed if isinstance(info, StoredFileInfo)]
2423 )
2425 for ref, info in trashed:
2426 # Mypy needs to know this is not the base class
2427 assert isinstance(info, StoredFileInfo), f"Unexpectedly got info of class {type(info)}"
2429 # Check for mypy
2430 assert ref.id is not None, f"Internal logic error in emptyTrash with ref {ref}/{info}"
2432 path_map[info.path].remove(ref.id)
2433 if not path_map[info.path]: 2433 ↛ 2425line 2433 didn't jump to line 2425, because the condition on line 2433 was never false
2434 del path_map[info.path]
2436 artifacts_to_keep = set(path_map)
2438 for ref, info in trashed:
2439 # Should not happen for this implementation but need
2440 # to keep mypy happy.
2441 assert info is not None, f"Internal logic error in emptyTrash with ref {ref}."
2443 # Mypy needs to know this is not the base class
2444 assert isinstance(info, StoredFileInfo), f"Unexpectedly got info of class {type(info)}"
2446 # Check for mypy
2447 assert ref.id is not None, f"Internal logic error in emptyTrash with ref {ref}/{info}"
2449 if info.path in artifacts_to_keep:
2450 # This is a multi-dataset artifact and we are not
2451 # removing all associated refs.
2452 continue
2454 # Only trashed refs still known to datastore will be returned.
2455 location = info.file_location(self.locationFactory)
2457 # Point of no return for this artifact
2458 log.debug("Removing artifact %s from datastore %s", location.uri, self.name)
2459 try:
2460 self._delete_artifact(location)
2461 except FileNotFoundError: 2461 ↛ 2476line 2461 didn't jump to line 2476
2462 # If the file itself has been deleted there is nothing
2463 # we can do about it. It is possible that trash has
2464 # been run in parallel in another process or someone
2465 # decided to delete the file. It is unlikely to come
2466 # back and so we should still continue with the removal
2467 # of the entry from the trash table. It is also possible
2468 # we removed it in a previous iteration if it was
2469 # a multi-dataset artifact. The delete artifact method
2470 # will log a debug message in this scenario.
2471 # Distinguishing file missing before trash started and
2472 # file already removed previously as part of this trash
2473 # is not worth the distinction with regards to potential
2474 # memory cost.
2475 pass
2476 except Exception as e:
2477 if ignore_errors:
2478 # Use a debug message here even though it's not
2479 # a good situation. In some cases this can be
2480 # caused by a race between user A and user B
2481 # and neither of them has permissions for the
2482 # other's files. Butler does not know about users
2483 # and trash has no idea what collections these
2484 # files were in (without guessing from a path).
2485 log.debug(
2486 "Encountered error removing artifact %s from datastore %s: %s",
2487 location.uri,
2488 self.name,
2489 e,
2490 )
2491 else:
2492 raise
2494 @transactional
2495 def transfer_from(
2496 self,
2497 source_datastore: Datastore,
2498 refs: Iterable[DatasetRef],
2499 transfer: str = "auto",
2500 artifact_existence: Optional[Dict[ResourcePath, bool]] = None,
2501 ) -> tuple[set[DatasetRef], set[DatasetRef]]:
2502 # Docstring inherited
2503 if type(self) is not type(source_datastore):
2504 raise TypeError(
2505 f"Datastore mismatch between this datastore ({type(self)}) and the "
2506 f"source datastore ({type(source_datastore)})."
2507 )
2509 # Be explicit for mypy
2510 if not isinstance(source_datastore, FileDatastore): 2510 ↛ 2511line 2510 didn't jump to line 2511, because the condition on line 2510 was never true
2511 raise TypeError(
2512 "Can only transfer to a FileDatastore from another FileDatastore, not"
2513 f" {type(source_datastore)}"
2514 )
2516 # Stop early if "direct" transfer mode is requested. That would
2517 # require that the URI inside the source datastore should be stored
2518 # directly in the target datastore, which seems unlikely to be useful
2519 # since at any moment the source datastore could delete the file.
2520 if transfer in ("direct", "split"):
2521 raise ValueError(
2522 f"Can not transfer from a source datastore using {transfer} mode since"
2523 " those files are controlled by the other datastore."
2524 )
2526 # Empty existence lookup if none given.
2527 if artifact_existence is None:
2528 artifact_existence = {}
2530 # We will go through the list multiple times so must convert
2531 # generators to lists.
2532 refs = list(refs)
2534 # In order to handle disassembled composites the code works
2535 # at the records level since it can assume that internal APIs
2536 # can be used.
2537 # - If the record already exists in the destination this is assumed
2538 # to be okay.
2539 # - If there is no record but the source and destination URIs are
2540 # identical no transfer is done but the record is added.
2541 # - If the source record refers to an absolute URI currently assume
2542 # that that URI should remain absolute and will be visible to the
2543 # destination butler. May need to have a flag to indicate whether
2544 # the dataset should be transferred. This will only happen if
2545 # the detached Butler has had a local ingest.
2547 # What we really want is all the records in the source datastore
2548 # associated with these refs. Or derived ones if they don't exist
2549 # in the source.
2550 source_records = source_datastore._get_stored_records_associated_with_refs(refs)
2552 # The source dataset_ids are the keys in these records
2553 source_ids = set(source_records)
2554 log.debug("Number of datastore records found in source: %d", len(source_ids))
2556 # The not None check is to appease mypy
2557 requested_ids = set(ref.id for ref in refs if ref.id is not None)
2558 missing_ids = requested_ids - source_ids
2560 # Missing IDs can be okay if that datastore has allowed
2561 # gets based on file existence. Should we transfer what we can
2562 # or complain about it and warn?
2563 if missing_ids and not source_datastore.trustGetRequest: 2563 ↛ 2564line 2563 didn't jump to line 2564, because the condition on line 2563 was never true
2564 raise ValueError(
2565 f"Some datasets are missing from source datastore {source_datastore}: {missing_ids}"
2566 )
2568 # Need to map these missing IDs to a DatasetRef so we can guess
2569 # the details.
2570 if missing_ids:
2571 log.info(
2572 "Number of expected datasets missing from source datastore records: %d out of %d",
2573 len(missing_ids),
2574 len(requested_ids),
2575 )
2576 id_to_ref = {ref.id: ref for ref in refs if ref.id in missing_ids}
2578 # This should be chunked in case we end up having to check
2579 # the file store since we need some log output to show
2580 # progress.
2581 for missing_ids_chunk in chunk_iterable(missing_ids, chunk_size=10_000):
2582 records = {}
2583 for missing in missing_ids_chunk:
2584 # Ask the source datastore where the missing artifacts
2585 # should be. An execution butler might not know about the
2586 # artifacts even if they are there.
2587 expected = source_datastore._get_expected_dataset_locations_info(id_to_ref[missing])
2588 records[missing] = [info for _, info in expected]
2590 # Call the mexist helper method in case we have not already
2591 # checked these artifacts such that artifact_existence is
2592 # empty. This allows us to benefit from parallelism.
2593 # datastore.mexists() itself does not give us access to the
2594 # derived datastore record.
2595 log.verbose("Checking existence of %d datasets unknown to datastore", len(records))
2596 ref_exists = source_datastore._process_mexists_records(
2597 id_to_ref, records, False, artifact_existence=artifact_existence
2598 )
2600 # Now go through the records and propagate the ones that exist.
2601 location_factory = source_datastore.locationFactory
2602 for missing, record_list in records.items():
2603 # Skip completely if the ref does not exist.
2604 ref = id_to_ref[missing]
2605 if not ref_exists[ref]:
2606 log.warning("Asked to transfer dataset %s but no file artifacts exist for it.", ref)
2607 continue
2608 # Check for file artifact to decide which parts of a
2609 # disassembled composite do exist. If there is only a
2610 # single record we don't even need to look because it can't
2611 # be a composite and must exist.
2612 if len(record_list) == 1:
2613 dataset_records = record_list
2614 else:
2615 dataset_records = [
2616 record
2617 for record in record_list
2618 if artifact_existence[record.file_location(location_factory).uri]
2619 ]
2620 assert len(dataset_records) > 0, "Disassembled composite should have had some files."
2622 # Rely on source_records being a defaultdict.
2623 source_records[missing].extend(dataset_records)
2625 # See if we already have these records
2626 target_records = self._get_stored_records_associated_with_refs(refs)
2628 # The artifacts to register
2629 artifacts = []
2631 # Refs that already exist
2632 already_present = []
2634 # Refs that were rejected by this datastore.
2635 rejected = set()
2637 # Refs that were transferred successfully.
2638 accepted = set()
2640 # Record each time we have done a "direct" transfer.
2641 direct_transfers = []
2643 # Now can transfer the artifacts
2644 for ref in refs:
2645 if not self.constraints.isAcceptable(ref): 2645 ↛ 2647line 2645 didn't jump to line 2647, because the condition on line 2645 was never true
2646 # This datastore should not be accepting this dataset.
2647 rejected.add(ref)
2648 continue
2650 accepted.add(ref)
2652 if ref.id in target_records:
2653 # Already have an artifact for this.
2654 already_present.append(ref)
2655 continue
2657 # mypy needs to know these are always resolved refs
2658 for info in source_records[ref.getCheckedId()]:
2659 source_location = info.file_location(source_datastore.locationFactory)
2660 target_location = info.file_location(self.locationFactory)
2661 if source_location == target_location and not source_location.pathInStore.isabs(): 2661 ↛ 2664line 2661 didn't jump to line 2664, because the condition on line 2661 was never true
2662 # Artifact is already in the target location.
2663 # (which is how execution butler currently runs)
2664 pass
2665 else:
2666 if target_location.pathInStore.isabs():
2667 # Just because we can see the artifact when running
2668 # the transfer doesn't mean it will be generally
2669 # accessible to a user of this butler. Need to decide
2670 # what to do about an absolute path.
2671 if transfer == "auto":
2672 # For "auto" transfers we allow the absolute URI
2673 # to be recorded in the target datastore.
2674 direct_transfers.append(source_location)
2675 else:
2676 # The user is explicitly requesting a transfer
2677 # even for an absolute URI. This requires us to
2678 # calculate the target path.
2679 template_ref = ref
2680 if info.component: 2680 ↛ 2681line 2680 didn't jump to line 2681, because the condition on line 2680 was never true
2681 template_ref = ref.makeComponentRef(info.component)
2682 target_location = self._calculate_ingested_datastore_name(
2683 source_location.uri,
2684 template_ref,
2685 )
2687 info = info.update(path=target_location.pathInStore.path)
2689 # Need to transfer it to the new location.
2690 # Assume we should always overwrite. If the artifact
2691 # is there this might indicate that a previous transfer
2692 # was interrupted but was not able to be rolled back
2693 # completely (eg pre-emption) so follow Datastore default
2694 # and overwrite.
2695 target_location.uri.transfer_from(
2696 source_location.uri, transfer=transfer, overwrite=True, transaction=self._transaction
2697 )
2699 artifacts.append((ref, info))
2701 if direct_transfers:
2702 log.info(
2703 "Transfer request for an outside-datastore artifact with absolute URI done %d time%s",
2704 len(direct_transfers),
2705 "" if len(direct_transfers) == 1 else "s",
2706 )
2708 self._register_datasets(artifacts)
2710 if already_present:
2711 n_skipped = len(already_present)
2712 log.info(
2713 "Skipped transfer of %d dataset%s already present in datastore",
2714 n_skipped,
2715 "" if n_skipped == 1 else "s",
2716 )
2718 return accepted, rejected
2720 @transactional
2721 def forget(self, refs: Iterable[DatasetRef]) -> None:
2722 # Docstring inherited.
2723 refs = list(refs)
2724 self.bridge.forget(refs)
2725 self._table.delete(["dataset_id"], *[{"dataset_id": ref.getCheckedId()} for ref in refs])
2727 def validateConfiguration(
2728 self, entities: Iterable[Union[DatasetRef, DatasetType, StorageClass]], logFailures: bool = False
2729 ) -> None:
2730 """Validate some of the configuration for this datastore.
2732 Parameters
2733 ----------
2734 entities : iterable of `DatasetRef`, `DatasetType`, or `StorageClass`
2735 Entities to test against this configuration. Can be differing
2736 types.
2737 logFailures : `bool`, optional
2738 If `True`, output a log message for every validation error
2739 detected.
2741 Raises
2742 ------
2743 DatastoreValidationError
2744 Raised if there is a validation problem with a configuration.
2745 All the problems are reported in a single exception.
2747 Notes
2748 -----
2749 This method checks that all the supplied entities have valid file
2750 templates and also have formatters defined.
2751 """
2753 templateFailed = None
2754 try:
2755 self.templates.validateTemplates(entities, logFailures=logFailures)
2756 except FileTemplateValidationError as e:
2757 templateFailed = str(e)
2759 formatterFailed = []
2760 for entity in entities:
2761 try:
2762 self.formatterFactory.getFormatterClass(entity)
2763 except KeyError as e:
2764 formatterFailed.append(str(e))
2765 if logFailures: 2765 ↛ 2760line 2765 didn't jump to line 2760, because the condition on line 2765 was never false
2766 log.critical("Formatter failure: %s", e)
2768 if templateFailed or formatterFailed:
2769 messages = []
2770 if templateFailed: 2770 ↛ 2771line 2770 didn't jump to line 2771, because the condition on line 2770 was never true
2771 messages.append(templateFailed)
2772 if formatterFailed: 2772 ↛ 2774line 2772 didn't jump to line 2774, because the condition on line 2772 was never false
2773 messages.append(",".join(formatterFailed))
2774 msg = ";\n".join(messages)
2775 raise DatastoreValidationError(msg)
2777 def getLookupKeys(self) -> Set[LookupKey]:
2778 # Docstring is inherited from base class
2779 return (
2780 self.templates.getLookupKeys()
2781 | self.formatterFactory.getLookupKeys()
2782 | self.constraints.getLookupKeys()
2783 )
2785 def validateKey(self, lookupKey: LookupKey, entity: Union[DatasetRef, DatasetType, StorageClass]) -> None:
2786 # Docstring is inherited from base class
2787 # The key can be valid in either formatters or templates so we can
2788 # only check the template if it exists
2789 if lookupKey in self.templates:
2790 try:
2791 self.templates[lookupKey].validateTemplate(entity)
2792 except FileTemplateValidationError as e:
2793 raise DatastoreValidationError(e) from e
2795 def export(
2796 self,
2797 refs: Iterable[DatasetRef],
2798 *,
2799 directory: Optional[ResourcePathExpression] = None,
2800 transfer: Optional[str] = "auto",
2801 ) -> Iterable[FileDataset]:
2802 # Docstring inherited from Datastore.export.
2803 if transfer == "auto" and directory is None:
2804 transfer = None
2806 if transfer is not None and directory is None:
2807 raise TypeError(f"Cannot export using transfer mode {transfer} with no export directory given")
2809 if transfer == "move":
2810 raise TypeError("Can not export by moving files out of datastore.")
2811 elif transfer == "direct": 2811 ↛ 2815line 2811 didn't jump to line 2815, because the condition on line 2811 was never true
2812 # For an export, treat this as equivalent to None. We do not
2813 # want an import to risk using absolute URIs to datasets owned
2814 # by another datastore.
2815 log.info("Treating 'direct' transfer mode as in-place export.")
2816 transfer = None
2818 # Force the directory to be a URI object
2819 directoryUri: Optional[ResourcePath] = None
2820 if directory is not None:
2821 directoryUri = ResourcePath(directory, forceDirectory=True)
2823 if transfer is not None and directoryUri is not None:
2824 # mypy needs the second test
2825 if not directoryUri.exists(): 2825 ↛ 2826line 2825 didn't jump to line 2826, because the condition on line 2825 was never true
2826 raise FileNotFoundError(f"Export location {directory} does not exist")
2828 progress = Progress("lsst.daf.butler.datastores.FileDatastore.export", level=logging.DEBUG)
2829 for ref in progress.wrap(refs, "Exporting dataset files"):
2830 fileLocations = self._get_dataset_locations_info(ref)
2831 if not fileLocations:
2832 raise FileNotFoundError(f"Could not retrieve dataset {ref}.")
2833 # For now we can not export disassembled datasets
2834 if len(fileLocations) > 1:
2835 raise NotImplementedError(f"Can not export disassembled datasets such as {ref}")
2836 location, storedFileInfo = fileLocations[0]
2838 pathInStore = location.pathInStore.path
2839 if transfer is None:
2840 # TODO: do we also need to return the readStorageClass somehow?
2841 # We will use the path in store directly. If this is an
2842 # absolute URI, preserve it.
2843 if location.pathInStore.isabs(): 2843 ↛ 2844line 2843 didn't jump to line 2844, because the condition on line 2843 was never true
2844 pathInStore = str(location.uri)
2845 elif transfer == "direct": 2845 ↛ 2847line 2845 didn't jump to line 2847, because the condition on line 2845 was never true
2846 # Use full URIs to the remote store in the export
2847 pathInStore = str(location.uri)
2848 else:
2849 # mypy needs help
2850 assert directoryUri is not None, "directoryUri must be defined to get here"
2851 storeUri = ResourcePath(location.uri)
2853 # if the datastore has an absolute URI to a resource, we
2854 # have two options:
2855 # 1. Keep the absolute URI in the exported YAML
2856 # 2. Allocate a new name in the local datastore and transfer
2857 # it.
2858 # For now go with option 2
2859 if location.pathInStore.isabs(): 2859 ↛ 2860line 2859 didn't jump to line 2860, because the condition on line 2859 was never true
2860 template = self.templates.getTemplate(ref)
2861 newURI = ResourcePath(template.format(ref), forceAbsolute=False)
2862 pathInStore = str(newURI.updatedExtension(location.pathInStore.getExtension()))
2864 exportUri = directoryUri.join(pathInStore)
2865 exportUri.transfer_from(storeUri, transfer=transfer)
2867 yield FileDataset(refs=[ref], path=pathInStore, formatter=storedFileInfo.formatter)
2869 @staticmethod
2870 def computeChecksum(
2871 uri: ResourcePath, algorithm: str = "blake2b", block_size: int = 8192
2872 ) -> Optional[str]:
2873 """Compute the checksum of the supplied file.
2875 Parameters
2876 ----------
2877 uri : `lsst.resources.ResourcePath`
2878 Name of resource to calculate checksum from.
2879 algorithm : `str`, optional
2880 Name of algorithm to use. Must be one of the algorithms supported
2881 by :py:class`hashlib`.
2882 block_size : `int`
2883 Number of bytes to read from file at one time.
2885 Returns
2886 -------
2887 hexdigest : `str`
2888 Hex digest of the file.
2890 Notes
2891 -----
2892 Currently returns None if the URI is for a remote resource.
2893 """
2894 if algorithm not in hashlib.algorithms_guaranteed: 2894 ↛ 2895line 2894 didn't jump to line 2895, because the condition on line 2894 was never true
2895 raise NameError("The specified algorithm '{}' is not supported by hashlib".format(algorithm))
2897 if not uri.isLocal: 2897 ↛ 2898line 2897 didn't jump to line 2898, because the condition on line 2897 was never true
2898 return None
2900 hasher = hashlib.new(algorithm)
2902 with uri.as_local() as local_uri:
2903 with open(local_uri.ospath, "rb") as f:
2904 for chunk in iter(lambda: f.read(block_size), b""):
2905 hasher.update(chunk)
2907 return hasher.hexdigest()
2909 def needs_expanded_data_ids(
2910 self,
2911 transfer: Optional[str],
2912 entity: Optional[Union[DatasetRef, DatasetType, StorageClass]] = None,
2913 ) -> bool:
2914 # Docstring inherited.
2915 # This _could_ also use entity to inspect whether the filename template
2916 # involves placeholders other than the required dimensions for its
2917 # dataset type, but that's not necessary for correctness; it just
2918 # enables more optimizations (perhaps only in theory).
2919 return transfer not in ("direct", None)
2921 def import_records(self, data: Mapping[str, DatastoreRecordData]) -> None:
2922 # Docstring inherited from the base class.
2923 record_data = data.get(self.name)
2924 if not record_data: 2924 ↛ 2925line 2924 didn't jump to line 2925, because the condition on line 2924 was never true
2925 return
2927 self._bridge.insert(FakeDatasetRef(dataset_id) for dataset_id in record_data.records.keys())
2929 # TODO: Verify that there are no unexpected table names in the dict?
2930 unpacked_records = []
2931 for dataset_data in record_data.records.values():
2932 records = dataset_data.get(self._table.name)
2933 if records: 2933 ↛ 2931line 2933 didn't jump to line 2931, because the condition on line 2933 was never false
2934 for info in records:
2935 assert isinstance(info, StoredFileInfo), "Expecting StoredFileInfo records"
2936 unpacked_records.append(info.to_record())
2937 if unpacked_records:
2938 self._table.insert(*unpacked_records, transaction=self._transaction)
2940 def export_records(self, refs: Iterable[DatasetIdRef]) -> Mapping[str, DatastoreRecordData]:
2941 # Docstring inherited from the base class.
2942 exported_refs = list(self._bridge.check(refs))
2943 ids = {ref.getCheckedId() for ref in exported_refs}
2944 records: defaultdict[DatasetId, defaultdict[str, List[StoredDatastoreItemInfo]]] = defaultdict(
2945 lambda: defaultdict(list), {id: defaultdict(list) for id in ids}
2946 )
2947 for row in self._table.fetch(dataset_id=ids):
2948 info: StoredDatastoreItemInfo = StoredFileInfo.from_record(row)
2949 records[info.dataset_id][self._table.name].append(info)
2951 record_data = DatastoreRecordData(records=records)
2952 return {self.name: record_data}
2954 def set_retrieve_dataset_type_method(self, method: Callable[[str], DatasetType | None] | None) -> None:
2955 # Docstring inherited from the base class.
2956 self._retrieve_dataset_method = method
2958 def _cast_storage_class(self, ref: DatasetRef) -> DatasetRef:
2959 """Update dataset reference to use the storage class from registry.
2961 This does nothing for regular datastores, and is only enabled for
2962 trusted mode where we need to use registry definition of storage class
2963 for some datastore methods. `set_retrieve_dataset_type_method` has to
2964 be called beforehand.
2965 """
2966 if self.trustGetRequest:
2967 if self._retrieve_dataset_method is None:
2968 # We could raise an exception here but unit tests do not define
2969 # this method.
2970 return ref
2971 dataset_type = self._retrieve_dataset_method(ref.datasetType.name)
2972 if dataset_type is not None: 2972 ↛ 2974line 2972 didn't jump to line 2974, because the condition on line 2972 was never false
2973 ref = ref.overrideStorageClass(dataset_type.storageClass)
2974 return ref