Coverage for python/lsst/daf/butler/datastores/fileDatastore.py: 84%
Shortcuts on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
21from __future__ import annotations
23"""Generic file-based datastore code."""
25__all__ = ("FileDatastore",)
27import hashlib
28import logging
29from collections import defaultdict
30from dataclasses import dataclass
31from typing import (
32 TYPE_CHECKING,
33 Any,
34 ClassVar,
35 Dict,
36 Iterable,
37 List,
38 Mapping,
39 Optional,
40 Set,
41 Tuple,
42 Type,
43 Union,
44)
46from lsst.daf.butler import (
47 CompositesMap,
48 Config,
49 DatasetId,
50 DatasetRef,
51 DatasetType,
52 DatasetTypeNotSupportedError,
53 Datastore,
54 DatastoreCacheManager,
55 DatastoreConfig,
56 DatastoreDisabledCacheManager,
57 DatastoreRecordData,
58 DatastoreValidationError,
59 FileDataset,
60 FileDescriptor,
61 FileTemplates,
62 FileTemplateValidationError,
63 Formatter,
64 FormatterFactory,
65 Location,
66 LocationFactory,
67 Progress,
68 StorageClass,
69 StoredDatastoreItemInfo,
70 StoredFileInfo,
71 ddl,
72)
73from lsst.daf.butler.core.repoRelocation import replaceRoot
74from lsst.daf.butler.core.utils import transactional
75from lsst.daf.butler.registry.interfaces import DatastoreRegistryBridge, ReadOnlyDatabaseError
76from lsst.resources import ResourcePath, ResourcePathExpression
77from lsst.utils.introspection import get_class_of, get_instance_of
78from lsst.utils.iteration import chunk_iterable
80# For VERBOSE logging usage.
81from lsst.utils.logging import VERBOSE, getLogger
82from lsst.utils.timer import time_this
83from sqlalchemy import BigInteger, String
85from .genericDatastore import GenericBaseDatastore
87if TYPE_CHECKING: 87 ↛ 88line 87 didn't jump to line 88, because the condition on line 87 was never true
88 from lsst.daf.butler import AbstractDatastoreCacheManager, LookupKey
89 from lsst.daf.butler.registry.interfaces import DatasetIdRef, DatastoreRegistryBridgeManager
91log = getLogger(__name__)
94class _IngestPrepData(Datastore.IngestPrepData):
95 """Helper class for FileDatastore ingest implementation.
97 Parameters
98 ----------
99 datasets : `list` of `FileDataset`
100 Files to be ingested by this datastore.
101 """
103 def __init__(self, datasets: List[FileDataset]):
104 super().__init__(ref for dataset in datasets for ref in dataset.refs)
105 self.datasets = datasets
108@dataclass(frozen=True)
109class DatastoreFileGetInformation:
110 """Collection of useful parameters needed to retrieve a file from
111 a Datastore.
112 """
114 location: Location
115 """The location from which to read the dataset."""
117 formatter: Formatter
118 """The `Formatter` to use to deserialize the dataset."""
120 info: StoredFileInfo
121 """Stored information about this file and its formatter."""
123 assemblerParams: Mapping[str, Any]
124 """Parameters to use for post-processing the retrieved dataset."""
126 formatterParams: Mapping[str, Any]
127 """Parameters that were understood by the associated formatter."""
129 component: Optional[str]
130 """The component to be retrieved (can be `None`)."""
132 readStorageClass: StorageClass
133 """The `StorageClass` of the dataset being read."""
136class FileDatastore(GenericBaseDatastore):
137 """Generic Datastore for file-based implementations.
139 Should always be sub-classed since key abstract methods are missing.
141 Parameters
142 ----------
143 config : `DatastoreConfig` or `str`
144 Configuration as either a `Config` object or URI to file.
145 bridgeManager : `DatastoreRegistryBridgeManager`
146 Object that manages the interface between `Registry` and datastores.
147 butlerRoot : `str`, optional
148 New datastore root to use to override the configuration value.
150 Raises
151 ------
152 ValueError
153 If root location does not exist and ``create`` is `False` in the
154 configuration.
155 """
157 defaultConfigFile: ClassVar[Optional[str]] = None
158 """Path to configuration defaults. Accessed within the ``config`` resource
159 or relative to a search path. Can be None if no defaults specified.
160 """
162 root: ResourcePath
163 """Root directory URI of this `Datastore`."""
165 locationFactory: LocationFactory
166 """Factory for creating locations relative to the datastore root."""
168 formatterFactory: FormatterFactory
169 """Factory for creating instances of formatters."""
171 templates: FileTemplates
172 """File templates that can be used by this `Datastore`."""
174 composites: CompositesMap
175 """Determines whether a dataset should be disassembled on put."""
177 defaultConfigFile = "datastores/fileDatastore.yaml"
178 """Path to configuration defaults. Accessed within the ``config`` resource
179 or relative to a search path. Can be None if no defaults specified.
180 """
182 @classmethod
183 def setConfigRoot(cls, root: str, config: Config, full: Config, overwrite: bool = True) -> None:
184 """Set any filesystem-dependent config options for this Datastore to
185 be appropriate for a new empty repository with the given root.
187 Parameters
188 ----------
189 root : `str`
190 URI to the root of the data repository.
191 config : `Config`
192 A `Config` to update. Only the subset understood by
193 this component will be updated. Will not expand
194 defaults.
195 full : `Config`
196 A complete config with all defaults expanded that can be
197 converted to a `DatastoreConfig`. Read-only and will not be
198 modified by this method.
199 Repository-specific options that should not be obtained
200 from defaults when Butler instances are constructed
201 should be copied from ``full`` to ``config``.
202 overwrite : `bool`, optional
203 If `False`, do not modify a value in ``config`` if the value
204 already exists. Default is always to overwrite with the provided
205 ``root``.
207 Notes
208 -----
209 If a keyword is explicitly defined in the supplied ``config`` it
210 will not be overridden by this method if ``overwrite`` is `False`.
211 This allows explicit values set in external configs to be retained.
212 """
213 Config.updateParameters(
214 DatastoreConfig,
215 config,
216 full,
217 toUpdate={"root": root},
218 toCopy=("cls", ("records", "table")),
219 overwrite=overwrite,
220 )
222 @classmethod
223 def makeTableSpec(cls, datasetIdColumnType: type) -> ddl.TableSpec:
224 return ddl.TableSpec(
225 fields=[
226 ddl.FieldSpec(name="dataset_id", dtype=datasetIdColumnType, primaryKey=True),
227 ddl.FieldSpec(name="path", dtype=String, length=256, nullable=False),
228 ddl.FieldSpec(name="formatter", dtype=String, length=128, nullable=False),
229 ddl.FieldSpec(name="storage_class", dtype=String, length=64, nullable=False),
230 # Use empty string to indicate no component
231 ddl.FieldSpec(name="component", dtype=String, length=32, primaryKey=True),
232 # TODO: should checksum be Base64Bytes instead?
233 ddl.FieldSpec(name="checksum", dtype=String, length=128, nullable=True),
234 ddl.FieldSpec(name="file_size", dtype=BigInteger, nullable=True),
235 ],
236 unique=frozenset(),
237 indexes=[tuple(["path"])],
238 )
240 def __init__(
241 self,
242 config: Union[DatastoreConfig, str],
243 bridgeManager: DatastoreRegistryBridgeManager,
244 butlerRoot: str = None,
245 ):
246 super().__init__(config, bridgeManager)
247 if "root" not in self.config: 247 ↛ 248line 247 didn't jump to line 248, because the condition on line 247 was never true
248 raise ValueError("No root directory specified in configuration")
250 self._bridgeManager = bridgeManager
252 # Name ourselves either using an explicit name or a name
253 # derived from the (unexpanded) root
254 if "name" in self.config:
255 self.name = self.config["name"]
256 else:
257 # We use the unexpanded root in the name to indicate that this
258 # datastore can be moved without having to update registry.
259 self.name = "{}@{}".format(type(self).__name__, self.config["root"])
261 # Support repository relocation in config
262 # Existence of self.root is checked in subclass
263 self.root = ResourcePath(
264 replaceRoot(self.config["root"], butlerRoot), forceDirectory=True, forceAbsolute=True
265 )
267 self.locationFactory = LocationFactory(self.root)
268 self.formatterFactory = FormatterFactory()
270 # Now associate formatters with storage classes
271 self.formatterFactory.registerFormatters(self.config["formatters"], universe=bridgeManager.universe)
273 # Read the file naming templates
274 self.templates = FileTemplates(self.config["templates"], universe=bridgeManager.universe)
276 # See if composites should be disassembled
277 self.composites = CompositesMap(self.config["composites"], universe=bridgeManager.universe)
279 tableName = self.config["records", "table"]
280 try:
281 # Storage of paths and formatters, keyed by dataset_id
282 self._table = bridgeManager.opaque.register(
283 tableName, self.makeTableSpec(bridgeManager.datasetIdColumnType)
284 )
285 # Interface to Registry.
286 self._bridge = bridgeManager.register(self.name)
287 except ReadOnlyDatabaseError:
288 # If the database is read only and we just tried and failed to
289 # create a table, it means someone is trying to create a read-only
290 # butler client for an empty repo. That should be okay, as long
291 # as they then try to get any datasets before some other client
292 # creates the table. Chances are they'rejust validating
293 # configuration.
294 pass
296 # Determine whether checksums should be used - default to False
297 self.useChecksum = self.config.get("checksum", False)
299 # Determine whether we can fall back to configuration if a
300 # requested dataset is not known to registry
301 self.trustGetRequest = self.config.get("trust_get_request", False)
303 # Create a cache manager
304 self.cacheManager: AbstractDatastoreCacheManager
305 if "cached" in self.config: 305 ↛ 308line 305 didn't jump to line 308, because the condition on line 305 was never false
306 self.cacheManager = DatastoreCacheManager(self.config["cached"], universe=bridgeManager.universe)
307 else:
308 self.cacheManager = DatastoreDisabledCacheManager("", universe=bridgeManager.universe)
310 # Check existence and create directory structure if necessary
311 if not self.root.exists():
312 if "create" not in self.config or not self.config["create"]: 312 ↛ 313line 312 didn't jump to line 313, because the condition on line 312 was never true
313 raise ValueError(f"No valid root and not allowed to create one at: {self.root}")
314 try:
315 self.root.mkdir()
316 except Exception as e:
317 raise ValueError(
318 f"Can not create datastore root '{self.root}', check permissions. Got error: {e}"
319 ) from e
321 def __str__(self) -> str:
322 return str(self.root)
324 @property
325 def bridge(self) -> DatastoreRegistryBridge:
326 return self._bridge
328 def _artifact_exists(self, location: Location) -> bool:
329 """Check that an artifact exists in this datastore at the specified
330 location.
332 Parameters
333 ----------
334 location : `Location`
335 Expected location of the artifact associated with this datastore.
337 Returns
338 -------
339 exists : `bool`
340 True if the location can be found, false otherwise.
341 """
342 log.debug("Checking if resource exists: %s", location.uri)
343 return location.uri.exists()
345 def _delete_artifact(self, location: Location) -> None:
346 """Delete the artifact from the datastore.
348 Parameters
349 ----------
350 location : `Location`
351 Location of the artifact associated with this datastore.
352 """
353 if location.pathInStore.isabs(): 353 ↛ 354line 353 didn't jump to line 354, because the condition on line 353 was never true
354 raise RuntimeError(f"Cannot delete artifact with absolute uri {location.uri}.")
356 try:
357 location.uri.remove()
358 except FileNotFoundError:
359 log.debug("File %s did not exist and so could not be deleted.", location.uri)
360 raise
361 except Exception as e:
362 log.critical("Failed to delete file: %s (%s)", location.uri, e)
363 raise
364 log.debug("Successfully deleted file: %s", location.uri)
366 def addStoredItemInfo(self, refs: Iterable[DatasetRef], infos: Iterable[StoredFileInfo]) -> None:
367 # Docstring inherited from GenericBaseDatastore
368 records = [info.to_record(ref) for ref, info in zip(refs, infos)]
369 self._table.insert(*records)
371 def getStoredItemsInfo(self, ref: DatasetIdRef) -> List[StoredFileInfo]:
372 # Docstring inherited from GenericBaseDatastore
374 # Look for the dataset_id -- there might be multiple matches
375 # if we have disassembled the dataset.
376 records = self._table.fetch(dataset_id=ref.id)
377 return [StoredFileInfo.from_record(record) for record in records]
379 def _get_stored_records_associated_with_refs(
380 self, refs: Iterable[DatasetIdRef]
381 ) -> Dict[DatasetId, List[StoredFileInfo]]:
382 """Retrieve all records associated with the provided refs.
384 Parameters
385 ----------
386 refs : iterable of `DatasetIdRef`
387 The refs for which records are to be retrieved.
389 Returns
390 -------
391 records : `dict` of [`DatasetId`, `list` of `StoredFileInfo`]
392 The matching records indexed by the ref ID. The number of entries
393 in the dict can be smaller than the number of requested refs.
394 """
395 records = self._table.fetch(dataset_id=[ref.id for ref in refs])
397 # Uniqueness is dataset_id + component so can have multiple records
398 # per ref.
399 records_by_ref = defaultdict(list)
400 for record in records:
401 records_by_ref[record["dataset_id"]].append(StoredFileInfo.from_record(record))
402 return records_by_ref
404 def _refs_associated_with_artifacts(
405 self, paths: List[Union[str, ResourcePath]]
406 ) -> Dict[str, Set[DatasetId]]:
407 """Return paths and associated dataset refs.
409 Parameters
410 ----------
411 paths : `list` of `str` or `lsst.resources.ResourcePath`
412 All the paths to include in search.
414 Returns
415 -------
416 mapping : `dict` of [`str`, `set` [`DatasetId`]]
417 Mapping of each path to a set of associated database IDs.
418 """
419 records = self._table.fetch(path=[str(path) for path in paths])
420 result = defaultdict(set)
421 for row in records:
422 result[row["path"]].add(row["dataset_id"])
423 return result
425 def _registered_refs_per_artifact(self, pathInStore: ResourcePath) -> Set[DatasetId]:
426 """Return all dataset refs associated with the supplied path.
428 Parameters
429 ----------
430 pathInStore : `lsst.resources.ResourcePath`
431 Path of interest in the data store.
433 Returns
434 -------
435 ids : `set` of `int`
436 All `DatasetRef` IDs associated with this path.
437 """
438 records = list(self._table.fetch(path=str(pathInStore)))
439 ids = {r["dataset_id"] for r in records}
440 return ids
442 def removeStoredItemInfo(self, ref: DatasetIdRef) -> None:
443 # Docstring inherited from GenericBaseDatastore
444 self._table.delete(["dataset_id"], {"dataset_id": ref.id})
446 def _get_dataset_locations_info(self, ref: DatasetIdRef) -> List[Tuple[Location, StoredFileInfo]]:
447 r"""Find all the `Location`\ s of the requested dataset in the
448 `Datastore` and the associated stored file information.
450 Parameters
451 ----------
452 ref : `DatasetRef`
453 Reference to the required `Dataset`.
455 Returns
456 -------
457 results : `list` [`tuple` [`Location`, `StoredFileInfo` ]]
458 Location of the dataset within the datastore and
459 stored information about each file and its formatter.
460 """
461 # Get the file information (this will fail if no file)
462 records = self.getStoredItemsInfo(ref)
464 # Use the path to determine the location -- we need to take
465 # into account absolute URIs in the datastore record
466 return [(r.file_location(self.locationFactory), r) for r in records]
468 def _can_remove_dataset_artifact(self, ref: DatasetIdRef, location: Location) -> bool:
469 """Check that there is only one dataset associated with the
470 specified artifact.
472 Parameters
473 ----------
474 ref : `DatasetRef` or `FakeDatasetRef`
475 Dataset to be removed.
476 location : `Location`
477 The location of the artifact to be removed.
479 Returns
480 -------
481 can_remove : `Bool`
482 True if the artifact can be safely removed.
483 """
484 # Can't ever delete absolute URIs.
485 if location.pathInStore.isabs():
486 return False
488 # Get all entries associated with this path
489 allRefs = self._registered_refs_per_artifact(location.pathInStore)
490 if not allRefs:
491 raise RuntimeError(f"Datastore inconsistency error. {location.pathInStore} not in registry")
493 # Remove these refs from all the refs and if there is nothing left
494 # then we can delete
495 remainingRefs = allRefs - {ref.id}
497 if remainingRefs:
498 return False
499 return True
501 def _get_expected_dataset_locations_info(self, ref: DatasetRef) -> List[Tuple[Location, StoredFileInfo]]:
502 """Predict the location and related file information of the requested
503 dataset in this datastore.
505 Parameters
506 ----------
507 ref : `DatasetRef`
508 Reference to the required `Dataset`.
510 Returns
511 -------
512 results : `list` [`tuple` [`Location`, `StoredFileInfo` ]]
513 Expected Location of the dataset within the datastore and
514 placeholder information about each file and its formatter.
516 Notes
517 -----
518 Uses the current configuration to determine how we would expect the
519 datastore files to have been written if we couldn't ask registry.
520 This is safe so long as there has been no change to datastore
521 configuration between writing the dataset and wanting to read it.
522 Will not work for files that have been ingested without using the
523 standard file template or default formatter.
524 """
526 # If we have a component ref we always need to ask the questions
527 # of the composite. If the composite is disassembled this routine
528 # should return all components. If the composite was not
529 # disassembled the composite is what is stored regardless of
530 # component request. Note that if the caller has disassembled
531 # a composite there is no way for this guess to know that
532 # without trying both the composite and component ref and seeing
533 # if there is something at the component Location even without
534 # disassembly being enabled.
535 if ref.datasetType.isComponent():
536 ref = ref.makeCompositeRef()
538 # See if the ref is a composite that should be disassembled
539 doDisassembly = self.composites.shouldBeDisassembled(ref)
541 all_info: List[Tuple[Location, Formatter, StorageClass, Optional[str]]] = []
543 if doDisassembly:
544 for component, componentStorage in ref.datasetType.storageClass.components.items():
545 compRef = ref.makeComponentRef(component)
546 location, formatter = self._determine_put_formatter_location(compRef)
547 all_info.append((location, formatter, componentStorage, component))
549 else:
550 # Always use the composite ref if no disassembly
551 location, formatter = self._determine_put_formatter_location(ref)
552 all_info.append((location, formatter, ref.datasetType.storageClass, None))
554 # Convert the list of tuples to have StoredFileInfo as second element
555 return [
556 (
557 location,
558 StoredFileInfo(
559 formatter=formatter,
560 path=location.pathInStore.path,
561 storageClass=storageClass,
562 component=component,
563 checksum=None,
564 file_size=-1,
565 dataset_id=ref.getCheckedId(),
566 ),
567 )
568 for location, formatter, storageClass, component in all_info
569 ]
571 def _prepare_for_get(
572 self, ref: DatasetRef, parameters: Optional[Mapping[str, Any]] = None
573 ) -> List[DatastoreFileGetInformation]:
574 """Check parameters for ``get`` and obtain formatter and
575 location.
577 Parameters
578 ----------
579 ref : `DatasetRef`
580 Reference to the required Dataset.
581 parameters : `dict`
582 `StorageClass`-specific parameters that specify, for example,
583 a slice of the dataset to be loaded.
585 Returns
586 -------
587 getInfo : `list` [`DatastoreFileGetInformation`]
588 Parameters needed to retrieve each file.
589 """
590 log.debug("Retrieve %s from %s with parameters %s", ref, self.name, parameters)
592 # Get file metadata and internal metadata
593 fileLocations = self._get_dataset_locations_info(ref)
594 if not fileLocations:
595 if not self.trustGetRequest:
596 raise FileNotFoundError(f"Could not retrieve dataset {ref}.")
597 # Assume the dataset is where we think it should be
598 fileLocations = self._get_expected_dataset_locations_info(ref)
600 # The storage class we want to use eventually
601 refStorageClass = ref.datasetType.storageClass
603 if len(fileLocations) > 1:
604 disassembled = True
606 # If trust is involved it is possible that there will be
607 # components listed here that do not exist in the datastore.
608 # Explicitly check for file artifact existence and filter out any
609 # that are missing.
610 if self.trustGetRequest:
611 fileLocations = [loc for loc in fileLocations if loc[0].uri.exists()]
613 # For now complain only if we have no components at all. One
614 # component is probably a problem but we can punt that to the
615 # assembler.
616 if not fileLocations: 616 ↛ 617line 616 didn't jump to line 617, because the condition on line 616 was never true
617 raise FileNotFoundError(f"None of the component files for dataset {ref} exist.")
619 else:
620 disassembled = False
622 # Is this a component request?
623 refComponent = ref.datasetType.component()
625 fileGetInfo = []
626 for location, storedFileInfo in fileLocations:
628 # The storage class used to write the file
629 writeStorageClass = storedFileInfo.storageClass
631 # If this has been disassembled we need read to match the write
632 if disassembled:
633 readStorageClass = writeStorageClass
634 else:
635 readStorageClass = refStorageClass
637 formatter = get_instance_of(
638 storedFileInfo.formatter,
639 FileDescriptor(
640 location,
641 readStorageClass=readStorageClass,
642 storageClass=writeStorageClass,
643 parameters=parameters,
644 ),
645 ref.dataId,
646 )
648 formatterParams, notFormatterParams = formatter.segregateParameters()
650 # Of the remaining parameters, extract the ones supported by
651 # this StorageClass (for components not all will be handled)
652 assemblerParams = readStorageClass.filterParameters(notFormatterParams)
654 # The ref itself could be a component if the dataset was
655 # disassembled by butler, or we disassembled in datastore and
656 # components came from the datastore records
657 component = storedFileInfo.component if storedFileInfo.component else refComponent
659 fileGetInfo.append(
660 DatastoreFileGetInformation(
661 location,
662 formatter,
663 storedFileInfo,
664 assemblerParams,
665 formatterParams,
666 component,
667 readStorageClass,
668 )
669 )
671 return fileGetInfo
673 def _prepare_for_put(self, inMemoryDataset: Any, ref: DatasetRef) -> Tuple[Location, Formatter]:
674 """Check the arguments for ``put`` and obtain formatter and
675 location.
677 Parameters
678 ----------
679 inMemoryDataset : `object`
680 The dataset to store.
681 ref : `DatasetRef`
682 Reference to the associated Dataset.
684 Returns
685 -------
686 location : `Location`
687 The location to write the dataset.
688 formatter : `Formatter`
689 The `Formatter` to use to write the dataset.
691 Raises
692 ------
693 TypeError
694 Supplied object and storage class are inconsistent.
695 DatasetTypeNotSupportedError
696 The associated `DatasetType` is not handled by this datastore.
697 """
698 self._validate_put_parameters(inMemoryDataset, ref)
699 return self._determine_put_formatter_location(ref)
701 def _determine_put_formatter_location(self, ref: DatasetRef) -> Tuple[Location, Formatter]:
702 """Calculate the formatter and output location to use for put.
704 Parameters
705 ----------
706 ref : `DatasetRef`
707 Reference to the associated Dataset.
709 Returns
710 -------
711 location : `Location`
712 The location to write the dataset.
713 formatter : `Formatter`
714 The `Formatter` to use to write the dataset.
715 """
716 # Work out output file name
717 try:
718 template = self.templates.getTemplate(ref)
719 except KeyError as e:
720 raise DatasetTypeNotSupportedError(f"Unable to find template for {ref}") from e
722 # Validate the template to protect against filenames from different
723 # dataIds returning the same and causing overwrite confusion.
724 template.validateTemplate(ref)
726 location = self.locationFactory.fromPath(template.format(ref))
728 # Get the formatter based on the storage class
729 storageClass = ref.datasetType.storageClass
730 try:
731 formatter = self.formatterFactory.getFormatter(
732 ref, FileDescriptor(location, storageClass=storageClass), ref.dataId
733 )
734 except KeyError as e:
735 raise DatasetTypeNotSupportedError(
736 f"Unable to find formatter for {ref} in datastore {self.name}"
737 ) from e
739 # Now that we know the formatter, update the location
740 location = formatter.makeUpdatedLocation(location)
742 return location, formatter
744 def _overrideTransferMode(self, *datasets: FileDataset, transfer: Optional[str] = None) -> Optional[str]:
745 # Docstring inherited from base class
746 if transfer != "auto":
747 return transfer
749 # See if the paths are within the datastore or not
750 inside = [self._pathInStore(d.path) is not None for d in datasets]
752 if all(inside):
753 transfer = None
754 elif not any(inside): 754 ↛ 763line 754 didn't jump to line 763, because the condition on line 754 was never false
755 # Allow ResourcePath to use its own knowledge
756 transfer = "auto"
757 else:
758 # This can happen when importing from a datastore that
759 # has had some datasets ingested using "direct" mode.
760 # Also allow ResourcePath to sort it out but warn about it.
761 # This can happen if you are importing from a datastore
762 # that had some direct transfer datasets.
763 log.warning(
764 "Some datasets are inside the datastore and some are outside. Using 'split' "
765 "transfer mode. This assumes that the files outside the datastore are "
766 "still accessible to the new butler since they will not be copied into "
767 "the target datastore."
768 )
769 transfer = "split"
771 return transfer
773 def _pathInStore(self, path: ResourcePathExpression) -> Optional[str]:
774 """Return path relative to datastore root
776 Parameters
777 ----------
778 path : `lsst.resources.ResourcePathExpression`
779 Path to dataset. Can be absolute URI. If relative assumed to
780 be relative to the datastore. Returns path in datastore
781 or raises an exception if the path it outside.
783 Returns
784 -------
785 inStore : `str`
786 Path relative to datastore root. Returns `None` if the file is
787 outside the root.
788 """
789 # Relative path will always be relative to datastore
790 pathUri = ResourcePath(path, forceAbsolute=False)
791 return pathUri.relative_to(self.root)
793 def _standardizeIngestPath(
794 self, path: Union[str, ResourcePath], *, transfer: Optional[str] = None
795 ) -> Union[str, ResourcePath]:
796 """Standardize the path of a to-be-ingested file.
798 Parameters
799 ----------
800 path : `str` or `lsst.resources.ResourcePath`
801 Path of a file to be ingested. This parameter is not expected
802 to be all the types that can be used to construct a
803 `~lsst.resources.ResourcePath`.
804 transfer : `str`, optional
805 How (and whether) the dataset should be added to the datastore.
806 See `ingest` for details of transfer modes.
807 This implementation is provided only so
808 `NotImplementedError` can be raised if the mode is not supported;
809 actual transfers are deferred to `_extractIngestInfo`.
811 Returns
812 -------
813 path : `str` or `lsst.resources.ResourcePath`
814 New path in what the datastore considers standard form. If an
815 absolute URI was given that will be returned unchanged.
817 Notes
818 -----
819 Subclasses of `FileDatastore` can implement this method instead
820 of `_prepIngest`. It should not modify the data repository or given
821 file in any way.
823 Raises
824 ------
825 NotImplementedError
826 Raised if the datastore does not support the given transfer mode
827 (including the case where ingest is not supported at all).
828 FileNotFoundError
829 Raised if one of the given files does not exist.
830 """
831 if transfer not in (None, "direct", "split") + self.root.transferModes: 831 ↛ 832line 831 didn't jump to line 832, because the condition on line 831 was never true
832 raise NotImplementedError(f"Transfer mode {transfer} not supported.")
834 # A relative URI indicates relative to datastore root
835 srcUri = ResourcePath(path, forceAbsolute=False)
836 if not srcUri.isabs():
837 srcUri = self.root.join(path)
839 if not srcUri.exists():
840 raise FileNotFoundError(
841 f"Resource at {srcUri} does not exist; note that paths to ingest "
842 f"are assumed to be relative to {self.root} unless they are absolute."
843 )
845 if transfer is None:
846 relpath = srcUri.relative_to(self.root)
847 if not relpath:
848 raise RuntimeError(
849 f"Transfer is none but source file ({srcUri}) is not within datastore ({self.root})"
850 )
852 # Return the relative path within the datastore for internal
853 # transfer
854 path = relpath
856 return path
858 def _extractIngestInfo(
859 self,
860 path: ResourcePathExpression,
861 ref: DatasetRef,
862 *,
863 formatter: Union[Formatter, Type[Formatter]],
864 transfer: Optional[str] = None,
865 record_validation_info: bool = True,
866 ) -> StoredFileInfo:
867 """Relocate (if necessary) and extract `StoredFileInfo` from a
868 to-be-ingested file.
870 Parameters
871 ----------
872 path : `lsst.resources.ResourcePathExpression`
873 URI or path of a file to be ingested.
874 ref : `DatasetRef`
875 Reference for the dataset being ingested. Guaranteed to have
876 ``dataset_id not None`.
877 formatter : `type` or `Formatter`
878 `Formatter` subclass to use for this dataset or an instance.
879 transfer : `str`, optional
880 How (and whether) the dataset should be added to the datastore.
881 See `ingest` for details of transfer modes.
882 record_validation_info : `bool`, optional
883 If `True`, the default, the datastore can record validation
884 information associated with the file. If `False` the datastore
885 will not attempt to track any information such as checksums
886 or file sizes. This can be useful if such information is tracked
887 in an external system or if the file is to be compressed in place.
888 It is up to the datastore whether this parameter is relevant.
890 Returns
891 -------
892 info : `StoredFileInfo`
893 Internal datastore record for this file. This will be inserted by
894 the caller; the `_extractIngestInfo` is only responsible for
895 creating and populating the struct.
897 Raises
898 ------
899 FileNotFoundError
900 Raised if one of the given files does not exist.
901 FileExistsError
902 Raised if transfer is not `None` but the (internal) location the
903 file would be moved to is already occupied.
904 """
905 if self._transaction is None: 905 ↛ 906line 905 didn't jump to line 906, because the condition on line 905 was never true
906 raise RuntimeError("Ingest called without transaction enabled")
908 # Create URI of the source path, do not need to force a relative
909 # path to absolute.
910 srcUri = ResourcePath(path, forceAbsolute=False)
912 # Track whether we have read the size of the source yet
913 have_sized = False
915 tgtLocation: Optional[Location]
916 if transfer is None or transfer == "split":
917 # A relative path is assumed to be relative to the datastore
918 # in this context
919 if not srcUri.isabs():
920 tgtLocation = self.locationFactory.fromPath(srcUri.ospath)
921 else:
922 # Work out the path in the datastore from an absolute URI
923 # This is required to be within the datastore.
924 pathInStore = srcUri.relative_to(self.root)
925 if pathInStore is None and transfer is None: 925 ↛ 926line 925 didn't jump to line 926, because the condition on line 925 was never true
926 raise RuntimeError(
927 f"Unexpectedly learned that {srcUri} is not within datastore {self.root}"
928 )
929 if pathInStore: 929 ↛ 931line 929 didn't jump to line 931, because the condition on line 929 was never false
930 tgtLocation = self.locationFactory.fromPath(pathInStore)
931 elif transfer == "split":
932 # Outside the datastore but treat that as a direct ingest
933 # instead.
934 tgtLocation = None
935 else:
936 raise RuntimeError(f"Unexpected transfer mode encountered: {transfer} for URI {srcUri}")
937 elif transfer == "direct": 937 ↛ 942line 937 didn't jump to line 942, because the condition on line 937 was never true
938 # Want to store the full URI to the resource directly in
939 # datastore. This is useful for referring to permanent archive
940 # storage for raw data.
941 # Trust that people know what they are doing.
942 tgtLocation = None
943 else:
944 # Work out the name we want this ingested file to have
945 # inside the datastore
946 tgtLocation = self._calculate_ingested_datastore_name(srcUri, ref, formatter)
947 if not tgtLocation.uri.dirname().exists():
948 log.debug("Folder %s does not exist yet.", tgtLocation.uri.dirname())
949 tgtLocation.uri.dirname().mkdir()
951 # if we are transferring from a local file to a remote location
952 # it may be more efficient to get the size and checksum of the
953 # local file rather than the transferred one
954 if record_validation_info and srcUri.isLocal:
955 size = srcUri.size()
956 checksum = self.computeChecksum(srcUri) if self.useChecksum else None
957 have_sized = True
959 # Transfer the resource to the destination.
960 # Allow overwrite of an existing file. This matches the behavior
961 # of datastore.put() in that it trusts that registry would not
962 # be asking to overwrite unless registry thought that the
963 # overwrite was allowed.
964 tgtLocation.uri.transfer_from(
965 srcUri, transfer=transfer, transaction=self._transaction, overwrite=True
966 )
968 if tgtLocation is None: 968 ↛ 970line 968 didn't jump to line 970, because the condition on line 968 was never true
969 # This means we are using direct mode
970 targetUri = srcUri
971 targetPath = str(srcUri)
972 else:
973 targetUri = tgtLocation.uri
974 targetPath = tgtLocation.pathInStore.path
976 # the file should exist in the datastore now
977 if record_validation_info:
978 if not have_sized:
979 size = targetUri.size()
980 checksum = self.computeChecksum(targetUri) if self.useChecksum else None
981 else:
982 # Not recording any file information.
983 size = -1
984 checksum = None
986 return StoredFileInfo(
987 formatter=formatter,
988 path=targetPath,
989 storageClass=ref.datasetType.storageClass,
990 component=ref.datasetType.component(),
991 file_size=size,
992 checksum=checksum,
993 dataset_id=ref.getCheckedId(),
994 )
996 def _prepIngest(self, *datasets: FileDataset, transfer: Optional[str] = None) -> _IngestPrepData:
997 # Docstring inherited from Datastore._prepIngest.
998 filtered = []
999 for dataset in datasets:
1000 acceptable = [ref for ref in dataset.refs if self.constraints.isAcceptable(ref)]
1001 if not acceptable:
1002 continue
1003 else:
1004 dataset.refs = acceptable
1005 if dataset.formatter is None:
1006 dataset.formatter = self.formatterFactory.getFormatterClass(dataset.refs[0])
1007 else:
1008 assert isinstance(dataset.formatter, (type, str))
1009 formatter_class = get_class_of(dataset.formatter)
1010 if not issubclass(formatter_class, Formatter): 1010 ↛ 1011line 1010 didn't jump to line 1011, because the condition on line 1010 was never true
1011 raise TypeError(f"Requested formatter {dataset.formatter} is not a Formatter class.")
1012 dataset.formatter = formatter_class
1013 dataset.path = self._standardizeIngestPath(dataset.path, transfer=transfer)
1014 filtered.append(dataset)
1015 return _IngestPrepData(filtered)
1017 @transactional
1018 def _finishIngest(
1019 self,
1020 prepData: Datastore.IngestPrepData,
1021 *,
1022 transfer: Optional[str] = None,
1023 record_validation_info: bool = True,
1024 ) -> None:
1025 # Docstring inherited from Datastore._finishIngest.
1026 refsAndInfos = []
1027 progress = Progress("lsst.daf.butler.datastores.FileDatastore.ingest", level=logging.DEBUG)
1028 for dataset in progress.wrap(prepData.datasets, desc="Ingesting dataset files"):
1029 # Do ingest as if the first dataset ref is associated with the file
1030 info = self._extractIngestInfo(
1031 dataset.path,
1032 dataset.refs[0],
1033 formatter=dataset.formatter,
1034 transfer=transfer,
1035 record_validation_info=record_validation_info,
1036 )
1037 refsAndInfos.extend([(ref, info) for ref in dataset.refs])
1038 self._register_datasets(refsAndInfos)
1040 def _calculate_ingested_datastore_name(
1041 self, srcUri: ResourcePath, ref: DatasetRef, formatter: Union[Formatter, Type[Formatter]]
1042 ) -> Location:
1043 """Given a source URI and a DatasetRef, determine the name the
1044 dataset will have inside datastore.
1046 Parameters
1047 ----------
1048 srcUri : `lsst.resources.ResourcePath`
1049 URI to the source dataset file.
1050 ref : `DatasetRef`
1051 Ref associated with the newly-ingested dataset artifact. This
1052 is used to determine the name within the datastore.
1053 formatter : `Formatter` or Formatter class.
1054 Formatter to use for validation. Can be a class or an instance.
1056 Returns
1057 -------
1058 location : `Location`
1059 Target location for the newly-ingested dataset.
1060 """
1061 # Ingesting a file from outside the datastore.
1062 # This involves a new name.
1063 template = self.templates.getTemplate(ref)
1064 location = self.locationFactory.fromPath(template.format(ref))
1066 # Get the extension
1067 ext = srcUri.getExtension()
1069 # Update the destination to include that extension
1070 location.updateExtension(ext)
1072 # Ask the formatter to validate this extension
1073 formatter.validateExtension(location)
1075 return location
1077 def _write_in_memory_to_artifact(self, inMemoryDataset: Any, ref: DatasetRef) -> StoredFileInfo:
1078 """Write out in memory dataset to datastore.
1080 Parameters
1081 ----------
1082 inMemoryDataset : `object`
1083 Dataset to write to datastore.
1084 ref : `DatasetRef`
1085 Registry information associated with this dataset.
1087 Returns
1088 -------
1089 info : `StoredFileInfo`
1090 Information describing the artifact written to the datastore.
1091 """
1092 # May need to coerce the in memory dataset to the correct
1093 # python type.
1094 inMemoryDataset = ref.datasetType.storageClass.coerce_type(inMemoryDataset)
1096 location, formatter = self._prepare_for_put(inMemoryDataset, ref)
1097 uri = location.uri
1099 if not uri.dirname().exists():
1100 log.debug("Folder %s does not exist yet so creating it.", uri.dirname())
1101 uri.dirname().mkdir()
1103 if self._transaction is None: 1103 ↛ 1104line 1103 didn't jump to line 1104, because the condition on line 1103 was never true
1104 raise RuntimeError("Attempting to write artifact without transaction enabled")
1106 def _removeFileExists(uri: ResourcePath) -> None:
1107 """Remove a file and do not complain if it is not there.
1109 This is important since a formatter might fail before the file
1110 is written and we should not confuse people by writing spurious
1111 error messages to the log.
1112 """
1113 try:
1114 uri.remove()
1115 except FileNotFoundError:
1116 pass
1118 # Register a callback to try to delete the uploaded data if
1119 # something fails below
1120 self._transaction.registerUndo("artifactWrite", _removeFileExists, uri)
1122 # For a local file, simply use the formatter directly
1123 if uri.isLocal:
1124 try:
1125 formatter.write(inMemoryDataset)
1126 except Exception as e:
1127 raise RuntimeError(
1128 f"Failed to serialize dataset {ref} of type {type(inMemoryDataset)} to location {uri}"
1129 ) from e
1130 log.debug("Successfully wrote python object to local file at %s", uri)
1131 else:
1132 # This is a remote URI. Some datasets can be serialized directly
1133 # to bytes and sent to the remote datastore without writing a
1134 # file. If the dataset is intended to be saved to the cache
1135 # a file is always written and direct write to the remote
1136 # datastore is bypassed.
1137 data_written = False
1138 if not self.cacheManager.should_be_cached(ref):
1139 try:
1140 serializedDataset = formatter.toBytes(inMemoryDataset)
1141 except NotImplementedError:
1142 # Fallback to the file writing option.
1143 pass
1144 except Exception as e:
1145 raise RuntimeError(
1146 f"Failed to serialize dataset {ref} of type {type(inMemoryDataset)} to bytes."
1147 ) from e
1148 else:
1149 log.debug("Writing bytes directly to %s", uri)
1150 uri.write(serializedDataset, overwrite=True)
1151 log.debug("Successfully wrote bytes directly to %s", uri)
1152 data_written = True
1154 if not data_written:
1155 # Did not write the bytes directly to object store so instead
1156 # write to temporary file.
1157 with ResourcePath.temporary_uri(suffix=uri.getExtension()) as temporary_uri:
1158 # Need to configure the formatter to write to a different
1159 # location and that needs us to overwrite internals
1160 log.debug("Writing dataset to temporary location at %s", temporary_uri)
1161 with formatter._updateLocation(Location(None, temporary_uri)):
1162 try:
1163 formatter.write(inMemoryDataset)
1164 except Exception as e:
1165 raise RuntimeError(
1166 f"Failed to serialize dataset {ref} of type"
1167 f" {type(inMemoryDataset)} to "
1168 f"temporary location {temporary_uri}"
1169 ) from e
1170 uri.transfer_from(temporary_uri, transfer="copy", overwrite=True)
1172 # Cache if required
1173 self.cacheManager.move_to_cache(temporary_uri, ref)
1175 log.debug("Successfully wrote dataset to %s via a temporary file.", uri)
1177 # URI is needed to resolve what ingest case are we dealing with
1178 return self._extractIngestInfo(uri, ref, formatter=formatter)
1180 def _read_artifact_into_memory(
1181 self,
1182 getInfo: DatastoreFileGetInformation,
1183 ref: DatasetRef,
1184 isComponent: bool = False,
1185 cache_ref: Optional[DatasetRef] = None,
1186 ) -> Any:
1187 """Read the artifact from datastore into in memory object.
1189 Parameters
1190 ----------
1191 getInfo : `DatastoreFileGetInformation`
1192 Information about the artifact within the datastore.
1193 ref : `DatasetRef`
1194 The registry information associated with this artifact.
1195 isComponent : `bool`
1196 Flag to indicate if a component is being read from this artifact.
1197 cache_ref : `DatasetRef`, optional
1198 The DatasetRef to use when looking up the file in the cache.
1199 This ref must have the same ID as the supplied ref but can
1200 be a parent ref or component ref to indicate to the cache whether
1201 a composite file is being requested from the cache or a component
1202 file. Without this the cache will default to the supplied ref but
1203 it can get confused with read-only derived components for
1204 disassembled composites.
1206 Returns
1207 -------
1208 inMemoryDataset : `object`
1209 The artifact as a python object.
1210 """
1211 location = getInfo.location
1212 uri = location.uri
1213 log.debug("Accessing data from %s", uri)
1215 if cache_ref is None:
1216 cache_ref = ref
1217 if cache_ref.id != ref.id: 1217 ↛ 1218line 1217 didn't jump to line 1218, because the condition on line 1217 was never true
1218 raise ValueError(
1219 "The supplied cache dataset ref refers to a different dataset than expected:"
1220 f" {ref.id} != {cache_ref.id}"
1221 )
1223 # Cannot recalculate checksum but can compare size as a quick check
1224 # Do not do this if the size is negative since that indicates
1225 # we do not know.
1226 recorded_size = getInfo.info.file_size
1227 resource_size = uri.size()
1228 if recorded_size >= 0 and resource_size != recorded_size: 1228 ↛ 1229line 1228 didn't jump to line 1229, because the condition on line 1228 was never true
1229 raise RuntimeError(
1230 "Integrity failure in Datastore. "
1231 f"Size of file {uri} ({resource_size}) "
1232 f"does not match size recorded in registry of {recorded_size}"
1233 )
1235 # For the general case we have choices for how to proceed.
1236 # 1. Always use a local file (downloading the remote resource to a
1237 # temporary file if needed).
1238 # 2. Use a threshold size and read into memory and use bytes.
1239 # Use both for now with an arbitrary hand off size.
1240 # This allows small datasets to be downloaded from remote object
1241 # stores without requiring a temporary file.
1243 formatter = getInfo.formatter
1244 nbytes_max = 10_000_000 # Arbitrary number that we can tune
1245 if resource_size <= nbytes_max and formatter.can_read_bytes():
1246 with self.cacheManager.find_in_cache(cache_ref, uri.getExtension()) as cached_file:
1247 if cached_file is not None:
1248 desired_uri = cached_file
1249 msg = f" (cached version of {uri})"
1250 else:
1251 desired_uri = uri
1252 msg = ""
1253 with time_this(log, msg="Reading bytes from %s%s", args=(desired_uri, msg)):
1254 serializedDataset = desired_uri.read()
1255 log.debug(
1256 "Deserializing %s from %d bytes from location %s with formatter %s",
1257 f"component {getInfo.component}" if isComponent else "",
1258 len(serializedDataset),
1259 uri,
1260 formatter.name(),
1261 )
1262 try:
1263 result = formatter.fromBytes(
1264 serializedDataset, component=getInfo.component if isComponent else None
1265 )
1266 except Exception as e:
1267 raise ValueError(
1268 f"Failure from formatter '{formatter.name()}' for dataset {ref.id}"
1269 f" ({ref.datasetType.name} from {uri}): {e}"
1270 ) from e
1271 else:
1272 # Read from file.
1274 # Have to update the Location associated with the formatter
1275 # because formatter.read does not allow an override.
1276 # This could be improved.
1277 location_updated = False
1278 msg = ""
1280 # First check in cache for local version.
1281 # The cache will only be relevant for remote resources but
1282 # no harm in always asking. Context manager ensures that cache
1283 # file is not deleted during cache expiration.
1284 with self.cacheManager.find_in_cache(cache_ref, uri.getExtension()) as cached_file:
1285 if cached_file is not None:
1286 msg = f"(via cache read of remote file {uri})"
1287 uri = cached_file
1288 location_updated = True
1290 with uri.as_local() as local_uri:
1292 can_be_cached = False
1293 if uri != local_uri: 1293 ↛ 1295line 1293 didn't jump to line 1295, because the condition on line 1293 was never true
1294 # URI was remote and file was downloaded
1295 cache_msg = ""
1296 location_updated = True
1298 if self.cacheManager.should_be_cached(cache_ref):
1299 # In this scenario we want to ask if the downloaded
1300 # file should be cached but we should not cache
1301 # it until after we've used it (to ensure it can't
1302 # be expired whilst we are using it).
1303 can_be_cached = True
1305 # Say that it is "likely" to be cached because
1306 # if the formatter read fails we will not be
1307 # caching this file.
1308 cache_msg = " and likely cached"
1310 msg = f"(via download to local file{cache_msg})"
1312 # Calculate the (possibly) new location for the formatter
1313 # to use.
1314 newLocation = Location(*local_uri.split()) if location_updated else None
1316 log.debug(
1317 "Reading%s from location %s %s with formatter %s",
1318 f" component {getInfo.component}" if isComponent else "",
1319 uri,
1320 msg,
1321 formatter.name(),
1322 )
1323 try:
1324 with formatter._updateLocation(newLocation):
1325 with time_this(
1326 log,
1327 msg="Reading%s from location %s %s with formatter %s",
1328 args=(
1329 f" component {getInfo.component}" if isComponent else "",
1330 uri,
1331 msg,
1332 formatter.name(),
1333 ),
1334 ):
1335 result = formatter.read(component=getInfo.component if isComponent else None)
1336 except Exception as e:
1337 raise ValueError(
1338 f"Failure from formatter '{formatter.name()}' for dataset {ref.id}"
1339 f" ({ref.datasetType.name} from {uri}): {e}"
1340 ) from e
1342 # File was read successfully so can move to cache
1343 if can_be_cached: 1343 ↛ 1344line 1343 didn't jump to line 1344, because the condition on line 1343 was never true
1344 self.cacheManager.move_to_cache(local_uri, cache_ref)
1346 return self._post_process_get(
1347 result, getInfo.readStorageClass, getInfo.assemblerParams, isComponent=isComponent
1348 )
1350 def knows(self, ref: DatasetRef) -> bool:
1351 """Check if the dataset is known to the datastore.
1353 Does not check for existence of any artifact.
1355 Parameters
1356 ----------
1357 ref : `DatasetRef`
1358 Reference to the required dataset.
1360 Returns
1361 -------
1362 exists : `bool`
1363 `True` if the dataset is known to the datastore.
1364 """
1365 fileLocations = self._get_dataset_locations_info(ref)
1366 if fileLocations:
1367 return True
1368 return False
1370 def _process_mexists_records(
1371 self,
1372 id_to_ref: Dict[DatasetId, DatasetRef],
1373 records: Dict[DatasetId, List[StoredFileInfo]],
1374 all_required: bool,
1375 artifact_existence: Optional[Dict[ResourcePath, bool]] = None,
1376 ) -> Dict[DatasetRef, bool]:
1377 """Helper function for mexists that checks the given records.
1379 Parameters
1380 ----------
1381 id_to_ref : `dict` of [`DatasetId`, `DatasetRef`]
1382 Mapping of the dataset ID to the dataset ref itself.
1383 records : `dict` of [`DatasetId`, `list` of `StoredFileInfo`]
1384 Records as generally returned by
1385 ``_get_stored_records_associated_with_refs``.
1386 all_required : `bool`
1387 Flag to indicate whether existence requires all artifacts
1388 associated with a dataset ID to exist or not for existence.
1389 artifact_existence : `dict` [`lsst.resources.ResourcePath`, `bool`]
1390 Optional mapping of datastore artifact to existence. Updated by
1391 this method with details of all artifacts tested. Can be `None`
1392 if the caller is not interested.
1394 Returns
1395 -------
1396 existence : `dict` of [`DatasetRef`, `bool`]
1397 Mapping from dataset to boolean indicating existence.
1398 """
1399 # The URIs to be checked and a mapping of those URIs to
1400 # the dataset ID.
1401 uris_to_check: List[ResourcePath] = []
1402 location_map: Dict[ResourcePath, DatasetId] = {}
1404 location_factory = self.locationFactory
1406 uri_existence: Dict[ResourcePath, bool] = {}
1407 for ref_id, infos in records.items():
1408 # Key is the dataset Id, value is list of StoredItemInfo
1409 uris = [info.file_location(location_factory).uri for info in infos]
1410 location_map.update({uri: ref_id for uri in uris})
1412 # Check the local cache directly for a dataset corresponding
1413 # to the remote URI.
1414 if self.cacheManager.file_count > 0:
1415 ref = id_to_ref[ref_id]
1416 for uri, storedFileInfo in zip(uris, infos):
1417 check_ref = ref
1418 if not ref.datasetType.isComponent() and (component := storedFileInfo.component): 1418 ↛ 1419line 1418 didn't jump to line 1419, because the condition on line 1418 was never true
1419 check_ref = ref.makeComponentRef(component)
1420 if self.cacheManager.known_to_cache(check_ref, uri.getExtension()):
1421 # Proxy for URI existence.
1422 uri_existence[uri] = True
1423 else:
1424 uris_to_check.append(uri)
1425 else:
1426 # Check all of them.
1427 uris_to_check.extend(uris)
1429 if artifact_existence is not None:
1430 # If a URI has already been checked remove it from the list
1431 # and immediately add the status to the output dict.
1432 filtered_uris_to_check = []
1433 for uri in uris_to_check:
1434 if uri in artifact_existence:
1435 uri_existence[uri] = artifact_existence[uri]
1436 else:
1437 filtered_uris_to_check.append(uri)
1438 uris_to_check = filtered_uris_to_check
1440 # Results.
1441 dataset_existence: Dict[DatasetRef, bool] = {}
1443 uri_existence.update(ResourcePath.mexists(uris_to_check))
1444 for uri, exists in uri_existence.items():
1445 dataset_id = location_map[uri]
1446 ref = id_to_ref[dataset_id]
1448 # Disassembled composite needs to check all locations.
1449 # all_required indicates whether all need to exist or not.
1450 if ref in dataset_existence:
1451 if all_required:
1452 exists = dataset_existence[ref] and exists
1453 else:
1454 exists = dataset_existence[ref] or exists
1455 dataset_existence[ref] = exists
1457 if artifact_existence is not None:
1458 artifact_existence.update(uri_existence)
1460 return dataset_existence
1462 def mexists(
1463 self, refs: Iterable[DatasetRef], artifact_existence: Optional[Dict[ResourcePath, bool]] = None
1464 ) -> Dict[DatasetRef, bool]:
1465 """Check the existence of multiple datasets at once.
1467 Parameters
1468 ----------
1469 refs : iterable of `DatasetRef`
1470 The datasets to be checked.
1471 artifact_existence : `dict` [`lsst.resources.ResourcePath`, `bool`]
1472 Optional mapping of datastore artifact to existence. Updated by
1473 this method with details of all artifacts tested. Can be `None`
1474 if the caller is not interested.
1476 Returns
1477 -------
1478 existence : `dict` of [`DatasetRef`, `bool`]
1479 Mapping from dataset to boolean indicating existence.
1481 Notes
1482 -----
1483 To minimize potentially costly remote existence checks, the local
1484 cache is checked as a proxy for existence. If a file for this
1485 `DatasetRef` does exist no check is done for the actual URI. This
1486 could result in possibly unexpected behavior if the dataset itself
1487 has been removed from the datastore by another process whilst it is
1488 still in the cache.
1489 """
1490 chunk_size = 10_000
1491 dataset_existence: Dict[DatasetRef, bool] = {}
1492 log.debug("Checking for the existence of multiple artifacts in datastore in chunks of %d", chunk_size)
1493 n_found_total = 0
1494 n_checked = 0
1495 n_chunks = 0
1496 for chunk in chunk_iterable(refs, chunk_size=chunk_size):
1497 chunk_result = self._mexists(chunk, artifact_existence)
1498 if log.isEnabledFor(VERBOSE):
1499 n_results = len(chunk_result)
1500 n_checked += n_results
1501 # Can treat the booleans as 0, 1 integers and sum them.
1502 n_found = sum(chunk_result.values())
1503 n_found_total += n_found
1504 log.verbose(
1505 "Number of datasets found in datastore for chunk %d = %d/%d (running total: %d/%d)",
1506 n_chunks,
1507 n_found,
1508 n_results,
1509 n_found_total,
1510 n_checked,
1511 )
1512 dataset_existence.update(chunk_result)
1513 n_chunks += 1
1515 return dataset_existence
1517 def _mexists(
1518 self, refs: Iterable[DatasetRef], artifact_existence: Optional[Dict[ResourcePath, bool]] = None
1519 ) -> Dict[DatasetRef, bool]:
1520 """Check the existence of multiple datasets at once.
1522 Parameters
1523 ----------
1524 refs : iterable of `DatasetRef`
1525 The datasets to be checked.
1527 Returns
1528 -------
1529 existence : `dict` of [`DatasetRef`, `bool`]
1530 Mapping from dataset to boolean indicating existence.
1531 """
1532 # Need a mapping of dataset_id to dataset ref since the API
1533 # works with dataset_id
1534 id_to_ref = {ref.getCheckedId(): ref for ref in refs}
1536 # Set of all IDs we are checking for.
1537 requested_ids = set(id_to_ref.keys())
1539 # The records themselves. Could be missing some entries.
1540 records = self._get_stored_records_associated_with_refs(refs)
1542 dataset_existence = self._process_mexists_records(
1543 id_to_ref, records, True, artifact_existence=artifact_existence
1544 )
1546 # Set of IDs that have been handled.
1547 handled_ids = {ref.id for ref in dataset_existence.keys()}
1549 missing_ids = requested_ids - handled_ids
1550 if missing_ids:
1551 if not self.trustGetRequest:
1552 # Must assume these do not exist
1553 for missing in missing_ids:
1554 dataset_existence[id_to_ref[missing]] = False
1555 else:
1556 log.debug(
1557 "%d out of %d datasets were not known to datastore during initial existence check.",
1558 len(missing_ids),
1559 len(requested_ids),
1560 )
1562 # Construct data structure identical to that returned
1563 # by _get_stored_records_associated_with_refs() but using
1564 # guessed names.
1565 records = {}
1566 for missing in missing_ids:
1567 expected = self._get_expected_dataset_locations_info(id_to_ref[missing])
1568 records[missing] = [info for _, info in expected]
1570 dataset_existence.update(
1571 self._process_mexists_records(
1572 id_to_ref, records, False, artifact_existence=artifact_existence
1573 )
1574 )
1576 return dataset_existence
1578 def exists(self, ref: DatasetRef) -> bool:
1579 """Check if the dataset exists in the datastore.
1581 Parameters
1582 ----------
1583 ref : `DatasetRef`
1584 Reference to the required dataset.
1586 Returns
1587 -------
1588 exists : `bool`
1589 `True` if the entity exists in the `Datastore`.
1591 Notes
1592 -----
1593 The local cache is checked as a proxy for existence in the remote
1594 object store. It is possible that another process on a different
1595 compute node could remove the file from the object store even
1596 though it is present in the local cache.
1597 """
1598 fileLocations = self._get_dataset_locations_info(ref)
1600 # if we are being asked to trust that registry might not be correct
1601 # we ask for the expected locations and check them explicitly
1602 if not fileLocations:
1603 if not self.trustGetRequest:
1604 return False
1606 # First check the cache. If it is not found we must check
1607 # the datastore itself. Assume that any component in the cache
1608 # means that the dataset does exist somewhere.
1609 if self.cacheManager.known_to_cache(ref): 1609 ↛ 1610line 1609 didn't jump to line 1610, because the condition on line 1609 was never true
1610 return True
1612 # When we are guessing a dataset location we can not check
1613 # for the existence of every component since we can not
1614 # know if every component was written. Instead we check
1615 # for the existence of any of the expected locations.
1616 for location, _ in self._get_expected_dataset_locations_info(ref): 1616 ↛ 1619line 1616 didn't jump to line 1619, because the loop on line 1616 didn't complete
1617 if self._artifact_exists(location): 1617 ↛ 1616line 1617 didn't jump to line 1616, because the condition on line 1617 was never false
1618 return True
1619 return False
1621 # All listed artifacts must exist.
1622 for location, storedFileInfo in fileLocations:
1623 # Checking in cache needs the component ref.
1624 check_ref = ref
1625 if not ref.datasetType.isComponent() and (component := storedFileInfo.component):
1626 check_ref = ref.makeComponentRef(component)
1627 if self.cacheManager.known_to_cache(check_ref, location.getExtension()):
1628 continue
1630 if not self._artifact_exists(location):
1631 return False
1633 return True
1635 def getURIs(
1636 self, ref: DatasetRef, predict: bool = False
1637 ) -> Tuple[Optional[ResourcePath], Dict[str, ResourcePath]]:
1638 """Return URIs associated with dataset.
1640 Parameters
1641 ----------
1642 ref : `DatasetRef`
1643 Reference to the required dataset.
1644 predict : `bool`, optional
1645 If the datastore does not know about the dataset, should it
1646 return a predicted URI or not?
1648 Returns
1649 -------
1650 primary : `lsst.resources.ResourcePath`
1651 The URI to the primary artifact associated with this dataset.
1652 If the dataset was disassembled within the datastore this
1653 may be `None`.
1654 components : `dict`
1655 URIs to any components associated with the dataset artifact.
1656 Can be empty if there are no components.
1657 """
1659 primary: Optional[ResourcePath] = None
1660 components: Dict[str, ResourcePath] = {}
1662 # if this has never been written then we have to guess
1663 if not self.exists(ref):
1664 if not predict:
1665 raise FileNotFoundError("Dataset {} not in this datastore".format(ref))
1667 doDisassembly = self.composites.shouldBeDisassembled(ref)
1669 if doDisassembly:
1671 for component, componentStorage in ref.datasetType.storageClass.components.items():
1672 compRef = ref.makeComponentRef(component)
1673 compLocation, _ = self._determine_put_formatter_location(compRef)
1675 # Add a URI fragment to indicate this is a guess
1676 components[component] = ResourcePath(compLocation.uri.geturl() + "#predicted")
1678 else:
1680 location, _ = self._determine_put_formatter_location(ref)
1682 # Add a URI fragment to indicate this is a guess
1683 primary = ResourcePath(location.uri.geturl() + "#predicted")
1685 return primary, components
1687 # If this is a ref that we have written we can get the path.
1688 # Get file metadata and internal metadata
1689 fileLocations = self._get_dataset_locations_info(ref)
1691 guessing = False
1692 if not fileLocations:
1693 if not self.trustGetRequest: 1693 ↛ 1694line 1693 didn't jump to line 1694, because the condition on line 1693 was never true
1694 raise RuntimeError(f"Unexpectedly got no artifacts for dataset {ref}")
1695 fileLocations = self._get_expected_dataset_locations_info(ref)
1696 guessing = True
1698 if len(fileLocations) == 1:
1699 # No disassembly so this is the primary URI
1700 uri = fileLocations[0][0].uri
1701 if guessing and not uri.exists(): 1701 ↛ 1702line 1701 didn't jump to line 1702, because the condition on line 1701 was never true
1702 raise FileNotFoundError(f"Expected URI ({uri}) does not exist")
1703 primary = uri
1705 else:
1706 for location, storedFileInfo in fileLocations:
1707 if storedFileInfo.component is None: 1707 ↛ 1708line 1707 didn't jump to line 1708, because the condition on line 1707 was never true
1708 raise RuntimeError(f"Unexpectedly got no component name for a component at {location}")
1709 uri = location.uri
1710 if guessing and not uri.exists(): 1710 ↛ 1714line 1710 didn't jump to line 1714, because the condition on line 1710 was never true
1711 # If we are trusting then it is entirely possible for
1712 # some components to be missing. In that case we skip
1713 # to the next component.
1714 if self.trustGetRequest:
1715 continue
1716 raise FileNotFoundError(f"Expected URI ({uri}) does not exist")
1717 components[storedFileInfo.component] = uri
1719 return primary, components
1721 def getURI(self, ref: DatasetRef, predict: bool = False) -> ResourcePath:
1722 """URI to the Dataset.
1724 Parameters
1725 ----------
1726 ref : `DatasetRef`
1727 Reference to the required Dataset.
1728 predict : `bool`
1729 If `True`, allow URIs to be returned of datasets that have not
1730 been written.
1732 Returns
1733 -------
1734 uri : `str`
1735 URI pointing to the dataset within the datastore. If the
1736 dataset does not exist in the datastore, and if ``predict`` is
1737 `True`, the URI will be a prediction and will include a URI
1738 fragment "#predicted".
1739 If the datastore does not have entities that relate well
1740 to the concept of a URI the returned URI will be
1741 descriptive. The returned URI is not guaranteed to be obtainable.
1743 Raises
1744 ------
1745 FileNotFoundError
1746 Raised if a URI has been requested for a dataset that does not
1747 exist and guessing is not allowed.
1748 RuntimeError
1749 Raised if a request is made for a single URI but multiple URIs
1750 are associated with this dataset.
1752 Notes
1753 -----
1754 When a predicted URI is requested an attempt will be made to form
1755 a reasonable URI based on file templates and the expected formatter.
1756 """
1757 primary, components = self.getURIs(ref, predict)
1758 if primary is None or components: 1758 ↛ 1759line 1758 didn't jump to line 1759, because the condition on line 1758 was never true
1759 raise RuntimeError(
1760 f"Dataset ({ref}) includes distinct URIs for components. Use Datastore.getURIs() instead."
1761 )
1762 return primary
1764 def retrieveArtifacts(
1765 self,
1766 refs: Iterable[DatasetRef],
1767 destination: ResourcePath,
1768 transfer: str = "auto",
1769 preserve_path: bool = True,
1770 overwrite: bool = False,
1771 ) -> List[ResourcePath]:
1772 """Retrieve the file artifacts associated with the supplied refs.
1774 Parameters
1775 ----------
1776 refs : iterable of `DatasetRef`
1777 The datasets for which file artifacts are to be retrieved.
1778 A single ref can result in multiple files. The refs must
1779 be resolved.
1780 destination : `lsst.resources.ResourcePath`
1781 Location to write the file artifacts.
1782 transfer : `str`, optional
1783 Method to use to transfer the artifacts. Must be one of the options
1784 supported by `lsst.resources.ResourcePath.transfer_from()`.
1785 "move" is not allowed.
1786 preserve_path : `bool`, optional
1787 If `True` the full path of the file artifact within the datastore
1788 is preserved. If `False` the final file component of the path
1789 is used.
1790 overwrite : `bool`, optional
1791 If `True` allow transfers to overwrite existing files at the
1792 destination.
1794 Returns
1795 -------
1796 targets : `list` of `lsst.resources.ResourcePath`
1797 URIs of file artifacts in destination location. Order is not
1798 preserved.
1799 """
1800 if not destination.isdir(): 1800 ↛ 1801line 1800 didn't jump to line 1801, because the condition on line 1800 was never true
1801 raise ValueError(f"Destination location must refer to a directory. Given {destination}")
1803 if transfer == "move":
1804 raise ValueError("Can not move artifacts out of datastore. Use copy instead.")
1806 # Source -> Destination
1807 # This also helps filter out duplicate DatasetRef in the request
1808 # that will map to the same underlying file transfer.
1809 to_transfer: Dict[ResourcePath, ResourcePath] = {}
1811 for ref in refs:
1812 locations = self._get_dataset_locations_info(ref)
1813 for location, _ in locations:
1814 source_uri = location.uri
1815 target_path: ResourcePathExpression
1816 if preserve_path:
1817 target_path = location.pathInStore
1818 if target_path.isabs(): 1818 ↛ 1821line 1818 didn't jump to line 1821, because the condition on line 1818 was never true
1819 # This is an absolute path to an external file.
1820 # Use the full path.
1821 target_path = target_path.relativeToPathRoot
1822 else:
1823 target_path = source_uri.basename()
1824 target_uri = destination.join(target_path)
1825 to_transfer[source_uri] = target_uri
1827 # In theory can now parallelize the transfer
1828 log.debug("Number of artifacts to transfer to %s: %d", str(destination), len(to_transfer))
1829 for source_uri, target_uri in to_transfer.items():
1830 target_uri.transfer_from(source_uri, transfer=transfer, overwrite=overwrite)
1832 return list(to_transfer.values())
1834 def get(self, ref: DatasetRef, parameters: Optional[Mapping[str, Any]] = None) -> Any:
1835 """Load an InMemoryDataset from the store.
1837 Parameters
1838 ----------
1839 ref : `DatasetRef`
1840 Reference to the required Dataset.
1841 parameters : `dict`
1842 `StorageClass`-specific parameters that specify, for example,
1843 a slice of the dataset to be loaded.
1845 Returns
1846 -------
1847 inMemoryDataset : `object`
1848 Requested dataset or slice thereof as an InMemoryDataset.
1850 Raises
1851 ------
1852 FileNotFoundError
1853 Requested dataset can not be retrieved.
1854 TypeError
1855 Return value from formatter has unexpected type.
1856 ValueError
1857 Formatter failed to process the dataset.
1858 """
1859 allGetInfo = self._prepare_for_get(ref, parameters)
1860 refComponent = ref.datasetType.component()
1862 # Supplied storage class for the component being read
1863 refStorageClass = ref.datasetType.storageClass
1865 # Create mapping from component name to related info
1866 allComponents = {i.component: i for i in allGetInfo}
1868 # By definition the dataset is disassembled if we have more
1869 # than one record for it.
1870 isDisassembled = len(allGetInfo) > 1
1872 # Look for the special case where we are disassembled but the
1873 # component is a derived component that was not written during
1874 # disassembly. For this scenario we need to check that the
1875 # component requested is listed as a derived component for the
1876 # composite storage class
1877 isDisassembledReadOnlyComponent = False
1878 if isDisassembled and refComponent:
1879 # The composite storage class should be accessible through
1880 # the component dataset type
1881 compositeStorageClass = ref.datasetType.parentStorageClass
1883 # In the unlikely scenario where the composite storage
1884 # class is not known, we can only assume that this is a
1885 # normal component. If that assumption is wrong then the
1886 # branch below that reads a persisted component will fail
1887 # so there is no need to complain here.
1888 if compositeStorageClass is not None: 1888 ↛ 1891line 1888 didn't jump to line 1891, because the condition on line 1888 was never false
1889 isDisassembledReadOnlyComponent = refComponent in compositeStorageClass.derivedComponents
1891 if isDisassembled and not refComponent:
1892 # This was a disassembled dataset spread over multiple files
1893 # and we need to put them all back together again.
1894 # Read into memory and then assemble
1896 # Check that the supplied parameters are suitable for the type read
1897 refStorageClass.validateParameters(parameters)
1899 # We want to keep track of all the parameters that were not used
1900 # by formatters. We assume that if any of the component formatters
1901 # use a parameter that we do not need to apply it again in the
1902 # assembler.
1903 usedParams = set()
1905 components: Dict[str, Any] = {}
1906 for getInfo in allGetInfo:
1907 # assemblerParams are parameters not understood by the
1908 # associated formatter.
1909 usedParams.update(set(getInfo.formatterParams))
1911 component = getInfo.component
1913 if component is None: 1913 ↛ 1914line 1913 didn't jump to line 1914, because the condition on line 1913 was never true
1914 raise RuntimeError(f"Internal error in datastore assembly of {ref}")
1916 # We do not want the formatter to think it's reading
1917 # a component though because it is really reading a
1918 # standalone dataset -- always tell reader it is not a
1919 # component.
1920 components[component] = self._read_artifact_into_memory(
1921 getInfo, ref.makeComponentRef(component), isComponent=False
1922 )
1924 inMemoryDataset = ref.datasetType.storageClass.delegate().assemble(components)
1926 # Any unused parameters will have to be passed to the assembler
1927 if parameters:
1928 unusedParams = {k: v for k, v in parameters.items() if k not in usedParams}
1929 else:
1930 unusedParams = {}
1932 # Process parameters
1933 return ref.datasetType.storageClass.delegate().handleParameters(
1934 inMemoryDataset, parameters=unusedParams
1935 )
1937 elif isDisassembledReadOnlyComponent:
1939 compositeStorageClass = ref.datasetType.parentStorageClass
1940 if compositeStorageClass is None: 1940 ↛ 1941line 1940 didn't jump to line 1941, because the condition on line 1940 was never true
1941 raise RuntimeError(
1942 f"Unable to retrieve derived component '{refComponent}' since"
1943 "no composite storage class is available."
1944 )
1946 if refComponent is None: 1946 ↛ 1948line 1946 didn't jump to line 1948, because the condition on line 1946 was never true
1947 # Mainly for mypy
1948 raise RuntimeError(f"Internal error in datastore {self.name}: component can not be None here")
1950 # Assume that every derived component can be calculated by
1951 # forwarding the request to a single read/write component.
1952 # Rather than guessing which rw component is the right one by
1953 # scanning each for a derived component of the same name,
1954 # we ask the storage class delegate directly which one is best to
1955 # use.
1956 compositeDelegate = compositeStorageClass.delegate()
1957 forwardedComponent = compositeDelegate.selectResponsibleComponent(
1958 refComponent, set(allComponents)
1959 )
1961 # Select the relevant component
1962 rwInfo = allComponents[forwardedComponent]
1964 # For now assume that read parameters are validated against
1965 # the real component and not the requested component
1966 forwardedStorageClass = rwInfo.formatter.fileDescriptor.readStorageClass
1967 forwardedStorageClass.validateParameters(parameters)
1969 # The reference to use for the caching must refer to the forwarded
1970 # component and not the derived component.
1971 cache_ref = ref.makeCompositeRef().makeComponentRef(forwardedComponent)
1973 # Unfortunately the FileDescriptor inside the formatter will have
1974 # the wrong write storage class so we need to create a new one
1975 # given the immutability constraint.
1976 writeStorageClass = rwInfo.info.storageClass
1978 # We may need to put some thought into parameters for read
1979 # components but for now forward them on as is
1980 readFormatter = type(rwInfo.formatter)(
1981 FileDescriptor(
1982 rwInfo.location,
1983 readStorageClass=refStorageClass,
1984 storageClass=writeStorageClass,
1985 parameters=parameters,
1986 ),
1987 ref.dataId,
1988 )
1990 # The assembler can not receive any parameter requests for a
1991 # derived component at this time since the assembler will
1992 # see the storage class of the derived component and those
1993 # parameters will have to be handled by the formatter on the
1994 # forwarded storage class.
1995 assemblerParams: Dict[str, Any] = {}
1997 # Need to created a new info that specifies the derived
1998 # component and associated storage class
1999 readInfo = DatastoreFileGetInformation(
2000 rwInfo.location,
2001 readFormatter,
2002 rwInfo.info,
2003 assemblerParams,
2004 {},
2005 refComponent,
2006 refStorageClass,
2007 )
2009 return self._read_artifact_into_memory(readInfo, ref, isComponent=True, cache_ref=cache_ref)
2011 else:
2012 # Single file request or component from that composite file
2013 for lookup in (refComponent, None): 2013 ↛ 2018line 2013 didn't jump to line 2018, because the loop on line 2013 didn't complete
2014 if lookup in allComponents: 2014 ↛ 2013line 2014 didn't jump to line 2013, because the condition on line 2014 was never false
2015 getInfo = allComponents[lookup]
2016 break
2017 else:
2018 raise FileNotFoundError(
2019 f"Component {refComponent} not found for ref {ref} in datastore {self.name}"
2020 )
2022 # Do not need the component itself if already disassembled
2023 if isDisassembled:
2024 isComponent = False
2025 else:
2026 isComponent = getInfo.component is not None
2028 # For a component read of a composite we want the cache to
2029 # be looking at the composite ref itself.
2030 cache_ref = ref.makeCompositeRef() if isComponent else ref
2032 # For a disassembled component we can validate parametersagainst
2033 # the component storage class directly
2034 if isDisassembled:
2035 refStorageClass.validateParameters(parameters)
2036 else:
2037 # For an assembled composite this could be a derived
2038 # component derived from a real component. The validity
2039 # of the parameters is not clear. For now validate against
2040 # the composite storage class
2041 getInfo.formatter.fileDescriptor.storageClass.validateParameters(parameters)
2043 return self._read_artifact_into_memory(getInfo, ref, isComponent=isComponent, cache_ref=cache_ref)
2045 @transactional
2046 def put(self, inMemoryDataset: Any, ref: DatasetRef) -> None:
2047 """Write a InMemoryDataset with a given `DatasetRef` to the store.
2049 Parameters
2050 ----------
2051 inMemoryDataset : `object`
2052 The dataset to store.
2053 ref : `DatasetRef`
2054 Reference to the associated Dataset.
2056 Raises
2057 ------
2058 TypeError
2059 Supplied object and storage class are inconsistent.
2060 DatasetTypeNotSupportedError
2061 The associated `DatasetType` is not handled by this datastore.
2063 Notes
2064 -----
2065 If the datastore is configured to reject certain dataset types it
2066 is possible that the put will fail and raise a
2067 `DatasetTypeNotSupportedError`. The main use case for this is to
2068 allow `ChainedDatastore` to put to multiple datastores without
2069 requiring that every datastore accepts the dataset.
2070 """
2072 doDisassembly = self.composites.shouldBeDisassembled(ref)
2073 # doDisassembly = True
2075 artifacts = []
2076 if doDisassembly:
2077 components = ref.datasetType.storageClass.delegate().disassemble(inMemoryDataset)
2078 if components is None: 2078 ↛ 2079line 2078 didn't jump to line 2079, because the condition on line 2078 was never true
2079 raise RuntimeError(
2080 f"Inconsistent configuration: dataset type {ref.datasetType.name} "
2081 f"with storage class {ref.datasetType.storageClass.name} "
2082 "is configured to be disassembled, but cannot be."
2083 )
2084 for component, componentInfo in components.items():
2085 # Don't recurse because we want to take advantage of
2086 # bulk insert -- need a new DatasetRef that refers to the
2087 # same dataset_id but has the component DatasetType
2088 # DatasetType does not refer to the types of components
2089 # So we construct one ourselves.
2090 compRef = ref.makeComponentRef(component)
2091 storedInfo = self._write_in_memory_to_artifact(componentInfo.component, compRef)
2092 artifacts.append((compRef, storedInfo))
2093 else:
2094 # Write the entire thing out
2095 storedInfo = self._write_in_memory_to_artifact(inMemoryDataset, ref)
2096 artifacts.append((ref, storedInfo))
2098 self._register_datasets(artifacts)
2100 @transactional
2101 def trash(self, ref: Union[DatasetRef, Iterable[DatasetRef]], ignore_errors: bool = True) -> None:
2102 # At this point can safely remove these datasets from the cache
2103 # to avoid confusion later on. If they are not trashed later
2104 # the cache will simply be refilled.
2105 self.cacheManager.remove_from_cache(ref)
2107 # If we are in trust mode there will be nothing to move to
2108 # the trash table and we will have to try to delete the file
2109 # immediately.
2110 if self.trustGetRequest:
2111 # Try to keep the logic below for a single file trash.
2112 if isinstance(ref, DatasetRef):
2113 refs = {ref}
2114 else:
2115 # Will recreate ref at the end of this branch.
2116 refs = set(ref)
2118 # Determine which datasets are known to datastore directly.
2119 id_to_ref = {ref.getCheckedId(): ref for ref in refs}
2120 existing_ids = self._get_stored_records_associated_with_refs(refs)
2121 existing_refs = {id_to_ref[ref_id] for ref_id in existing_ids}
2123 missing = refs - existing_refs
2124 if missing:
2125 # Do an explicit existence check on these refs.
2126 # We only care about the artifacts at this point and not
2127 # the dataset existence.
2128 artifact_existence: Dict[ResourcePath, bool] = {}
2129 _ = self.mexists(missing, artifact_existence)
2130 uris = [uri for uri, exists in artifact_existence.items() if exists]
2132 # FUTURE UPGRADE: Implement a parallelized bulk remove.
2133 log.debug("Removing %d artifacts from datastore that are unknown to datastore", len(uris))
2134 for uri in uris:
2135 try:
2136 uri.remove()
2137 except Exception as e:
2138 if ignore_errors:
2139 log.debug("Artifact %s could not be removed: %s", uri, e)
2140 continue
2141 raise
2143 # There is no point asking the code below to remove refs we
2144 # know are missing so update it with the list of existing
2145 # records. Try to retain one vs many logic.
2146 if not existing_refs:
2147 # Nothing more to do since none of the datasets were
2148 # known to the datastore record table.
2149 return
2150 ref = list(existing_refs)
2151 if len(ref) == 1:
2152 ref = ref[0]
2154 # Get file metadata and internal metadata
2155 if not isinstance(ref, DatasetRef):
2156 log.debug("Doing multi-dataset trash in datastore %s", self.name)
2157 # Assumed to be an iterable of refs so bulk mode enabled.
2158 try:
2159 self.bridge.moveToTrash(ref)
2160 except Exception as e:
2161 if ignore_errors:
2162 log.warning("Unexpected issue moving multiple datasets to trash: %s", e)
2163 else:
2164 raise
2165 return
2167 log.debug("Trashing dataset %s in datastore %s", ref, self.name)
2169 fileLocations = self._get_dataset_locations_info(ref)
2171 if not fileLocations:
2172 err_msg = f"Requested dataset to trash ({ref}) is not known to datastore {self.name}"
2173 if ignore_errors:
2174 log.warning(err_msg)
2175 return
2176 else:
2177 raise FileNotFoundError(err_msg)
2179 for location, storedFileInfo in fileLocations:
2180 if not self._artifact_exists(location): 2180 ↛ 2181line 2180 didn't jump to line 2181
2181 err_msg = (
2182 f"Dataset is known to datastore {self.name} but "
2183 f"associated artifact ({location.uri}) is missing"
2184 )
2185 if ignore_errors:
2186 log.warning(err_msg)
2187 return
2188 else:
2189 raise FileNotFoundError(err_msg)
2191 # Mark dataset as trashed
2192 try:
2193 self.bridge.moveToTrash([ref])
2194 except Exception as e:
2195 if ignore_errors:
2196 log.warning(
2197 "Attempted to mark dataset (%s) to be trashed in datastore %s "
2198 "but encountered an error: %s",
2199 ref,
2200 self.name,
2201 e,
2202 )
2203 pass
2204 else:
2205 raise
2207 @transactional
2208 def emptyTrash(self, ignore_errors: bool = True) -> None:
2209 """Remove all datasets from the trash.
2211 Parameters
2212 ----------
2213 ignore_errors : `bool`
2214 If `True` return without error even if something went wrong.
2215 Problems could occur if another process is simultaneously trying
2216 to delete.
2217 """
2218 log.debug("Emptying trash in datastore %s", self.name)
2220 # Context manager will empty trash iff we finish it without raising.
2221 # It will also automatically delete the relevant rows from the
2222 # trash table and the records table.
2223 with self.bridge.emptyTrash(
2224 self._table, record_class=StoredFileInfo, record_column="path"
2225 ) as trash_data:
2226 # Removing the artifacts themselves requires that the files are
2227 # not also associated with refs that are not to be trashed.
2228 # Therefore need to do a query with the file paths themselves
2229 # and return all the refs associated with them. Can only delete
2230 # a file if the refs to be trashed are the only refs associated
2231 # with the file.
2232 # This requires multiple copies of the trashed items
2233 trashed, artifacts_to_keep = trash_data
2235 if artifacts_to_keep is None:
2236 # The bridge is not helping us so have to work it out
2237 # ourselves. This is not going to be as efficient.
2238 trashed = list(trashed)
2240 # The instance check is for mypy since up to this point it
2241 # does not know the type of info.
2242 path_map = self._refs_associated_with_artifacts(
2243 [info.path for _, info in trashed if isinstance(info, StoredFileInfo)]
2244 )
2246 for ref, info in trashed:
2248 # Mypy needs to know this is not the base class
2249 assert isinstance(info, StoredFileInfo), f"Unexpectedly got info of class {type(info)}"
2251 # Check for mypy
2252 assert ref.id is not None, f"Internal logic error in emptyTrash with ref {ref}/{info}"
2254 path_map[info.path].remove(ref.id)
2255 if not path_map[info.path]: 2255 ↛ 2246line 2255 didn't jump to line 2246, because the condition on line 2255 was never false
2256 del path_map[info.path]
2258 artifacts_to_keep = set(path_map)
2260 for ref, info in trashed:
2262 # Should not happen for this implementation but need
2263 # to keep mypy happy.
2264 assert info is not None, f"Internal logic error in emptyTrash with ref {ref}."
2266 # Mypy needs to know this is not the base class
2267 assert isinstance(info, StoredFileInfo), f"Unexpectedly got info of class {type(info)}"
2269 # Check for mypy
2270 assert ref.id is not None, f"Internal logic error in emptyTrash with ref {ref}/{info}"
2272 if info.path in artifacts_to_keep:
2273 # This is a multi-dataset artifact and we are not
2274 # removing all associated refs.
2275 continue
2277 # Only trashed refs still known to datastore will be returned.
2278 location = info.file_location(self.locationFactory)
2280 # Point of no return for this artifact
2281 log.debug("Removing artifact %s from datastore %s", location.uri, self.name)
2282 try:
2283 self._delete_artifact(location)
2284 except FileNotFoundError:
2285 # If the file itself has been deleted there is nothing
2286 # we can do about it. It is possible that trash has
2287 # been run in parallel in another process or someone
2288 # decided to delete the file. It is unlikely to come
2289 # back and so we should still continue with the removal
2290 # of the entry from the trash table. It is also possible
2291 # we removed it in a previous iteration if it was
2292 # a multi-dataset artifact. The delete artifact method
2293 # will log a debug message in this scenario.
2294 # Distinguishing file missing before trash started and
2295 # file already removed previously as part of this trash
2296 # is not worth the distinction with regards to potential
2297 # memory cost.
2298 pass
2299 except Exception as e:
2300 if ignore_errors:
2301 # Use a debug message here even though it's not
2302 # a good situation. In some cases this can be
2303 # caused by a race between user A and user B
2304 # and neither of them has permissions for the
2305 # other's files. Butler does not know about users
2306 # and trash has no idea what collections these
2307 # files were in (without guessing from a path).
2308 log.debug(
2309 "Encountered error removing artifact %s from datastore %s: %s",
2310 location.uri,
2311 self.name,
2312 e,
2313 )
2314 else:
2315 raise
2317 @transactional
2318 def transfer_from(
2319 self,
2320 source_datastore: Datastore,
2321 refs: Iterable[DatasetRef],
2322 local_refs: Optional[Iterable[DatasetRef]] = None,
2323 transfer: str = "auto",
2324 artifact_existence: Optional[Dict[ResourcePath, bool]] = None,
2325 ) -> None:
2326 # Docstring inherited
2327 if type(self) is not type(source_datastore):
2328 raise TypeError(
2329 f"Datastore mismatch between this datastore ({type(self)}) and the "
2330 f"source datastore ({type(source_datastore)})."
2331 )
2333 # Be explicit for mypy
2334 if not isinstance(source_datastore, FileDatastore): 2334 ↛ 2335line 2334 didn't jump to line 2335, because the condition on line 2334 was never true
2335 raise TypeError(
2336 "Can only transfer to a FileDatastore from another FileDatastore, not"
2337 f" {type(source_datastore)}"
2338 )
2340 # Stop early if "direct" transfer mode is requested. That would
2341 # require that the URI inside the source datastore should be stored
2342 # directly in the target datastore, which seems unlikely to be useful
2343 # since at any moment the source datastore could delete the file.
2344 if transfer in ("direct", "split"):
2345 raise ValueError(
2346 f"Can not transfer from a source datastore using {transfer} mode since"
2347 " those files are controlled by the other datastore."
2348 )
2350 # Empty existence lookup if none given.
2351 if artifact_existence is None:
2352 artifact_existence = {}
2354 # We will go through the list multiple times so must convert
2355 # generators to lists.
2356 refs = list(refs)
2358 if local_refs is None:
2359 local_refs = refs
2360 else:
2361 local_refs = list(local_refs)
2363 # In order to handle disassembled composites the code works
2364 # at the records level since it can assume that internal APIs
2365 # can be used.
2366 # - If the record already exists in the destination this is assumed
2367 # to be okay.
2368 # - If there is no record but the source and destination URIs are
2369 # identical no transfer is done but the record is added.
2370 # - If the source record refers to an absolute URI currently assume
2371 # that that URI should remain absolute and will be visible to the
2372 # destination butler. May need to have a flag to indicate whether
2373 # the dataset should be transferred. This will only happen if
2374 # the detached Butler has had a local ingest.
2376 # What we really want is all the records in the source datastore
2377 # associated with these refs. Or derived ones if they don't exist
2378 # in the source.
2379 source_records = source_datastore._get_stored_records_associated_with_refs(refs)
2381 # The source dataset_ids are the keys in these records
2382 source_ids = set(source_records)
2383 log.debug("Number of datastore records found in source: %d", len(source_ids))
2385 # The not None check is to appease mypy
2386 requested_ids = set(ref.id for ref in refs if ref.id is not None)
2387 missing_ids = requested_ids - source_ids
2389 # Missing IDs can be okay if that datastore has allowed
2390 # gets based on file existence. Should we transfer what we can
2391 # or complain about it and warn?
2392 if missing_ids and not source_datastore.trustGetRequest: 2392 ↛ 2393line 2392 didn't jump to line 2393, because the condition on line 2392 was never true
2393 raise ValueError(
2394 f"Some datasets are missing from source datastore {source_datastore}: {missing_ids}"
2395 )
2397 # Need to map these missing IDs to a DatasetRef so we can guess
2398 # the details.
2399 if missing_ids:
2400 log.info(
2401 "Number of expected datasets missing from source datastore records: %d out of %d",
2402 len(missing_ids),
2403 len(requested_ids),
2404 )
2405 id_to_ref = {ref.id: ref for ref in refs if ref.id in missing_ids}
2407 # This should be chunked in case we end up having to check
2408 # the file store since we need some log output to show
2409 # progress.
2410 for missing_ids_chunk in chunk_iterable(missing_ids, chunk_size=10_000):
2411 records = {}
2412 for missing in missing_ids_chunk:
2413 # Ask the source datastore where the missing artifacts
2414 # should be. An execution butler might not know about the
2415 # artifacts even if they are there.
2416 expected = source_datastore._get_expected_dataset_locations_info(id_to_ref[missing])
2417 records[missing] = [info for _, info in expected]
2419 # Call the mexist helper method in case we have not already
2420 # checked these artifacts such that artifact_existence is
2421 # empty. This allows us to benefit from parallelism.
2422 # datastore.mexists() itself does not give us access to the
2423 # derived datastore record.
2424 log.verbose("Checking existence of %d datasets unknown to datastore", len(records))
2425 ref_exists = source_datastore._process_mexists_records(
2426 id_to_ref, records, False, artifact_existence=artifact_existence
2427 )
2429 # Now go through the records and propagate the ones that exist.
2430 location_factory = source_datastore.locationFactory
2431 for missing, record_list in records.items():
2432 # Skip completely if the ref does not exist.
2433 ref = id_to_ref[missing]
2434 if not ref_exists[ref]:
2435 log.warning("Asked to transfer dataset %s but no file artifacts exist for it.", ref)
2436 continue
2437 # Check for file artifact to decide which parts of a
2438 # disassembled composite do exist. If there is only a
2439 # single record we don't even need to look because it can't
2440 # be a composite and must exist.
2441 if len(record_list) == 1:
2442 dataset_records = record_list
2443 else:
2444 dataset_records = [
2445 record
2446 for record in record_list
2447 if artifact_existence[record.file_location(location_factory).uri]
2448 ]
2449 assert len(dataset_records) > 0, "Disassembled composite should have had some files."
2451 # Rely on source_records being a defaultdict.
2452 source_records[missing].extend(dataset_records)
2454 # See if we already have these records
2455 target_records = self._get_stored_records_associated_with_refs(local_refs)
2457 # The artifacts to register
2458 artifacts = []
2460 # Refs that already exist
2461 already_present = []
2463 # Now can transfer the artifacts
2464 for source_ref, target_ref in zip(refs, local_refs):
2465 if target_ref.id in target_records:
2466 # Already have an artifact for this.
2467 already_present.append(target_ref)
2468 continue
2470 # mypy needs to know these are always resolved refs
2471 for info in source_records[source_ref.getCheckedId()]:
2472 source_location = info.file_location(source_datastore.locationFactory)
2473 target_location = info.file_location(self.locationFactory)
2474 if source_location == target_location: 2474 ↛ 2478line 2474 didn't jump to line 2478, because the condition on line 2474 was never true
2475 # Either the dataset is already in the target datastore
2476 # (which is how execution butler currently runs) or
2477 # it is an absolute URI.
2478 if source_location.pathInStore.isabs():
2479 # Just because we can see the artifact when running
2480 # the transfer doesn't mean it will be generally
2481 # accessible to a user of this butler. For now warn
2482 # but assume it will be accessible.
2483 log.warning(
2484 "Transfer request for an outside-datastore artifact has been found at %s",
2485 source_location,
2486 )
2487 else:
2488 # Need to transfer it to the new location.
2489 # Assume we should always overwrite. If the artifact
2490 # is there this might indicate that a previous transfer
2491 # was interrupted but was not able to be rolled back
2492 # completely (eg pre-emption) so follow Datastore default
2493 # and overwrite.
2494 target_location.uri.transfer_from(
2495 source_location.uri, transfer=transfer, overwrite=True, transaction=self._transaction
2496 )
2498 artifacts.append((target_ref, info))
2500 self._register_datasets(artifacts)
2502 if already_present:
2503 n_skipped = len(already_present)
2504 log.info(
2505 "Skipped transfer of %d dataset%s already present in datastore",
2506 n_skipped,
2507 "" if n_skipped == 1 else "s",
2508 )
2510 @transactional
2511 def forget(self, refs: Iterable[DatasetRef]) -> None:
2512 # Docstring inherited.
2513 refs = list(refs)
2514 self.bridge.forget(refs)
2515 self._table.delete(["dataset_id"], *[{"dataset_id": ref.getCheckedId()} for ref in refs])
2517 def validateConfiguration(
2518 self, entities: Iterable[Union[DatasetRef, DatasetType, StorageClass]], logFailures: bool = False
2519 ) -> None:
2520 """Validate some of the configuration for this datastore.
2522 Parameters
2523 ----------
2524 entities : iterable of `DatasetRef`, `DatasetType`, or `StorageClass`
2525 Entities to test against this configuration. Can be differing
2526 types.
2527 logFailures : `bool`, optional
2528 If `True`, output a log message for every validation error
2529 detected.
2531 Raises
2532 ------
2533 DatastoreValidationError
2534 Raised if there is a validation problem with a configuration.
2535 All the problems are reported in a single exception.
2537 Notes
2538 -----
2539 This method checks that all the supplied entities have valid file
2540 templates and also have formatters defined.
2541 """
2543 templateFailed = None
2544 try:
2545 self.templates.validateTemplates(entities, logFailures=logFailures)
2546 except FileTemplateValidationError as e:
2547 templateFailed = str(e)
2549 formatterFailed = []
2550 for entity in entities:
2551 try:
2552 self.formatterFactory.getFormatterClass(entity)
2553 except KeyError as e:
2554 formatterFailed.append(str(e))
2555 if logFailures: 2555 ↛ 2550line 2555 didn't jump to line 2550, because the condition on line 2555 was never false
2556 log.critical("Formatter failure: %s", e)
2558 if templateFailed or formatterFailed:
2559 messages = []
2560 if templateFailed: 2560 ↛ 2561line 2560 didn't jump to line 2561, because the condition on line 2560 was never true
2561 messages.append(templateFailed)
2562 if formatterFailed: 2562 ↛ 2564line 2562 didn't jump to line 2564, because the condition on line 2562 was never false
2563 messages.append(",".join(formatterFailed))
2564 msg = ";\n".join(messages)
2565 raise DatastoreValidationError(msg)
2567 def getLookupKeys(self) -> Set[LookupKey]:
2568 # Docstring is inherited from base class
2569 return (
2570 self.templates.getLookupKeys()
2571 | self.formatterFactory.getLookupKeys()
2572 | self.constraints.getLookupKeys()
2573 )
2575 def validateKey(self, lookupKey: LookupKey, entity: Union[DatasetRef, DatasetType, StorageClass]) -> None:
2576 # Docstring is inherited from base class
2577 # The key can be valid in either formatters or templates so we can
2578 # only check the template if it exists
2579 if lookupKey in self.templates:
2580 try:
2581 self.templates[lookupKey].validateTemplate(entity)
2582 except FileTemplateValidationError as e:
2583 raise DatastoreValidationError(e) from e
2585 def export(
2586 self,
2587 refs: Iterable[DatasetRef],
2588 *,
2589 directory: Optional[ResourcePathExpression] = None,
2590 transfer: Optional[str] = "auto",
2591 ) -> Iterable[FileDataset]:
2592 # Docstring inherited from Datastore.export.
2593 if transfer is not None and directory is None: 2593 ↛ 2594line 2593 didn't jump to line 2594, because the condition on line 2593 was never true
2594 raise RuntimeError(f"Cannot export using transfer mode {transfer} with no export directory given")
2596 # Force the directory to be a URI object
2597 directoryUri: Optional[ResourcePath] = None
2598 if directory is not None: 2598 ↛ 2601line 2598 didn't jump to line 2601, because the condition on line 2598 was never false
2599 directoryUri = ResourcePath(directory, forceDirectory=True)
2601 if transfer is not None and directoryUri is not None: 2601 ↛ 2606line 2601 didn't jump to line 2606, because the condition on line 2601 was never false
2602 # mypy needs the second test
2603 if not directoryUri.exists(): 2603 ↛ 2604line 2603 didn't jump to line 2604, because the condition on line 2603 was never true
2604 raise FileNotFoundError(f"Export location {directory} does not exist")
2606 progress = Progress("lsst.daf.butler.datastores.FileDatastore.export", level=logging.DEBUG)
2607 for ref in progress.wrap(refs, "Exporting dataset files"):
2608 fileLocations = self._get_dataset_locations_info(ref)
2609 if not fileLocations: 2609 ↛ 2610line 2609 didn't jump to line 2610, because the condition on line 2609 was never true
2610 raise FileNotFoundError(f"Could not retrieve dataset {ref}.")
2611 # For now we can not export disassembled datasets
2612 if len(fileLocations) > 1:
2613 raise NotImplementedError(f"Can not export disassembled datasets such as {ref}")
2614 location, storedFileInfo = fileLocations[0]
2616 pathInStore = location.pathInStore.path
2617 if transfer is None: 2617 ↛ 2621line 2617 didn't jump to line 2621, because the condition on line 2617 was never true
2618 # TODO: do we also need to return the readStorageClass somehow?
2619 # We will use the path in store directly. If this is an
2620 # absolute URI, preserve it.
2621 if location.pathInStore.isabs():
2622 pathInStore = str(location.uri)
2623 elif transfer == "direct": 2623 ↛ 2625line 2623 didn't jump to line 2625, because the condition on line 2623 was never true
2624 # Use full URIs to the remote store in the export
2625 pathInStore = str(location.uri)
2626 else:
2627 # mypy needs help
2628 assert directoryUri is not None, "directoryUri must be defined to get here"
2629 storeUri = ResourcePath(location.uri)
2631 # if the datastore has an absolute URI to a resource, we
2632 # have two options:
2633 # 1. Keep the absolute URI in the exported YAML
2634 # 2. Allocate a new name in the local datastore and transfer
2635 # it.
2636 # For now go with option 2
2637 if location.pathInStore.isabs(): 2637 ↛ 2638line 2637 didn't jump to line 2638, because the condition on line 2637 was never true
2638 template = self.templates.getTemplate(ref)
2639 newURI = ResourcePath(template.format(ref), forceAbsolute=False)
2640 pathInStore = str(newURI.updatedExtension(location.pathInStore.getExtension()))
2642 exportUri = directoryUri.join(pathInStore)
2643 exportUri.transfer_from(storeUri, transfer=transfer)
2645 yield FileDataset(refs=[ref], path=pathInStore, formatter=storedFileInfo.formatter)
2647 @staticmethod
2648 def computeChecksum(
2649 uri: ResourcePath, algorithm: str = "blake2b", block_size: int = 8192
2650 ) -> Optional[str]:
2651 """Compute the checksum of the supplied file.
2653 Parameters
2654 ----------
2655 uri : `lsst.resources.ResourcePath`
2656 Name of resource to calculate checksum from.
2657 algorithm : `str`, optional
2658 Name of algorithm to use. Must be one of the algorithms supported
2659 by :py:class`hashlib`.
2660 block_size : `int`
2661 Number of bytes to read from file at one time.
2663 Returns
2664 -------
2665 hexdigest : `str`
2666 Hex digest of the file.
2668 Notes
2669 -----
2670 Currently returns None if the URI is for a remote resource.
2671 """
2672 if algorithm not in hashlib.algorithms_guaranteed: 2672 ↛ 2673line 2672 didn't jump to line 2673, because the condition on line 2672 was never true
2673 raise NameError("The specified algorithm '{}' is not supported by hashlib".format(algorithm))
2675 if not uri.isLocal: 2675 ↛ 2676line 2675 didn't jump to line 2676, because the condition on line 2675 was never true
2676 return None
2678 hasher = hashlib.new(algorithm)
2680 with uri.as_local() as local_uri:
2681 with open(local_uri.ospath, "rb") as f:
2682 for chunk in iter(lambda: f.read(block_size), b""):
2683 hasher.update(chunk)
2685 return hasher.hexdigest()
2687 def needs_expanded_data_ids(
2688 self,
2689 transfer: Optional[str],
2690 entity: Optional[Union[DatasetRef, DatasetType, StorageClass]] = None,
2691 ) -> bool:
2692 # Docstring inherited.
2693 # This _could_ also use entity to inspect whether the filename template
2694 # involves placeholders other than the required dimensions for its
2695 # dataset type, but that's not necessary for correctness; it just
2696 # enables more optimizations (perhaps only in theory).
2697 return transfer not in ("direct", None)
2699 def import_records(self, data: Mapping[str, DatastoreRecordData]) -> None:
2700 # Docstring inherited from the base class.
2701 record_data = data.get(self.name)
2702 if not record_data: 2702 ↛ 2703line 2702 didn't jump to line 2703, because the condition on line 2702 was never true
2703 return
2705 if record_data.refs: 2705 ↛ 2709line 2705 didn't jump to line 2709, because the condition on line 2705 was never false
2706 self._bridge.insert(record_data.refs)
2708 # TODO: Verify that there are no unexpected table names in the dict?
2709 records = record_data.records.get(self._table.name)
2710 if records: 2710 ↛ exitline 2710 didn't return from function 'import_records', because the condition on line 2710 was never false
2711 unpacked_records = []
2712 for info in records:
2713 assert isinstance(info, StoredFileInfo), "Expecting StoredFileInfo records"
2714 unpacked_records.append(info.to_record())
2715 self._table.insert(*unpacked_records)
2717 def export_records(self, refs: Iterable[DatasetIdRef]) -> Mapping[str, DatastoreRecordData]:
2718 # Docstring inherited from the base class.
2719 exported_refs = list(self._bridge.check(refs))
2721 id2ref = {ref.id: ref for ref in exported_refs}
2722 rows = self._table.fetch(dataset_id=list(id2ref.keys()))
2723 records: List[StoredDatastoreItemInfo] = [StoredFileInfo.from_record(row) for row in rows]
2725 record_data = DatastoreRecordData(refs=exported_refs, records={self._table.name: records})
2726 return {self.name: record_data}