Coverage for python/lsst/daf/butler/datastores/fileDatastore.py : 80%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
21from __future__ import annotations
23"""Generic file-based datastore code."""
25__all__ = ("FileDatastore", )
27import hashlib
28import logging
29import os
30import tempfile
32from sqlalchemy import BigInteger, String
34from collections import defaultdict
35from dataclasses import dataclass
36from typing import (
37 TYPE_CHECKING,
38 Any,
39 ClassVar,
40 Dict,
41 Iterable,
42 List,
43 Mapping,
44 Optional,
45 Set,
46 Tuple,
47 Type,
48 Union,
49)
51from lsst.daf.butler import (
52 ButlerURI,
53 CompositesMap,
54 Config,
55 FileDataset,
56 DatasetId,
57 DatasetRef,
58 DatasetType,
59 DatasetTypeNotSupportedError,
60 Datastore,
61 DatastoreCacheManager,
62 DatastoreDisabledCacheManager,
63 DatastoreConfig,
64 DatastoreValidationError,
65 FileDescriptor,
66 FileTemplates,
67 FileTemplateValidationError,
68 Formatter,
69 FormatterFactory,
70 Location,
71 LocationFactory,
72 Progress,
73 StorageClass,
74 StoredFileInfo,
75)
77from lsst.daf.butler import ddl
78from lsst.daf.butler.registry.interfaces import (
79 ReadOnlyDatabaseError,
80 DatastoreRegistryBridge,
81)
83from lsst.daf.butler.core.repoRelocation import replaceRoot
84from lsst.daf.butler.core.utils import getInstanceOf, getClassOf, transactional
85from .genericDatastore import GenericBaseDatastore
87if TYPE_CHECKING: 87 ↛ 88line 87 didn't jump to line 88, because the condition on line 87 was never true
88 from lsst.daf.butler import LookupKey, AbstractDatastoreCacheManager
89 from lsst.daf.butler.registry.interfaces import DatasetIdRef, DatastoreRegistryBridgeManager
91log = logging.getLogger(__name__)
94class _IngestPrepData(Datastore.IngestPrepData):
95 """Helper class for FileDatastore ingest implementation.
97 Parameters
98 ----------
99 datasets : `list` of `FileDataset`
100 Files to be ingested by this datastore.
101 """
102 def __init__(self, datasets: List[FileDataset]):
103 super().__init__(ref for dataset in datasets for ref in dataset.refs)
104 self.datasets = datasets
107@dataclass(frozen=True)
108class DatastoreFileGetInformation:
109 """Collection of useful parameters needed to retrieve a file from
110 a Datastore.
111 """
113 location: Location
114 """The location from which to read the dataset."""
116 formatter: Formatter
117 """The `Formatter` to use to deserialize the dataset."""
119 info: StoredFileInfo
120 """Stored information about this file and its formatter."""
122 assemblerParams: Dict[str, Any]
123 """Parameters to use for post-processing the retrieved dataset."""
125 formatterParams: Dict[str, Any]
126 """Parameters that were understood by the associated formatter."""
128 component: Optional[str]
129 """The component to be retrieved (can be `None`)."""
131 readStorageClass: StorageClass
132 """The `StorageClass` of the dataset being read."""
135class FileDatastore(GenericBaseDatastore):
136 """Generic Datastore for file-based implementations.
138 Should always be sub-classed since key abstract methods are missing.
140 Parameters
141 ----------
142 config : `DatastoreConfig` or `str`
143 Configuration as either a `Config` object or URI to file.
144 bridgeManager : `DatastoreRegistryBridgeManager`
145 Object that manages the interface between `Registry` and datastores.
146 butlerRoot : `str`, optional
147 New datastore root to use to override the configuration value.
149 Raises
150 ------
151 ValueError
152 If root location does not exist and ``create`` is `False` in the
153 configuration.
154 """
156 defaultConfigFile: ClassVar[Optional[str]] = None
157 """Path to configuration defaults. Accessed within the ``config`` resource
158 or relative to a search path. Can be None if no defaults specified.
159 """
161 root: ButlerURI
162 """Root directory URI of this `Datastore`."""
164 locationFactory: LocationFactory
165 """Factory for creating locations relative to the datastore root."""
167 formatterFactory: FormatterFactory
168 """Factory for creating instances of formatters."""
170 templates: FileTemplates
171 """File templates that can be used by this `Datastore`."""
173 composites: CompositesMap
174 """Determines whether a dataset should be disassembled on put."""
176 defaultConfigFile = "datastores/fileDatastore.yaml"
177 """Path to configuration defaults. Accessed within the ``config`` resource
178 or relative to a search path. Can be None if no defaults specified.
179 """
181 @classmethod
182 def setConfigRoot(cls, root: str, config: Config, full: Config, overwrite: bool = True) -> None:
183 """Set any filesystem-dependent config options for this Datastore to
184 be appropriate for a new empty repository with the given root.
186 Parameters
187 ----------
188 root : `str`
189 URI to the root of the data repository.
190 config : `Config`
191 A `Config` to update. Only the subset understood by
192 this component will be updated. Will not expand
193 defaults.
194 full : `Config`
195 A complete config with all defaults expanded that can be
196 converted to a `DatastoreConfig`. Read-only and will not be
197 modified by this method.
198 Repository-specific options that should not be obtained
199 from defaults when Butler instances are constructed
200 should be copied from ``full`` to ``config``.
201 overwrite : `bool`, optional
202 If `False`, do not modify a value in ``config`` if the value
203 already exists. Default is always to overwrite with the provided
204 ``root``.
206 Notes
207 -----
208 If a keyword is explicitly defined in the supplied ``config`` it
209 will not be overridden by this method if ``overwrite`` is `False`.
210 This allows explicit values set in external configs to be retained.
211 """
212 Config.updateParameters(DatastoreConfig, config, full,
213 toUpdate={"root": root},
214 toCopy=("cls", ("records", "table")), overwrite=overwrite)
216 @classmethod
217 def makeTableSpec(cls, datasetIdColumnType: type) -> ddl.TableSpec:
218 return ddl.TableSpec(
219 fields=[
220 ddl.FieldSpec(name="dataset_id", dtype=datasetIdColumnType, primaryKey=True),
221 ddl.FieldSpec(name="path", dtype=String, length=256, nullable=False),
222 ddl.FieldSpec(name="formatter", dtype=String, length=128, nullable=False),
223 ddl.FieldSpec(name="storage_class", dtype=String, length=64, nullable=False),
224 # Use empty string to indicate no component
225 ddl.FieldSpec(name="component", dtype=String, length=32, primaryKey=True),
226 # TODO: should checksum be Base64Bytes instead?
227 ddl.FieldSpec(name="checksum", dtype=String, length=128, nullable=True),
228 ddl.FieldSpec(name="file_size", dtype=BigInteger, nullable=True),
229 ],
230 unique=frozenset(),
231 indexes=[tuple(["path"])],
232 )
234 def __init__(self, config: Union[DatastoreConfig, str],
235 bridgeManager: DatastoreRegistryBridgeManager, butlerRoot: str = None):
236 super().__init__(config, bridgeManager)
237 if "root" not in self.config: 237 ↛ 238line 237 didn't jump to line 238, because the condition on line 237 was never true
238 raise ValueError("No root directory specified in configuration")
240 # Name ourselves either using an explicit name or a name
241 # derived from the (unexpanded) root
242 if "name" in self.config:
243 self.name = self.config["name"]
244 else:
245 # We use the unexpanded root in the name to indicate that this
246 # datastore can be moved without having to update registry.
247 self.name = "{}@{}".format(type(self).__name__,
248 self.config["root"])
250 # Support repository relocation in config
251 # Existence of self.root is checked in subclass
252 self.root = ButlerURI(replaceRoot(self.config["root"], butlerRoot),
253 forceDirectory=True, forceAbsolute=True)
255 self.locationFactory = LocationFactory(self.root)
256 self.formatterFactory = FormatterFactory()
258 # Now associate formatters with storage classes
259 self.formatterFactory.registerFormatters(self.config["formatters"],
260 universe=bridgeManager.universe)
262 # Read the file naming templates
263 self.templates = FileTemplates(self.config["templates"],
264 universe=bridgeManager.universe)
266 # See if composites should be disassembled
267 self.composites = CompositesMap(self.config["composites"],
268 universe=bridgeManager.universe)
270 tableName = self.config["records", "table"]
271 try:
272 # Storage of paths and formatters, keyed by dataset_id
273 self._table = bridgeManager.opaque.register(
274 tableName, self.makeTableSpec(bridgeManager.datasetIdColumnType))
275 # Interface to Registry.
276 self._bridge = bridgeManager.register(self.name)
277 except ReadOnlyDatabaseError:
278 # If the database is read only and we just tried and failed to
279 # create a table, it means someone is trying to create a read-only
280 # butler client for an empty repo. That should be okay, as long
281 # as they then try to get any datasets before some other client
282 # creates the table. Chances are they'rejust validating
283 # configuration.
284 pass
286 # Determine whether checksums should be used - default to False
287 self.useChecksum = self.config.get("checksum", False)
289 # Determine whether we can fall back to configuration if a
290 # requested dataset is not known to registry
291 self.trustGetRequest = self.config.get("trust_get_request", False)
293 # Create a cache manager
294 self.cacheManager: AbstractDatastoreCacheManager
295 if "cached" in self.config: 295 ↛ 299line 295 didn't jump to line 299, because the condition on line 295 was never false
296 self.cacheManager = DatastoreCacheManager(self.config["cached"],
297 universe=bridgeManager.universe)
298 else:
299 self.cacheManager = DatastoreDisabledCacheManager("",
300 universe=bridgeManager.universe)
302 # Check existence and create directory structure if necessary
303 if not self.root.exists():
304 if "create" not in self.config or not self.config["create"]: 304 ↛ 305line 304 didn't jump to line 305, because the condition on line 304 was never true
305 raise ValueError(f"No valid root and not allowed to create one at: {self.root}")
306 try:
307 self.root.mkdir()
308 except Exception as e:
309 raise ValueError(f"Can not create datastore root '{self.root}', check permissions."
310 f" Got error: {e}") from e
312 def __str__(self) -> str:
313 return str(self.root)
315 @property
316 def bridge(self) -> DatastoreRegistryBridge:
317 return self._bridge
319 def _artifact_exists(self, location: Location) -> bool:
320 """Check that an artifact exists in this datastore at the specified
321 location.
323 Parameters
324 ----------
325 location : `Location`
326 Expected location of the artifact associated with this datastore.
328 Returns
329 -------
330 exists : `bool`
331 True if the location can be found, false otherwise.
332 """
333 log.debug("Checking if resource exists: %s", location.uri)
334 return location.uri.exists()
336 def _delete_artifact(self, location: Location) -> None:
337 """Delete the artifact from the datastore.
339 Parameters
340 ----------
341 location : `Location`
342 Location of the artifact associated with this datastore.
343 """
344 if location.pathInStore.isabs(): 344 ↛ 345line 344 didn't jump to line 345, because the condition on line 344 was never true
345 raise RuntimeError(f"Cannot delete artifact with absolute uri {location.uri}.")
347 try:
348 location.uri.remove()
349 except FileNotFoundError:
350 log.debug("File %s did not exist and so could not be deleted.", location.uri)
351 raise
352 except Exception as e:
353 log.critical("Failed to delete file: %s (%s)", location.uri, e)
354 raise
355 log.debug("Successfully deleted file: %s", location.uri)
357 def addStoredItemInfo(self, refs: Iterable[DatasetRef], infos: Iterable[StoredFileInfo]) -> None:
358 # Docstring inherited from GenericBaseDatastore
359 records = [info.to_record(ref) for ref, info in zip(refs, infos)]
360 self._table.insert(*records)
362 def getStoredItemsInfo(self, ref: DatasetIdRef) -> List[StoredFileInfo]:
363 # Docstring inherited from GenericBaseDatastore
365 # Look for the dataset_id -- there might be multiple matches
366 # if we have disassembled the dataset.
367 records = self._table.fetch(dataset_id=ref.id)
368 return [StoredFileInfo.from_record(record) for record in records]
370 def _get_stored_records_associated_with_refs(self,
371 refs: Iterable[DatasetIdRef]
372 ) -> Dict[DatasetId, List[StoredFileInfo]]:
373 """Retrieve all records associated with the provided refs.
375 Parameters
376 ----------
377 refs : iterable of `DatasetIdRef`
378 The refs for which records are to be retrieved.
380 Returns
381 -------
382 records : `dict` of [`DatasetId`, `list` of `StoredFileInfo`]
383 The matching records indexed by the ref ID. The number of entries
384 in the dict can be smaller than the number of requested refs.
385 """
386 records = self._table.fetch(dataset_id=[ref.id for ref in refs])
388 # Uniqueness is dataset_id + component so can have multiple records
389 # per ref.
390 records_by_ref = defaultdict(list)
391 for record in records:
392 records_by_ref[record["dataset_id"]].append(StoredFileInfo.from_record(record))
393 return records_by_ref
395 def _refs_associated_with_artifacts(self, paths: List[Union[str, ButlerURI]]) -> Dict[str,
396 Set[DatasetId]]:
397 """Return paths and associated dataset refs.
399 Parameters
400 ----------
401 paths : `list` of `str` or `ButlerURI`
402 All the paths to include in search.
404 Returns
405 -------
406 mapping : `dict` of [`str`, `set` [`DatasetId`]]
407 Mapping of each path to a set of associated database IDs.
408 """
409 records = self._table.fetch(path=[str(path) for path in paths])
410 result = defaultdict(set)
411 for row in records:
412 result[row["path"]].add(row["dataset_id"])
413 return result
415 def _registered_refs_per_artifact(self, pathInStore: ButlerURI) -> Set[DatasetId]:
416 """Return all dataset refs associated with the supplied path.
418 Parameters
419 ----------
420 pathInStore : `ButlerURI`
421 Path of interest in the data store.
423 Returns
424 -------
425 ids : `set` of `int`
426 All `DatasetRef` IDs associated with this path.
427 """
428 records = list(self._table.fetch(path=str(pathInStore)))
429 ids = {r["dataset_id"] for r in records}
430 return ids
432 def removeStoredItemInfo(self, ref: DatasetIdRef) -> None:
433 # Docstring inherited from GenericBaseDatastore
434 self._table.delete(["dataset_id"], {"dataset_id": ref.id})
436 def _get_dataset_locations_info(self, ref: DatasetIdRef) -> List[Tuple[Location, StoredFileInfo]]:
437 r"""Find all the `Location`\ s of the requested dataset in the
438 `Datastore` and the associated stored file information.
440 Parameters
441 ----------
442 ref : `DatasetRef`
443 Reference to the required `Dataset`.
445 Returns
446 -------
447 results : `list` [`tuple` [`Location`, `StoredFileInfo` ]]
448 Location of the dataset within the datastore and
449 stored information about each file and its formatter.
450 """
451 # Get the file information (this will fail if no file)
452 records = self.getStoredItemsInfo(ref)
454 # Use the path to determine the location -- we need to take
455 # into account absolute URIs in the datastore record
456 return [(r.file_location(self.locationFactory), r) for r in records]
458 def _can_remove_dataset_artifact(self, ref: DatasetIdRef, location: Location) -> bool:
459 """Check that there is only one dataset associated with the
460 specified artifact.
462 Parameters
463 ----------
464 ref : `DatasetRef` or `FakeDatasetRef`
465 Dataset to be removed.
466 location : `Location`
467 The location of the artifact to be removed.
469 Returns
470 -------
471 can_remove : `Bool`
472 True if the artifact can be safely removed.
473 """
474 # Can't ever delete absolute URIs.
475 if location.pathInStore.isabs():
476 return False
478 # Get all entries associated with this path
479 allRefs = self._registered_refs_per_artifact(location.pathInStore)
480 if not allRefs:
481 raise RuntimeError(f"Datastore inconsistency error. {location.pathInStore} not in registry")
483 # Remove these refs from all the refs and if there is nothing left
484 # then we can delete
485 remainingRefs = allRefs - {ref.id}
487 if remainingRefs:
488 return False
489 return True
491 def _get_expected_dataset_locations_info(self, ref: DatasetRef) -> List[Tuple[Location,
492 StoredFileInfo]]:
493 """Predict the location and related file information of the requested
494 dataset in this datastore.
496 Parameters
497 ----------
498 ref : `DatasetRef`
499 Reference to the required `Dataset`.
501 Returns
502 -------
503 results : `list` [`tuple` [`Location`, `StoredFileInfo` ]]
504 Expected Location of the dataset within the datastore and
505 placeholder information about each file and its formatter.
507 Notes
508 -----
509 Uses the current configuration to determine how we would expect the
510 datastore files to have been written if we couldn't ask registry.
511 This is safe so long as there has been no change to datastore
512 configuration between writing the dataset and wanting to read it.
513 Will not work for files that have been ingested without using the
514 standard file template or default formatter.
515 """
517 # If we have a component ref we always need to ask the questions
518 # of the composite. If the composite is disassembled this routine
519 # should return all components. If the composite was not
520 # disassembled the composite is what is stored regardless of
521 # component request. Note that if the caller has disassembled
522 # a composite there is no way for this guess to know that
523 # without trying both the composite and component ref and seeing
524 # if there is something at the component Location even without
525 # disassembly being enabled.
526 if ref.datasetType.isComponent():
527 ref = ref.makeCompositeRef()
529 # See if the ref is a composite that should be disassembled
530 doDisassembly = self.composites.shouldBeDisassembled(ref)
532 all_info: List[Tuple[Location, Formatter, StorageClass, Optional[str]]] = []
534 if doDisassembly:
535 for component, componentStorage in ref.datasetType.storageClass.components.items():
536 compRef = ref.makeComponentRef(component)
537 location, formatter = self._determine_put_formatter_location(compRef)
538 all_info.append((location, formatter, componentStorage, component))
540 else:
541 # Always use the composite ref if no disassembly
542 location, formatter = self._determine_put_formatter_location(ref)
543 all_info.append((location, formatter, ref.datasetType.storageClass, None))
545 # Convert the list of tuples to have StoredFileInfo as second element
546 return [(location, StoredFileInfo(formatter=formatter,
547 path=location.pathInStore.path,
548 storageClass=storageClass,
549 component=component,
550 checksum=None,
551 file_size=-1))
552 for location, formatter, storageClass, component in all_info]
554 def _prepare_for_get(self, ref: DatasetRef,
555 parameters: Optional[Mapping[str, Any]] = None) -> List[DatastoreFileGetInformation]:
556 """Check parameters for ``get`` and obtain formatter and
557 location.
559 Parameters
560 ----------
561 ref : `DatasetRef`
562 Reference to the required Dataset.
563 parameters : `dict`
564 `StorageClass`-specific parameters that specify, for example,
565 a slice of the dataset to be loaded.
567 Returns
568 -------
569 getInfo : `list` [`DatastoreFileGetInformation`]
570 Parameters needed to retrieve each file.
571 """
572 log.debug("Retrieve %s from %s with parameters %s", ref, self.name, parameters)
574 # Get file metadata and internal metadata
575 fileLocations = self._get_dataset_locations_info(ref)
576 if not fileLocations:
577 if not self.trustGetRequest:
578 raise FileNotFoundError(f"Could not retrieve dataset {ref}.")
579 # Assume the dataset is where we think it should be
580 fileLocations = self._get_expected_dataset_locations_info(ref)
582 # The storage class we want to use eventually
583 refStorageClass = ref.datasetType.storageClass
585 if len(fileLocations) > 1:
586 disassembled = True
588 # If trust is involved it is possible that there will be
589 # components listed here that do not exist in the datastore.
590 # Explicitly check for file artifact existence and filter out any
591 # that are missing.
592 if self.trustGetRequest:
593 fileLocations = [loc for loc in fileLocations if loc[0].uri.exists()]
595 # For now complain only if we have no components at all. One
596 # component is probably a problem but we can punt that to the
597 # assembler.
598 if not fileLocations: 598 ↛ 599line 598 didn't jump to line 599, because the condition on line 598 was never true
599 raise FileNotFoundError(f"None of the component files for dataset {ref} exist.")
601 else:
602 disassembled = False
604 # Is this a component request?
605 refComponent = ref.datasetType.component()
607 fileGetInfo = []
608 for location, storedFileInfo in fileLocations:
610 # The storage class used to write the file
611 writeStorageClass = storedFileInfo.storageClass
613 # If this has been disassembled we need read to match the write
614 if disassembled:
615 readStorageClass = writeStorageClass
616 else:
617 readStorageClass = refStorageClass
619 formatter = getInstanceOf(storedFileInfo.formatter,
620 FileDescriptor(location, readStorageClass=readStorageClass,
621 storageClass=writeStorageClass, parameters=parameters),
622 ref.dataId)
624 formatterParams, notFormatterParams = formatter.segregateParameters()
626 # Of the remaining parameters, extract the ones supported by
627 # this StorageClass (for components not all will be handled)
628 assemblerParams = readStorageClass.filterParameters(notFormatterParams)
630 # The ref itself could be a component if the dataset was
631 # disassembled by butler, or we disassembled in datastore and
632 # components came from the datastore records
633 component = storedFileInfo.component if storedFileInfo.component else refComponent
635 fileGetInfo.append(DatastoreFileGetInformation(location, formatter, storedFileInfo,
636 assemblerParams, formatterParams,
637 component, readStorageClass))
639 return fileGetInfo
641 def _prepare_for_put(self, inMemoryDataset: Any, ref: DatasetRef) -> Tuple[Location, Formatter]:
642 """Check the arguments for ``put`` and obtain formatter and
643 location.
645 Parameters
646 ----------
647 inMemoryDataset : `object`
648 The dataset to store.
649 ref : `DatasetRef`
650 Reference to the associated Dataset.
652 Returns
653 -------
654 location : `Location`
655 The location to write the dataset.
656 formatter : `Formatter`
657 The `Formatter` to use to write the dataset.
659 Raises
660 ------
661 TypeError
662 Supplied object and storage class are inconsistent.
663 DatasetTypeNotSupportedError
664 The associated `DatasetType` is not handled by this datastore.
665 """
666 self._validate_put_parameters(inMemoryDataset, ref)
667 return self._determine_put_formatter_location(ref)
669 def _determine_put_formatter_location(self, ref: DatasetRef) -> Tuple[Location, Formatter]:
670 """Calculate the formatter and output location to use for put.
672 Parameters
673 ----------
674 ref : `DatasetRef`
675 Reference to the associated Dataset.
677 Returns
678 -------
679 location : `Location`
680 The location to write the dataset.
681 formatter : `Formatter`
682 The `Formatter` to use to write the dataset.
683 """
684 # Work out output file name
685 try:
686 template = self.templates.getTemplate(ref)
687 except KeyError as e:
688 raise DatasetTypeNotSupportedError(f"Unable to find template for {ref}") from e
690 # Validate the template to protect against filenames from different
691 # dataIds returning the same and causing overwrite confusion.
692 template.validateTemplate(ref)
694 location = self.locationFactory.fromPath(template.format(ref))
696 # Get the formatter based on the storage class
697 storageClass = ref.datasetType.storageClass
698 try:
699 formatter = self.formatterFactory.getFormatter(ref,
700 FileDescriptor(location,
701 storageClass=storageClass),
702 ref.dataId)
703 except KeyError as e:
704 raise DatasetTypeNotSupportedError(f"Unable to find formatter for {ref} in datastore "
705 f"{self.name}") from e
707 # Now that we know the formatter, update the location
708 location = formatter.makeUpdatedLocation(location)
710 return location, formatter
712 def _overrideTransferMode(self, *datasets: FileDataset, transfer: Optional[str] = None) -> Optional[str]:
713 # Docstring inherited from base class
714 if transfer != "auto":
715 return transfer
717 # See if the paths are within the datastore or not
718 inside = [self._pathInStore(d.path) is not None for d in datasets]
720 if all(inside):
721 transfer = None
722 elif not any(inside): 722 ↛ 726line 722 didn't jump to line 726, because the condition on line 722 was never false
723 # Allow ButlerURI to use its own knowledge
724 transfer = "auto"
725 else:
726 raise ValueError("Some datasets are inside the datastore and some are outside."
727 " Please use an explicit transfer mode and not 'auto'.")
729 return transfer
731 def _pathInStore(self, path: Union[str, ButlerURI]) -> Optional[str]:
732 """Return path relative to datastore root
734 Parameters
735 ----------
736 path : `str` or `ButlerURI`
737 Path to dataset. Can be absolute URI. If relative assumed to
738 be relative to the datastore. Returns path in datastore
739 or raises an exception if the path it outside.
741 Returns
742 -------
743 inStore : `str`
744 Path relative to datastore root. Returns `None` if the file is
745 outside the root.
746 """
747 # Relative path will always be relative to datastore
748 pathUri = ButlerURI(path, forceAbsolute=False)
749 return pathUri.relative_to(self.root)
751 def _standardizeIngestPath(self, path: Union[str, ButlerURI], *,
752 transfer: Optional[str] = None) -> Union[str, ButlerURI]:
753 """Standardize the path of a to-be-ingested file.
755 Parameters
756 ----------
757 path : `str` or `ButlerURI`
758 Path of a file to be ingested.
759 transfer : `str`, optional
760 How (and whether) the dataset should be added to the datastore.
761 See `ingest` for details of transfer modes.
762 This implementation is provided only so
763 `NotImplementedError` can be raised if the mode is not supported;
764 actual transfers are deferred to `_extractIngestInfo`.
766 Returns
767 -------
768 path : `str` or `ButlerURI`
769 New path in what the datastore considers standard form. If an
770 absolute URI was given that will be returned unchanged.
772 Notes
773 -----
774 Subclasses of `FileDatastore` can implement this method instead
775 of `_prepIngest`. It should not modify the data repository or given
776 file in any way.
778 Raises
779 ------
780 NotImplementedError
781 Raised if the datastore does not support the given transfer mode
782 (including the case where ingest is not supported at all).
783 FileNotFoundError
784 Raised if one of the given files does not exist.
785 """
786 if transfer not in (None, "direct") + self.root.transferModes: 786 ↛ 787line 786 didn't jump to line 787, because the condition on line 786 was never true
787 raise NotImplementedError(f"Transfer mode {transfer} not supported.")
789 # A relative URI indicates relative to datastore root
790 srcUri = ButlerURI(path, forceAbsolute=False)
791 if not srcUri.isabs():
792 srcUri = self.root.join(path)
794 if not srcUri.exists():
795 raise FileNotFoundError(f"Resource at {srcUri} does not exist; note that paths to ingest "
796 f"are assumed to be relative to {self.root} unless they are absolute.")
798 if transfer is None:
799 relpath = srcUri.relative_to(self.root)
800 if not relpath:
801 raise RuntimeError(f"Transfer is none but source file ({srcUri}) is not "
802 f"within datastore ({self.root})")
804 # Return the relative path within the datastore for internal
805 # transfer
806 path = relpath
808 return path
810 def _extractIngestInfo(self, path: Union[str, ButlerURI], ref: DatasetRef, *,
811 formatter: Union[Formatter, Type[Formatter]],
812 transfer: Optional[str] = None) -> StoredFileInfo:
813 """Relocate (if necessary) and extract `StoredFileInfo` from a
814 to-be-ingested file.
816 Parameters
817 ----------
818 path : `str` or `ButlerURI`
819 URI or path of a file to be ingested.
820 ref : `DatasetRef`
821 Reference for the dataset being ingested. Guaranteed to have
822 ``dataset_id not None`.
823 formatter : `type` or `Formatter`
824 `Formatter` subclass to use for this dataset or an instance.
825 transfer : `str`, optional
826 How (and whether) the dataset should be added to the datastore.
827 See `ingest` for details of transfer modes.
829 Returns
830 -------
831 info : `StoredFileInfo`
832 Internal datastore record for this file. This will be inserted by
833 the caller; the `_extractIngestInfo` is only resposible for
834 creating and populating the struct.
836 Raises
837 ------
838 FileNotFoundError
839 Raised if one of the given files does not exist.
840 FileExistsError
841 Raised if transfer is not `None` but the (internal) location the
842 file would be moved to is already occupied.
843 """
844 if self._transaction is None: 844 ↛ 845line 844 didn't jump to line 845, because the condition on line 844 was never true
845 raise RuntimeError("Ingest called without transaction enabled")
847 # Create URI of the source path, do not need to force a relative
848 # path to absolute.
849 srcUri = ButlerURI(path, forceAbsolute=False)
851 # Track whether we have read the size of the source yet
852 have_sized = False
854 tgtLocation: Optional[Location]
855 if transfer is None:
856 # A relative path is assumed to be relative to the datastore
857 # in this context
858 if not srcUri.isabs():
859 tgtLocation = self.locationFactory.fromPath(srcUri.ospath)
860 else:
861 # Work out the path in the datastore from an absolute URI
862 # This is required to be within the datastore.
863 pathInStore = srcUri.relative_to(self.root)
864 if pathInStore is None: 864 ↛ 865line 864 didn't jump to line 865, because the condition on line 864 was never true
865 raise RuntimeError(f"Unexpectedly learned that {srcUri} is "
866 f"not within datastore {self.root}")
867 tgtLocation = self.locationFactory.fromPath(pathInStore)
868 elif transfer == "direct": 868 ↛ 873line 868 didn't jump to line 873, because the condition on line 868 was never true
869 # Want to store the full URI to the resource directly in
870 # datastore. This is useful for referring to permanent archive
871 # storage for raw data.
872 # Trust that people know what they are doing.
873 tgtLocation = None
874 else:
875 # Work out the name we want this ingested file to have
876 # inside the datastore
877 tgtLocation = self._calculate_ingested_datastore_name(srcUri, ref, formatter)
878 if not tgtLocation.uri.dirname().exists():
879 log.debug("Folder %s does not exist yet.", tgtLocation.uri.dirname())
880 tgtLocation.uri.dirname().mkdir()
882 # if we are transferring from a local file to a remote location
883 # it may be more efficient to get the size and checksum of the
884 # local file rather than the transferred one
885 if not srcUri.scheme or srcUri.scheme == "file": 885 ↛ 891line 885 didn't jump to line 891, because the condition on line 885 was never false
886 size = srcUri.size()
887 checksum = self.computeChecksum(srcUri) if self.useChecksum else None
888 have_sized = True
890 # transfer the resource to the destination
891 tgtLocation.uri.transfer_from(srcUri, transfer=transfer, transaction=self._transaction)
893 if tgtLocation is None: 893 ↛ 895line 893 didn't jump to line 895, because the condition on line 893 was never true
894 # This means we are using direct mode
895 targetUri = srcUri
896 targetPath = str(srcUri)
897 else:
898 targetUri = tgtLocation.uri
899 targetPath = tgtLocation.pathInStore.path
901 # the file should exist in the datastore now
902 if not have_sized:
903 size = targetUri.size()
904 checksum = self.computeChecksum(targetUri) if self.useChecksum else None
906 return StoredFileInfo(formatter=formatter, path=targetPath,
907 storageClass=ref.datasetType.storageClass,
908 component=ref.datasetType.component(),
909 file_size=size, checksum=checksum)
911 def _prepIngest(self, *datasets: FileDataset, transfer: Optional[str] = None) -> _IngestPrepData:
912 # Docstring inherited from Datastore._prepIngest.
913 filtered = []
914 for dataset in datasets:
915 acceptable = [ref for ref in dataset.refs if self.constraints.isAcceptable(ref)]
916 if not acceptable:
917 continue
918 else:
919 dataset.refs = acceptable
920 if dataset.formatter is None:
921 dataset.formatter = self.formatterFactory.getFormatterClass(dataset.refs[0])
922 else:
923 assert isinstance(dataset.formatter, (type, str))
924 dataset.formatter = getClassOf(dataset.formatter)
925 dataset.path = self._standardizeIngestPath(dataset.path, transfer=transfer)
926 filtered.append(dataset)
927 return _IngestPrepData(filtered)
929 @transactional
930 def _finishIngest(self, prepData: Datastore.IngestPrepData, *, transfer: Optional[str] = None) -> None:
931 # Docstring inherited from Datastore._finishIngest.
932 refsAndInfos = []
933 progress = Progress("lsst.daf.butler.datastores.FileDatastore.ingest", level=logging.DEBUG)
934 for dataset in progress.wrap(prepData.datasets, desc="Ingesting dataset files"):
935 # Do ingest as if the first dataset ref is associated with the file
936 info = self._extractIngestInfo(dataset.path, dataset.refs[0], formatter=dataset.formatter,
937 transfer=transfer)
938 refsAndInfos.extend([(ref, info) for ref in dataset.refs])
939 self._register_datasets(refsAndInfos)
941 def _calculate_ingested_datastore_name(self, srcUri: ButlerURI, ref: DatasetRef,
942 formatter: Union[Formatter, Type[Formatter]]) -> Location:
943 """Given a source URI and a DatasetRef, determine the name the
944 dataset will have inside datastore.
946 Parameters
947 ----------
948 srcUri : `ButlerURI`
949 URI to the source dataset file.
950 ref : `DatasetRef`
951 Ref associated with the newly-ingested dataset artifact. This
952 is used to determine the name within the datastore.
953 formatter : `Formatter` or Formatter class.
954 Formatter to use for validation. Can be a class or an instance.
956 Returns
957 -------
958 location : `Location`
959 Target location for the newly-ingested dataset.
960 """
961 # Ingesting a file from outside the datastore.
962 # This involves a new name.
963 template = self.templates.getTemplate(ref)
964 location = self.locationFactory.fromPath(template.format(ref))
966 # Get the extension
967 ext = srcUri.getExtension()
969 # Update the destination to include that extension
970 location.updateExtension(ext)
972 # Ask the formatter to validate this extension
973 formatter.validateExtension(location)
975 return location
977 def _write_in_memory_to_artifact(self, inMemoryDataset: Any, ref: DatasetRef) -> StoredFileInfo:
978 """Write out in memory dataset to datastore.
980 Parameters
981 ----------
982 inMemoryDataset : `object`
983 Dataset to write to datastore.
984 ref : `DatasetRef`
985 Registry information associated with this dataset.
987 Returns
988 -------
989 info : `StoredFileInfo`
990 Information describin the artifact written to the datastore.
991 """
992 location, formatter = self._prepare_for_put(inMemoryDataset, ref)
993 uri = location.uri
995 if not uri.dirname().exists():
996 log.debug("Folder %s does not exist yet so creating it.", uri.dirname())
997 uri.dirname().mkdir()
999 if self._transaction is None: 999 ↛ 1000line 999 didn't jump to line 1000, because the condition on line 999 was never true
1000 raise RuntimeError("Attempting to write artifact without transaction enabled")
1002 def _removeFileExists(uri: ButlerURI) -> None:
1003 """Remove a file and do not complain if it is not there.
1005 This is important since a formatter might fail before the file
1006 is written and we should not confuse people by writing spurious
1007 error messages to the log.
1008 """
1009 try:
1010 uri.remove()
1011 except FileNotFoundError:
1012 pass
1014 # Register a callback to try to delete the uploaded data if
1015 # something fails below
1016 self._transaction.registerUndo("artifactWrite", _removeFileExists, uri)
1018 # For a local file, simply use the formatter directly
1019 if uri.isLocal:
1020 try:
1021 formatter.write(inMemoryDataset)
1022 except Exception as e:
1023 raise RuntimeError(f"Failed to serialize dataset {ref} of type {type(inMemoryDataset)} "
1024 f"to location {uri}") from e
1025 log.debug("Successfully wrote python object to local file at %s", uri)
1026 else:
1027 # This is a remote URI, so first try bytes and write directly else
1028 # fallback to a temporary file
1029 try:
1030 serializedDataset = formatter.toBytes(inMemoryDataset)
1031 except NotImplementedError: 1031 ↛ 1050line 1031 didn't jump to line 1050
1032 with tempfile.NamedTemporaryFile(suffix=uri.getExtension()) as tmpFile:
1033 # Need to configure the formatter to write to a different
1034 # location and that needs us to overwrite internals
1035 tmpLocation = Location(*os.path.split(tmpFile.name))
1036 log.debug("Writing dataset to temporary location at %s", tmpLocation.uri)
1037 with formatter._updateLocation(tmpLocation):
1038 try:
1039 formatter.write(inMemoryDataset)
1040 except Exception as e:
1041 raise RuntimeError(f"Failed to serialize dataset {ref} of type"
1042 f" {type(inMemoryDataset)} to "
1043 f"temporary location {tmpLocation.uri}") from e
1044 uri.transfer_from(tmpLocation.uri, transfer="copy", overwrite=True)
1046 # Cache if required
1047 self.cacheManager.move_to_cache(tmpLocation.uri, ref)
1049 log.debug("Successfully wrote dataset to %s via a temporary file.", uri)
1050 except Exception as e:
1051 raise RuntimeError(f"Failed to serialize dataset {ref} to bytes.") from e
1052 else:
1053 log.debug("Writing bytes directly to %s", uri)
1054 uri.write(serializedDataset, overwrite=True)
1055 log.debug("Successfully wrote bytes directly to %s", uri)
1057 # URI is needed to resolve what ingest case are we dealing with
1058 return self._extractIngestInfo(uri, ref, formatter=formatter)
1060 def _read_artifact_into_memory(self, getInfo: DatastoreFileGetInformation,
1061 ref: DatasetRef, isComponent: bool = False) -> Any:
1062 """Read the artifact from datastore into in memory object.
1064 Parameters
1065 ----------
1066 getInfo : `DatastoreFileGetInformation`
1067 Information about the artifact within the datastore.
1068 ref : `DatasetRef`
1069 The registry information associated with this artifact.
1070 isComponent : `bool`
1071 Flag to indicate if a component is being read from this artifact.
1073 Returns
1074 -------
1075 inMemoryDataset : `object`
1076 The artifact as a python object.
1077 """
1078 location = getInfo.location
1079 uri = location.uri
1080 log.debug("Accessing data from %s", uri)
1082 # Cannot recalculate checksum but can compare size as a quick check
1083 # Do not do this if the size is negative since that indicates
1084 # we do not know.
1085 recorded_size = getInfo.info.file_size
1086 resource_size = uri.size()
1087 if recorded_size >= 0 and resource_size != recorded_size: 1087 ↛ 1088line 1087 didn't jump to line 1088, because the condition on line 1087 was never true
1088 raise RuntimeError("Integrity failure in Datastore. "
1089 f"Size of file {uri} ({resource_size}) "
1090 f"does not match size recorded in registry of {recorded_size}")
1092 # For the general case we have choices for how to proceed.
1093 # 1. Always use a local file (downloading the remote resource to a
1094 # temporary file if needed).
1095 # 2. Use a threshold size and read into memory and use bytes.
1096 # Use both for now with an arbitrary hand off size.
1097 # This allows small datasets to be downloaded from remote object
1098 # stores without requiring a temporary file.
1100 formatter = getInfo.formatter
1101 nbytes_max = 10_000_000 # Arbitrary number that we can tune
1102 if resource_size <= nbytes_max and formatter.can_read_bytes():
1103 serializedDataset = uri.read()
1104 log.debug("Deserializing %s from %d bytes from location %s with formatter %s",
1105 f"component {getInfo.component}" if isComponent else "",
1106 len(serializedDataset), uri, formatter.name())
1107 try:
1108 result = formatter.fromBytes(serializedDataset,
1109 component=getInfo.component if isComponent else None)
1110 except Exception as e:
1111 raise ValueError(f"Failure from formatter '{formatter.name()}' for dataset {ref.id}"
1112 f" ({ref.datasetType.name} from {uri}): {e}") from e
1113 else:
1114 # Read from file.
1116 # Have to update the Location associated with the formatter
1117 # because formatter.read does not allow an override.
1118 # This could be improved.
1119 location_updated = False
1120 msg = ""
1122 # First check in cache for local version.
1123 # The cache will only be relevant for remote resources.
1124 if not uri.isLocal:
1125 cached_file = self.cacheManager.find_in_cache(ref, uri.getExtension())
1126 if cached_file is not None: 1126 ↛ 1127line 1126 didn't jump to line 1127, because the condition on line 1126 was never true
1127 msg = f"(via cache read of remote file {uri})"
1128 uri = cached_file
1129 location_updated = True
1131 with uri.as_local() as local_uri:
1133 # URI was remote and file was downloaded
1134 if uri != local_uri:
1135 cache_msg = ""
1136 location_updated = True
1138 # Cache the downloaded file if needed.
1139 cached_uri = self.cacheManager.move_to_cache(local_uri, ref)
1140 if cached_uri is not None: 1140 ↛ 1141line 1140 didn't jump to line 1141, because the condition on line 1140 was never true
1141 local_uri = cached_uri
1142 cache_msg = " and cached"
1144 msg = f"(via download to local file{cache_msg})"
1146 # Calculate the (possibly) new location for the formatter
1147 # to use.
1148 newLocation = Location(*local_uri.split()) if location_updated else None
1150 log.debug("Reading%s from location %s %s with formatter %s",
1151 f" component {getInfo.component}" if isComponent else "",
1152 uri, msg, formatter.name())
1153 try:
1154 with formatter._updateLocation(newLocation):
1155 result = formatter.read(component=getInfo.component if isComponent else None)
1156 except Exception as e:
1157 raise ValueError(f"Failure from formatter '{formatter.name()}' for dataset {ref.id}"
1158 f" ({ref.datasetType.name} from {uri}): {e}") from e
1160 return self._post_process_get(result, getInfo.readStorageClass, getInfo.assemblerParams,
1161 isComponent=isComponent)
1163 def knows(self, ref: DatasetRef) -> bool:
1164 """Check if the dataset is known to the datastore.
1166 Does not check for existence of any artifact.
1168 Parameters
1169 ----------
1170 ref : `DatasetRef`
1171 Reference to the required dataset.
1173 Returns
1174 -------
1175 exists : `bool`
1176 `True` if the dataset is known to the datastore.
1177 """
1178 fileLocations = self._get_dataset_locations_info(ref)
1179 if fileLocations:
1180 return True
1181 return False
1183 def exists(self, ref: DatasetRef) -> bool:
1184 """Check if the dataset exists in the datastore.
1186 Parameters
1187 ----------
1188 ref : `DatasetRef`
1189 Reference to the required dataset.
1191 Returns
1192 -------
1193 exists : `bool`
1194 `True` if the entity exists in the `Datastore`.
1195 """
1196 fileLocations = self._get_dataset_locations_info(ref)
1198 # if we are being asked to trust that registry might not be correct
1199 # we ask for the expected locations and check them explicitly
1200 if not fileLocations:
1201 if not self.trustGetRequest:
1202 return False
1203 fileLocations = self._get_expected_dataset_locations_info(ref)
1204 for location, _ in fileLocations:
1205 if not self._artifact_exists(location):
1206 return False
1208 return True
1210 def getURIs(self, ref: DatasetRef,
1211 predict: bool = False) -> Tuple[Optional[ButlerURI], Dict[str, ButlerURI]]:
1212 """Return URIs associated with dataset.
1214 Parameters
1215 ----------
1216 ref : `DatasetRef`
1217 Reference to the required dataset.
1218 predict : `bool`, optional
1219 If the datastore does not know about the dataset, should it
1220 return a predicted URI or not?
1222 Returns
1223 -------
1224 primary : `ButlerURI`
1225 The URI to the primary artifact associated with this dataset.
1226 If the dataset was disassembled within the datastore this
1227 may be `None`.
1228 components : `dict`
1229 URIs to any components associated with the dataset artifact.
1230 Can be empty if there are no components.
1231 """
1233 primary: Optional[ButlerURI] = None
1234 components: Dict[str, ButlerURI] = {}
1236 # if this has never been written then we have to guess
1237 if not self.exists(ref):
1238 if not predict:
1239 raise FileNotFoundError("Dataset {} not in this datastore".format(ref))
1241 doDisassembly = self.composites.shouldBeDisassembled(ref)
1243 if doDisassembly:
1245 for component, componentStorage in ref.datasetType.storageClass.components.items():
1246 compRef = ref.makeComponentRef(component)
1247 compLocation, _ = self._determine_put_formatter_location(compRef)
1249 # Add a URI fragment to indicate this is a guess
1250 components[component] = ButlerURI(compLocation.uri.geturl() + "#predicted")
1252 else:
1254 location, _ = self._determine_put_formatter_location(ref)
1256 # Add a URI fragment to indicate this is a guess
1257 primary = ButlerURI(location.uri.geturl() + "#predicted")
1259 return primary, components
1261 # If this is a ref that we have written we can get the path.
1262 # Get file metadata and internal metadata
1263 fileLocations = self._get_dataset_locations_info(ref)
1265 guessing = False
1266 if not fileLocations:
1267 if not self.trustGetRequest: 1267 ↛ 1268line 1267 didn't jump to line 1268, because the condition on line 1267 was never true
1268 raise RuntimeError(f"Unexpectedly got no artifacts for dataset {ref}")
1269 fileLocations = self._get_expected_dataset_locations_info(ref)
1270 guessing = True
1272 if len(fileLocations) == 1:
1273 # No disassembly so this is the primary URI
1274 uri = fileLocations[0][0].uri
1275 if guessing and not uri.exists(): 1275 ↛ 1276line 1275 didn't jump to line 1276, because the condition on line 1275 was never true
1276 raise FileNotFoundError(f"Expected URI ({uri}) does not exist")
1277 primary = uri
1279 else:
1280 for location, storedFileInfo in fileLocations:
1281 if storedFileInfo.component is None: 1281 ↛ 1282line 1281 didn't jump to line 1282, because the condition on line 1281 was never true
1282 raise RuntimeError(f"Unexpectedly got no component name for a component at {location}")
1283 uri = location.uri
1284 if guessing and not uri.exists(): 1284 ↛ 1285line 1284 didn't jump to line 1285, because the condition on line 1284 was never true
1285 raise FileNotFoundError(f"Expected URI ({uri}) does not exist")
1286 components[storedFileInfo.component] = uri
1288 return primary, components
1290 def getURI(self, ref: DatasetRef, predict: bool = False) -> ButlerURI:
1291 """URI to the Dataset.
1293 Parameters
1294 ----------
1295 ref : `DatasetRef`
1296 Reference to the required Dataset.
1297 predict : `bool`
1298 If `True`, allow URIs to be returned of datasets that have not
1299 been written.
1301 Returns
1302 -------
1303 uri : `str`
1304 URI pointing to the dataset within the datastore. If the
1305 dataset does not exist in the datastore, and if ``predict`` is
1306 `True`, the URI will be a prediction and will include a URI
1307 fragment "#predicted".
1308 If the datastore does not have entities that relate well
1309 to the concept of a URI the returned URI will be
1310 descriptive. The returned URI is not guaranteed to be obtainable.
1312 Raises
1313 ------
1314 FileNotFoundError
1315 Raised if a URI has been requested for a dataset that does not
1316 exist and guessing is not allowed.
1317 RuntimeError
1318 Raised if a request is made for a single URI but multiple URIs
1319 are associated with this dataset.
1321 Notes
1322 -----
1323 When a predicted URI is requested an attempt will be made to form
1324 a reasonable URI based on file templates and the expected formatter.
1325 """
1326 primary, components = self.getURIs(ref, predict)
1327 if primary is None or components: 1327 ↛ 1328line 1327 didn't jump to line 1328, because the condition on line 1327 was never true
1328 raise RuntimeError(f"Dataset ({ref}) includes distinct URIs for components. "
1329 "Use Dataastore.getURIs() instead.")
1330 return primary
1332 def retrieveArtifacts(self, refs: Iterable[DatasetRef],
1333 destination: ButlerURI, transfer: str = "auto",
1334 preserve_path: bool = True,
1335 overwrite: bool = False) -> List[ButlerURI]:
1336 """Retrieve the file artifacts associated with the supplied refs.
1338 Parameters
1339 ----------
1340 refs : iterable of `DatasetRef`
1341 The datasets for which file artifacts are to be retrieved.
1342 A single ref can result in multiple files. The refs must
1343 be resolved.
1344 destination : `ButlerURI`
1345 Location to write the file artifacts.
1346 transfer : `str`, optional
1347 Method to use to transfer the artifacts. Must be one of the options
1348 supported by `ButlerURI.transfer_from()`. "move" is not allowed.
1349 preserve_path : `bool`, optional
1350 If `True` the full path of the file artifact within the datastore
1351 is preserved. If `False` the final file component of the path
1352 is used.
1353 overwrite : `bool`, optional
1354 If `True` allow transfers to overwrite existing files at the
1355 destination.
1357 Returns
1358 -------
1359 targets : `list` of `ButlerURI`
1360 URIs of file artifacts in destination location. Order is not
1361 preserved.
1362 """
1363 if not destination.isdir(): 1363 ↛ 1364line 1363 didn't jump to line 1364, because the condition on line 1363 was never true
1364 raise ValueError(f"Destination location must refer to a directory. Given {destination}")
1366 if transfer == "move":
1367 raise ValueError("Can not move artifacts out of datastore. Use copy instead.")
1369 # Source -> Destination
1370 # This also helps filter out duplicate DatasetRef in the request
1371 # that will map to the same underlying file transfer.
1372 to_transfer: Dict[ButlerURI, ButlerURI] = {}
1374 for ref in refs:
1375 locations = self._get_dataset_locations_info(ref)
1376 for location, _ in locations:
1377 source_uri = location.uri
1378 target_path: Union[str, ButlerURI]
1379 if preserve_path:
1380 target_path = location.pathInStore
1381 if target_path.isabs(): 1381 ↛ 1384line 1381 didn't jump to line 1384, because the condition on line 1381 was never true
1382 # This is an absolute path to an external file.
1383 # Use the full path.
1384 target_path = target_path.relativeToPathRoot
1385 else:
1386 target_path = source_uri.basename()
1387 target_uri = destination.join(target_path)
1388 to_transfer[source_uri] = target_uri
1390 # In theory can now parallelize the transfer
1391 log.debug("Number of artifacts to transfer to %s: %d",
1392 str(destination), len(to_transfer))
1393 for source_uri, target_uri in to_transfer.items():
1394 target_uri.transfer_from(source_uri, transfer=transfer, overwrite=overwrite)
1396 return list(to_transfer.values())
1398 def get(self, ref: DatasetRef, parameters: Optional[Mapping[str, Any]] = None) -> Any:
1399 """Load an InMemoryDataset from the store.
1401 Parameters
1402 ----------
1403 ref : `DatasetRef`
1404 Reference to the required Dataset.
1405 parameters : `dict`
1406 `StorageClass`-specific parameters that specify, for example,
1407 a slice of the dataset to be loaded.
1409 Returns
1410 -------
1411 inMemoryDataset : `object`
1412 Requested dataset or slice thereof as an InMemoryDataset.
1414 Raises
1415 ------
1416 FileNotFoundError
1417 Requested dataset can not be retrieved.
1418 TypeError
1419 Return value from formatter has unexpected type.
1420 ValueError
1421 Formatter failed to process the dataset.
1422 """
1423 allGetInfo = self._prepare_for_get(ref, parameters)
1424 refComponent = ref.datasetType.component()
1426 # Supplied storage class for the component being read
1427 refStorageClass = ref.datasetType.storageClass
1429 # Create mapping from component name to related info
1430 allComponents = {i.component: i for i in allGetInfo}
1432 # By definition the dataset is disassembled if we have more
1433 # than one record for it.
1434 isDisassembled = len(allGetInfo) > 1
1436 # Look for the special case where we are disassembled but the
1437 # component is a derived component that was not written during
1438 # disassembly. For this scenario we need to check that the
1439 # component requested is listed as a derived component for the
1440 # composite storage class
1441 isDisassembledReadOnlyComponent = False
1442 if isDisassembled and refComponent:
1443 # The composite storage class should be accessible through
1444 # the component dataset type
1445 compositeStorageClass = ref.datasetType.parentStorageClass
1447 # In the unlikely scenario where the composite storage
1448 # class is not known, we can only assume that this is a
1449 # normal component. If that assumption is wrong then the
1450 # branch below that reads a persisted component will fail
1451 # so there is no need to complain here.
1452 if compositeStorageClass is not None: 1452 ↛ 1455line 1452 didn't jump to line 1455, because the condition on line 1452 was never false
1453 isDisassembledReadOnlyComponent = refComponent in compositeStorageClass.derivedComponents
1455 if isDisassembled and not refComponent:
1456 # This was a disassembled dataset spread over multiple files
1457 # and we need to put them all back together again.
1458 # Read into memory and then assemble
1460 # Check that the supplied parameters are suitable for the type read
1461 refStorageClass.validateParameters(parameters)
1463 # We want to keep track of all the parameters that were not used
1464 # by formatters. We assume that if any of the component formatters
1465 # use a parameter that we do not need to apply it again in the
1466 # assembler.
1467 usedParams = set()
1469 components: Dict[str, Any] = {}
1470 for getInfo in allGetInfo:
1471 # assemblerParams are parameters not understood by the
1472 # associated formatter.
1473 usedParams.update(set(getInfo.formatterParams))
1475 component = getInfo.component
1477 if component is None: 1477 ↛ 1478line 1477 didn't jump to line 1478, because the condition on line 1477 was never true
1478 raise RuntimeError(f"Internal error in datastore assembly of {ref}")
1480 # We do not want the formatter to think it's reading
1481 # a component though because it is really reading a
1482 # standalone dataset -- always tell reader it is not a
1483 # component.
1484 components[component] = self._read_artifact_into_memory(getInfo, ref, isComponent=False)
1486 inMemoryDataset = ref.datasetType.storageClass.delegate().assemble(components)
1488 # Any unused parameters will have to be passed to the assembler
1489 if parameters:
1490 unusedParams = {k: v for k, v in parameters.items() if k not in usedParams}
1491 else:
1492 unusedParams = {}
1494 # Process parameters
1495 return ref.datasetType.storageClass.delegate().handleParameters(inMemoryDataset,
1496 parameters=unusedParams)
1498 elif isDisassembledReadOnlyComponent:
1500 compositeStorageClass = ref.datasetType.parentStorageClass
1501 if compositeStorageClass is None: 1501 ↛ 1502line 1501 didn't jump to line 1502, because the condition on line 1501 was never true
1502 raise RuntimeError(f"Unable to retrieve derived component '{refComponent}' since"
1503 "no composite storage class is available.")
1505 if refComponent is None: 1505 ↛ 1507line 1505 didn't jump to line 1507, because the condition on line 1505 was never true
1506 # Mainly for mypy
1507 raise RuntimeError(f"Internal error in datastore {self.name}: component can not be None here")
1509 # Assume that every derived component can be calculated by
1510 # forwarding the request to a single read/write component.
1511 # Rather than guessing which rw component is the right one by
1512 # scanning each for a derived component of the same name,
1513 # we ask the storage class delegate directly which one is best to
1514 # use.
1515 compositeDelegate = compositeStorageClass.delegate()
1516 forwardedComponent = compositeDelegate.selectResponsibleComponent(refComponent,
1517 set(allComponents))
1519 # Select the relevant component
1520 rwInfo = allComponents[forwardedComponent]
1522 # For now assume that read parameters are validated against
1523 # the real component and not the requested component
1524 forwardedStorageClass = rwInfo.formatter.fileDescriptor.readStorageClass
1525 forwardedStorageClass.validateParameters(parameters)
1527 # Unfortunately the FileDescriptor inside the formatter will have
1528 # the wrong write storage class so we need to create a new one
1529 # given the immutability constraint.
1530 writeStorageClass = rwInfo.info.storageClass
1532 # We may need to put some thought into parameters for read
1533 # components but for now forward them on as is
1534 readFormatter = type(rwInfo.formatter)(FileDescriptor(rwInfo.location,
1535 readStorageClass=refStorageClass,
1536 storageClass=writeStorageClass,
1537 parameters=parameters),
1538 ref.dataId)
1540 # The assembler can not receive any parameter requests for a
1541 # derived component at this time since the assembler will
1542 # see the storage class of the derived component and those
1543 # parameters will have to be handled by the formatter on the
1544 # forwarded storage class.
1545 assemblerParams: Dict[str, Any] = {}
1547 # Need to created a new info that specifies the derived
1548 # component and associated storage class
1549 readInfo = DatastoreFileGetInformation(rwInfo.location, readFormatter,
1550 rwInfo.info, assemblerParams, {},
1551 refComponent, refStorageClass)
1553 return self._read_artifact_into_memory(readInfo, ref, isComponent=True)
1555 else:
1556 # Single file request or component from that composite file
1557 for lookup in (refComponent, None): 1557 ↛ 1562line 1557 didn't jump to line 1562, because the loop on line 1557 didn't complete
1558 if lookup in allComponents: 1558 ↛ 1557line 1558 didn't jump to line 1557, because the condition on line 1558 was never false
1559 getInfo = allComponents[lookup]
1560 break
1561 else:
1562 raise FileNotFoundError(f"Component {refComponent} not found "
1563 f"for ref {ref} in datastore {self.name}")
1565 # Do not need the component itself if already disassembled
1566 if isDisassembled:
1567 isComponent = False
1568 else:
1569 isComponent = getInfo.component is not None
1571 # For a disassembled component we can validate parametersagainst
1572 # the component storage class directly
1573 if isDisassembled:
1574 refStorageClass.validateParameters(parameters)
1575 else:
1576 # For an assembled composite this could be a derived
1577 # component derived from a real component. The validity
1578 # of the parameters is not clear. For now validate against
1579 # the composite storage class
1580 getInfo.formatter.fileDescriptor.storageClass.validateParameters(parameters)
1582 return self._read_artifact_into_memory(getInfo, ref, isComponent=isComponent)
1584 @transactional
1585 def put(self, inMemoryDataset: Any, ref: DatasetRef) -> None:
1586 """Write a InMemoryDataset with a given `DatasetRef` to the store.
1588 Parameters
1589 ----------
1590 inMemoryDataset : `object`
1591 The dataset to store.
1592 ref : `DatasetRef`
1593 Reference to the associated Dataset.
1595 Raises
1596 ------
1597 TypeError
1598 Supplied object and storage class are inconsistent.
1599 DatasetTypeNotSupportedError
1600 The associated `DatasetType` is not handled by this datastore.
1602 Notes
1603 -----
1604 If the datastore is configured to reject certain dataset types it
1605 is possible that the put will fail and raise a
1606 `DatasetTypeNotSupportedError`. The main use case for this is to
1607 allow `ChainedDatastore` to put to multiple datastores without
1608 requiring that every datastore accepts the dataset.
1609 """
1611 doDisassembly = self.composites.shouldBeDisassembled(ref)
1612 # doDisassembly = True
1614 artifacts = []
1615 if doDisassembly:
1616 components = ref.datasetType.storageClass.delegate().disassemble(inMemoryDataset)
1617 for component, componentInfo in components.items():
1618 # Don't recurse because we want to take advantage of
1619 # bulk insert -- need a new DatasetRef that refers to the
1620 # same dataset_id but has the component DatasetType
1621 # DatasetType does not refer to the types of components
1622 # So we construct one ourselves.
1623 compRef = ref.makeComponentRef(component)
1624 storedInfo = self._write_in_memory_to_artifact(componentInfo.component, compRef)
1625 artifacts.append((compRef, storedInfo))
1626 else:
1627 # Write the entire thing out
1628 storedInfo = self._write_in_memory_to_artifact(inMemoryDataset, ref)
1629 artifacts.append((ref, storedInfo))
1631 self._register_datasets(artifacts)
1633 @transactional
1634 def trash(self, ref: Union[DatasetRef, Iterable[DatasetRef]], ignore_errors: bool = True) -> None:
1635 # Get file metadata and internal metadata
1636 if not isinstance(ref, DatasetRef):
1637 log.debug("Doing multi-dataset trash in datastore %s", self.name)
1638 # Assumed to be an iterable of refs so bulk mode enabled.
1639 try:
1640 self.bridge.moveToTrash(ref)
1641 except Exception as e:
1642 if ignore_errors:
1643 log.warning("Unexpected issue moving multiple datasets to trash: %s", e)
1644 else:
1645 raise
1646 return
1648 log.debug("Trashing dataset %s in datastore %s", ref, self.name)
1650 fileLocations = self._get_dataset_locations_info(ref)
1652 if not fileLocations:
1653 err_msg = f"Requested dataset to trash ({ref}) is not known to datastore {self.name}"
1654 if ignore_errors: 1654 ↛ 1655line 1654 didn't jump to line 1655, because the condition on line 1654 was never true
1655 log.warning(err_msg)
1656 return
1657 else:
1658 raise FileNotFoundError(err_msg)
1660 for location, storedFileInfo in fileLocations:
1661 if not self._artifact_exists(location): 1661 ↛ 1662line 1661 didn't jump to line 1662, because the condition on line 1661 was never true
1662 err_msg = f"Dataset is known to datastore {self.name} but " \
1663 f"associated artifact ({location.uri}) is missing"
1664 if ignore_errors:
1665 log.warning(err_msg)
1666 return
1667 else:
1668 raise FileNotFoundError(err_msg)
1670 # Mark dataset as trashed
1671 try:
1672 self.bridge.moveToTrash([ref])
1673 except Exception as e:
1674 if ignore_errors:
1675 log.warning(f"Attempted to mark dataset ({ref}) to be trashed in datastore {self.name} "
1676 f"but encountered an error: {e}")
1677 pass
1678 else:
1679 raise
1681 @transactional
1682 def emptyTrash(self, ignore_errors: bool = True) -> None:
1683 """Remove all datasets from the trash.
1685 Parameters
1686 ----------
1687 ignore_errors : `bool`
1688 If `True` return without error even if something went wrong.
1689 Problems could occur if another process is simultaneously trying
1690 to delete.
1691 """
1692 log.debug("Emptying trash in datastore %s", self.name)
1694 # Context manager will empty trash iff we finish it without raising.
1695 # It will also automatically delete the relevant rows from the
1696 # trash table and the records table.
1697 with self.bridge.emptyTrash(self._table, record_class=StoredFileInfo,
1698 record_column="path") as trash_data:
1699 # Removing the artifacts themselves requires that the files are
1700 # not also associated with refs that are not to be trashed.
1701 # Therefore need to do a query with the file paths themselves
1702 # and return all the refs associated with them. Can only delete
1703 # a file if the refs to be trashed are the only refs associated
1704 # with the file.
1705 # This requires multiple copies of the trashed items
1706 trashed, artifacts_to_keep = trash_data
1708 if artifacts_to_keep is None:
1709 # The bridge is not helping us so have to work it out
1710 # ourselves. This is not going to be as efficient.
1711 trashed = list(trashed)
1713 # The instance check is for mypy since up to this point it
1714 # does not know the type of info.
1715 path_map = self._refs_associated_with_artifacts([info.path for _, info in trashed
1716 if isinstance(info, StoredFileInfo)])
1718 for ref, info in trashed:
1720 # Mypy needs to know this is not the base class
1721 assert isinstance(info, StoredFileInfo), f"Unexpectedly got info of class {type(info)}"
1723 # Check for mypy
1724 assert ref.id is not None, f"Internal logic error in emptyTrash with ref {ref}/{info}"
1726 path_map[info.path].remove(ref.id)
1727 if not path_map[info.path]: 1727 ↛ 1718line 1727 didn't jump to line 1718, because the condition on line 1727 was never false
1728 del path_map[info.path]
1730 artifacts_to_keep = set(path_map)
1732 for ref, info in trashed:
1734 # Should not happen for this implementation but need
1735 # to keep mypy happy.
1736 assert info is not None, f"Internal logic error in emptyTrash with ref {ref}."
1738 # Mypy needs to know this is not the base class
1739 assert isinstance(info, StoredFileInfo), f"Unexpectedly got info of class {type(info)}"
1741 # Check for mypy
1742 assert ref.id is not None, f"Internal logic error in emptyTrash with ref {ref}/{info}"
1744 if info.path in artifacts_to_keep:
1745 # This is a multi-dataset artifact and we are not
1746 # removing all associated refs.
1747 continue
1749 # Only trashed refs still known to datastore will be returned.
1750 location = info.file_location(self.locationFactory)
1752 # Point of no return for this artifact
1753 log.debug("Removing artifact %s from datastore %s", location.uri, self.name)
1754 try:
1755 self._delete_artifact(location)
1756 except FileNotFoundError:
1757 # If the file itself has been deleted there is nothing
1758 # we can do about it. It is possible that trash has
1759 # been run in parallel in another process or someone
1760 # decided to delete the file. It is unlikely to come
1761 # back and so we should still continue with the removal
1762 # of the entry from the trash table. It is also possible
1763 # we removed it in a previous iteration if it was
1764 # a multi-dataset artifact. The delete artifact method
1765 # will log a debug message in this scenario.
1766 # Distinguishing file missing before trash started and
1767 # file already removed previously as part of this trash
1768 # is not worth the distinction with regards to potential
1769 # memory cost.
1770 pass
1771 except Exception as e:
1772 if ignore_errors:
1773 # Use a debug message here even though it's not
1774 # a good situation. In some cases this can be
1775 # caused by a race between user A and user B
1776 # and neither of them has permissions for the
1777 # other's files. Butler does not know about users
1778 # and trash has no idea what collections these
1779 # files were in (without guessing from a path).
1780 log.debug("Encountered error removing artifact %s from datastore %s: %s",
1781 location.uri, self.name, e)
1782 else:
1783 raise
1785 @transactional
1786 def transfer_from(self, source_datastore: Datastore, refs: Iterable[DatasetRef],
1787 local_refs: Optional[Iterable[DatasetRef]] = None,
1788 transfer: str = "auto") -> None:
1789 # Docstring inherited
1790 if type(self) is not type(source_datastore): 1790 ↛ 1791line 1790 didn't jump to line 1791, because the condition on line 1790 was never true
1791 raise TypeError(f"Datastore mismatch between this datastore ({type(self)}) and the "
1792 f"source datastore ({type(source_datastore)}).")
1794 # Be explicit for mypy
1795 if not isinstance(source_datastore, FileDatastore): 1795 ↛ 1796line 1795 didn't jump to line 1796, because the condition on line 1795 was never true
1796 raise TypeError("Can only transfer to a FileDatastore from another FileDatastore, not"
1797 f" {type(source_datastore)}")
1799 # Stop early if "direct" transfer mode is requested. That would
1800 # require that the URI inside the source datastore should be stored
1801 # directly in the target datastore, which seems unlikely to be useful
1802 # since at any moment the source datastore could delete the file.
1803 if transfer == "direct": 1803 ↛ 1804line 1803 didn't jump to line 1804, because the condition on line 1803 was never true
1804 raise ValueError("Can not transfer from a source datastore using direct mode since"
1805 " those files are controlled by the other datastore.")
1807 # We will go through the list multiple times so must convert
1808 # generators to lists.
1809 refs = list(refs)
1811 if local_refs is None: 1811 ↛ 1812line 1811 didn't jump to line 1812, because the condition on line 1811 was never true
1812 local_refs = refs
1813 else:
1814 local_refs = list(local_refs)
1816 # In order to handle disassembled composites the code works
1817 # at the records level since it can assume that internal APIs
1818 # can be used.
1819 # - If the record already exists in the destination this is assumed
1820 # to be okay.
1821 # - If there is no record but the source and destination URIs are
1822 # identical no transfer is done but the record is added.
1823 # - If the source record refers to an absolute URI currently assume
1824 # that that URI should remain absolute and will be visible to the
1825 # destination butler. May need to have a flag to indicate whether
1826 # the dataset should be transferred. This will only happen if
1827 # the detached Butler has had a local ingest.
1829 # What we really want is all the records in the source datastore
1830 # associated with these refs. Or derived ones if they don't exist
1831 # in the source.
1832 source_records = source_datastore._get_stored_records_associated_with_refs(refs)
1834 # The source dataset_ids are the keys in these records
1835 source_ids = set(source_records)
1836 log.debug("Number of datastore records found in source: %d", len(source_ids))
1838 # The not None check is to appease mypy
1839 requested_ids = set(ref.id for ref in refs if ref.id is not None)
1840 missing_ids = requested_ids - source_ids
1842 # Missing IDs can be okay if that datastore has allowed
1843 # gets based on file existence. Should we transfer what we can
1844 # or complain about it and warn?
1845 if missing_ids and not source_datastore.trustGetRequest: 1845 ↛ 1846line 1845 didn't jump to line 1846, because the condition on line 1845 was never true
1846 raise ValueError(f"Some datasets are missing from source datastore {source_datastore}:"
1847 f" {missing_ids}")
1849 # Need to map these missing IDs to a DatasetRef so we can guess
1850 # the details.
1851 if missing_ids: 1851 ↛ 1852line 1851 didn't jump to line 1852, because the condition on line 1851 was never true
1852 log.info("Number of expected datasets missing from source datastore records: %d",
1853 len(missing_ids))
1854 id_to_ref = {ref.id: ref for ref in refs if ref.id in missing_ids}
1856 for missing in missing_ids:
1857 expected = self._get_expected_dataset_locations_info(id_to_ref[missing])
1858 source_records[missing].extend(info for _, info in expected)
1860 # See if we already have these records
1861 target_records = self._get_stored_records_associated_with_refs(local_refs)
1863 # The artifacts to register
1864 artifacts = []
1866 # Refs that already exist
1867 already_present = []
1869 # Now can transfer the artifacts
1870 for source_ref, target_ref in zip(refs, local_refs):
1871 if target_ref.id in target_records: 1871 ↛ 1873line 1871 didn't jump to line 1873, because the condition on line 1871 was never true
1872 # Already have an artifact for this.
1873 already_present.append(target_ref)
1874 continue
1876 # mypy needs to know these are always resolved refs
1877 for info in source_records[source_ref.getCheckedId()]:
1878 source_location = info.file_location(source_datastore.locationFactory)
1879 target_location = info.file_location(self.locationFactory)
1880 if source_location == target_location: 1880 ↛ 1884line 1880 didn't jump to line 1884, because the condition on line 1880 was never true
1881 # Either the dataset is already in the target datastore
1882 # (which is how execution butler currently runs) or
1883 # it is an absolute URI.
1884 if source_location.pathInStore.isabs():
1885 # Just because we can see the artifact when running
1886 # the transfer doesn't mean it will be generally
1887 # accessible to a user of this butler. For now warn
1888 # but assume it will be accessible.
1889 log.warning("Transfer request for an outside-datastore artifact has been found at %s",
1890 source_location)
1891 else:
1892 # Need to transfer it to the new location.
1893 # Assume we should always overwrite. If the artifact
1894 # is there this might indicate that a previous transfer
1895 # was interrupted but was not able to be rolled back
1896 # completely (eg pre-emption) so follow Datastore default
1897 # and overwrite.
1898 target_location.uri.transfer_from(source_location.uri, transfer=transfer,
1899 overwrite=True, transaction=self._transaction)
1901 artifacts.append((target_ref, info))
1903 self._register_datasets(artifacts)
1905 if already_present: 1905 ↛ 1906line 1905 didn't jump to line 1906, because the condition on line 1905 was never true
1906 n_skipped = len(already_present)
1907 log.info("Skipped transfer of %d dataset%s already present in datastore", n_skipped,
1908 "" if n_skipped == 1 else "s")
1910 @transactional
1911 def forget(self, refs: Iterable[DatasetRef]) -> None:
1912 # Docstring inherited.
1913 refs = list(refs)
1914 self.bridge.forget(refs)
1915 self._table.delete(["dataset_id"], *[{"dataset_id": ref.getCheckedId()} for ref in refs])
1917 def validateConfiguration(self, entities: Iterable[Union[DatasetRef, DatasetType, StorageClass]],
1918 logFailures: bool = False) -> None:
1919 """Validate some of the configuration for this datastore.
1921 Parameters
1922 ----------
1923 entities : iterable of `DatasetRef`, `DatasetType`, or `StorageClass`
1924 Entities to test against this configuration. Can be differing
1925 types.
1926 logFailures : `bool`, optional
1927 If `True`, output a log message for every validation error
1928 detected.
1930 Raises
1931 ------
1932 DatastoreValidationError
1933 Raised if there is a validation problem with a configuration.
1934 All the problems are reported in a single exception.
1936 Notes
1937 -----
1938 This method checks that all the supplied entities have valid file
1939 templates and also have formatters defined.
1940 """
1942 templateFailed = None
1943 try:
1944 self.templates.validateTemplates(entities, logFailures=logFailures)
1945 except FileTemplateValidationError as e:
1946 templateFailed = str(e)
1948 formatterFailed = []
1949 for entity in entities:
1950 try:
1951 self.formatterFactory.getFormatterClass(entity)
1952 except KeyError as e:
1953 formatterFailed.append(str(e))
1954 if logFailures: 1954 ↛ 1949line 1954 didn't jump to line 1949, because the condition on line 1954 was never false
1955 log.critical("Formatter failure: %s", e)
1957 if templateFailed or formatterFailed:
1958 messages = []
1959 if templateFailed: 1959 ↛ 1960line 1959 didn't jump to line 1960, because the condition on line 1959 was never true
1960 messages.append(templateFailed)
1961 if formatterFailed: 1961 ↛ 1963line 1961 didn't jump to line 1963, because the condition on line 1961 was never false
1962 messages.append(",".join(formatterFailed))
1963 msg = ";\n".join(messages)
1964 raise DatastoreValidationError(msg)
1966 def getLookupKeys(self) -> Set[LookupKey]:
1967 # Docstring is inherited from base class
1968 return self.templates.getLookupKeys() | self.formatterFactory.getLookupKeys() | \
1969 self.constraints.getLookupKeys()
1971 def validateKey(self, lookupKey: LookupKey,
1972 entity: Union[DatasetRef, DatasetType, StorageClass]) -> None:
1973 # Docstring is inherited from base class
1974 # The key can be valid in either formatters or templates so we can
1975 # only check the template if it exists
1976 if lookupKey in self.templates:
1977 try:
1978 self.templates[lookupKey].validateTemplate(entity)
1979 except FileTemplateValidationError as e:
1980 raise DatastoreValidationError(e) from e
1982 def export(self, refs: Iterable[DatasetRef], *,
1983 directory: Optional[Union[ButlerURI, str]] = None,
1984 transfer: Optional[str] = "auto") -> Iterable[FileDataset]:
1985 # Docstring inherited from Datastore.export.
1986 if transfer is not None and directory is None: 1986 ↛ 1987line 1986 didn't jump to line 1987, because the condition on line 1986 was never true
1987 raise RuntimeError(f"Cannot export using transfer mode {transfer} with no "
1988 "export directory given")
1990 # Force the directory to be a URI object
1991 directoryUri: Optional[ButlerURI] = None
1992 if directory is not None: 1992 ↛ 1995line 1992 didn't jump to line 1995, because the condition on line 1992 was never false
1993 directoryUri = ButlerURI(directory, forceDirectory=True)
1995 if transfer is not None and directoryUri is not None: 1995 ↛ 2000line 1995 didn't jump to line 2000, because the condition on line 1995 was never false
1996 # mypy needs the second test
1997 if not directoryUri.exists(): 1997 ↛ 1998line 1997 didn't jump to line 1998, because the condition on line 1997 was never true
1998 raise FileNotFoundError(f"Export location {directory} does not exist")
2000 progress = Progress("lsst.daf.butler.datastores.FileDatastore.export", level=logging.DEBUG)
2001 for ref in progress.wrap(refs, "Exporting dataset files"):
2002 fileLocations = self._get_dataset_locations_info(ref)
2003 if not fileLocations: 2003 ↛ 2004line 2003 didn't jump to line 2004, because the condition on line 2003 was never true
2004 raise FileNotFoundError(f"Could not retrieve dataset {ref}.")
2005 # For now we can not export disassembled datasets
2006 if len(fileLocations) > 1: 2006 ↛ 2007line 2006 didn't jump to line 2007, because the condition on line 2006 was never true
2007 raise NotImplementedError(f"Can not export disassembled datasets such as {ref}")
2008 location, storedFileInfo = fileLocations[0]
2010 pathInStore = location.pathInStore.path
2011 if transfer is None: 2011 ↛ 2014line 2011 didn't jump to line 2014, because the condition on line 2011 was never true
2012 # TODO: do we also need to return the readStorageClass somehow?
2013 # We will use the path in store directly
2014 pass
2015 elif transfer == "direct": 2015 ↛ 2017line 2015 didn't jump to line 2017, because the condition on line 2015 was never true
2016 # Use full URIs to the remote store in the export
2017 pathInStore = str(location.uri)
2018 else:
2019 # mypy needs help
2020 assert directoryUri is not None, "directoryUri must be defined to get here"
2021 storeUri = ButlerURI(location.uri)
2023 # if the datastore has an absolute URI to a resource, we
2024 # have two options:
2025 # 1. Keep the absolute URI in the exported YAML
2026 # 2. Allocate a new name in the local datastore and transfer
2027 # it.
2028 # For now go with option 2
2029 if location.pathInStore.isabs(): 2029 ↛ 2030line 2029 didn't jump to line 2030, because the condition on line 2029 was never true
2030 template = self.templates.getTemplate(ref)
2031 newURI = ButlerURI(template.format(ref), forceAbsolute=False)
2032 pathInStore = str(newURI.updatedExtension(location.pathInStore.getExtension()))
2034 exportUri = directoryUri.join(pathInStore)
2035 exportUri.transfer_from(storeUri, transfer=transfer)
2037 yield FileDataset(refs=[ref], path=pathInStore, formatter=storedFileInfo.formatter)
2039 @staticmethod
2040 def computeChecksum(uri: ButlerURI, algorithm: str = "blake2b", block_size: int = 8192) -> Optional[str]:
2041 """Compute the checksum of the supplied file.
2043 Parameters
2044 ----------
2045 uri : `ButlerURI`
2046 Name of resource to calculate checksum from.
2047 algorithm : `str`, optional
2048 Name of algorithm to use. Must be one of the algorithms supported
2049 by :py:class`hashlib`.
2050 block_size : `int`
2051 Number of bytes to read from file at one time.
2053 Returns
2054 -------
2055 hexdigest : `str`
2056 Hex digest of the file.
2058 Notes
2059 -----
2060 Currently returns None if the URI is for a remote resource.
2061 """
2062 if algorithm not in hashlib.algorithms_guaranteed: 2062 ↛ 2063line 2062 didn't jump to line 2063, because the condition on line 2062 was never true
2063 raise NameError("The specified algorithm '{}' is not supported by hashlib".format(algorithm))
2065 if not uri.isLocal: 2065 ↛ 2066line 2065 didn't jump to line 2066, because the condition on line 2065 was never true
2066 return None
2068 hasher = hashlib.new(algorithm)
2070 with uri.as_local() as local_uri:
2071 with open(local_uri.ospath, "rb") as f:
2072 for chunk in iter(lambda: f.read(block_size), b""):
2073 hasher.update(chunk)
2075 return hasher.hexdigest()
2077 def needs_expanded_data_ids(
2078 self,
2079 transfer: Optional[str],
2080 entity: Optional[Union[DatasetRef, DatasetType, StorageClass]] = None,
2081 ) -> bool:
2082 # Docstring inherited.
2083 # This _could_ also use entity to inspect whether the filename template
2084 # involves placeholders other than the required dimensions for its
2085 # dataset type, but that's not necessary for correctness; it just
2086 # enables more optimizations (perhaps only in theory).
2087 return transfer not in ("direct", None)