Coverage for python/lsst/daf/butler/datastores/fileLikeDatastore.py : 83%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
21from __future__ import annotations
23"""Generic file-based datastore code."""
25__all__ = ("FileLikeDatastore", )
27import hashlib
28import logging
29import os
30from abc import abstractmethod
32from sqlalchemy import BigInteger, String
34from dataclasses import dataclass
35from typing import (
36 TYPE_CHECKING,
37 Any,
38 ClassVar,
39 Dict,
40 Iterable,
41 List,
42 Mapping,
43 Optional,
44 Set,
45 Tuple,
46 Type,
47 Union,
48)
50from lsst.daf.butler import (
51 ButlerURI,
52 CompositesMap,
53 Config,
54 FileDataset,
55 DatasetRef,
56 DatasetType,
57 DatasetTypeNotSupportedError,
58 Datastore,
59 DatastoreConfig,
60 DatastoreValidationError,
61 FileDescriptor,
62 FileTemplates,
63 FileTemplateValidationError,
64 Formatter,
65 FormatterFactory,
66 Location,
67 LocationFactory,
68 StorageClass,
69 StoredFileInfo,
70)
72from lsst.daf.butler import ddl
73from lsst.daf.butler.registry.interfaces import (
74 ReadOnlyDatabaseError,
75 DatastoreRegistryBridge,
76)
78from lsst.daf.butler.core.repoRelocation import replaceRoot
79from lsst.daf.butler.core.utils import getInstanceOf, getClassOf, transactional
80from .genericDatastore import GenericBaseDatastore
82if TYPE_CHECKING: 82 ↛ 83line 82 didn't jump to line 83, because the condition on line 82 was never true
83 from lsst.daf.butler import LookupKey
84 from lsst.daf.butler.registry.interfaces import DatasetIdRef, DatastoreRegistryBridgeManager
86log = logging.getLogger(__name__)
88# String to use when a Python None is encountered
89NULLSTR = "__NULL_STRING__"
92class _IngestPrepData(Datastore.IngestPrepData):
93 """Helper class for FileLikeDatastore ingest implementation.
95 Parameters
96 ----------
97 datasets : `list` of `FileDataset`
98 Files to be ingested by this datastore.
99 """
100 def __init__(self, datasets: List[FileDataset]):
101 super().__init__(ref for dataset in datasets for ref in dataset.refs)
102 self.datasets = datasets
105@dataclass(frozen=True)
106class DatastoreFileGetInformation:
107 """Collection of useful parameters needed to retrieve a file from
108 a Datastore.
109 """
111 location: Location
112 """The location from which to read the dataset."""
114 formatter: Formatter
115 """The `Formatter` to use to deserialize the dataset."""
117 info: StoredFileInfo
118 """Stored information about this file and its formatter."""
120 assemblerParams: Dict[str, Any]
121 """Parameters to use for post-processing the retrieved dataset."""
123 formatterParams: Dict[str, Any]
124 """Parameters that were understood by the associated formatter."""
126 component: Optional[str]
127 """The component to be retrieved (can be `None`)."""
129 readStorageClass: StorageClass
130 """The `StorageClass` of the dataset being read."""
133class FileLikeDatastore(GenericBaseDatastore):
134 """Generic Datastore for file-based implementations.
136 Should always be sub-classed since key abstract methods are missing.
138 Parameters
139 ----------
140 config : `DatastoreConfig` or `str`
141 Configuration as either a `Config` object or URI to file.
142 bridgeManager : `DatastoreRegistryBridgeManager`
143 Object that manages the interface between `Registry` and datastores.
144 butlerRoot : `str`, optional
145 New datastore root to use to override the configuration value.
147 Raises
148 ------
149 ValueError
150 If root location does not exist and ``create`` is `False` in the
151 configuration.
152 """
154 defaultConfigFile: ClassVar[Optional[str]] = None
155 """Path to configuration defaults. Accessed within the ``config`` resource
156 or relative to a search path. Can be None if no defaults specified.
157 """
159 root: ButlerURI
160 """Root directory URI of this `Datastore`."""
162 locationFactory: LocationFactory
163 """Factory for creating locations relative to the datastore root."""
165 formatterFactory: FormatterFactory
166 """Factory for creating instances of formatters."""
168 templates: FileTemplates
169 """File templates that can be used by this `Datastore`."""
171 composites: CompositesMap
172 """Determines whether a dataset should be disassembled on put."""
174 @classmethod
175 def setConfigRoot(cls, root: str, config: Config, full: Config, overwrite: bool = True) -> None:
176 """Set any filesystem-dependent config options for this Datastore to
177 be appropriate for a new empty repository with the given root.
179 Parameters
180 ----------
181 root : `str`
182 URI to the root of the data repository.
183 config : `Config`
184 A `Config` to update. Only the subset understood by
185 this component will be updated. Will not expand
186 defaults.
187 full : `Config`
188 A complete config with all defaults expanded that can be
189 converted to a `DatastoreConfig`. Read-only and will not be
190 modified by this method.
191 Repository-specific options that should not be obtained
192 from defaults when Butler instances are constructed
193 should be copied from ``full`` to ``config``.
194 overwrite : `bool`, optional
195 If `False`, do not modify a value in ``config`` if the value
196 already exists. Default is always to overwrite with the provided
197 ``root``.
199 Notes
200 -----
201 If a keyword is explicitly defined in the supplied ``config`` it
202 will not be overridden by this method if ``overwrite`` is `False`.
203 This allows explicit values set in external configs to be retained.
204 """
205 Config.updateParameters(DatastoreConfig, config, full,
206 toUpdate={"root": root},
207 toCopy=("cls", ("records", "table")), overwrite=overwrite)
209 @classmethod
210 def makeTableSpec(cls) -> ddl.TableSpec:
211 return ddl.TableSpec(
212 fields=[
213 ddl.FieldSpec(name="dataset_id", dtype=BigInteger, primaryKey=True),
214 ddl.FieldSpec(name="path", dtype=String, length=256, nullable=False),
215 ddl.FieldSpec(name="formatter", dtype=String, length=128, nullable=False),
216 ddl.FieldSpec(name="storage_class", dtype=String, length=64, nullable=False),
217 # Use empty string to indicate no component
218 ddl.FieldSpec(name="component", dtype=String, length=32, primaryKey=True),
219 # TODO: should checksum be Base64Bytes instead?
220 ddl.FieldSpec(name="checksum", dtype=String, length=128, nullable=True),
221 ddl.FieldSpec(name="file_size", dtype=BigInteger, nullable=True),
222 ],
223 unique=frozenset(),
224 )
226 def __init__(self, config: Union[DatastoreConfig, str],
227 bridgeManager: DatastoreRegistryBridgeManager, butlerRoot: str = None):
228 super().__init__(config, bridgeManager)
229 if "root" not in self.config: 229 ↛ 230line 229 didn't jump to line 230, because the condition on line 229 was never true
230 raise ValueError("No root directory specified in configuration")
232 # Name ourselves either using an explicit name or a name
233 # derived from the (unexpanded) root
234 if "name" in self.config:
235 self.name = self.config["name"]
236 else:
237 # We use the unexpanded root in the name to indicate that this
238 # datastore can be moved without having to update registry.
239 self.name = "{}@{}".format(type(self).__name__,
240 self.config["root"])
242 # Support repository relocation in config
243 # Existence of self.root is checked in subclass
244 self.root = ButlerURI(replaceRoot(self.config["root"], butlerRoot),
245 forceDirectory=True, forceAbsolute=True)
247 self.locationFactory = LocationFactory(self.root)
248 self.formatterFactory = FormatterFactory()
250 # Now associate formatters with storage classes
251 self.formatterFactory.registerFormatters(self.config["formatters"],
252 universe=bridgeManager.universe)
254 # Read the file naming templates
255 self.templates = FileTemplates(self.config["templates"],
256 universe=bridgeManager.universe)
258 # See if composites should be disassembled
259 self.composites = CompositesMap(self.config["composites"],
260 universe=bridgeManager.universe)
262 tableName = self.config["records", "table"]
263 try:
264 # Storage of paths and formatters, keyed by dataset_id
265 self._table = bridgeManager.opaque.register(tableName, self.makeTableSpec())
266 # Interface to Registry.
267 self._bridge = bridgeManager.register(self.name)
268 except ReadOnlyDatabaseError:
269 # If the database is read only and we just tried and failed to
270 # create a table, it means someone is trying to create a read-only
271 # butler client for an empty repo. That should be okay, as long
272 # as they then try to get any datasets before some other client
273 # creates the table. Chances are they'rejust validating
274 # configuration.
275 pass
277 # Determine whether checksums should be used - default to False
278 self.useChecksum = self.config.get("checksum", False)
280 def __str__(self) -> str:
281 return str(self.root)
283 @property
284 def bridge(self) -> DatastoreRegistryBridge:
285 return self._bridge
287 def _artifact_exists(self, location: Location) -> bool:
288 """Check that an artifact exists in this datastore at the specified
289 location.
291 Parameters
292 ----------
293 location : `Location`
294 Expected location of the artifact associated with this datastore.
296 Returns
297 -------
298 exists : `bool`
299 True if the location can be found, false otherwise.
300 """
301 log.debug("Checking if resource exists: %s", location.uri)
302 return location.uri.exists()
304 def _delete_artifact(self, location: Location) -> None:
305 """Delete the artifact from the datastore.
307 Parameters
308 ----------
309 location : `Location`
310 Location of the artifact associated with this datastore.
311 """
312 log.debug("Deleting file: %s", location.uri)
313 location.uri.remove()
314 log.debug("Successfully deleted file: %s", location.uri)
316 def addStoredItemInfo(self, refs: Iterable[DatasetRef], infos: Iterable[StoredFileInfo]) -> None:
317 # Docstring inherited from GenericBaseDatastore
318 records = []
319 for ref, info in zip(refs, infos):
320 # Component should come from ref and fall back on info
321 component = ref.datasetType.component()
322 if component is None and info.component is not None: 322 ↛ 323line 322 didn't jump to line 323, because the condition on line 322 was never true
323 component = info.component
324 if component is None:
325 # Use empty string since we want this to be part of the
326 # primary key.
327 component = NULLSTR
328 records.append(
329 dict(dataset_id=ref.id, formatter=info.formatter, path=info.path,
330 storage_class=info.storageClass.name, component=component,
331 checksum=info.checksum, file_size=info.file_size)
332 )
333 self._table.insert(*records)
335 def getStoredItemsInfo(self, ref: DatasetIdRef) -> List[StoredFileInfo]:
336 # Docstring inherited from GenericBaseDatastore
338 # Look for the dataset_id -- there might be multiple matches
339 # if we have disassembled the dataset.
340 records = list(self._table.fetch(dataset_id=ref.id))
342 results = []
343 for record in records:
344 # Convert name of StorageClass to instance
345 storageClass = self.storageClassFactory.getStorageClass(record["storage_class"])
346 component = record["component"] if (record["component"]
347 and record["component"] != NULLSTR) else None
349 info = StoredFileInfo(formatter=record["formatter"],
350 path=record["path"],
351 storageClass=storageClass,
352 component=component,
353 checksum=record["checksum"],
354 file_size=record["file_size"])
355 results.append(info)
357 return results
359 def _registered_refs_per_artifact(self, pathInStore: str) -> Set[int]:
360 """Return all dataset refs associated with the supplied path.
362 Parameters
363 ----------
364 pathInStore : `str`
365 Path of interest in the data store.
367 Returns
368 -------
369 ids : `set` of `int`
370 All `DatasetRef` IDs associated with this path.
371 """
372 records = list(self._table.fetch(path=pathInStore))
373 ids = {r["dataset_id"] for r in records}
374 return ids
376 def removeStoredItemInfo(self, ref: DatasetIdRef) -> None:
377 # Docstring inherited from GenericBaseDatastore
378 self._table.delete(dataset_id=ref.id)
380 def _get_dataset_locations_info(self, ref: DatasetIdRef) -> List[Tuple[Location, StoredFileInfo]]:
381 r"""Find all the `Location`\ s of the requested dataset in the
382 `Datastore` and the associated stored file information.
384 Parameters
385 ----------
386 ref : `DatasetRef`
387 Reference to the required `Dataset`.
389 Returns
390 -------
391 results : `list` [`tuple` [`Location`, `StoredFileInfo` ]]
392 Location of the dataset within the datastore and
393 stored information about each file and its formatter.
394 """
395 # Get the file information (this will fail if no file)
396 records = self.getStoredItemsInfo(ref)
398 # Use the path to determine the location
399 return [(self.locationFactory.fromPath(r.path), r) for r in records]
401 def _can_remove_dataset_artifact(self, ref: DatasetIdRef, location: Location) -> bool:
402 """Check that there is only one dataset associated with the
403 specified artifact.
405 Parameters
406 ----------
407 ref : `DatasetRef` or `FakeDatasetRef`
408 Dataset to be removed.
409 location : `Location`
410 The location of the artifact to be removed.
412 Returns
413 -------
414 can_remove : `Bool`
415 True if the artifact can be safely removed.
416 """
418 # Get all entries associated with this path
419 allRefs = self._registered_refs_per_artifact(location.pathInStore)
420 if not allRefs: 420 ↛ 421line 420 didn't jump to line 421, because the condition on line 420 was never true
421 raise RuntimeError(f"Datastore inconsistency error. {location.pathInStore} not in registry")
423 # Remove these refs from all the refs and if there is nothing left
424 # then we can delete
425 remainingRefs = allRefs - {ref.id}
427 if remainingRefs:
428 return False
429 return True
431 def _prepare_for_get(self, ref: DatasetRef,
432 parameters: Optional[Mapping[str, Any]] = None) -> List[DatastoreFileGetInformation]:
433 """Check parameters for ``get`` and obtain formatter and
434 location.
436 Parameters
437 ----------
438 ref : `DatasetRef`
439 Reference to the required Dataset.
440 parameters : `dict`
441 `StorageClass`-specific parameters that specify, for example,
442 a slice of the dataset to be loaded.
444 Returns
445 -------
446 getInfo : `list` [`DatastoreFileGetInformation`]
447 Parameters needed to retrieve each file.
448 """
449 log.debug("Retrieve %s from %s with parameters %s", ref, self.name, parameters)
451 # Get file metadata and internal metadata
452 fileLocations = self._get_dataset_locations_info(ref)
453 if not fileLocations:
454 raise FileNotFoundError(f"Could not retrieve dataset {ref}.")
456 # The storage class we want to use eventually
457 refStorageClass = ref.datasetType.storageClass
459 if len(fileLocations) > 1:
460 disassembled = True
461 else:
462 disassembled = False
464 # Is this a component request?
465 refComponent = ref.datasetType.component()
467 fileGetInfo = []
468 for location, storedFileInfo in fileLocations:
470 # The storage class used to write the file
471 writeStorageClass = storedFileInfo.storageClass
473 # If this has been disassembled we need read to match the write
474 if disassembled:
475 readStorageClass = writeStorageClass
476 else:
477 readStorageClass = refStorageClass
479 formatter = getInstanceOf(storedFileInfo.formatter,
480 FileDescriptor(location, readStorageClass=readStorageClass,
481 storageClass=writeStorageClass, parameters=parameters),
482 ref.dataId)
484 formatterParams, notFormatterParams = formatter.segregateParameters()
486 # Of the remaining parameters, extract the ones supported by
487 # this StorageClass (for components not all will be handled)
488 assemblerParams = readStorageClass.filterParameters(notFormatterParams)
490 # The ref itself could be a component if the dataset was
491 # disassembled by butler, or we disassembled in datastore and
492 # components came from the datastore records
493 component = storedFileInfo.component if storedFileInfo.component else refComponent
495 fileGetInfo.append(DatastoreFileGetInformation(location, formatter, storedFileInfo,
496 assemblerParams, formatterParams,
497 component, readStorageClass))
499 return fileGetInfo
501 def _prepare_for_put(self, inMemoryDataset: Any, ref: DatasetRef) -> Tuple[Location, Formatter]:
502 """Check the arguments for ``put`` and obtain formatter and
503 location.
505 Parameters
506 ----------
507 inMemoryDataset : `object`
508 The dataset to store.
509 ref : `DatasetRef`
510 Reference to the associated Dataset.
512 Returns
513 -------
514 location : `Location`
515 The location to write the dataset.
516 formatter : `Formatter`
517 The `Formatter` to use to write the dataset.
519 Raises
520 ------
521 TypeError
522 Supplied object and storage class are inconsistent.
523 DatasetTypeNotSupportedError
524 The associated `DatasetType` is not handled by this datastore.
525 """
526 self._validate_put_parameters(inMemoryDataset, ref)
528 # Work out output file name
529 try:
530 template = self.templates.getTemplate(ref)
531 except KeyError as e:
532 raise DatasetTypeNotSupportedError(f"Unable to find template for {ref}") from e
534 # Validate the template to protect against filenames from different
535 # dataIds returning the same and causing overwrite confusion.
536 template.validateTemplate(ref)
538 location = self.locationFactory.fromPath(template.format(ref))
540 # Get the formatter based on the storage class
541 storageClass = ref.datasetType.storageClass
542 try:
543 formatter = self.formatterFactory.getFormatter(ref,
544 FileDescriptor(location,
545 storageClass=storageClass),
546 ref.dataId)
547 except KeyError as e:
548 raise DatasetTypeNotSupportedError(f"Unable to find formatter for {ref} in datastore "
549 f"{self.name}") from e
551 # Now that we know the formatter, update the location
552 location = formatter.makeUpdatedLocation(location)
554 return location, formatter
556 def _overrideTransferMode(self, *datasets: FileDataset, transfer: Optional[str] = None) -> Optional[str]:
557 # Docstring inherited from base class
558 if transfer != "auto":
559 return transfer
561 # See if the paths are within the datastore or not
562 inside = [self._pathInStore(d.path) is not None for d in datasets]
564 if all(inside):
565 transfer = None
566 elif not any(inside): 566 ↛ 570line 566 didn't jump to line 570, because the condition on line 566 was never false
567 # Allow ButlerURI to use its own knowledge
568 transfer = "auto"
569 else:
570 raise ValueError("Some datasets are inside the datastore and some are outside."
571 " Please use an explicit transfer mode and not 'auto'.")
573 return transfer
575 def _pathInStore(self, path: str) -> Optional[str]:
576 """Return path relative to datastore root
578 Parameters
579 ----------
580 path : `str`
581 Path to dataset. Can be absolute. If relative assumed to
582 be relative to the datastore. Returns path in datastore
583 or raises an exception if the path it outside.
585 Returns
586 -------
587 inStore : `str`
588 Path relative to datastore root. Returns `None` if the file is
589 outside the root.
590 """
591 # Relative path will always be relative to datastore
592 pathUri = ButlerURI(path, forceAbsolute=False)
593 return pathUri.relative_to(self.root)
595 def _standardizeIngestPath(self, path: str, *, transfer: Optional[str] = None) -> str:
596 """Standardize the path of a to-be-ingested file.
598 Parameters
599 ----------
600 path : `str`
601 Path of a file to be ingested.
602 transfer : `str`, optional
603 How (and whether) the dataset should be added to the datastore.
604 See `ingest` for details of transfer modes.
605 This implementation is provided only so
606 `NotImplementedError` can be raised if the mode is not supported;
607 actual transfers are deferred to `_extractIngestInfo`.
609 Returns
610 -------
611 path : `str`
612 New path in what the datastore considers standard form.
614 Notes
615 -----
616 Subclasses of `FileLikeDatastore` can implement this method instead
617 of `_prepIngest`. It should not modify the data repository or given
618 file in any way.
620 Raises
621 ------
622 NotImplementedError
623 Raised if the datastore does not support the given transfer mode
624 (including the case where ingest is not supported at all).
625 FileNotFoundError
626 Raised if one of the given files does not exist.
627 """
628 if transfer not in (None,) + self.root.transferModes: 628 ↛ 629line 628 didn't jump to line 629, because the condition on line 628 was never true
629 raise NotImplementedError(f"Transfer mode {transfer} not supported.")
631 # A relative URI indicates relative to datastore root
632 srcUri = ButlerURI(path, forceAbsolute=False)
633 if not srcUri.isabs():
634 srcUri = self.root.join(path)
636 if not srcUri.exists():
637 raise FileNotFoundError(f"Resource at {srcUri} does not exist; note that paths to ingest "
638 f"are assumed to be relative to {self.root} unless they are absolute.")
640 if transfer is None:
641 relpath = srcUri.relative_to(self.root)
642 if not relpath:
643 raise RuntimeError(f"Transfer is none but source file ({srcUri}) is not "
644 f"within datastore ({self.root})")
646 # Return the relative path within the datastore for internal
647 # transfer
648 path = relpath
650 return path
652 def _extractIngestInfo(self, path: Union[str, ButlerURI], ref: DatasetRef, *,
653 formatter: Union[Formatter, Type[Formatter]],
654 transfer: Optional[str] = None) -> StoredFileInfo:
655 """Relocate (if necessary) and extract `StoredFileInfo` from a
656 to-be-ingested file.
658 Parameters
659 ----------
660 path : `str` or `ButlerURI`
661 URI or path of a file to be ingested.
662 ref : `DatasetRef`
663 Reference for the dataset being ingested. Guaranteed to have
664 ``dataset_id not None`.
665 formatter : `type` or `Formatter`
666 `Formatter` subclass to use for this dataset or an instance.
667 transfer : `str`, optional
668 How (and whether) the dataset should be added to the datastore.
669 See `ingest` for details of transfer modes.
671 Returns
672 -------
673 info : `StoredFileInfo`
674 Internal datastore record for this file. This will be inserted by
675 the caller; the `_extractIngestInfo` is only resposible for
676 creating and populating the struct.
678 Raises
679 ------
680 FileNotFoundError
681 Raised if one of the given files does not exist.
682 FileExistsError
683 Raised if transfer is not `None` but the (internal) location the
684 file would be moved to is already occupied.
685 """
686 if self._transaction is None: 686 ↛ 687line 686 didn't jump to line 687, because the condition on line 686 was never true
687 raise RuntimeError("Ingest called without transaction enabled")
689 # Create URI of the source path, do not need to force a relative
690 # path to absolute.
691 srcUri = ButlerURI(path, forceAbsolute=False)
693 # Track whether we have read the size of the source yet
694 have_sized = False
696 if transfer is None:
697 # A relative path is assumed to be relative to the datastore
698 # in this context
699 if not srcUri.isabs():
700 tgtLocation = self.locationFactory.fromPath(srcUri.ospath)
701 else:
702 # Work out the path in the datastore from an absolute URI
703 # This is required to be within the datastore.
704 pathInStore = srcUri.relative_to(self.root)
705 if pathInStore is None: 705 ↛ 706line 705 didn't jump to line 706, because the condition on line 705 was never true
706 raise RuntimeError(f"Unexpectedly learned that {srcUri} is "
707 f"not within datastore {self.root}")
708 tgtLocation = self.locationFactory.fromPath(pathInStore)
709 else:
710 # Work out the name we want this ingested file to have
711 # inside the datastore
712 tgtLocation = self._calculate_ingested_datastore_name(srcUri, ref, formatter)
713 if not tgtLocation.uri.dirname().exists():
714 log.debug("Folder %s does not exist yet.", tgtLocation.uri.dirname())
715 tgtLocation.uri.dirname().mkdir()
717 # if we are transferring from a local file to a remote location
718 # it may be more efficient to get the size and checksum of the
719 # local file rather than the transferred one
720 if not srcUri.scheme or srcUri.scheme == "file": 720 ↛ 726line 720 didn't jump to line 726, because the condition on line 720 was never false
721 size = srcUri.size()
722 checksum = self.computeChecksum(srcUri) if self.useChecksum else None
723 have_sized = True
725 # transfer the resource to the destination
726 tgtLocation.uri.transfer_from(srcUri, transfer=transfer, transaction=self._transaction)
728 # the file should exist in the datastore now
729 if not have_sized:
730 size = tgtLocation.uri.size()
731 checksum = self.computeChecksum(tgtLocation.uri) if self.useChecksum else None
733 return StoredFileInfo(formatter=formatter, path=tgtLocation.pathInStore,
734 storageClass=ref.datasetType.storageClass,
735 component=ref.datasetType.component(),
736 file_size=size, checksum=checksum)
738 def _prepIngest(self, *datasets: FileDataset, transfer: Optional[str] = None) -> _IngestPrepData:
739 # Docstring inherited from Datastore._prepIngest.
740 filtered = []
741 for dataset in datasets:
742 acceptable = [ref for ref in dataset.refs if self.constraints.isAcceptable(ref)]
743 if not acceptable:
744 continue
745 else:
746 dataset.refs = acceptable
747 if dataset.formatter is None:
748 dataset.formatter = self.formatterFactory.getFormatterClass(dataset.refs[0])
749 else:
750 assert isinstance(dataset.formatter, (type, str))
751 dataset.formatter = getClassOf(dataset.formatter)
752 dataset.path = self._standardizeIngestPath(dataset.path, transfer=transfer)
753 filtered.append(dataset)
754 return _IngestPrepData(filtered)
756 @transactional
757 def _finishIngest(self, prepData: Datastore.IngestPrepData, *, transfer: Optional[str] = None) -> None:
758 # Docstring inherited from Datastore._finishIngest.
759 refsAndInfos = []
760 for dataset in prepData.datasets:
761 # Do ingest as if the first dataset ref is associated with the file
762 info = self._extractIngestInfo(dataset.path, dataset.refs[0], formatter=dataset.formatter,
763 transfer=transfer)
764 refsAndInfos.extend([(ref, info) for ref in dataset.refs])
765 self._register_datasets(refsAndInfos)
767 def _calculate_ingested_datastore_name(self, srcUri: ButlerURI, ref: DatasetRef,
768 formatter: Union[Formatter, Type[Formatter]]) -> Location:
769 """Given a source URI and a DatasetRef, determine the name the
770 dataset will have inside datastore.
772 Parameters
773 ----------
774 srcUri : `ButlerURI`
775 URI to the source dataset file.
776 ref : `DatasetRef`
777 Ref associated with the newly-ingested dataset artifact. This
778 is used to determine the name within the datastore.
779 formatter : `Formatter` or Formatter class.
780 Formatter to use for validation. Can be a class or an instance.
782 Returns
783 -------
784 location : `Location`
785 Target location for the newly-ingested dataset.
786 """
787 # Ingesting a file from outside the datastore.
788 # This involves a new name.
789 template = self.templates.getTemplate(ref)
790 location = self.locationFactory.fromPath(template.format(ref))
792 # Get the extension
793 ext = srcUri.getExtension()
795 # Update the destination to include that extension
796 location.updateExtension(ext)
798 # Ask the formatter to validate this extension
799 formatter.validateExtension(location)
801 return location
803 @abstractmethod
804 def _write_in_memory_to_artifact(self, inMemoryDataset: Any, ref: DatasetRef) -> StoredFileInfo:
805 """Write out in memory dataset to datastore.
807 Parameters
808 ----------
809 inMemoryDataset : `object`
810 Dataset to write to datastore.
811 ref : `DatasetRef`
812 Registry information associated with this dataset.
814 Returns
815 -------
816 info : `StoredFileInfo`
817 Information describin the artifact written to the datastore.
818 """
819 raise NotImplementedError()
821 @abstractmethod
822 def _read_artifact_into_memory(self, getInfo: DatastoreFileGetInformation,
823 ref: DatasetRef, isComponent: bool = False) -> Any:
824 """Read the artifact from datastore into in memory object.
826 Parameters
827 ----------
828 getInfo : `DatastoreFileGetInformation`
829 Information about the artifact within the datastore.
830 ref : `DatasetRef`
831 The registry information associated with this artifact.
832 isComponent : `bool`
833 Flag to indicate if a component is being read from this artifact.
835 Returns
836 -------
837 inMemoryDataset : `object`
838 The artifact as a python object.
839 """
840 raise NotImplementedError()
842 def exists(self, ref: DatasetRef) -> bool:
843 """Check if the dataset exists in the datastore.
845 Parameters
846 ----------
847 ref : `DatasetRef`
848 Reference to the required dataset.
850 Returns
851 -------
852 exists : `bool`
853 `True` if the entity exists in the `Datastore`.
854 """
855 fileLocations = self._get_dataset_locations_info(ref)
856 if not fileLocations:
857 return False
858 for location, _ in fileLocations:
859 if not self._artifact_exists(location):
860 return False
862 return True
864 def getURIs(self, ref: DatasetRef,
865 predict: bool = False) -> Tuple[Optional[ButlerURI], Dict[str, ButlerURI]]:
866 """Return URIs associated with dataset.
868 Parameters
869 ----------
870 ref : `DatasetRef`
871 Reference to the required dataset.
872 predict : `bool`, optional
873 If the datastore does not know about the dataset, should it
874 return a predicted URI or not?
876 Returns
877 -------
878 primary : `ButlerURI`
879 The URI to the primary artifact associated with this dataset.
880 If the dataset was disassembled within the datastore this
881 may be `None`.
882 components : `dict`
883 URIs to any components associated with the dataset artifact.
884 Can be empty if there are no components.
885 """
887 primary: Optional[ButlerURI] = None
888 components: Dict[str, ButlerURI] = {}
890 # if this has never been written then we have to guess
891 if not self.exists(ref):
892 if not predict:
893 raise FileNotFoundError("Dataset {} not in this datastore".format(ref))
895 def predictLocation(thisRef: DatasetRef) -> Location:
896 template = self.templates.getTemplate(thisRef)
897 location = self.locationFactory.fromPath(template.format(thisRef))
898 storageClass = ref.datasetType.storageClass
899 formatter = self.formatterFactory.getFormatter(thisRef,
900 FileDescriptor(location,
901 storageClass=storageClass))
902 # Try to use the extension attribute but ignore problems if the
903 # formatter does not define one.
904 try:
905 location = formatter.makeUpdatedLocation(location)
906 except Exception:
907 # Use the default extension
908 pass
909 return location
911 doDisassembly = self.composites.shouldBeDisassembled(ref)
913 if doDisassembly:
915 for component, componentStorage in ref.datasetType.storageClass.components.items():
916 compRef = ref.makeComponentRef(component)
917 compLocation = predictLocation(compRef)
919 # Add a URI fragment to indicate this is a guess
920 components[component] = ButlerURI(compLocation.uri.geturl() + "#predicted")
922 else:
924 location = predictLocation(ref)
926 # Add a URI fragment to indicate this is a guess
927 primary = ButlerURI(location.uri.geturl() + "#predicted")
929 return primary, components
931 # If this is a ref that we have written we can get the path.
932 # Get file metadata and internal metadata
933 fileLocations = self._get_dataset_locations_info(ref)
935 if not fileLocations: 935 ↛ 936line 935 didn't jump to line 936, because the condition on line 935 was never true
936 raise RuntimeError(f"Unexpectedly got no artifacts for dataset {ref}")
938 if len(fileLocations) == 1:
939 # No disassembly so this is the primary URI
940 primary = ButlerURI(fileLocations[0][0].uri)
942 else:
943 for location, storedFileInfo in fileLocations:
944 if storedFileInfo.component is None: 944 ↛ 945line 944 didn't jump to line 945, because the condition on line 944 was never true
945 raise RuntimeError(f"Unexpectedly got no component name for a component at {location}")
946 components[storedFileInfo.component] = ButlerURI(location.uri)
948 return primary, components
950 def getURI(self, ref: DatasetRef, predict: bool = False) -> ButlerURI:
951 """URI to the Dataset.
953 Parameters
954 ----------
955 ref : `DatasetRef`
956 Reference to the required Dataset.
957 predict : `bool`
958 If `True`, allow URIs to be returned of datasets that have not
959 been written.
961 Returns
962 -------
963 uri : `str`
964 URI pointing to the dataset within the datastore. If the
965 dataset does not exist in the datastore, and if ``predict`` is
966 `True`, the URI will be a prediction and will include a URI
967 fragment "#predicted".
968 If the datastore does not have entities that relate well
969 to the concept of a URI the returned URI will be
970 descriptive. The returned URI is not guaranteed to be obtainable.
972 Raises
973 ------
974 FileNotFoundError
975 Raised if a URI has been requested for a dataset that does not
976 exist and guessing is not allowed.
977 RuntimeError
978 Raised if a request is made for a single URI but multiple URIs
979 are associated with this dataset.
981 Notes
982 -----
983 When a predicted URI is requested an attempt will be made to form
984 a reasonable URI based on file templates and the expected formatter.
985 """
986 primary, components = self.getURIs(ref, predict)
987 if primary is None or components: 987 ↛ 988line 987 didn't jump to line 988, because the condition on line 987 was never true
988 raise RuntimeError(f"Dataset ({ref}) includes distinct URIs for components. "
989 "Use Dataastore.getURIs() instead.")
990 return primary
992 def get(self, ref: DatasetRef, parameters: Optional[Mapping[str, Any]] = None) -> Any:
993 """Load an InMemoryDataset from the store.
995 Parameters
996 ----------
997 ref : `DatasetRef`
998 Reference to the required Dataset.
999 parameters : `dict`
1000 `StorageClass`-specific parameters that specify, for example,
1001 a slice of the dataset to be loaded.
1003 Returns
1004 -------
1005 inMemoryDataset : `object`
1006 Requested dataset or slice thereof as an InMemoryDataset.
1008 Raises
1009 ------
1010 FileNotFoundError
1011 Requested dataset can not be retrieved.
1012 TypeError
1013 Return value from formatter has unexpected type.
1014 ValueError
1015 Formatter failed to process the dataset.
1016 """
1017 allGetInfo = self._prepare_for_get(ref, parameters)
1018 refComponent = ref.datasetType.component()
1020 # Supplied storage class for the component being read
1021 refStorageClass = ref.datasetType.storageClass
1023 # Create mapping from component name to related info
1024 allComponents = {i.component: i for i in allGetInfo}
1026 # By definition the dataset is disassembled if we have more
1027 # than one record for it.
1028 isDisassembled = len(allGetInfo) > 1
1030 # Look for the special case where we are disassembled but the
1031 # component is a derived component that was not written during
1032 # disassembly. For this scenario we need to check that the
1033 # component requested is listed as a derived component for the
1034 # composite storage class
1035 isDisassembledReadOnlyComponent = False
1036 if isDisassembled and refComponent:
1037 # The composite storage class should be accessible through
1038 # the component dataset type
1039 compositeStorageClass = ref.datasetType.parentStorageClass
1041 # In the unlikely scenario where the composite storage
1042 # class is not known, we can only assume that this is a
1043 # normal component. If that assumption is wrong then the
1044 # branch below that reads a persisted component will fail
1045 # so there is no need to complain here.
1046 if compositeStorageClass is not None: 1046 ↛ 1049line 1046 didn't jump to line 1049, because the condition on line 1046 was never false
1047 isDisassembledReadOnlyComponent = refComponent in compositeStorageClass.derivedComponents
1049 if isDisassembled and not refComponent:
1050 # This was a disassembled dataset spread over multiple files
1051 # and we need to put them all back together again.
1052 # Read into memory and then assemble
1054 # Check that the supplied parameters are suitable for the type read
1055 refStorageClass.validateParameters(parameters)
1057 # We want to keep track of all the parameters that were not used
1058 # by formatters. We assume that if any of the component formatters
1059 # use a parameter that we do not need to apply it again in the
1060 # assembler.
1061 usedParams = set()
1063 components: Dict[str, Any] = {}
1064 for getInfo in allGetInfo:
1065 # assemblerParams are parameters not understood by the
1066 # associated formatter.
1067 usedParams.update(set(getInfo.formatterParams))
1069 component = getInfo.component
1071 if component is None: 1071 ↛ 1072line 1071 didn't jump to line 1072, because the condition on line 1071 was never true
1072 raise RuntimeError(f"Internal error in datastore assembly of {ref}")
1074 # We do not want the formatter to think it's reading
1075 # a component though because it is really reading a
1076 # standalone dataset -- always tell reader it is not a
1077 # component.
1078 components[component] = self._read_artifact_into_memory(getInfo, ref, isComponent=False)
1080 inMemoryDataset = ref.datasetType.storageClass.delegate().assemble(components)
1082 # Any unused parameters will have to be passed to the assembler
1083 if parameters:
1084 unusedParams = {k: v for k, v in parameters.items() if k not in usedParams}
1085 else:
1086 unusedParams = {}
1088 # Process parameters
1089 return ref.datasetType.storageClass.delegate().handleParameters(inMemoryDataset,
1090 parameters=unusedParams)
1092 elif isDisassembledReadOnlyComponent:
1094 compositeStorageClass = ref.datasetType.parentStorageClass
1095 if compositeStorageClass is None: 1095 ↛ 1096line 1095 didn't jump to line 1096, because the condition on line 1095 was never true
1096 raise RuntimeError(f"Unable to retrieve derived component '{refComponent}' since"
1097 "no composite storage class is available.")
1099 if refComponent is None: 1099 ↛ 1101line 1099 didn't jump to line 1101, because the condition on line 1099 was never true
1100 # Mainly for mypy
1101 raise RuntimeError(f"Internal error in datastore {self.name}: component can not be None here")
1103 # Assume that every derived component can be calculated by
1104 # forwarding the request to a single read/write component.
1105 # Rather than guessing which rw component is the right one by
1106 # scanning each for a derived component of the same name,
1107 # we ask the storage class delegate directly which one is best to
1108 # use.
1109 compositeDelegate = compositeStorageClass.delegate()
1110 forwardedComponent = compositeDelegate.selectResponsibleComponent(refComponent,
1111 set(allComponents))
1113 # Select the relevant component
1114 rwInfo = allComponents[forwardedComponent]
1116 # For now assume that read parameters are validated against
1117 # the real component and not the requested component
1118 forwardedStorageClass = rwInfo.formatter.fileDescriptor.readStorageClass
1119 forwardedStorageClass.validateParameters(parameters)
1121 # Unfortunately the FileDescriptor inside the formatter will have
1122 # the wrong write storage class so we need to create a new one
1123 # given the immutability constraint.
1124 writeStorageClass = rwInfo.info.storageClass
1126 # We may need to put some thought into parameters for read
1127 # components but for now forward them on as is
1128 readFormatter = type(rwInfo.formatter)(FileDescriptor(rwInfo.location,
1129 readStorageClass=refStorageClass,
1130 storageClass=writeStorageClass,
1131 parameters=parameters),
1132 ref.dataId)
1134 # The assembler can not receive any parameter requests for a
1135 # derived component at this time since the assembler will
1136 # see the storage class of the derived component and those
1137 # parameters will have to be handled by the formatter on the
1138 # forwarded storage class.
1139 assemblerParams: Dict[str, Any] = {}
1141 # Need to created a new info that specifies the derived
1142 # component and associated storage class
1143 readInfo = DatastoreFileGetInformation(rwInfo.location, readFormatter,
1144 rwInfo.info, assemblerParams, {},
1145 refComponent, refStorageClass)
1147 return self._read_artifact_into_memory(readInfo, ref, isComponent=True)
1149 else:
1150 # Single file request or component from that composite file
1151 for lookup in (refComponent, None): 1151 ↛ 1156line 1151 didn't jump to line 1156, because the loop on line 1151 didn't complete
1152 if lookup in allComponents: 1152 ↛ 1151line 1152 didn't jump to line 1151, because the condition on line 1152 was never false
1153 getInfo = allComponents[lookup]
1154 break
1155 else:
1156 raise FileNotFoundError(f"Component {refComponent} not found "
1157 f"for ref {ref} in datastore {self.name}")
1159 # Do not need the component itself if already disassembled
1160 if isDisassembled:
1161 isComponent = False
1162 else:
1163 isComponent = getInfo.component is not None
1165 # For a disassembled component we can validate parametersagainst
1166 # the component storage class directly
1167 if isDisassembled:
1168 refStorageClass.validateParameters(parameters)
1169 else:
1170 # For an assembled composite this could be a derived
1171 # component derived from a real component. The validity
1172 # of the parameters is not clear. For now validate against
1173 # the composite storage class
1174 getInfo.formatter.fileDescriptor.storageClass.validateParameters(parameters)
1176 return self._read_artifact_into_memory(getInfo, ref, isComponent=isComponent)
1178 @transactional
1179 def put(self, inMemoryDataset: Any, ref: DatasetRef) -> None:
1180 """Write a InMemoryDataset with a given `DatasetRef` to the store.
1182 Parameters
1183 ----------
1184 inMemoryDataset : `object`
1185 The dataset to store.
1186 ref : `DatasetRef`
1187 Reference to the associated Dataset.
1189 Raises
1190 ------
1191 TypeError
1192 Supplied object and storage class are inconsistent.
1193 DatasetTypeNotSupportedError
1194 The associated `DatasetType` is not handled by this datastore.
1196 Notes
1197 -----
1198 If the datastore is configured to reject certain dataset types it
1199 is possible that the put will fail and raise a
1200 `DatasetTypeNotSupportedError`. The main use case for this is to
1201 allow `ChainedDatastore` to put to multiple datastores without
1202 requiring that every datastore accepts the dataset.
1203 """
1205 doDisassembly = self.composites.shouldBeDisassembled(ref)
1206 # doDisassembly = True
1208 artifacts = []
1209 if doDisassembly:
1210 components = ref.datasetType.storageClass.delegate().disassemble(inMemoryDataset)
1211 for component, componentInfo in components.items():
1212 # Don't recurse because we want to take advantage of
1213 # bulk insert -- need a new DatasetRef that refers to the
1214 # same dataset_id but has the component DatasetType
1215 # DatasetType does not refer to the types of components
1216 # So we construct one ourselves.
1217 compRef = ref.makeComponentRef(component)
1218 storedInfo = self._write_in_memory_to_artifact(componentInfo.component, compRef)
1219 artifacts.append((compRef, storedInfo))
1220 else:
1221 # Write the entire thing out
1222 storedInfo = self._write_in_memory_to_artifact(inMemoryDataset, ref)
1223 artifacts.append((ref, storedInfo))
1225 self._register_datasets(artifacts)
1227 @transactional
1228 def trash(self, ref: DatasetRef, ignore_errors: bool = True) -> None:
1229 """Indicate to the datastore that a dataset can be removed.
1231 Parameters
1232 ----------
1233 ref : `DatasetRef`
1234 Reference to the required Dataset.
1235 ignore_errors : `bool`
1236 If `True` return without error even if something went wrong.
1237 Problems could occur if another process is simultaneously trying
1238 to delete.
1240 Raises
1241 ------
1242 FileNotFoundError
1243 Attempt to remove a dataset that does not exist.
1244 """
1245 # Get file metadata and internal metadata
1246 log.debug("Trashing %s in datastore %s", ref, self.name)
1248 fileLocations = self._get_dataset_locations_info(ref)
1250 if not fileLocations:
1251 err_msg = f"Requested dataset to trash ({ref}) is not known to datastore {self.name}"
1252 if ignore_errors:
1253 log.warning(err_msg)
1254 return
1255 else:
1256 raise FileNotFoundError(err_msg)
1258 for location, storedFileInfo in fileLocations:
1259 if not self._artifact_exists(location): 1259 ↛ 1260line 1259 didn't jump to line 1260, because the condition on line 1259 was never true
1260 err_msg = f"Dataset is known to datastore {self.name} but " \
1261 f"associated artifact ({location.uri}) is missing"
1262 if ignore_errors:
1263 log.warning(err_msg)
1264 return
1265 else:
1266 raise FileNotFoundError(err_msg)
1268 # Mark dataset as trashed
1269 try:
1270 self._move_to_trash_in_registry(ref)
1271 except Exception as e:
1272 if ignore_errors:
1273 log.warning(f"Attempted to mark dataset ({ref}) to be trashed in datastore {self.name} "
1274 f"but encountered an error: {e}")
1275 pass
1276 else:
1277 raise
1279 @transactional
1280 def emptyTrash(self, ignore_errors: bool = True) -> None:
1281 """Remove all datasets from the trash.
1283 Parameters
1284 ----------
1285 ignore_errors : `bool`
1286 If `True` return without error even if something went wrong.
1287 Problems could occur if another process is simultaneously trying
1288 to delete.
1289 """
1290 log.debug("Emptying trash in datastore %s", self.name)
1291 # Context manager will empty trash iff we finish it without raising.
1292 with self.bridge.emptyTrash() as trashed:
1293 for ref in trashed:
1294 fileLocations = self._get_dataset_locations_info(ref)
1296 if not fileLocations: 1296 ↛ 1297line 1296 didn't jump to line 1297, because the condition on line 1296 was never true
1297 err_msg = f"Requested dataset ({ref}) does not exist in datastore {self.name}"
1298 if ignore_errors:
1299 log.warning(err_msg)
1300 continue
1301 else:
1302 raise FileNotFoundError(err_msg)
1304 for location, _ in fileLocations:
1306 if not self._artifact_exists(location): 1306 ↛ 1307line 1306 didn't jump to line 1307, because the condition on line 1306 was never true
1307 err_msg = f"Dataset {location.uri} no longer present in datastore {self.name}"
1308 if ignore_errors:
1309 log.warning(err_msg)
1310 continue
1311 else:
1312 raise FileNotFoundError(err_msg)
1314 # Can only delete the artifact if there are no references
1315 # to the file from untrashed dataset refs.
1316 if self._can_remove_dataset_artifact(ref, location):
1317 # Point of no return for this artifact
1318 log.debug("Removing artifact %s from datastore %s", location.uri, self.name)
1319 try:
1320 self._delete_artifact(location)
1321 except Exception as e:
1322 if ignore_errors:
1323 log.critical("Encountered error removing artifact %s from datastore %s: %s",
1324 location.uri, self.name, e)
1325 else:
1326 raise
1328 # Now must remove the entry from the internal registry even if
1329 # the artifact removal failed and was ignored,
1330 # otherwise the removal check above will never be true
1331 try:
1332 # There may be multiple rows associated with this ref
1333 # depending on disassembly
1334 self.removeStoredItemInfo(ref)
1335 except Exception as e:
1336 if ignore_errors:
1337 log.warning("Error removing dataset %s (%s) from internal registry of %s: %s",
1338 ref.id, location.uri, self.name, e)
1339 continue
1340 else:
1341 raise FileNotFoundError(err_msg)
1343 def validateConfiguration(self, entities: Iterable[Union[DatasetRef, DatasetType, StorageClass]],
1344 logFailures: bool = False) -> None:
1345 """Validate some of the configuration for this datastore.
1347 Parameters
1348 ----------
1349 entities : iterable of `DatasetRef`, `DatasetType`, or `StorageClass`
1350 Entities to test against this configuration. Can be differing
1351 types.
1352 logFailures : `bool`, optional
1353 If `True`, output a log message for every validation error
1354 detected.
1356 Raises
1357 ------
1358 DatastoreValidationError
1359 Raised if there is a validation problem with a configuration.
1360 All the problems are reported in a single exception.
1362 Notes
1363 -----
1364 This method checks that all the supplied entities have valid file
1365 templates and also have formatters defined.
1366 """
1368 templateFailed = None
1369 try:
1370 self.templates.validateTemplates(entities, logFailures=logFailures)
1371 except FileTemplateValidationError as e:
1372 templateFailed = str(e)
1374 formatterFailed = []
1375 for entity in entities:
1376 try:
1377 self.formatterFactory.getFormatterClass(entity)
1378 except KeyError as e:
1379 formatterFailed.append(str(e))
1380 if logFailures: 1380 ↛ 1375line 1380 didn't jump to line 1375, because the condition on line 1380 was never false
1381 log.fatal("Formatter failure: %s", e)
1383 if templateFailed or formatterFailed:
1384 messages = []
1385 if templateFailed: 1385 ↛ 1386line 1385 didn't jump to line 1386, because the condition on line 1385 was never true
1386 messages.append(templateFailed)
1387 if formatterFailed: 1387 ↛ 1389line 1387 didn't jump to line 1389, because the condition on line 1387 was never false
1388 messages.append(",".join(formatterFailed))
1389 msg = ";\n".join(messages)
1390 raise DatastoreValidationError(msg)
1392 def getLookupKeys(self) -> Set[LookupKey]:
1393 # Docstring is inherited from base class
1394 return self.templates.getLookupKeys() | self.formatterFactory.getLookupKeys() | \
1395 self.constraints.getLookupKeys()
1397 def validateKey(self, lookupKey: LookupKey,
1398 entity: Union[DatasetRef, DatasetType, StorageClass]) -> None:
1399 # Docstring is inherited from base class
1400 # The key can be valid in either formatters or templates so we can
1401 # only check the template if it exists
1402 if lookupKey in self.templates:
1403 try:
1404 self.templates[lookupKey].validateTemplate(entity)
1405 except FileTemplateValidationError as e:
1406 raise DatastoreValidationError(e) from e
1408 def export(self, refs: Iterable[DatasetRef], *,
1409 directory: Optional[Union[ButlerURI, str]] = None,
1410 transfer: Optional[str] = "auto") -> Iterable[FileDataset]:
1411 # Docstring inherited from Datastore.export.
1412 if transfer is not None and directory is None: 1412 ↛ 1413line 1412 didn't jump to line 1413, because the condition on line 1412 was never true
1413 raise RuntimeError(f"Cannot export using transfer mode {transfer} with no "
1414 "export directory given")
1416 # Force the directory to be a URI object
1417 directoryUri: Optional[ButlerURI] = None
1418 if directory is not None: 1418 ↛ 1421line 1418 didn't jump to line 1421, because the condition on line 1418 was never false
1419 directoryUri = ButlerURI(directory, forceDirectory=True)
1421 if transfer is not None and directoryUri is not None: 1421 ↛ 1426line 1421 didn't jump to line 1426, because the condition on line 1421 was never false
1422 # mypy needs the second test
1423 if not directoryUri.exists(): 1423 ↛ 1424line 1423 didn't jump to line 1424, because the condition on line 1423 was never true
1424 raise FileNotFoundError(f"Export location {directory} does not exist")
1426 for ref in refs:
1427 fileLocations = self._get_dataset_locations_info(ref)
1428 if not fileLocations: 1428 ↛ 1429line 1428 didn't jump to line 1429, because the condition on line 1428 was never true
1429 raise FileNotFoundError(f"Could not retrieve dataset {ref}.")
1430 # For now we can not export disassembled datasets
1431 if len(fileLocations) > 1:
1432 raise NotImplementedError(f"Can not export disassembled datasets such as {ref}")
1433 location, storedFileInfo = fileLocations[0]
1434 if transfer is None: 1434 ↛ 1437line 1434 didn't jump to line 1437, because the condition on line 1434 was never true
1435 # TODO: do we also need to return the readStorageClass somehow?
1436 # We will use the path in store directly
1437 pass
1438 else:
1439 # mypy needs help
1440 assert directoryUri is not None, "directoryUri must be defined to get here"
1441 storeUri = ButlerURI(location.uri)
1442 exportUri = directoryUri.join(location.pathInStore)
1443 exportUri.transfer_from(storeUri, transfer=transfer)
1445 yield FileDataset(refs=[ref], path=location.pathInStore, formatter=storedFileInfo.formatter)
1447 @staticmethod
1448 def computeChecksum(uri: ButlerURI, algorithm: str = "blake2b", block_size: int = 8192) -> Optional[str]:
1449 """Compute the checksum of the supplied file.
1451 Parameters
1452 ----------
1453 uri : `ButlerURI`
1454 Name of resource to calculate checksum from.
1455 algorithm : `str`, optional
1456 Name of algorithm to use. Must be one of the algorithms supported
1457 by :py:class`hashlib`.
1458 block_size : `int`
1459 Number of bytes to read from file at one time.
1461 Returns
1462 -------
1463 hexdigest : `str`
1464 Hex digest of the file.
1466 Notes
1467 -----
1468 Currently returns None if the URI is for a remote resource.
1469 """
1470 if algorithm not in hashlib.algorithms_guaranteed: 1470 ↛ 1471line 1470 didn't jump to line 1471, because the condition on line 1470 was never true
1471 raise NameError("The specified algorithm '{}' is not supported by hashlib".format(algorithm))
1473 if uri.scheme and uri.scheme != "file": 1473 ↛ 1474line 1473 didn't jump to line 1474, because the condition on line 1473 was never true
1474 return None
1476 hasher = hashlib.new(algorithm)
1478 filename, is_temp = uri.as_local()
1480 with open(filename, "rb") as f:
1481 for chunk in iter(lambda: f.read(block_size), b""):
1482 hasher.update(chunk)
1484 if is_temp: 1484 ↛ 1485line 1484 didn't jump to line 1485, because the condition on line 1484 was never true
1485 os.remove(filename)
1487 return hasher.hexdigest()