Coverage for python/lsst/daf/butler/datastores/fileLikeDatastore.py : 83%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
21from __future__ import annotations
23"""Generic file-based datastore code."""
25__all__ = ("FileLikeDatastore", )
27import hashlib
28import logging
29import os
30from abc import abstractmethod
32from sqlalchemy import BigInteger, String
34from dataclasses import dataclass
35from typing import (
36 TYPE_CHECKING,
37 Any,
38 ClassVar,
39 Dict,
40 Iterable,
41 List,
42 Mapping,
43 Optional,
44 Set,
45 Tuple,
46 Type,
47 Union,
48)
50from lsst.daf.butler import (
51 ButlerURI,
52 CompositesMap,
53 Config,
54 FileDataset,
55 DatasetRef,
56 DatasetType,
57 DatasetTypeNotSupportedError,
58 Datastore,
59 DatastoreConfig,
60 DatastoreValidationError,
61 FileDescriptor,
62 FileTemplates,
63 FileTemplateValidationError,
64 Formatter,
65 FormatterFactory,
66 Location,
67 LocationFactory,
68 StorageClass,
69 StoredFileInfo,
70)
72from lsst.daf.butler import ddl
73from lsst.daf.butler.registry.interfaces import (
74 ReadOnlyDatabaseError,
75 DatastoreRegistryBridge,
76)
78from lsst.daf.butler.core.repoRelocation import replaceRoot
79from lsst.daf.butler.core.utils import getInstanceOf, getClassOf, transactional
80from .genericDatastore import GenericBaseDatastore
82if TYPE_CHECKING: 82 ↛ 83line 82 didn't jump to line 83, because the condition on line 82 was never true
83 from lsst.daf.butler import LookupKey
84 from lsst.daf.butler.registry.interfaces import DatasetIdRef, DatastoreRegistryBridgeManager
86log = logging.getLogger(__name__)
88# String to use when a Python None is encountered
89NULLSTR = "__NULL_STRING__"
92class _IngestPrepData(Datastore.IngestPrepData):
93 """Helper class for FileLikeDatastore ingest implementation.
95 Parameters
96 ----------
97 datasets : `list` of `FileDataset`
98 Files to be ingested by this datastore.
99 """
100 def __init__(self, datasets: List[FileDataset]):
101 super().__init__(ref for dataset in datasets for ref in dataset.refs)
102 self.datasets = datasets
105@dataclass(frozen=True)
106class DatastoreFileGetInformation:
107 """Collection of useful parameters needed to retrieve a file from
108 a Datastore.
109 """
111 location: Location
112 """The location from which to read the dataset."""
114 formatter: Formatter
115 """The `Formatter` to use to deserialize the dataset."""
117 info: StoredFileInfo
118 """Stored information about this file and its formatter."""
120 assemblerParams: Dict[str, Any]
121 """Parameters to use for post-processing the retrieved dataset."""
123 formatterParams: Dict[str, Any]
124 """Parameters that were understood by the associated formatter."""
126 component: Optional[str]
127 """The component to be retrieved (can be `None`)."""
129 readStorageClass: StorageClass
130 """The `StorageClass` of the dataset being read."""
133class FileLikeDatastore(GenericBaseDatastore):
134 """Generic Datastore for file-based implementations.
136 Should always be sub-classed since key abstract methods are missing.
138 Parameters
139 ----------
140 config : `DatastoreConfig` or `str`
141 Configuration as either a `Config` object or URI to file.
142 bridgeManager : `DatastoreRegistryBridgeManager`
143 Object that manages the interface between `Registry` and datastores.
144 butlerRoot : `str`, optional
145 New datastore root to use to override the configuration value.
147 Raises
148 ------
149 ValueError
150 If root location does not exist and ``create`` is `False` in the
151 configuration.
152 """
154 defaultConfigFile: ClassVar[Optional[str]] = None
155 """Path to configuration defaults. Accessed within the ``config`` resource
156 or relative to a search path. Can be None if no defaults specified.
157 """
159 root: ButlerURI
160 """Root directory URI of this `Datastore`."""
162 locationFactory: LocationFactory
163 """Factory for creating locations relative to the datastore root."""
165 formatterFactory: FormatterFactory
166 """Factory for creating instances of formatters."""
168 templates: FileTemplates
169 """File templates that can be used by this `Datastore`."""
171 composites: CompositesMap
172 """Determines whether a dataset should be disassembled on put."""
174 @classmethod
175 def setConfigRoot(cls, root: str, config: Config, full: Config, overwrite: bool = True) -> None:
176 """Set any filesystem-dependent config options for this Datastore to
177 be appropriate for a new empty repository with the given root.
179 Parameters
180 ----------
181 root : `str`
182 URI to the root of the data repository.
183 config : `Config`
184 A `Config` to update. Only the subset understood by
185 this component will be updated. Will not expand
186 defaults.
187 full : `Config`
188 A complete config with all defaults expanded that can be
189 converted to a `DatastoreConfig`. Read-only and will not be
190 modified by this method.
191 Repository-specific options that should not be obtained
192 from defaults when Butler instances are constructed
193 should be copied from ``full`` to ``config``.
194 overwrite : `bool`, optional
195 If `False`, do not modify a value in ``config`` if the value
196 already exists. Default is always to overwrite with the provided
197 ``root``.
199 Notes
200 -----
201 If a keyword is explicitly defined in the supplied ``config`` it
202 will not be overridden by this method if ``overwrite`` is `False`.
203 This allows explicit values set in external configs to be retained.
204 """
205 Config.updateParameters(DatastoreConfig, config, full,
206 toUpdate={"root": root},
207 toCopy=("cls", ("records", "table")), overwrite=overwrite)
209 @classmethod
210 def makeTableSpec(cls) -> ddl.TableSpec:
211 return ddl.TableSpec(
212 fields=[
213 ddl.FieldSpec(name="dataset_id", dtype=BigInteger, primaryKey=True),
214 ddl.FieldSpec(name="path", dtype=String, length=256, nullable=False),
215 ddl.FieldSpec(name="formatter", dtype=String, length=128, nullable=False),
216 ddl.FieldSpec(name="storage_class", dtype=String, length=64, nullable=False),
217 # Use empty string to indicate no component
218 ddl.FieldSpec(name="component", dtype=String, length=32, primaryKey=True),
219 # TODO: should checksum be Base64Bytes instead?
220 ddl.FieldSpec(name="checksum", dtype=String, length=128, nullable=True),
221 ddl.FieldSpec(name="file_size", dtype=BigInteger, nullable=True),
222 ],
223 unique=frozenset(),
224 )
226 def __init__(self, config: Union[DatastoreConfig, str],
227 bridgeManager: DatastoreRegistryBridgeManager, butlerRoot: str = None):
228 super().__init__(config, bridgeManager)
229 if "root" not in self.config: 229 ↛ 230line 229 didn't jump to line 230, because the condition on line 229 was never true
230 raise ValueError("No root directory specified in configuration")
232 # Name ourselves either using an explicit name or a name
233 # derived from the (unexpanded) root
234 if "name" in self.config:
235 self.name = self.config["name"]
236 else:
237 # We use the unexpanded root in the name to indicate that this
238 # datastore can be moved without having to update registry.
239 self.name = "{}@{}".format(type(self).__name__,
240 self.config["root"])
242 # Support repository relocation in config
243 # Existence of self.root is checked in subclass
244 self.root = ButlerURI(replaceRoot(self.config["root"], butlerRoot),
245 forceDirectory=True, forceAbsolute=True)
247 self.locationFactory = LocationFactory(self.root)
248 self.formatterFactory = FormatterFactory()
250 # Now associate formatters with storage classes
251 self.formatterFactory.registerFormatters(self.config["formatters"],
252 universe=bridgeManager.universe)
254 # Read the file naming templates
255 self.templates = FileTemplates(self.config["templates"],
256 universe=bridgeManager.universe)
258 # See if composites should be disassembled
259 self.composites = CompositesMap(self.config["composites"],
260 universe=bridgeManager.universe)
262 tableName = self.config["records", "table"]
263 try:
264 # Storage of paths and formatters, keyed by dataset_id
265 self._table = bridgeManager.opaque.register(tableName, self.makeTableSpec())
266 # Interface to Registry.
267 self._bridge = bridgeManager.register(self.name)
268 except ReadOnlyDatabaseError:
269 # If the database is read only and we just tried and failed to
270 # create a table, it means someone is trying to create a read-only
271 # butler client for an empty repo. That should be okay, as long
272 # as they then try to get any datasets before some other client
273 # creates the table. Chances are they'rejust validating
274 # configuration.
275 pass
277 # Determine whether checksums should be used
278 self.useChecksum = self.config.get("checksum", True)
280 def __str__(self) -> str:
281 return str(self.root)
283 @property
284 def bridge(self) -> DatastoreRegistryBridge:
285 return self._bridge
287 def _artifact_exists(self, location: Location) -> bool:
288 """Check that an artifact exists in this datastore at the specified
289 location.
291 Parameters
292 ----------
293 location : `Location`
294 Expected location of the artifact associated with this datastore.
296 Returns
297 -------
298 exists : `bool`
299 True if the location can be found, false otherwise.
300 """
301 log.debug("Checking if resource exists: %s", location.uri)
302 return location.uri.exists()
304 def _delete_artifact(self, location: Location) -> None:
305 """Delete the artifact from the datastore.
307 Parameters
308 ----------
309 location : `Location`
310 Location of the artifact associated with this datastore.
311 """
312 log.debug("Deleting file: %s", location.uri)
313 location.uri.remove()
314 log.debug("Successfully deleted file: %s", location.uri)
316 def addStoredItemInfo(self, refs: Iterable[DatasetRef], infos: Iterable[StoredFileInfo]) -> None:
317 # Docstring inherited from GenericBaseDatastore
318 records = []
319 for ref, info in zip(refs, infos):
320 # Component should come from ref and fall back on info
321 component = ref.datasetType.component()
322 if component is None and info.component is not None: 322 ↛ 323line 322 didn't jump to line 323, because the condition on line 322 was never true
323 component = info.component
324 if component is None:
325 # Use empty string since we want this to be part of the
326 # primary key.
327 component = NULLSTR
328 records.append(
329 dict(dataset_id=ref.id, formatter=info.formatter, path=info.path,
330 storage_class=info.storageClass.name, component=component,
331 checksum=info.checksum, file_size=info.file_size)
332 )
333 self._table.insert(*records)
335 def getStoredItemsInfo(self, ref: DatasetIdRef) -> List[StoredFileInfo]:
336 # Docstring inherited from GenericBaseDatastore
338 # Look for the dataset_id -- there might be multiple matches
339 # if we have disassembled the dataset.
340 records = list(self._table.fetch(dataset_id=ref.id))
342 results = []
343 for record in records:
344 # Convert name of StorageClass to instance
345 storageClass = self.storageClassFactory.getStorageClass(record["storage_class"])
346 component = record["component"] if (record["component"]
347 and record["component"] != NULLSTR) else None
349 info = StoredFileInfo(formatter=record["formatter"],
350 path=record["path"],
351 storageClass=storageClass,
352 component=component,
353 checksum=record["checksum"],
354 file_size=record["file_size"])
355 results.append(info)
357 return results
359 def _registered_refs_per_artifact(self, pathInStore: str) -> Set[int]:
360 """Return all dataset refs associated with the supplied path.
362 Parameters
363 ----------
364 pathInStore : `str`
365 Path of interest in the data store.
367 Returns
368 -------
369 ids : `set` of `int`
370 All `DatasetRef` IDs associated with this path.
371 """
372 records = list(self._table.fetch(path=pathInStore))
373 ids = {r["dataset_id"] for r in records}
374 return ids
376 def removeStoredItemInfo(self, ref: DatasetIdRef) -> None:
377 # Docstring inherited from GenericBaseDatastore
378 self._table.delete(dataset_id=ref.id)
380 def _get_dataset_locations_info(self, ref: DatasetIdRef) -> List[Tuple[Location, StoredFileInfo]]:
381 r"""Find all the `Location`\ s of the requested dataset in the
382 `Datastore` and the associated stored file information.
384 Parameters
385 ----------
386 ref : `DatasetRef`
387 Reference to the required `Dataset`.
389 Returns
390 -------
391 results : `list` [`tuple` [`Location`, `StoredFileInfo` ]]
392 Location of the dataset within the datastore and
393 stored information about each file and its formatter.
394 """
395 # Get the file information (this will fail if no file)
396 records = self.getStoredItemsInfo(ref)
398 # Use the path to determine the location
399 return [(self.locationFactory.fromPath(r.path), r) for r in records]
401 def _can_remove_dataset_artifact(self, ref: DatasetIdRef, location: Location) -> bool:
402 """Check that there is only one dataset associated with the
403 specified artifact.
405 Parameters
406 ----------
407 ref : `DatasetRef` or `FakeDatasetRef`
408 Dataset to be removed.
409 location : `Location`
410 The location of the artifact to be removed.
412 Returns
413 -------
414 can_remove : `Bool`
415 True if the artifact can be safely removed.
416 """
418 # Get all entries associated with this path
419 allRefs = self._registered_refs_per_artifact(location.pathInStore)
420 if not allRefs: 420 ↛ 421line 420 didn't jump to line 421, because the condition on line 420 was never true
421 raise RuntimeError(f"Datastore inconsistency error. {location.pathInStore} not in registry")
423 # Remove these refs from all the refs and if there is nothing left
424 # then we can delete
425 remainingRefs = allRefs - {ref.id}
427 if remainingRefs:
428 return False
429 return True
431 def _prepare_for_get(self, ref: DatasetRef,
432 parameters: Optional[Mapping[str, Any]] = None) -> List[DatastoreFileGetInformation]:
433 """Check parameters for ``get`` and obtain formatter and
434 location.
436 Parameters
437 ----------
438 ref : `DatasetRef`
439 Reference to the required Dataset.
440 parameters : `dict`
441 `StorageClass`-specific parameters that specify, for example,
442 a slice of the dataset to be loaded.
444 Returns
445 -------
446 getInfo : `list` [`DatastoreFileGetInformation`]
447 Parameters needed to retrieve each file.
448 """
449 log.debug("Retrieve %s from %s with parameters %s", ref, self.name, parameters)
451 # Get file metadata and internal metadata
452 fileLocations = self._get_dataset_locations_info(ref)
453 if not fileLocations:
454 raise FileNotFoundError(f"Could not retrieve dataset {ref}.")
456 # The storage class we want to use eventually
457 refStorageClass = ref.datasetType.storageClass
459 if len(fileLocations) > 1:
460 disassembled = True
461 else:
462 disassembled = False
464 # Is this a component request?
465 refComponent = ref.datasetType.component()
467 fileGetInfo = []
468 for location, storedFileInfo in fileLocations:
470 # The storage class used to write the file
471 writeStorageClass = storedFileInfo.storageClass
473 # If this has been disassembled we need read to match the write
474 if disassembled:
475 readStorageClass = writeStorageClass
476 else:
477 readStorageClass = refStorageClass
479 formatter = getInstanceOf(storedFileInfo.formatter,
480 FileDescriptor(location, readStorageClass=readStorageClass,
481 storageClass=writeStorageClass, parameters=parameters),
482 ref.dataId)
484 formatterParams, notFormatterParams = formatter.segregateParameters()
486 # Of the remaining parameters, extract the ones supported by
487 # this StorageClass (for components not all will be handled)
488 assemblerParams = readStorageClass.filterParameters(notFormatterParams)
490 # The ref itself could be a component if the dataset was
491 # disassembled by butler, or we disassembled in datastore and
492 # components came from the datastore records
493 component = storedFileInfo.component if storedFileInfo.component else refComponent
495 fileGetInfo.append(DatastoreFileGetInformation(location, formatter, storedFileInfo,
496 assemblerParams, formatterParams,
497 component, readStorageClass))
499 return fileGetInfo
501 def _prepare_for_put(self, inMemoryDataset: Any, ref: DatasetRef) -> Tuple[Location, Formatter]:
502 """Check the arguments for ``put`` and obtain formatter and
503 location.
505 Parameters
506 ----------
507 inMemoryDataset : `object`
508 The dataset to store.
509 ref : `DatasetRef`
510 Reference to the associated Dataset.
512 Returns
513 -------
514 location : `Location`
515 The location to write the dataset.
516 formatter : `Formatter`
517 The `Formatter` to use to write the dataset.
519 Raises
520 ------
521 TypeError
522 Supplied object and storage class are inconsistent.
523 DatasetTypeNotSupportedError
524 The associated `DatasetType` is not handled by this datastore.
525 """
526 self._validate_put_parameters(inMemoryDataset, ref)
528 # Work out output file name
529 try:
530 template = self.templates.getTemplate(ref)
531 except KeyError as e:
532 raise DatasetTypeNotSupportedError(f"Unable to find template for {ref}") from e
534 # Validate the template to protect against filenames from different
535 # dataIds returning the same and causing overwrite confusion.
536 template.validateTemplate(ref)
538 location = self.locationFactory.fromPath(template.format(ref))
540 # Get the formatter based on the storage class
541 storageClass = ref.datasetType.storageClass
542 try:
543 formatter = self.formatterFactory.getFormatter(ref,
544 FileDescriptor(location,
545 storageClass=storageClass),
546 ref.dataId)
547 except KeyError as e:
548 raise DatasetTypeNotSupportedError(f"Unable to find formatter for {ref} in datastore "
549 f"{self.name}") from e
551 # Now that we know the formatter, update the location
552 location = formatter.makeUpdatedLocation(location)
554 return location, formatter
556 def _overrideTransferMode(self, *datasets: FileDataset, transfer: Optional[str] = None) -> Optional[str]:
557 # Docstring inherited from base class
558 if transfer != "auto":
559 return transfer
561 # See if the paths are within the datastore or not
562 inside = [self._pathInStore(d.path) is not None for d in datasets]
564 if all(inside):
565 transfer = None
566 elif not any(inside): 566 ↛ 570line 566 didn't jump to line 570, because the condition on line 566 was never false
567 # Allow ButlerURI to use its own knowledge
568 transfer = "auto"
569 else:
570 raise ValueError("Some datasets are inside the datastore and some are outside."
571 " Please use an explicit transfer mode and not 'auto'.")
573 return transfer
575 def _pathInStore(self, path: str) -> Optional[str]:
576 """Return path relative to datastore root
578 Parameters
579 ----------
580 path : `str`
581 Path to dataset. Can be absolute. If relative assumed to
582 be relative to the datastore. Returns path in datastore
583 or raises an exception if the path it outside.
585 Returns
586 -------
587 inStore : `str`
588 Path relative to datastore root. Returns `None` if the file is
589 outside the root.
590 """
591 # Relative path will always be relative to datastore
592 pathUri = ButlerURI(path, forceAbsolute=False)
593 return pathUri.relative_to(self.root)
595 def _standardizeIngestPath(self, path: str, *, transfer: Optional[str] = None) -> str:
596 """Standardize the path of a to-be-ingested file.
598 Parameters
599 ----------
600 path : `str`
601 Path of a file to be ingested.
602 transfer : `str`, optional
603 How (and whether) the dataset should be added to the datastore.
604 See `ingest` for details of transfer modes.
605 This implementation is provided only so
606 `NotImplementedError` can be raised if the mode is not supported;
607 actual transfers are deferred to `_extractIngestInfo`.
609 Returns
610 -------
611 path : `str`
612 New path in what the datastore considers standard form.
614 Notes
615 -----
616 Subclasses of `FileLikeDatastore` can implement this method instead
617 of `_prepIngest`. It should not modify the data repository or given
618 file in any way.
620 Raises
621 ------
622 NotImplementedError
623 Raised if the datastore does not support the given transfer mode
624 (including the case where ingest is not supported at all).
625 FileNotFoundError
626 Raised if one of the given files does not exist.
627 """
628 if transfer not in (None,) + self.root.transferModes: 628 ↛ 629line 628 didn't jump to line 629, because the condition on line 628 was never true
629 raise NotImplementedError(f"Transfer mode {transfer} not supported.")
631 # A relative URI indicates relative to datastore root
632 srcUri = ButlerURI(path, forceAbsolute=False)
633 if not srcUri.isabs():
634 srcUri = self.root.join(path)
636 if not srcUri.exists():
637 raise FileNotFoundError(f"Resource at {srcUri} does not exist; note that paths to ingest "
638 f"are assumed to be relative to {self.root} unless they are absolute.")
640 if transfer is None:
641 relpath = srcUri.relative_to(self.root)
642 if not relpath:
643 raise RuntimeError(f"Transfer is none but source file ({srcUri}) is not "
644 f"within datastore ({self.root})")
646 # Return the relative path within the datastore for internal
647 # transfer
648 path = relpath
650 return path
652 def _extractIngestInfo(self, path: Union[str, ButlerURI], ref: DatasetRef, *,
653 formatter: Union[Formatter, Type[Formatter]],
654 transfer: Optional[str] = None) -> StoredFileInfo:
655 """Relocate (if necessary) and extract `StoredFileInfo` from a
656 to-be-ingested file.
658 Parameters
659 ----------
660 path : `str` or `ButlerURI`
661 URI or path of a file to be ingested.
662 ref : `DatasetRef`
663 Reference for the dataset being ingested. Guaranteed to have
664 ``dataset_id not None`.
665 formatter : `type` or `Formatter`
666 `Formatter` subclass to use for this dataset or an instance.
667 transfer : `str`, optional
668 How (and whether) the dataset should be added to the datastore.
669 See `ingest` for details of transfer modes.
671 Returns
672 -------
673 info : `StoredFileInfo`
674 Internal datastore record for this file. This will be inserted by
675 the caller; the `_extractIngestInfo` is only resposible for
676 creating and populating the struct.
678 Raises
679 ------
680 FileNotFoundError
681 Raised if one of the given files does not exist.
682 FileExistsError
683 Raised if transfer is not `None` but the (internal) location the
684 file would be moved to is already occupied.
685 """
686 if self._transaction is None: 686 ↛ 687line 686 didn't jump to line 687, because the condition on line 686 was never true
687 raise RuntimeError("Ingest called without transaction enabled")
689 # Create URI of the source path, do not need to force a relative
690 # path to absolute.
691 srcUri = ButlerURI(path, forceAbsolute=False)
693 # Track whether we have read the size of the source yet
694 have_sized = False
696 if transfer is None:
697 # A relative path is assumed to be relative to the datastore
698 # in this context
699 if not srcUri.isabs():
700 tgtLocation = self.locationFactory.fromPath(srcUri.ospath)
701 else:
702 # Work out the path in the datastore from an absolute URI
703 # This is required to be within the datastore.
704 pathInStore = srcUri.relative_to(self.root)
705 if pathInStore is None: 705 ↛ 706line 705 didn't jump to line 706, because the condition on line 705 was never true
706 raise RuntimeError(f"Unexpectedly learned that {srcUri} is "
707 f"not within datastore {self.root}")
708 tgtLocation = self.locationFactory.fromPath(pathInStore)
709 else:
710 # Work out the name we want this ingested file to have
711 # inside the datastore
712 tgtLocation = self._calculate_ingested_datastore_name(srcUri, ref, formatter)
714 # if we are transferring from a local file to a remote location
715 # it may be more efficient to get the size and checksum of the
716 # local file rather than the transferred one
717 if not srcUri.scheme or srcUri.scheme == "file": 717 ↛ 722line 717 didn't jump to line 722, because the condition on line 717 was never false
718 size = srcUri.size()
719 checksum = self.computeChecksum(srcUri) if self.useChecksum else None
721 # transfer the resource to the destination
722 tgtLocation.uri.transfer_from(srcUri, transfer=transfer, transaction=self._transaction)
724 # the file should exist in the datastore now
725 if not have_sized: 725 ↛ 729line 725 didn't jump to line 729, because the condition on line 725 was never false
726 size = tgtLocation.uri.size()
727 checksum = self.computeChecksum(tgtLocation.uri) if self.useChecksum else None
729 return StoredFileInfo(formatter=formatter, path=tgtLocation.pathInStore,
730 storageClass=ref.datasetType.storageClass,
731 component=ref.datasetType.component(),
732 file_size=size, checksum=checksum)
734 def _prepIngest(self, *datasets: FileDataset, transfer: Optional[str] = None) -> _IngestPrepData:
735 # Docstring inherited from Datastore._prepIngest.
736 filtered = []
737 for dataset in datasets:
738 acceptable = [ref for ref in dataset.refs if self.constraints.isAcceptable(ref)]
739 if not acceptable:
740 continue
741 else:
742 dataset.refs = acceptable
743 if dataset.formatter is None:
744 dataset.formatter = self.formatterFactory.getFormatterClass(dataset.refs[0])
745 else:
746 assert isinstance(dataset.formatter, (type, str))
747 dataset.formatter = getClassOf(dataset.formatter)
748 dataset.path = self._standardizeIngestPath(dataset.path, transfer=transfer)
749 filtered.append(dataset)
750 return _IngestPrepData(filtered)
752 @transactional
753 def _finishIngest(self, prepData: Datastore.IngestPrepData, *, transfer: Optional[str] = None) -> None:
754 # Docstring inherited from Datastore._finishIngest.
755 refsAndInfos = []
756 for dataset in prepData.datasets:
757 # Do ingest as if the first dataset ref is associated with the file
758 info = self._extractIngestInfo(dataset.path, dataset.refs[0], formatter=dataset.formatter,
759 transfer=transfer)
760 refsAndInfos.extend([(ref, info) for ref in dataset.refs])
761 self._register_datasets(refsAndInfos)
763 def _calculate_ingested_datastore_name(self, srcUri: ButlerURI, ref: DatasetRef,
764 formatter: Union[Formatter, Type[Formatter]]) -> Location:
765 """Given a source URI and a DatasetRef, determine the name the
766 dataset will have inside datastore.
768 Parameters
769 ----------
770 srcUri : `ButlerURI`
771 URI to the source dataset file.
772 ref : `DatasetRef`
773 Ref associated with the newly-ingested dataset artifact. This
774 is used to determine the name within the datastore.
775 formatter : `Formatter` or Formatter class.
776 Formatter to use for validation. Can be a class or an instance.
778 Returns
779 -------
780 location : `Location`
781 Target location for the newly-ingested dataset.
782 """
783 # Ingesting a file from outside the datastore.
784 # This involves a new name.
785 template = self.templates.getTemplate(ref)
786 location = self.locationFactory.fromPath(template.format(ref))
788 # Get the extension
789 ext = srcUri.getExtension()
791 # Update the destination to include that extension
792 location.updateExtension(ext)
794 # Ask the formatter to validate this extension
795 formatter.validateExtension(location)
797 return location
799 @abstractmethod
800 def _write_in_memory_to_artifact(self, inMemoryDataset: Any, ref: DatasetRef) -> StoredFileInfo:
801 """Write out in memory dataset to datastore.
803 Parameters
804 ----------
805 inMemoryDataset : `object`
806 Dataset to write to datastore.
807 ref : `DatasetRef`
808 Registry information associated with this dataset.
810 Returns
811 -------
812 info : `StoredFileInfo`
813 Information describin the artifact written to the datastore.
814 """
815 raise NotImplementedError()
817 @abstractmethod
818 def _read_artifact_into_memory(self, getInfo: DatastoreFileGetInformation,
819 ref: DatasetRef, isComponent: bool = False) -> Any:
820 """Read the artifact from datastore into in memory object.
822 Parameters
823 ----------
824 getInfo : `DatastoreFileGetInformation`
825 Information about the artifact within the datastore.
826 ref : `DatasetRef`
827 The registry information associated with this artifact.
828 isComponent : `bool`
829 Flag to indicate if a component is being read from this artifact.
831 Returns
832 -------
833 inMemoryDataset : `object`
834 The artifact as a python object.
835 """
836 raise NotImplementedError()
838 def exists(self, ref: DatasetRef) -> bool:
839 """Check if the dataset exists in the datastore.
841 Parameters
842 ----------
843 ref : `DatasetRef`
844 Reference to the required dataset.
846 Returns
847 -------
848 exists : `bool`
849 `True` if the entity exists in the `Datastore`.
850 """
851 fileLocations = self._get_dataset_locations_info(ref)
852 if not fileLocations:
853 return False
854 for location, _ in fileLocations:
855 if not self._artifact_exists(location):
856 return False
858 return True
860 def getURIs(self, ref: DatasetRef,
861 predict: bool = False) -> Tuple[Optional[ButlerURI], Dict[str, ButlerURI]]:
862 """Return URIs associated with dataset.
864 Parameters
865 ----------
866 ref : `DatasetRef`
867 Reference to the required dataset.
868 predict : `bool`, optional
869 If the datastore does not know about the dataset, should it
870 return a predicted URI or not?
872 Returns
873 -------
874 primary : `ButlerURI`
875 The URI to the primary artifact associated with this dataset.
876 If the dataset was disassembled within the datastore this
877 may be `None`.
878 components : `dict`
879 URIs to any components associated with the dataset artifact.
880 Can be empty if there are no components.
881 """
883 primary: Optional[ButlerURI] = None
884 components: Dict[str, ButlerURI] = {}
886 # if this has never been written then we have to guess
887 if not self.exists(ref):
888 if not predict:
889 raise FileNotFoundError("Dataset {} not in this datastore".format(ref))
891 def predictLocation(thisRef: DatasetRef) -> Location:
892 template = self.templates.getTemplate(thisRef)
893 location = self.locationFactory.fromPath(template.format(thisRef))
894 storageClass = ref.datasetType.storageClass
895 formatter = self.formatterFactory.getFormatter(thisRef,
896 FileDescriptor(location,
897 storageClass=storageClass))
898 # Try to use the extension attribute but ignore problems if the
899 # formatter does not define one.
900 try:
901 location = formatter.makeUpdatedLocation(location)
902 except Exception:
903 # Use the default extension
904 pass
905 return location
907 doDisassembly = self.composites.shouldBeDisassembled(ref)
909 if doDisassembly:
911 for component, componentStorage in ref.datasetType.storageClass.components.items():
912 compRef = ref.makeComponentRef(component)
913 compLocation = predictLocation(compRef)
915 # Add a URI fragment to indicate this is a guess
916 components[component] = ButlerURI(compLocation.uri.geturl() + "#predicted")
918 else:
920 location = predictLocation(ref)
922 # Add a URI fragment to indicate this is a guess
923 primary = ButlerURI(location.uri.geturl() + "#predicted")
925 return primary, components
927 # If this is a ref that we have written we can get the path.
928 # Get file metadata and internal metadata
929 fileLocations = self._get_dataset_locations_info(ref)
931 if not fileLocations: 931 ↛ 932line 931 didn't jump to line 932, because the condition on line 931 was never true
932 raise RuntimeError(f"Unexpectedly got no artifacts for dataset {ref}")
934 if len(fileLocations) == 1:
935 # No disassembly so this is the primary URI
936 primary = ButlerURI(fileLocations[0][0].uri)
938 else:
939 for location, storedFileInfo in fileLocations:
940 if storedFileInfo.component is None: 940 ↛ 941line 940 didn't jump to line 941, because the condition on line 940 was never true
941 raise RuntimeError(f"Unexpectedly got no component name for a component at {location}")
942 components[storedFileInfo.component] = ButlerURI(location.uri)
944 return primary, components
946 def getURI(self, ref: DatasetRef, predict: bool = False) -> ButlerURI:
947 """URI to the Dataset.
949 Parameters
950 ----------
951 ref : `DatasetRef`
952 Reference to the required Dataset.
953 predict : `bool`
954 If `True`, allow URIs to be returned of datasets that have not
955 been written.
957 Returns
958 -------
959 uri : `str`
960 URI pointing to the dataset within the datastore. If the
961 dataset does not exist in the datastore, and if ``predict`` is
962 `True`, the URI will be a prediction and will include a URI
963 fragment "#predicted".
964 If the datastore does not have entities that relate well
965 to the concept of a URI the returned URI will be
966 descriptive. The returned URI is not guaranteed to be obtainable.
968 Raises
969 ------
970 FileNotFoundError
971 Raised if a URI has been requested for a dataset that does not
972 exist and guessing is not allowed.
973 RuntimeError
974 Raised if a request is made for a single URI but multiple URIs
975 are associated with this dataset.
977 Notes
978 -----
979 When a predicted URI is requested an attempt will be made to form
980 a reasonable URI based on file templates and the expected formatter.
981 """
982 primary, components = self.getURIs(ref, predict)
983 if primary is None or components: 983 ↛ 984line 983 didn't jump to line 984, because the condition on line 983 was never true
984 raise RuntimeError(f"Dataset ({ref}) includes distinct URIs for components. "
985 "Use Dataastore.getURIs() instead.")
986 return primary
988 def get(self, ref: DatasetRef, parameters: Optional[Mapping[str, Any]] = None) -> Any:
989 """Load an InMemoryDataset from the store.
991 Parameters
992 ----------
993 ref : `DatasetRef`
994 Reference to the required Dataset.
995 parameters : `dict`
996 `StorageClass`-specific parameters that specify, for example,
997 a slice of the dataset to be loaded.
999 Returns
1000 -------
1001 inMemoryDataset : `object`
1002 Requested dataset or slice thereof as an InMemoryDataset.
1004 Raises
1005 ------
1006 FileNotFoundError
1007 Requested dataset can not be retrieved.
1008 TypeError
1009 Return value from formatter has unexpected type.
1010 ValueError
1011 Formatter failed to process the dataset.
1012 """
1013 allGetInfo = self._prepare_for_get(ref, parameters)
1014 refComponent = ref.datasetType.component()
1016 # Supplied storage class for the component being read
1017 refStorageClass = ref.datasetType.storageClass
1019 # Create mapping from component name to related info
1020 allComponents = {i.component: i for i in allGetInfo}
1022 # By definition the dataset is disassembled if we have more
1023 # than one record for it.
1024 isDisassembled = len(allGetInfo) > 1
1026 # Look for the special case where we are disassembled but the
1027 # component is a read-only component that was not written during
1028 # disassembly. For this scenario we need to check that the
1029 # component requested is listed as a read-only component for the
1030 # composite storage class
1031 isDisassembledReadOnlyComponent = False
1032 if isDisassembled and refComponent:
1033 # The composite storage class should be accessible through
1034 # the component dataset type
1035 compositeStorageClass = ref.datasetType.parentStorageClass
1037 # In the unlikely scenario where the composite storage
1038 # class is not known, we can only assume that this is a
1039 # normal component. If that assumption is wrong then the
1040 # branch below that reads a persisted component will fail
1041 # so there is no need to complain here.
1042 if compositeStorageClass is not None: 1042 ↛ 1045line 1042 didn't jump to line 1045, because the condition on line 1042 was never false
1043 isDisassembledReadOnlyComponent = refComponent in compositeStorageClass.readComponents
1045 if isDisassembled and not refComponent:
1046 # This was a disassembled dataset spread over multiple files
1047 # and we need to put them all back together again.
1048 # Read into memory and then assemble
1050 # Check that the supplied parameters are suitable for the type read
1051 refStorageClass.validateParameters(parameters)
1053 # We want to keep track of all the parameters that were not used
1054 # by formatters. We assume that if any of the component formatters
1055 # use a parameter that we do not need to apply it again in the
1056 # assembler.
1057 usedParams = set()
1059 components: Dict[str, Any] = {}
1060 for getInfo in allGetInfo:
1061 # assemblerParams are parameters not understood by the
1062 # associated formatter.
1063 usedParams.update(set(getInfo.formatterParams))
1065 component = getInfo.component
1067 if component is None: 1067 ↛ 1068line 1067 didn't jump to line 1068, because the condition on line 1067 was never true
1068 raise RuntimeError(f"Internal error in datastore assembly of {ref}")
1070 # We do not want the formatter to think it's reading
1071 # a component though because it is really reading a
1072 # standalone dataset -- always tell reader it is not a
1073 # component.
1074 components[component] = self._read_artifact_into_memory(getInfo, ref, isComponent=False)
1076 inMemoryDataset = ref.datasetType.storageClass.assembler().assemble(components)
1078 # Any unused parameters will have to be passed to the assembler
1079 if parameters:
1080 unusedParams = {k: v for k, v in parameters.items() if k not in usedParams}
1081 else:
1082 unusedParams = {}
1084 # Process parameters
1085 return ref.datasetType.storageClass.assembler().handleParameters(inMemoryDataset,
1086 parameters=unusedParams)
1088 elif isDisassembledReadOnlyComponent:
1090 compositeStorageClass = ref.datasetType.parentStorageClass
1091 if compositeStorageClass is None: 1091 ↛ 1092line 1091 didn't jump to line 1092, because the condition on line 1091 was never true
1092 raise RuntimeError(f"Unable to retrieve read-only component '{refComponent}' since"
1093 "no composite storage class is available.")
1095 if refComponent is None: 1095 ↛ 1097line 1095 didn't jump to line 1097, because the condition on line 1095 was never true
1096 # Mainly for mypy
1097 raise RuntimeError(f"Internal error in datastore {self.name}: component can not be None here")
1099 # Assume that every read-only component can be calculated by
1100 # forwarding the request to a single read/write component.
1101 # Rather than guessing which rw component is the right one by
1102 # scanning each for a read-only component of the same name,
1103 # we ask the composite assembler directly which one is best to
1104 # use.
1105 compositeAssembler = compositeStorageClass.assembler()
1106 forwardedComponent = compositeAssembler.selectResponsibleComponent(refComponent,
1107 set(allComponents))
1109 # Select the relevant component
1110 rwInfo = allComponents[forwardedComponent]
1112 # For now assume that read parameters are validated against
1113 # the real component and not the requested component
1114 forwardedStorageClass = rwInfo.formatter.fileDescriptor.readStorageClass
1115 forwardedStorageClass.validateParameters(parameters)
1117 # Unfortunately the FileDescriptor inside the formatter will have
1118 # the wrong write storage class so we need to create a new one
1119 # given the immutability constraint.
1120 writeStorageClass = rwInfo.info.storageClass
1122 # We may need to put some thought into parameters for read
1123 # components but for now forward them on as is
1124 readFormatter = type(rwInfo.formatter)(FileDescriptor(rwInfo.location,
1125 readStorageClass=refStorageClass,
1126 storageClass=writeStorageClass,
1127 parameters=parameters),
1128 ref.dataId)
1130 # The assembler can not receive any parameter requests for a
1131 # read-only component at this time since the assembler will
1132 # see the storage class of the read-only component and those
1133 # parameters will have to be handled by the formatter on the
1134 # forwarded storage class.
1135 assemblerParams: Dict[str, Any] = {}
1137 # Need to created a new info that specifies the read-only
1138 # component and associated storage class
1139 readInfo = DatastoreFileGetInformation(rwInfo.location, readFormatter,
1140 rwInfo.info, assemblerParams, {},
1141 refComponent, refStorageClass)
1143 return self._read_artifact_into_memory(readInfo, ref, isComponent=True)
1145 else:
1146 # Single file request or component from that composite file
1147 for lookup in (refComponent, None): 1147 ↛ 1152line 1147 didn't jump to line 1152, because the loop on line 1147 didn't complete
1148 if lookup in allComponents: 1148 ↛ 1147line 1148 didn't jump to line 1147, because the condition on line 1148 was never false
1149 getInfo = allComponents[lookup]
1150 break
1151 else:
1152 raise FileNotFoundError(f"Component {refComponent} not found "
1153 f"for ref {ref} in datastore {self.name}")
1155 # Do not need the component itself if already disassembled
1156 if isDisassembled:
1157 isComponent = False
1158 else:
1159 isComponent = getInfo.component is not None
1161 # For a disassembled component we can validate parametersagainst
1162 # the component storage class directly
1163 if isDisassembled:
1164 refStorageClass.validateParameters(parameters)
1165 else:
1166 # For an assembled composite this could be a read-only
1167 # component derived from a real component. The validity
1168 # of the parameters is not clear. For now validate against
1169 # the composite storage class
1170 getInfo.formatter.fileDescriptor.storageClass.validateParameters(parameters)
1172 return self._read_artifact_into_memory(getInfo, ref, isComponent=isComponent)
1174 @transactional
1175 def put(self, inMemoryDataset: Any, ref: DatasetRef) -> None:
1176 """Write a InMemoryDataset with a given `DatasetRef` to the store.
1178 Parameters
1179 ----------
1180 inMemoryDataset : `object`
1181 The dataset to store.
1182 ref : `DatasetRef`
1183 Reference to the associated Dataset.
1185 Raises
1186 ------
1187 TypeError
1188 Supplied object and storage class are inconsistent.
1189 DatasetTypeNotSupportedError
1190 The associated `DatasetType` is not handled by this datastore.
1192 Notes
1193 -----
1194 If the datastore is configured to reject certain dataset types it
1195 is possible that the put will fail and raise a
1196 `DatasetTypeNotSupportedError`. The main use case for this is to
1197 allow `ChainedDatastore` to put to multiple datastores without
1198 requiring that every datastore accepts the dataset.
1199 """
1201 doDisassembly = self.composites.shouldBeDisassembled(ref)
1202 # doDisassembly = True
1204 artifacts = []
1205 if doDisassembly:
1206 components = ref.datasetType.storageClass.assembler().disassemble(inMemoryDataset)
1207 for component, componentInfo in components.items():
1208 # Don't recurse because we want to take advantage of
1209 # bulk insert -- need a new DatasetRef that refers to the
1210 # same dataset_id but has the component DatasetType
1211 # DatasetType does not refer to the types of components
1212 # So we construct one ourselves.
1213 compRef = ref.makeComponentRef(component)
1214 storedInfo = self._write_in_memory_to_artifact(componentInfo.component, compRef)
1215 artifacts.append((compRef, storedInfo))
1216 else:
1217 # Write the entire thing out
1218 storedInfo = self._write_in_memory_to_artifact(inMemoryDataset, ref)
1219 artifacts.append((ref, storedInfo))
1221 self._register_datasets(artifacts)
1223 @transactional
1224 def trash(self, ref: DatasetRef, ignore_errors: bool = True) -> None:
1225 """Indicate to the datastore that a dataset can be removed.
1227 Parameters
1228 ----------
1229 ref : `DatasetRef`
1230 Reference to the required Dataset.
1231 ignore_errors : `bool`
1232 If `True` return without error even if something went wrong.
1233 Problems could occur if another process is simultaneously trying
1234 to delete.
1236 Raises
1237 ------
1238 FileNotFoundError
1239 Attempt to remove a dataset that does not exist.
1240 """
1241 # Get file metadata and internal metadata
1242 log.debug("Trashing %s in datastore %s", ref, self.name)
1244 fileLocations = self._get_dataset_locations_info(ref)
1246 if not fileLocations:
1247 err_msg = f"Requested dataset to trash ({ref}) is not known to datastore {self.name}"
1248 if ignore_errors:
1249 log.warning(err_msg)
1250 return
1251 else:
1252 raise FileNotFoundError(err_msg)
1254 for location, storedFileInfo in fileLocations:
1255 if not self._artifact_exists(location): 1255 ↛ 1256line 1255 didn't jump to line 1256, because the condition on line 1255 was never true
1256 err_msg = f"Dataset is known to datastore {self.name} but " \
1257 f"associated artifact ({location.uri}) is missing"
1258 if ignore_errors:
1259 log.warning(err_msg)
1260 return
1261 else:
1262 raise FileNotFoundError(err_msg)
1264 # Mark dataset as trashed
1265 try:
1266 self._move_to_trash_in_registry(ref)
1267 except Exception as e:
1268 if ignore_errors:
1269 log.warning(f"Attempted to mark dataset ({ref}) to be trashed in datastore {self.name} "
1270 f"but encountered an error: {e}")
1271 pass
1272 else:
1273 raise
1275 @transactional
1276 def emptyTrash(self, ignore_errors: bool = True) -> None:
1277 """Remove all datasets from the trash.
1279 Parameters
1280 ----------
1281 ignore_errors : `bool`
1282 If `True` return without error even if something went wrong.
1283 Problems could occur if another process is simultaneously trying
1284 to delete.
1285 """
1286 log.debug("Emptying trash in datastore %s", self.name)
1287 # Context manager will empty trash iff we finish it without raising.
1288 with self.bridge.emptyTrash() as trashed:
1289 for ref in trashed:
1290 fileLocations = self._get_dataset_locations_info(ref)
1292 if not fileLocations: 1292 ↛ 1293line 1292 didn't jump to line 1293, because the condition on line 1292 was never true
1293 err_msg = f"Requested dataset ({ref}) does not exist in datastore {self.name}"
1294 if ignore_errors:
1295 log.warning(err_msg)
1296 continue
1297 else:
1298 raise FileNotFoundError(err_msg)
1300 for location, _ in fileLocations:
1302 if not self._artifact_exists(location): 1302 ↛ 1303line 1302 didn't jump to line 1303, because the condition on line 1302 was never true
1303 err_msg = f"Dataset {location.uri} no longer present in datastore {self.name}"
1304 if ignore_errors:
1305 log.warning(err_msg)
1306 continue
1307 else:
1308 raise FileNotFoundError(err_msg)
1310 # Can only delete the artifact if there are no references
1311 # to the file from untrashed dataset refs.
1312 if self._can_remove_dataset_artifact(ref, location):
1313 # Point of no return for this artifact
1314 log.debug("Removing artifact %s from datastore %s", location.uri, self.name)
1315 try:
1316 self._delete_artifact(location)
1317 except Exception as e:
1318 if ignore_errors:
1319 log.critical("Encountered error removing artifact %s from datastore %s: %s",
1320 location.uri, self.name, e)
1321 else:
1322 raise
1324 # Now must remove the entry from the internal registry even if
1325 # the artifact removal failed and was ignored,
1326 # otherwise the removal check above will never be true
1327 try:
1328 # There may be multiple rows associated with this ref
1329 # depending on disassembly
1330 self.removeStoredItemInfo(ref)
1331 except Exception as e:
1332 if ignore_errors:
1333 log.warning("Error removing dataset %s (%s) from internal registry of %s: %s",
1334 ref.id, location.uri, self.name, e)
1335 continue
1336 else:
1337 raise FileNotFoundError(err_msg)
1339 def validateConfiguration(self, entities: Iterable[Union[DatasetRef, DatasetType, StorageClass]],
1340 logFailures: bool = False) -> None:
1341 """Validate some of the configuration for this datastore.
1343 Parameters
1344 ----------
1345 entities : iterable of `DatasetRef`, `DatasetType`, or `StorageClass`
1346 Entities to test against this configuration. Can be differing
1347 types.
1348 logFailures : `bool`, optional
1349 If `True`, output a log message for every validation error
1350 detected.
1352 Raises
1353 ------
1354 DatastoreValidationError
1355 Raised if there is a validation problem with a configuration.
1356 All the problems are reported in a single exception.
1358 Notes
1359 -----
1360 This method checks that all the supplied entities have valid file
1361 templates and also have formatters defined.
1362 """
1364 templateFailed = None
1365 try:
1366 self.templates.validateTemplates(entities, logFailures=logFailures)
1367 except FileTemplateValidationError as e:
1368 templateFailed = str(e)
1370 formatterFailed = []
1371 for entity in entities:
1372 try:
1373 self.formatterFactory.getFormatterClass(entity)
1374 except KeyError as e:
1375 formatterFailed.append(str(e))
1376 if logFailures: 1376 ↛ 1371line 1376 didn't jump to line 1371, because the condition on line 1376 was never false
1377 log.fatal("Formatter failure: %s", e)
1379 if templateFailed or formatterFailed:
1380 messages = []
1381 if templateFailed: 1381 ↛ 1382line 1381 didn't jump to line 1382, because the condition on line 1381 was never true
1382 messages.append(templateFailed)
1383 if formatterFailed: 1383 ↛ 1385line 1383 didn't jump to line 1385, because the condition on line 1383 was never false
1384 messages.append(",".join(formatterFailed))
1385 msg = ";\n".join(messages)
1386 raise DatastoreValidationError(msg)
1388 def getLookupKeys(self) -> Set[LookupKey]:
1389 # Docstring is inherited from base class
1390 return self.templates.getLookupKeys() | self.formatterFactory.getLookupKeys() | \
1391 self.constraints.getLookupKeys()
1393 def validateKey(self, lookupKey: LookupKey,
1394 entity: Union[DatasetRef, DatasetType, StorageClass]) -> None:
1395 # Docstring is inherited from base class
1396 # The key can be valid in either formatters or templates so we can
1397 # only check the template if it exists
1398 if lookupKey in self.templates:
1399 try:
1400 self.templates[lookupKey].validateTemplate(entity)
1401 except FileTemplateValidationError as e:
1402 raise DatastoreValidationError(e) from e
1404 def export(self, refs: Iterable[DatasetRef], *,
1405 directory: Optional[Union[ButlerURI, str]] = None,
1406 transfer: Optional[str] = "auto") -> Iterable[FileDataset]:
1407 # Docstring inherited from Datastore.export.
1408 if transfer is not None and directory is None: 1408 ↛ 1409line 1408 didn't jump to line 1409, because the condition on line 1408 was never true
1409 raise RuntimeError(f"Cannot export using transfer mode {transfer} with no "
1410 "export directory given")
1412 # Force the directory to be a URI object
1413 directoryUri: Optional[ButlerURI] = None
1414 if directory is not None: 1414 ↛ 1417line 1414 didn't jump to line 1417, because the condition on line 1414 was never false
1415 directoryUri = ButlerURI(directory, forceDirectory=True)
1417 if transfer is not None and directoryUri is not None: 1417 ↛ 1422line 1417 didn't jump to line 1422, because the condition on line 1417 was never false
1418 # mypy needs the second test
1419 if not directoryUri.exists(): 1419 ↛ 1420line 1419 didn't jump to line 1420, because the condition on line 1419 was never true
1420 raise FileNotFoundError(f"Export location {directory} does not exist")
1422 for ref in refs:
1423 fileLocations = self._get_dataset_locations_info(ref)
1424 if not fileLocations: 1424 ↛ 1425line 1424 didn't jump to line 1425, because the condition on line 1424 was never true
1425 raise FileNotFoundError(f"Could not retrieve dataset {ref}.")
1426 # For now we can not export disassembled datasets
1427 if len(fileLocations) > 1:
1428 raise NotImplementedError(f"Can not export disassembled datasets such as {ref}")
1429 location, storedFileInfo = fileLocations[0]
1430 if transfer is None: 1430 ↛ 1433line 1430 didn't jump to line 1433, because the condition on line 1430 was never true
1431 # TODO: do we also need to return the readStorageClass somehow?
1432 # We will use the path in store directly
1433 pass
1434 else:
1435 # mypy needs help
1436 assert directoryUri is not None, "directoryUri must be defined to get here"
1437 storeUri = ButlerURI(location.uri)
1438 exportUri = directoryUri.join(location.pathInStore)
1439 exportUri.transfer_from(storeUri, transfer=transfer)
1441 yield FileDataset(refs=[ref], path=location.pathInStore, formatter=storedFileInfo.formatter)
1443 @staticmethod
1444 def computeChecksum(uri: ButlerURI, algorithm: str = "blake2b", block_size: int = 8192) -> Optional[str]:
1445 """Compute the checksum of the supplied file.
1447 Parameters
1448 ----------
1449 uri : `ButlerURI`
1450 Name of resource to calculate checksum from.
1451 algorithm : `str`, optional
1452 Name of algorithm to use. Must be one of the algorithms supported
1453 by :py:class`hashlib`.
1454 block_size : `int`
1455 Number of bytes to read from file at one time.
1457 Returns
1458 -------
1459 hexdigest : `str`
1460 Hex digest of the file.
1462 Notes
1463 -----
1464 Currently returns None if the URI is for a remote resource.
1465 """
1466 if algorithm not in hashlib.algorithms_guaranteed: 1466 ↛ 1467line 1466 didn't jump to line 1467, because the condition on line 1466 was never true
1467 raise NameError("The specified algorithm '{}' is not supported by hashlib".format(algorithm))
1469 if uri.scheme and uri.scheme != "file":
1470 return None
1472 hasher = hashlib.new(algorithm)
1474 filename, is_temp = uri.as_local()
1476 with open(filename, "rb") as f:
1477 for chunk in iter(lambda: f.read(block_size), b""):
1478 hasher.update(chunk)
1480 if is_temp: 1480 ↛ 1481line 1480 didn't jump to line 1481, because the condition on line 1480 was never true
1481 os.remove(filename)
1483 return hasher.hexdigest()