Coverage for python/lsst/daf/butler/datastores/fileLikeDatastore.py : 83%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
21from __future__ import annotations
23"""Generic file-based datastore code."""
25__all__ = ("FileLikeDatastore", )
27import hashlib
28import logging
29import os
30from abc import abstractmethod
32from sqlalchemy import BigInteger, String
34from dataclasses import dataclass
35from typing import (
36 TYPE_CHECKING,
37 Any,
38 ClassVar,
39 Dict,
40 Iterable,
41 List,
42 Mapping,
43 Optional,
44 Set,
45 Tuple,
46 Type,
47 Union,
48)
50from lsst.daf.butler import (
51 ButlerURI,
52 CompositesMap,
53 Config,
54 FileDataset,
55 DatasetRef,
56 DatasetType,
57 DatasetTypeNotSupportedError,
58 Datastore,
59 DatastoreConfig,
60 DatastoreValidationError,
61 FileDescriptor,
62 FileTemplates,
63 FileTemplateValidationError,
64 Formatter,
65 FormatterFactory,
66 Location,
67 LocationFactory,
68 StorageClass,
69 StoredFileInfo,
70)
72from lsst.daf.butler import ddl
73from lsst.daf.butler.registry.interfaces import (
74 ReadOnlyDatabaseError,
75 DatastoreRegistryBridge,
76)
78from lsst.daf.butler.core.repoRelocation import replaceRoot
79from lsst.daf.butler.core.utils import getInstanceOf, getClassOf, transactional
80from .genericDatastore import GenericBaseDatastore
82if TYPE_CHECKING: 82 ↛ 83line 82 didn't jump to line 83, because the condition on line 82 was never true
83 from lsst.daf.butler import LookupKey
84 from lsst.daf.butler.registry.interfaces import DatasetIdRef, DatastoreRegistryBridgeManager
86log = logging.getLogger(__name__)
88# String to use when a Python None is encountered
89NULLSTR = "__NULL_STRING__"
92class _IngestPrepData(Datastore.IngestPrepData):
93 """Helper class for FileLikeDatastore ingest implementation.
95 Parameters
96 ----------
97 datasets : `list` of `FileDataset`
98 Files to be ingested by this datastore.
99 """
100 def __init__(self, datasets: List[FileDataset]):
101 super().__init__(ref for dataset in datasets for ref in dataset.refs)
102 self.datasets = datasets
105@dataclass(frozen=True)
106class DatastoreFileGetInformation:
107 """Collection of useful parameters needed to retrieve a file from
108 a Datastore.
109 """
111 location: Location
112 """The location from which to read the dataset."""
114 formatter: Formatter
115 """The `Formatter` to use to deserialize the dataset."""
117 info: StoredFileInfo
118 """Stored information about this file and its formatter."""
120 assemblerParams: Dict[str, Any]
121 """Parameters to use for post-processing the retrieved dataset."""
123 formatterParams: Dict[str, Any]
124 """Parameters that were understood by the associated formatter."""
126 component: Optional[str]
127 """The component to be retrieved (can be `None`)."""
129 readStorageClass: StorageClass
130 """The `StorageClass` of the dataset being read."""
133class FileLikeDatastore(GenericBaseDatastore):
134 """Generic Datastore for file-based implementations.
136 Should always be sub-classed since key abstract methods are missing.
138 Parameters
139 ----------
140 config : `DatastoreConfig` or `str`
141 Configuration as either a `Config` object or URI to file.
142 bridgeManager : `DatastoreRegistryBridgeManager`
143 Object that manages the interface between `Registry` and datastores.
144 butlerRoot : `str`, optional
145 New datastore root to use to override the configuration value.
147 Raises
148 ------
149 ValueError
150 If root location does not exist and ``create`` is `False` in the
151 configuration.
152 """
154 defaultConfigFile: ClassVar[Optional[str]] = None
155 """Path to configuration defaults. Accessed within the ``config`` resource
156 or relative to a search path. Can be None if no defaults specified.
157 """
159 root: ButlerURI
160 """Root directory URI of this `Datastore`."""
162 locationFactory: LocationFactory
163 """Factory for creating locations relative to the datastore root."""
165 formatterFactory: FormatterFactory
166 """Factory for creating instances of formatters."""
168 templates: FileTemplates
169 """File templates that can be used by this `Datastore`."""
171 composites: CompositesMap
172 """Determines whether a dataset should be disassembled on put."""
174 @classmethod
175 def setConfigRoot(cls, root: str, config: Config, full: Config, overwrite: bool = True) -> None:
176 """Set any filesystem-dependent config options for this Datastore to
177 be appropriate for a new empty repository with the given root.
179 Parameters
180 ----------
181 root : `str`
182 URI to the root of the data repository.
183 config : `Config`
184 A `Config` to update. Only the subset understood by
185 this component will be updated. Will not expand
186 defaults.
187 full : `Config`
188 A complete config with all defaults expanded that can be
189 converted to a `DatastoreConfig`. Read-only and will not be
190 modified by this method.
191 Repository-specific options that should not be obtained
192 from defaults when Butler instances are constructed
193 should be copied from ``full`` to ``config``.
194 overwrite : `bool`, optional
195 If `False`, do not modify a value in ``config`` if the value
196 already exists. Default is always to overwrite with the provided
197 ``root``.
199 Notes
200 -----
201 If a keyword is explicitly defined in the supplied ``config`` it
202 will not be overridden by this method if ``overwrite`` is `False`.
203 This allows explicit values set in external configs to be retained.
204 """
205 Config.updateParameters(DatastoreConfig, config, full,
206 toUpdate={"root": root},
207 toCopy=("cls", ("records", "table")), overwrite=overwrite)
209 @classmethod
210 def makeTableSpec(cls) -> ddl.TableSpec:
211 return ddl.TableSpec(
212 fields=[
213 ddl.FieldSpec(name="dataset_id", dtype=BigInteger, primaryKey=True),
214 ddl.FieldSpec(name="path", dtype=String, length=256, nullable=False),
215 ddl.FieldSpec(name="formatter", dtype=String, length=128, nullable=False),
216 ddl.FieldSpec(name="storage_class", dtype=String, length=64, nullable=False),
217 # Use empty string to indicate no component
218 ddl.FieldSpec(name="component", dtype=String, length=32, primaryKey=True),
219 # TODO: should checksum be Base64Bytes instead?
220 ddl.FieldSpec(name="checksum", dtype=String, length=128, nullable=True),
221 ddl.FieldSpec(name="file_size", dtype=BigInteger, nullable=True),
222 ],
223 unique=frozenset(),
224 )
226 def __init__(self, config: Union[DatastoreConfig, str],
227 bridgeManager: DatastoreRegistryBridgeManager, butlerRoot: str = None):
228 super().__init__(config, bridgeManager)
229 if "root" not in self.config: 229 ↛ 230line 229 didn't jump to line 230, because the condition on line 229 was never true
230 raise ValueError("No root directory specified in configuration")
232 # Name ourselves either using an explicit name or a name
233 # derived from the (unexpanded) root
234 if "name" in self.config:
235 self.name = self.config["name"]
236 else:
237 # We use the unexpanded root in the name to indicate that this
238 # datastore can be moved without having to update registry.
239 self.name = "{}@{}".format(type(self).__name__,
240 self.config["root"])
242 # Support repository relocation in config
243 # Existence of self.root is checked in subclass
244 self.root = ButlerURI(replaceRoot(self.config["root"], butlerRoot),
245 forceDirectory=True, forceAbsolute=True)
247 self.locationFactory = LocationFactory(self.root)
248 self.formatterFactory = FormatterFactory()
250 # Now associate formatters with storage classes
251 self.formatterFactory.registerFormatters(self.config["formatters"],
252 universe=bridgeManager.universe)
254 # Read the file naming templates
255 self.templates = FileTemplates(self.config["templates"],
256 universe=bridgeManager.universe)
258 # See if composites should be disassembled
259 self.composites = CompositesMap(self.config["composites"],
260 universe=bridgeManager.universe)
262 tableName = self.config["records", "table"]
263 try:
264 # Storage of paths and formatters, keyed by dataset_id
265 self._table = bridgeManager.opaque.register(tableName, self.makeTableSpec())
266 # Interface to Registry.
267 self._bridge = bridgeManager.register(self.name)
268 except ReadOnlyDatabaseError:
269 # If the database is read only and we just tried and failed to
270 # create a table, it means someone is trying to create a read-only
271 # butler client for an empty repo. That should be okay, as long
272 # as they then try to get any datasets before some other client
273 # creates the table. Chances are they'rejust validating
274 # configuration.
275 pass
277 # Determine whether checksums should be used
278 self.useChecksum = self.config.get("checksum", True)
280 def __str__(self) -> str:
281 return str(self.root)
283 @property
284 def bridge(self) -> DatastoreRegistryBridge:
285 return self._bridge
287 def _artifact_exists(self, location: Location) -> bool:
288 """Check that an artifact exists in this datastore at the specified
289 location.
291 Parameters
292 ----------
293 location : `Location`
294 Expected location of the artifact associated with this datastore.
296 Returns
297 -------
298 exists : `bool`
299 True if the location can be found, false otherwise.
300 """
301 log.debug("Checking if resource exists: %s", location.uri)
302 return location.uri.exists()
304 def _delete_artifact(self, location: Location) -> None:
305 """Delete the artifact from the datastore.
307 Parameters
308 ----------
309 location : `Location`
310 Location of the artifact associated with this datastore.
311 """
312 log.debug("Deleting file: %s", location.uri)
313 location.uri.remove()
314 log.debug("Successfully deleted file: %s", location.uri)
316 def addStoredItemInfo(self, refs: Iterable[DatasetRef], infos: Iterable[StoredFileInfo]) -> None:
317 # Docstring inherited from GenericBaseDatastore
318 records = []
319 for ref, info in zip(refs, infos):
320 # Component should come from ref and fall back on info
321 component = ref.datasetType.component()
322 if component is None and info.component is not None: 322 ↛ 323line 322 didn't jump to line 323, because the condition on line 322 was never true
323 component = info.component
324 if component is None:
325 # Use empty string since we want this to be part of the
326 # primary key.
327 component = NULLSTR
328 records.append(
329 dict(dataset_id=ref.id, formatter=info.formatter, path=info.path,
330 storage_class=info.storageClass.name, component=component,
331 checksum=info.checksum, file_size=info.file_size)
332 )
333 self._table.insert(*records)
335 def getStoredItemsInfo(self, ref: DatasetIdRef) -> List[StoredFileInfo]:
336 # Docstring inherited from GenericBaseDatastore
338 # Look for the dataset_id -- there might be multiple matches
339 # if we have disassembled the dataset.
340 records = list(self._table.fetch(dataset_id=ref.id))
342 results = []
343 for record in records:
344 # Convert name of StorageClass to instance
345 storageClass = self.storageClassFactory.getStorageClass(record["storage_class"])
346 component = record["component"] if (record["component"]
347 and record["component"] != NULLSTR) else None
349 info = StoredFileInfo(formatter=record["formatter"],
350 path=record["path"],
351 storageClass=storageClass,
352 component=component,
353 checksum=record["checksum"],
354 file_size=record["file_size"])
355 results.append(info)
357 return results
359 def _registered_refs_per_artifact(self, pathInStore: str) -> Set[int]:
360 """Return all dataset refs associated with the supplied path.
362 Parameters
363 ----------
364 pathInStore : `str`
365 Path of interest in the data store.
367 Returns
368 -------
369 ids : `set` of `int`
370 All `DatasetRef` IDs associated with this path.
371 """
372 records = list(self._table.fetch(path=pathInStore))
373 ids = {r["dataset_id"] for r in records}
374 return ids
376 def removeStoredItemInfo(self, ref: DatasetIdRef) -> None:
377 # Docstring inherited from GenericBaseDatastore
378 self._table.delete(dataset_id=ref.id)
380 def _get_dataset_locations_info(self, ref: DatasetIdRef) -> List[Tuple[Location, StoredFileInfo]]:
381 r"""Find all the `Location`\ s of the requested dataset in the
382 `Datastore` and the associated stored file information.
384 Parameters
385 ----------
386 ref : `DatasetRef`
387 Reference to the required `Dataset`.
389 Returns
390 -------
391 results : `list` [`tuple` [`Location`, `StoredFileInfo` ]]
392 Location of the dataset within the datastore and
393 stored information about each file and its formatter.
394 """
395 # Get the file information (this will fail if no file)
396 records = self.getStoredItemsInfo(ref)
398 # Use the path to determine the location
399 return [(self.locationFactory.fromPath(r.path), r) for r in records]
401 def _can_remove_dataset_artifact(self, ref: DatasetIdRef, location: Location) -> bool:
402 """Check that there is only one dataset associated with the
403 specified artifact.
405 Parameters
406 ----------
407 ref : `DatasetRef` or `FakeDatasetRef`
408 Dataset to be removed.
409 location : `Location`
410 The location of the artifact to be removed.
412 Returns
413 -------
414 can_remove : `Bool`
415 True if the artifact can be safely removed.
416 """
418 # Get all entries associated with this path
419 allRefs = self._registered_refs_per_artifact(location.pathInStore)
420 if not allRefs: 420 ↛ 421line 420 didn't jump to line 421, because the condition on line 420 was never true
421 raise RuntimeError(f"Datastore inconsistency error. {location.pathInStore} not in registry")
423 # Remove these refs from all the refs and if there is nothing left
424 # then we can delete
425 remainingRefs = allRefs - {ref.id}
427 if remainingRefs:
428 return False
429 return True
431 def _prepare_for_get(self, ref: DatasetRef,
432 parameters: Optional[Mapping[str, Any]] = None) -> List[DatastoreFileGetInformation]:
433 """Check parameters for ``get`` and obtain formatter and
434 location.
436 Parameters
437 ----------
438 ref : `DatasetRef`
439 Reference to the required Dataset.
440 parameters : `dict`
441 `StorageClass`-specific parameters that specify, for example,
442 a slice of the dataset to be loaded.
444 Returns
445 -------
446 getInfo : `list` [`DatastoreFileGetInformation`]
447 Parameters needed to retrieve each file.
448 """
449 log.debug("Retrieve %s from %s with parameters %s", ref, self.name, parameters)
451 # Get file metadata and internal metadata
452 fileLocations = self._get_dataset_locations_info(ref)
453 if not fileLocations:
454 raise FileNotFoundError(f"Could not retrieve dataset {ref}.")
456 # The storage class we want to use eventually
457 refStorageClass = ref.datasetType.storageClass
459 if len(fileLocations) > 1:
460 disassembled = True
461 else:
462 disassembled = False
464 # Is this a component request?
465 refComponent = ref.datasetType.component()
467 fileGetInfo = []
468 for location, storedFileInfo in fileLocations:
470 # The storage class used to write the file
471 writeStorageClass = storedFileInfo.storageClass
473 # If this has been disassembled we need read to match the write
474 if disassembled:
475 readStorageClass = writeStorageClass
476 else:
477 readStorageClass = refStorageClass
479 formatter = getInstanceOf(storedFileInfo.formatter,
480 FileDescriptor(location, readStorageClass=readStorageClass,
481 storageClass=writeStorageClass, parameters=parameters),
482 ref.dataId)
484 formatterParams, notFormatterParams = formatter.segregateParameters()
486 # Of the remaining parameters, extract the ones supported by
487 # this StorageClass (for components not all will be handled)
488 assemblerParams = readStorageClass.filterParameters(notFormatterParams)
490 # The ref itself could be a component if the dataset was
491 # disassembled by butler, or we disassembled in datastore and
492 # components came from the datastore records
493 component = storedFileInfo.component if storedFileInfo.component else refComponent
495 fileGetInfo.append(DatastoreFileGetInformation(location, formatter, storedFileInfo,
496 assemblerParams, formatterParams,
497 component, readStorageClass))
499 return fileGetInfo
501 def _prepare_for_put(self, inMemoryDataset: Any, ref: DatasetRef) -> Tuple[Location, Formatter]:
502 """Check the arguments for ``put`` and obtain formatter and
503 location.
505 Parameters
506 ----------
507 inMemoryDataset : `object`
508 The dataset to store.
509 ref : `DatasetRef`
510 Reference to the associated Dataset.
512 Returns
513 -------
514 location : `Location`
515 The location to write the dataset.
516 formatter : `Formatter`
517 The `Formatter` to use to write the dataset.
519 Raises
520 ------
521 TypeError
522 Supplied object and storage class are inconsistent.
523 DatasetTypeNotSupportedError
524 The associated `DatasetType` is not handled by this datastore.
525 """
526 self._validate_put_parameters(inMemoryDataset, ref)
528 # Work out output file name
529 try:
530 template = self.templates.getTemplate(ref)
531 except KeyError as e:
532 raise DatasetTypeNotSupportedError(f"Unable to find template for {ref}") from e
534 # Validate the template to protect against filenames from different
535 # dataIds returning the same and causing overwrite confusion.
536 template.validateTemplate(ref)
538 location = self.locationFactory.fromPath(template.format(ref))
540 # Get the formatter based on the storage class
541 storageClass = ref.datasetType.storageClass
542 try:
543 formatter = self.formatterFactory.getFormatter(ref,
544 FileDescriptor(location,
545 storageClass=storageClass),
546 ref.dataId)
547 except KeyError as e:
548 raise DatasetTypeNotSupportedError(f"Unable to find formatter for {ref} in datastore "
549 f"{self.name}") from e
551 # Now that we know the formatter, update the location
552 location = formatter.makeUpdatedLocation(location)
554 return location, formatter
556 def _overrideTransferMode(self, *datasets: FileDataset, transfer: Optional[str] = None) -> Optional[str]:
557 # Docstring inherited from base class
558 if transfer != "auto":
559 return transfer
561 # See if the paths are within the datastore or not
562 inside = [self._pathInStore(d.path) is not None for d in datasets]
564 if all(inside):
565 transfer = None
566 elif not any(inside): 566 ↛ 570line 566 didn't jump to line 570, because the condition on line 566 was never false
567 # Allow ButlerURI to use its own knowledge
568 transfer = "auto"
569 else:
570 raise ValueError("Some datasets are inside the datastore and some are outside."
571 " Please use an explicit transfer mode and not 'auto'.")
573 return transfer
575 def _pathInStore(self, path: str) -> Optional[str]:
576 """Return path relative to datastore root
578 Parameters
579 ----------
580 path : `str`
581 Path to dataset. Can be absolute. If relative assumed to
582 be relative to the datastore. Returns path in datastore
583 or raises an exception if the path it outside.
585 Returns
586 -------
587 inStore : `str`
588 Path relative to datastore root. Returns `None` if the file is
589 outside the root.
590 """
591 # Relative path will always be relative to datastore
592 pathUri = ButlerURI(path, forceAbsolute=False)
593 return pathUri.relative_to(self.root)
595 def _standardizeIngestPath(self, path: str, *, transfer: Optional[str] = None) -> str:
596 """Standardize the path of a to-be-ingested file.
598 Parameters
599 ----------
600 path : `str`
601 Path of a file to be ingested.
602 transfer : `str`, optional
603 How (and whether) the dataset should be added to the datastore.
604 See `ingest` for details of transfer modes.
605 This implementation is provided only so
606 `NotImplementedError` can be raised if the mode is not supported;
607 actual transfers are deferred to `_extractIngestInfo`.
609 Returns
610 -------
611 path : `str`
612 New path in what the datastore considers standard form.
614 Notes
615 -----
616 Subclasses of `FileLikeDatastore` can implement this method instead
617 of `_prepIngest`. It should not modify the data repository or given
618 file in any way.
620 Raises
621 ------
622 NotImplementedError
623 Raised if the datastore does not support the given transfer mode
624 (including the case where ingest is not supported at all).
625 FileNotFoundError
626 Raised if one of the given files does not exist.
627 """
628 if transfer not in (None,) + self.root.transferModes: 628 ↛ 629line 628 didn't jump to line 629, because the condition on line 628 was never true
629 raise NotImplementedError(f"Transfer mode {transfer} not supported.")
631 # A relative URI indicates relative to datastore root
632 srcUri = ButlerURI(path, forceAbsolute=False)
633 if not srcUri.isabs():
634 srcUri = self.root.join(path)
636 if not srcUri.exists():
637 raise FileNotFoundError(f"Resource at {srcUri} does not exist; note that paths to ingest "
638 f"are assumed to be relative to {self.root} unless they are absolute.")
640 if transfer is None:
641 relpath = srcUri.relative_to(self.root)
642 if not relpath:
643 raise RuntimeError(f"Transfer is none but source file ({srcUri}) is not "
644 f"within datastore ({self.root})")
646 # Return the relative path within the datastore for internal
647 # transfer
648 path = relpath
650 return path
652 def _extractIngestInfo(self, path: Union[str, ButlerURI], ref: DatasetRef, *,
653 formatter: Union[Formatter, Type[Formatter]],
654 transfer: Optional[str] = None) -> StoredFileInfo:
655 """Relocate (if necessary) and extract `StoredFileInfo` from a
656 to-be-ingested file.
658 Parameters
659 ----------
660 path : `str` or `ButlerURI`
661 URI or path of a file to be ingested.
662 ref : `DatasetRef`
663 Reference for the dataset being ingested. Guaranteed to have
664 ``dataset_id not None`.
665 formatter : `type` or `Formatter`
666 `Formatter` subclass to use for this dataset or an instance.
667 transfer : `str`, optional
668 How (and whether) the dataset should be added to the datastore.
669 See `ingest` for details of transfer modes.
671 Returns
672 -------
673 info : `StoredFileInfo`
674 Internal datastore record for this file. This will be inserted by
675 the caller; the `_extractIngestInfo` is only resposible for
676 creating and populating the struct.
678 Raises
679 ------
680 FileNotFoundError
681 Raised if one of the given files does not exist.
682 FileExistsError
683 Raised if transfer is not `None` but the (internal) location the
684 file would be moved to is already occupied.
685 """
686 if self._transaction is None: 686 ↛ 687line 686 didn't jump to line 687, because the condition on line 686 was never true
687 raise RuntimeError("Ingest called without transaction enabled")
689 # Create URI of the source path, do not need to force a relative
690 # path to absolute.
691 srcUri = ButlerURI(path, forceAbsolute=False)
693 # Track whether we have read the size of the source yet
694 have_sized = False
696 if transfer is None:
697 # A relative path is assumed to be relative to the datastore
698 # in this context
699 if not srcUri.isabs():
700 tgtLocation = self.locationFactory.fromPath(srcUri.ospath)
701 else:
702 # Work out the path in the datastore from an absolute URI
703 # This is required to be within the datastore.
704 pathInStore = srcUri.relative_to(self.root)
705 if pathInStore is None: 705 ↛ 706line 705 didn't jump to line 706, because the condition on line 705 was never true
706 raise RuntimeError(f"Unexpectedly learned that {srcUri} is "
707 f"not within datastore {self.root}")
708 tgtLocation = self.locationFactory.fromPath(pathInStore)
709 else:
710 # Work out the name we want this ingested file to have
711 # inside the datastore
712 tgtLocation = self._calculate_ingested_datastore_name(srcUri, ref, formatter)
713 if not tgtLocation.uri.dirname().exists():
714 log.debug("Folder %s does not exist yet.", tgtLocation.uri.dirname())
715 tgtLocation.uri.dirname().mkdir()
717 # if we are transferring from a local file to a remote location
718 # it may be more efficient to get the size and checksum of the
719 # local file rather than the transferred one
720 if not srcUri.scheme or srcUri.scheme == "file": 720 ↛ 725line 720 didn't jump to line 725, because the condition on line 720 was never false
721 size = srcUri.size()
722 checksum = self.computeChecksum(srcUri) if self.useChecksum else None
724 # transfer the resource to the destination
725 tgtLocation.uri.transfer_from(srcUri, transfer=transfer, transaction=self._transaction)
727 # the file should exist in the datastore now
728 if not have_sized: 728 ↛ 732line 728 didn't jump to line 732, because the condition on line 728 was never false
729 size = tgtLocation.uri.size()
730 checksum = self.computeChecksum(tgtLocation.uri) if self.useChecksum else None
732 return StoredFileInfo(formatter=formatter, path=tgtLocation.pathInStore,
733 storageClass=ref.datasetType.storageClass,
734 component=ref.datasetType.component(),
735 file_size=size, checksum=checksum)
737 def _prepIngest(self, *datasets: FileDataset, transfer: Optional[str] = None) -> _IngestPrepData:
738 # Docstring inherited from Datastore._prepIngest.
739 filtered = []
740 for dataset in datasets:
741 acceptable = [ref for ref in dataset.refs if self.constraints.isAcceptable(ref)]
742 if not acceptable:
743 continue
744 else:
745 dataset.refs = acceptable
746 if dataset.formatter is None:
747 dataset.formatter = self.formatterFactory.getFormatterClass(dataset.refs[0])
748 else:
749 assert isinstance(dataset.formatter, (type, str))
750 dataset.formatter = getClassOf(dataset.formatter)
751 dataset.path = self._standardizeIngestPath(dataset.path, transfer=transfer)
752 filtered.append(dataset)
753 return _IngestPrepData(filtered)
755 @transactional
756 def _finishIngest(self, prepData: Datastore.IngestPrepData, *, transfer: Optional[str] = None) -> None:
757 # Docstring inherited from Datastore._finishIngest.
758 refsAndInfos = []
759 for dataset in prepData.datasets:
760 # Do ingest as if the first dataset ref is associated with the file
761 info = self._extractIngestInfo(dataset.path, dataset.refs[0], formatter=dataset.formatter,
762 transfer=transfer)
763 refsAndInfos.extend([(ref, info) for ref in dataset.refs])
764 self._register_datasets(refsAndInfos)
766 def _calculate_ingested_datastore_name(self, srcUri: ButlerURI, ref: DatasetRef,
767 formatter: Union[Formatter, Type[Formatter]]) -> Location:
768 """Given a source URI and a DatasetRef, determine the name the
769 dataset will have inside datastore.
771 Parameters
772 ----------
773 srcUri : `ButlerURI`
774 URI to the source dataset file.
775 ref : `DatasetRef`
776 Ref associated with the newly-ingested dataset artifact. This
777 is used to determine the name within the datastore.
778 formatter : `Formatter` or Formatter class.
779 Formatter to use for validation. Can be a class or an instance.
781 Returns
782 -------
783 location : `Location`
784 Target location for the newly-ingested dataset.
785 """
786 # Ingesting a file from outside the datastore.
787 # This involves a new name.
788 template = self.templates.getTemplate(ref)
789 location = self.locationFactory.fromPath(template.format(ref))
791 # Get the extension
792 ext = srcUri.getExtension()
794 # Update the destination to include that extension
795 location.updateExtension(ext)
797 # Ask the formatter to validate this extension
798 formatter.validateExtension(location)
800 return location
802 @abstractmethod
803 def _write_in_memory_to_artifact(self, inMemoryDataset: Any, ref: DatasetRef) -> StoredFileInfo:
804 """Write out in memory dataset to datastore.
806 Parameters
807 ----------
808 inMemoryDataset : `object`
809 Dataset to write to datastore.
810 ref : `DatasetRef`
811 Registry information associated with this dataset.
813 Returns
814 -------
815 info : `StoredFileInfo`
816 Information describin the artifact written to the datastore.
817 """
818 raise NotImplementedError()
820 @abstractmethod
821 def _read_artifact_into_memory(self, getInfo: DatastoreFileGetInformation,
822 ref: DatasetRef, isComponent: bool = False) -> Any:
823 """Read the artifact from datastore into in memory object.
825 Parameters
826 ----------
827 getInfo : `DatastoreFileGetInformation`
828 Information about the artifact within the datastore.
829 ref : `DatasetRef`
830 The registry information associated with this artifact.
831 isComponent : `bool`
832 Flag to indicate if a component is being read from this artifact.
834 Returns
835 -------
836 inMemoryDataset : `object`
837 The artifact as a python object.
838 """
839 raise NotImplementedError()
841 def exists(self, ref: DatasetRef) -> bool:
842 """Check if the dataset exists in the datastore.
844 Parameters
845 ----------
846 ref : `DatasetRef`
847 Reference to the required dataset.
849 Returns
850 -------
851 exists : `bool`
852 `True` if the entity exists in the `Datastore`.
853 """
854 fileLocations = self._get_dataset_locations_info(ref)
855 if not fileLocations:
856 return False
857 for location, _ in fileLocations:
858 if not self._artifact_exists(location):
859 return False
861 return True
863 def getURIs(self, ref: DatasetRef,
864 predict: bool = False) -> Tuple[Optional[ButlerURI], Dict[str, ButlerURI]]:
865 """Return URIs associated with dataset.
867 Parameters
868 ----------
869 ref : `DatasetRef`
870 Reference to the required dataset.
871 predict : `bool`, optional
872 If the datastore does not know about the dataset, should it
873 return a predicted URI or not?
875 Returns
876 -------
877 primary : `ButlerURI`
878 The URI to the primary artifact associated with this dataset.
879 If the dataset was disassembled within the datastore this
880 may be `None`.
881 components : `dict`
882 URIs to any components associated with the dataset artifact.
883 Can be empty if there are no components.
884 """
886 primary: Optional[ButlerURI] = None
887 components: Dict[str, ButlerURI] = {}
889 # if this has never been written then we have to guess
890 if not self.exists(ref):
891 if not predict:
892 raise FileNotFoundError("Dataset {} not in this datastore".format(ref))
894 def predictLocation(thisRef: DatasetRef) -> Location:
895 template = self.templates.getTemplate(thisRef)
896 location = self.locationFactory.fromPath(template.format(thisRef))
897 storageClass = ref.datasetType.storageClass
898 formatter = self.formatterFactory.getFormatter(thisRef,
899 FileDescriptor(location,
900 storageClass=storageClass))
901 # Try to use the extension attribute but ignore problems if the
902 # formatter does not define one.
903 try:
904 location = formatter.makeUpdatedLocation(location)
905 except Exception:
906 # Use the default extension
907 pass
908 return location
910 doDisassembly = self.composites.shouldBeDisassembled(ref)
912 if doDisassembly:
914 for component, componentStorage in ref.datasetType.storageClass.components.items():
915 compRef = ref.makeComponentRef(component)
916 compLocation = predictLocation(compRef)
918 # Add a URI fragment to indicate this is a guess
919 components[component] = ButlerURI(compLocation.uri.geturl() + "#predicted")
921 else:
923 location = predictLocation(ref)
925 # Add a URI fragment to indicate this is a guess
926 primary = ButlerURI(location.uri.geturl() + "#predicted")
928 return primary, components
930 # If this is a ref that we have written we can get the path.
931 # Get file metadata and internal metadata
932 fileLocations = self._get_dataset_locations_info(ref)
934 if not fileLocations: 934 ↛ 935line 934 didn't jump to line 935, because the condition on line 934 was never true
935 raise RuntimeError(f"Unexpectedly got no artifacts for dataset {ref}")
937 if len(fileLocations) == 1:
938 # No disassembly so this is the primary URI
939 primary = ButlerURI(fileLocations[0][0].uri)
941 else:
942 for location, storedFileInfo in fileLocations:
943 if storedFileInfo.component is None: 943 ↛ 944line 943 didn't jump to line 944, because the condition on line 943 was never true
944 raise RuntimeError(f"Unexpectedly got no component name for a component at {location}")
945 components[storedFileInfo.component] = ButlerURI(location.uri)
947 return primary, components
949 def getURI(self, ref: DatasetRef, predict: bool = False) -> ButlerURI:
950 """URI to the Dataset.
952 Parameters
953 ----------
954 ref : `DatasetRef`
955 Reference to the required Dataset.
956 predict : `bool`
957 If `True`, allow URIs to be returned of datasets that have not
958 been written.
960 Returns
961 -------
962 uri : `str`
963 URI pointing to the dataset within the datastore. If the
964 dataset does not exist in the datastore, and if ``predict`` is
965 `True`, the URI will be a prediction and will include a URI
966 fragment "#predicted".
967 If the datastore does not have entities that relate well
968 to the concept of a URI the returned URI will be
969 descriptive. The returned URI is not guaranteed to be obtainable.
971 Raises
972 ------
973 FileNotFoundError
974 Raised if a URI has been requested for a dataset that does not
975 exist and guessing is not allowed.
976 RuntimeError
977 Raised if a request is made for a single URI but multiple URIs
978 are associated with this dataset.
980 Notes
981 -----
982 When a predicted URI is requested an attempt will be made to form
983 a reasonable URI based on file templates and the expected formatter.
984 """
985 primary, components = self.getURIs(ref, predict)
986 if primary is None or components: 986 ↛ 987line 986 didn't jump to line 987, because the condition on line 986 was never true
987 raise RuntimeError(f"Dataset ({ref}) includes distinct URIs for components. "
988 "Use Dataastore.getURIs() instead.")
989 return primary
991 def get(self, ref: DatasetRef, parameters: Optional[Mapping[str, Any]] = None) -> Any:
992 """Load an InMemoryDataset from the store.
994 Parameters
995 ----------
996 ref : `DatasetRef`
997 Reference to the required Dataset.
998 parameters : `dict`
999 `StorageClass`-specific parameters that specify, for example,
1000 a slice of the dataset to be loaded.
1002 Returns
1003 -------
1004 inMemoryDataset : `object`
1005 Requested dataset or slice thereof as an InMemoryDataset.
1007 Raises
1008 ------
1009 FileNotFoundError
1010 Requested dataset can not be retrieved.
1011 TypeError
1012 Return value from formatter has unexpected type.
1013 ValueError
1014 Formatter failed to process the dataset.
1015 """
1016 allGetInfo = self._prepare_for_get(ref, parameters)
1017 refComponent = ref.datasetType.component()
1019 # Supplied storage class for the component being read
1020 refStorageClass = ref.datasetType.storageClass
1022 # Create mapping from component name to related info
1023 allComponents = {i.component: i for i in allGetInfo}
1025 # By definition the dataset is disassembled if we have more
1026 # than one record for it.
1027 isDisassembled = len(allGetInfo) > 1
1029 # Look for the special case where we are disassembled but the
1030 # component is a derived component that was not written during
1031 # disassembly. For this scenario we need to check that the
1032 # component requested is listed as a derived component for the
1033 # composite storage class
1034 isDisassembledReadOnlyComponent = False
1035 if isDisassembled and refComponent:
1036 # The composite storage class should be accessible through
1037 # the component dataset type
1038 compositeStorageClass = ref.datasetType.parentStorageClass
1040 # In the unlikely scenario where the composite storage
1041 # class is not known, we can only assume that this is a
1042 # normal component. If that assumption is wrong then the
1043 # branch below that reads a persisted component will fail
1044 # so there is no need to complain here.
1045 if compositeStorageClass is not None: 1045 ↛ 1048line 1045 didn't jump to line 1048, because the condition on line 1045 was never false
1046 isDisassembledReadOnlyComponent = refComponent in compositeStorageClass.derivedComponents
1048 if isDisassembled and not refComponent:
1049 # This was a disassembled dataset spread over multiple files
1050 # and we need to put them all back together again.
1051 # Read into memory and then assemble
1053 # Check that the supplied parameters are suitable for the type read
1054 refStorageClass.validateParameters(parameters)
1056 # We want to keep track of all the parameters that were not used
1057 # by formatters. We assume that if any of the component formatters
1058 # use a parameter that we do not need to apply it again in the
1059 # assembler.
1060 usedParams = set()
1062 components: Dict[str, Any] = {}
1063 for getInfo in allGetInfo:
1064 # assemblerParams are parameters not understood by the
1065 # associated formatter.
1066 usedParams.update(set(getInfo.formatterParams))
1068 component = getInfo.component
1070 if component is None: 1070 ↛ 1071line 1070 didn't jump to line 1071, because the condition on line 1070 was never true
1071 raise RuntimeError(f"Internal error in datastore assembly of {ref}")
1073 # We do not want the formatter to think it's reading
1074 # a component though because it is really reading a
1075 # standalone dataset -- always tell reader it is not a
1076 # component.
1077 components[component] = self._read_artifact_into_memory(getInfo, ref, isComponent=False)
1079 inMemoryDataset = ref.datasetType.storageClass.delegate().assemble(components)
1081 # Any unused parameters will have to be passed to the assembler
1082 if parameters:
1083 unusedParams = {k: v for k, v in parameters.items() if k not in usedParams}
1084 else:
1085 unusedParams = {}
1087 # Process parameters
1088 return ref.datasetType.storageClass.delegate().handleParameters(inMemoryDataset,
1089 parameters=unusedParams)
1091 elif isDisassembledReadOnlyComponent:
1093 compositeStorageClass = ref.datasetType.parentStorageClass
1094 if compositeStorageClass is None: 1094 ↛ 1095line 1094 didn't jump to line 1095, because the condition on line 1094 was never true
1095 raise RuntimeError(f"Unable to retrieve derived component '{refComponent}' since"
1096 "no composite storage class is available.")
1098 if refComponent is None: 1098 ↛ 1100line 1098 didn't jump to line 1100, because the condition on line 1098 was never true
1099 # Mainly for mypy
1100 raise RuntimeError(f"Internal error in datastore {self.name}: component can not be None here")
1102 # Assume that every derived component can be calculated by
1103 # forwarding the request to a single read/write component.
1104 # Rather than guessing which rw component is the right one by
1105 # scanning each for a derived component of the same name,
1106 # we ask the storage class delegate directly which one is best to
1107 # use.
1108 compositeDelegate = compositeStorageClass.delegate()
1109 forwardedComponent = compositeDelegate.selectResponsibleComponent(refComponent,
1110 set(allComponents))
1112 # Select the relevant component
1113 rwInfo = allComponents[forwardedComponent]
1115 # For now assume that read parameters are validated against
1116 # the real component and not the requested component
1117 forwardedStorageClass = rwInfo.formatter.fileDescriptor.readStorageClass
1118 forwardedStorageClass.validateParameters(parameters)
1120 # Unfortunately the FileDescriptor inside the formatter will have
1121 # the wrong write storage class so we need to create a new one
1122 # given the immutability constraint.
1123 writeStorageClass = rwInfo.info.storageClass
1125 # We may need to put some thought into parameters for read
1126 # components but for now forward them on as is
1127 readFormatter = type(rwInfo.formatter)(FileDescriptor(rwInfo.location,
1128 readStorageClass=refStorageClass,
1129 storageClass=writeStorageClass,
1130 parameters=parameters),
1131 ref.dataId)
1133 # The assembler can not receive any parameter requests for a
1134 # derived component at this time since the assembler will
1135 # see the storage class of the derived component and those
1136 # parameters will have to be handled by the formatter on the
1137 # forwarded storage class.
1138 assemblerParams: Dict[str, Any] = {}
1140 # Need to created a new info that specifies the derived
1141 # component and associated storage class
1142 readInfo = DatastoreFileGetInformation(rwInfo.location, readFormatter,
1143 rwInfo.info, assemblerParams, {},
1144 refComponent, refStorageClass)
1146 return self._read_artifact_into_memory(readInfo, ref, isComponent=True)
1148 else:
1149 # Single file request or component from that composite file
1150 for lookup in (refComponent, None): 1150 ↛ 1155line 1150 didn't jump to line 1155, because the loop on line 1150 didn't complete
1151 if lookup in allComponents: 1151 ↛ 1150line 1151 didn't jump to line 1150, because the condition on line 1151 was never false
1152 getInfo = allComponents[lookup]
1153 break
1154 else:
1155 raise FileNotFoundError(f"Component {refComponent} not found "
1156 f"for ref {ref} in datastore {self.name}")
1158 # Do not need the component itself if already disassembled
1159 if isDisassembled:
1160 isComponent = False
1161 else:
1162 isComponent = getInfo.component is not None
1164 # For a disassembled component we can validate parametersagainst
1165 # the component storage class directly
1166 if isDisassembled:
1167 refStorageClass.validateParameters(parameters)
1168 else:
1169 # For an assembled composite this could be a derived
1170 # component derived from a real component. The validity
1171 # of the parameters is not clear. For now validate against
1172 # the composite storage class
1173 getInfo.formatter.fileDescriptor.storageClass.validateParameters(parameters)
1175 return self._read_artifact_into_memory(getInfo, ref, isComponent=isComponent)
1177 @transactional
1178 def put(self, inMemoryDataset: Any, ref: DatasetRef) -> None:
1179 """Write a InMemoryDataset with a given `DatasetRef` to the store.
1181 Parameters
1182 ----------
1183 inMemoryDataset : `object`
1184 The dataset to store.
1185 ref : `DatasetRef`
1186 Reference to the associated Dataset.
1188 Raises
1189 ------
1190 TypeError
1191 Supplied object and storage class are inconsistent.
1192 DatasetTypeNotSupportedError
1193 The associated `DatasetType` is not handled by this datastore.
1195 Notes
1196 -----
1197 If the datastore is configured to reject certain dataset types it
1198 is possible that the put will fail and raise a
1199 `DatasetTypeNotSupportedError`. The main use case for this is to
1200 allow `ChainedDatastore` to put to multiple datastores without
1201 requiring that every datastore accepts the dataset.
1202 """
1204 doDisassembly = self.composites.shouldBeDisassembled(ref)
1205 # doDisassembly = True
1207 artifacts = []
1208 if doDisassembly:
1209 components = ref.datasetType.storageClass.delegate().disassemble(inMemoryDataset)
1210 for component, componentInfo in components.items():
1211 # Don't recurse because we want to take advantage of
1212 # bulk insert -- need a new DatasetRef that refers to the
1213 # same dataset_id but has the component DatasetType
1214 # DatasetType does not refer to the types of components
1215 # So we construct one ourselves.
1216 compRef = ref.makeComponentRef(component)
1217 storedInfo = self._write_in_memory_to_artifact(componentInfo.component, compRef)
1218 artifacts.append((compRef, storedInfo))
1219 else:
1220 # Write the entire thing out
1221 storedInfo = self._write_in_memory_to_artifact(inMemoryDataset, ref)
1222 artifacts.append((ref, storedInfo))
1224 self._register_datasets(artifacts)
1226 @transactional
1227 def trash(self, ref: DatasetRef, ignore_errors: bool = True) -> None:
1228 """Indicate to the datastore that a dataset can be removed.
1230 Parameters
1231 ----------
1232 ref : `DatasetRef`
1233 Reference to the required Dataset.
1234 ignore_errors : `bool`
1235 If `True` return without error even if something went wrong.
1236 Problems could occur if another process is simultaneously trying
1237 to delete.
1239 Raises
1240 ------
1241 FileNotFoundError
1242 Attempt to remove a dataset that does not exist.
1243 """
1244 # Get file metadata and internal metadata
1245 log.debug("Trashing %s in datastore %s", ref, self.name)
1247 fileLocations = self._get_dataset_locations_info(ref)
1249 if not fileLocations:
1250 err_msg = f"Requested dataset to trash ({ref}) is not known to datastore {self.name}"
1251 if ignore_errors:
1252 log.warning(err_msg)
1253 return
1254 else:
1255 raise FileNotFoundError(err_msg)
1257 for location, storedFileInfo in fileLocations:
1258 if not self._artifact_exists(location): 1258 ↛ 1259line 1258 didn't jump to line 1259, because the condition on line 1258 was never true
1259 err_msg = f"Dataset is known to datastore {self.name} but " \
1260 f"associated artifact ({location.uri}) is missing"
1261 if ignore_errors:
1262 log.warning(err_msg)
1263 return
1264 else:
1265 raise FileNotFoundError(err_msg)
1267 # Mark dataset as trashed
1268 try:
1269 self._move_to_trash_in_registry(ref)
1270 except Exception as e:
1271 if ignore_errors:
1272 log.warning(f"Attempted to mark dataset ({ref}) to be trashed in datastore {self.name} "
1273 f"but encountered an error: {e}")
1274 pass
1275 else:
1276 raise
1278 @transactional
1279 def emptyTrash(self, ignore_errors: bool = True) -> None:
1280 """Remove all datasets from the trash.
1282 Parameters
1283 ----------
1284 ignore_errors : `bool`
1285 If `True` return without error even if something went wrong.
1286 Problems could occur if another process is simultaneously trying
1287 to delete.
1288 """
1289 log.debug("Emptying trash in datastore %s", self.name)
1290 # Context manager will empty trash iff we finish it without raising.
1291 with self.bridge.emptyTrash() as trashed:
1292 for ref in trashed:
1293 fileLocations = self._get_dataset_locations_info(ref)
1295 if not fileLocations: 1295 ↛ 1296line 1295 didn't jump to line 1296, because the condition on line 1295 was never true
1296 err_msg = f"Requested dataset ({ref}) does not exist in datastore {self.name}"
1297 if ignore_errors:
1298 log.warning(err_msg)
1299 continue
1300 else:
1301 raise FileNotFoundError(err_msg)
1303 for location, _ in fileLocations:
1305 if not self._artifact_exists(location): 1305 ↛ 1306line 1305 didn't jump to line 1306, because the condition on line 1305 was never true
1306 err_msg = f"Dataset {location.uri} no longer present in datastore {self.name}"
1307 if ignore_errors:
1308 log.warning(err_msg)
1309 continue
1310 else:
1311 raise FileNotFoundError(err_msg)
1313 # Can only delete the artifact if there are no references
1314 # to the file from untrashed dataset refs.
1315 if self._can_remove_dataset_artifact(ref, location):
1316 # Point of no return for this artifact
1317 log.debug("Removing artifact %s from datastore %s", location.uri, self.name)
1318 try:
1319 self._delete_artifact(location)
1320 except Exception as e:
1321 if ignore_errors:
1322 log.critical("Encountered error removing artifact %s from datastore %s: %s",
1323 location.uri, self.name, e)
1324 else:
1325 raise
1327 # Now must remove the entry from the internal registry even if
1328 # the artifact removal failed and was ignored,
1329 # otherwise the removal check above will never be true
1330 try:
1331 # There may be multiple rows associated with this ref
1332 # depending on disassembly
1333 self.removeStoredItemInfo(ref)
1334 except Exception as e:
1335 if ignore_errors:
1336 log.warning("Error removing dataset %s (%s) from internal registry of %s: %s",
1337 ref.id, location.uri, self.name, e)
1338 continue
1339 else:
1340 raise FileNotFoundError(err_msg)
1342 def validateConfiguration(self, entities: Iterable[Union[DatasetRef, DatasetType, StorageClass]],
1343 logFailures: bool = False) -> None:
1344 """Validate some of the configuration for this datastore.
1346 Parameters
1347 ----------
1348 entities : iterable of `DatasetRef`, `DatasetType`, or `StorageClass`
1349 Entities to test against this configuration. Can be differing
1350 types.
1351 logFailures : `bool`, optional
1352 If `True`, output a log message for every validation error
1353 detected.
1355 Raises
1356 ------
1357 DatastoreValidationError
1358 Raised if there is a validation problem with a configuration.
1359 All the problems are reported in a single exception.
1361 Notes
1362 -----
1363 This method checks that all the supplied entities have valid file
1364 templates and also have formatters defined.
1365 """
1367 templateFailed = None
1368 try:
1369 self.templates.validateTemplates(entities, logFailures=logFailures)
1370 except FileTemplateValidationError as e:
1371 templateFailed = str(e)
1373 formatterFailed = []
1374 for entity in entities:
1375 try:
1376 self.formatterFactory.getFormatterClass(entity)
1377 except KeyError as e:
1378 formatterFailed.append(str(e))
1379 if logFailures: 1379 ↛ 1374line 1379 didn't jump to line 1374, because the condition on line 1379 was never false
1380 log.fatal("Formatter failure: %s", e)
1382 if templateFailed or formatterFailed:
1383 messages = []
1384 if templateFailed: 1384 ↛ 1385line 1384 didn't jump to line 1385, because the condition on line 1384 was never true
1385 messages.append(templateFailed)
1386 if formatterFailed: 1386 ↛ 1388line 1386 didn't jump to line 1388, because the condition on line 1386 was never false
1387 messages.append(",".join(formatterFailed))
1388 msg = ";\n".join(messages)
1389 raise DatastoreValidationError(msg)
1391 def getLookupKeys(self) -> Set[LookupKey]:
1392 # Docstring is inherited from base class
1393 return self.templates.getLookupKeys() | self.formatterFactory.getLookupKeys() | \
1394 self.constraints.getLookupKeys()
1396 def validateKey(self, lookupKey: LookupKey,
1397 entity: Union[DatasetRef, DatasetType, StorageClass]) -> None:
1398 # Docstring is inherited from base class
1399 # The key can be valid in either formatters or templates so we can
1400 # only check the template if it exists
1401 if lookupKey in self.templates:
1402 try:
1403 self.templates[lookupKey].validateTemplate(entity)
1404 except FileTemplateValidationError as e:
1405 raise DatastoreValidationError(e) from e
1407 def export(self, refs: Iterable[DatasetRef], *,
1408 directory: Optional[Union[ButlerURI, str]] = None,
1409 transfer: Optional[str] = "auto") -> Iterable[FileDataset]:
1410 # Docstring inherited from Datastore.export.
1411 if transfer is not None and directory is None: 1411 ↛ 1412line 1411 didn't jump to line 1412, because the condition on line 1411 was never true
1412 raise RuntimeError(f"Cannot export using transfer mode {transfer} with no "
1413 "export directory given")
1415 # Force the directory to be a URI object
1416 directoryUri: Optional[ButlerURI] = None
1417 if directory is not None: 1417 ↛ 1420line 1417 didn't jump to line 1420, because the condition on line 1417 was never false
1418 directoryUri = ButlerURI(directory, forceDirectory=True)
1420 if transfer is not None and directoryUri is not None: 1420 ↛ 1425line 1420 didn't jump to line 1425, because the condition on line 1420 was never false
1421 # mypy needs the second test
1422 if not directoryUri.exists(): 1422 ↛ 1423line 1422 didn't jump to line 1423, because the condition on line 1422 was never true
1423 raise FileNotFoundError(f"Export location {directory} does not exist")
1425 for ref in refs:
1426 fileLocations = self._get_dataset_locations_info(ref)
1427 if not fileLocations: 1427 ↛ 1428line 1427 didn't jump to line 1428, because the condition on line 1427 was never true
1428 raise FileNotFoundError(f"Could not retrieve dataset {ref}.")
1429 # For now we can not export disassembled datasets
1430 if len(fileLocations) > 1:
1431 raise NotImplementedError(f"Can not export disassembled datasets such as {ref}")
1432 location, storedFileInfo = fileLocations[0]
1433 if transfer is None: 1433 ↛ 1436line 1433 didn't jump to line 1436, because the condition on line 1433 was never true
1434 # TODO: do we also need to return the readStorageClass somehow?
1435 # We will use the path in store directly
1436 pass
1437 else:
1438 # mypy needs help
1439 assert directoryUri is not None, "directoryUri must be defined to get here"
1440 storeUri = ButlerURI(location.uri)
1441 exportUri = directoryUri.join(location.pathInStore)
1442 exportUri.transfer_from(storeUri, transfer=transfer)
1444 yield FileDataset(refs=[ref], path=location.pathInStore, formatter=storedFileInfo.formatter)
1446 @staticmethod
1447 def computeChecksum(uri: ButlerURI, algorithm: str = "blake2b", block_size: int = 8192) -> Optional[str]:
1448 """Compute the checksum of the supplied file.
1450 Parameters
1451 ----------
1452 uri : `ButlerURI`
1453 Name of resource to calculate checksum from.
1454 algorithm : `str`, optional
1455 Name of algorithm to use. Must be one of the algorithms supported
1456 by :py:class`hashlib`.
1457 block_size : `int`
1458 Number of bytes to read from file at one time.
1460 Returns
1461 -------
1462 hexdigest : `str`
1463 Hex digest of the file.
1465 Notes
1466 -----
1467 Currently returns None if the URI is for a remote resource.
1468 """
1469 if algorithm not in hashlib.algorithms_guaranteed: 1469 ↛ 1470line 1469 didn't jump to line 1470, because the condition on line 1469 was never true
1470 raise NameError("The specified algorithm '{}' is not supported by hashlib".format(algorithm))
1472 if uri.scheme and uri.scheme != "file":
1473 return None
1475 hasher = hashlib.new(algorithm)
1477 filename, is_temp = uri.as_local()
1479 with open(filename, "rb") as f:
1480 for chunk in iter(lambda: f.read(block_size), b""):
1481 hasher.update(chunk)
1483 if is_temp: 1483 ↛ 1484line 1483 didn't jump to line 1484, because the condition on line 1483 was never true
1484 os.remove(filename)
1486 return hasher.hexdigest()