Coverage for python/lsst/daf/butler/datastores/fileDatastore.py : 84%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
21from __future__ import annotations
23"""Generic file-based datastore code."""
25__all__ = ("FileDatastore", )
27import hashlib
28import logging
29import os
30import tempfile
32from sqlalchemy import BigInteger, String
34from dataclasses import dataclass
35from typing import (
36 TYPE_CHECKING,
37 Any,
38 ClassVar,
39 Dict,
40 Iterable,
41 List,
42 Mapping,
43 Optional,
44 Set,
45 Tuple,
46 Type,
47 Union,
48)
50from lsst.daf.butler import (
51 ButlerURI,
52 CompositesMap,
53 Config,
54 FileDataset,
55 DatasetRef,
56 DatasetType,
57 DatasetTypeNotSupportedError,
58 Datastore,
59 DatastoreConfig,
60 DatastoreValidationError,
61 FileDescriptor,
62 FileTemplates,
63 FileTemplateValidationError,
64 Formatter,
65 FormatterFactory,
66 Location,
67 LocationFactory,
68 StorageClass,
69 StoredFileInfo,
70)
72from lsst.daf.butler import ddl
73from lsst.daf.butler.registry.interfaces import (
74 ReadOnlyDatabaseError,
75 DatastoreRegistryBridge,
76)
78from lsst.daf.butler.core.repoRelocation import replaceRoot
79from lsst.daf.butler.core.utils import getInstanceOf, getClassOf, transactional
80from .genericDatastore import GenericBaseDatastore
82if TYPE_CHECKING: 82 ↛ 83line 82 didn't jump to line 83, because the condition on line 82 was never true
83 from lsst.daf.butler import LookupKey
84 from lsst.daf.butler.registry.interfaces import DatasetIdRef, DatastoreRegistryBridgeManager
86log = logging.getLogger(__name__)
88# String to use when a Python None is encountered
89NULLSTR = "__NULL_STRING__"
92class _IngestPrepData(Datastore.IngestPrepData):
93 """Helper class for FileDatastore ingest implementation.
95 Parameters
96 ----------
97 datasets : `list` of `FileDataset`
98 Files to be ingested by this datastore.
99 """
100 def __init__(self, datasets: List[FileDataset]):
101 super().__init__(ref for dataset in datasets for ref in dataset.refs)
102 self.datasets = datasets
105@dataclass(frozen=True)
106class DatastoreFileGetInformation:
107 """Collection of useful parameters needed to retrieve a file from
108 a Datastore.
109 """
111 location: Location
112 """The location from which to read the dataset."""
114 formatter: Formatter
115 """The `Formatter` to use to deserialize the dataset."""
117 info: StoredFileInfo
118 """Stored information about this file and its formatter."""
120 assemblerParams: Dict[str, Any]
121 """Parameters to use for post-processing the retrieved dataset."""
123 formatterParams: Dict[str, Any]
124 """Parameters that were understood by the associated formatter."""
126 component: Optional[str]
127 """The component to be retrieved (can be `None`)."""
129 readStorageClass: StorageClass
130 """The `StorageClass` of the dataset being read."""
133class FileDatastore(GenericBaseDatastore):
134 """Generic Datastore for file-based implementations.
136 Should always be sub-classed since key abstract methods are missing.
138 Parameters
139 ----------
140 config : `DatastoreConfig` or `str`
141 Configuration as either a `Config` object or URI to file.
142 bridgeManager : `DatastoreRegistryBridgeManager`
143 Object that manages the interface between `Registry` and datastores.
144 butlerRoot : `str`, optional
145 New datastore root to use to override the configuration value.
147 Raises
148 ------
149 ValueError
150 If root location does not exist and ``create`` is `False` in the
151 configuration.
152 """
154 defaultConfigFile: ClassVar[Optional[str]] = None
155 """Path to configuration defaults. Accessed within the ``config`` resource
156 or relative to a search path. Can be None if no defaults specified.
157 """
159 root: ButlerURI
160 """Root directory URI of this `Datastore`."""
162 locationFactory: LocationFactory
163 """Factory for creating locations relative to the datastore root."""
165 formatterFactory: FormatterFactory
166 """Factory for creating instances of formatters."""
168 templates: FileTemplates
169 """File templates that can be used by this `Datastore`."""
171 composites: CompositesMap
172 """Determines whether a dataset should be disassembled on put."""
174 defaultConfigFile = "datastores/fileDatastore.yaml"
175 """Path to configuration defaults. Accessed within the ``config`` resource
176 or relative to a search path. Can be None if no defaults specified.
177 """
179 @classmethod
180 def setConfigRoot(cls, root: str, config: Config, full: Config, overwrite: bool = True) -> None:
181 """Set any filesystem-dependent config options for this Datastore to
182 be appropriate for a new empty repository with the given root.
184 Parameters
185 ----------
186 root : `str`
187 URI to the root of the data repository.
188 config : `Config`
189 A `Config` to update. Only the subset understood by
190 this component will be updated. Will not expand
191 defaults.
192 full : `Config`
193 A complete config with all defaults expanded that can be
194 converted to a `DatastoreConfig`. Read-only and will not be
195 modified by this method.
196 Repository-specific options that should not be obtained
197 from defaults when Butler instances are constructed
198 should be copied from ``full`` to ``config``.
199 overwrite : `bool`, optional
200 If `False`, do not modify a value in ``config`` if the value
201 already exists. Default is always to overwrite with the provided
202 ``root``.
204 Notes
205 -----
206 If a keyword is explicitly defined in the supplied ``config`` it
207 will not be overridden by this method if ``overwrite`` is `False`.
208 This allows explicit values set in external configs to be retained.
209 """
210 Config.updateParameters(DatastoreConfig, config, full,
211 toUpdate={"root": root},
212 toCopy=("cls", ("records", "table")), overwrite=overwrite)
214 @classmethod
215 def makeTableSpec(cls) -> ddl.TableSpec:
216 return ddl.TableSpec(
217 fields=[
218 ddl.FieldSpec(name="dataset_id", dtype=BigInteger, primaryKey=True),
219 ddl.FieldSpec(name="path", dtype=String, length=256, nullable=False),
220 ddl.FieldSpec(name="formatter", dtype=String, length=128, nullable=False),
221 ddl.FieldSpec(name="storage_class", dtype=String, length=64, nullable=False),
222 # Use empty string to indicate no component
223 ddl.FieldSpec(name="component", dtype=String, length=32, primaryKey=True),
224 # TODO: should checksum be Base64Bytes instead?
225 ddl.FieldSpec(name="checksum", dtype=String, length=128, nullable=True),
226 ddl.FieldSpec(name="file_size", dtype=BigInteger, nullable=True),
227 ],
228 unique=frozenset(),
229 )
231 def __init__(self, config: Union[DatastoreConfig, str],
232 bridgeManager: DatastoreRegistryBridgeManager, butlerRoot: str = None):
233 super().__init__(config, bridgeManager)
234 if "root" not in self.config: 234 ↛ 235line 234 didn't jump to line 235, because the condition on line 234 was never true
235 raise ValueError("No root directory specified in configuration")
237 # Name ourselves either using an explicit name or a name
238 # derived from the (unexpanded) root
239 if "name" in self.config:
240 self.name = self.config["name"]
241 else:
242 # We use the unexpanded root in the name to indicate that this
243 # datastore can be moved without having to update registry.
244 self.name = "{}@{}".format(type(self).__name__,
245 self.config["root"])
247 # Support repository relocation in config
248 # Existence of self.root is checked in subclass
249 self.root = ButlerURI(replaceRoot(self.config["root"], butlerRoot),
250 forceDirectory=True, forceAbsolute=True)
252 self.locationFactory = LocationFactory(self.root)
253 self.formatterFactory = FormatterFactory()
255 # Now associate formatters with storage classes
256 self.formatterFactory.registerFormatters(self.config["formatters"],
257 universe=bridgeManager.universe)
259 # Read the file naming templates
260 self.templates = FileTemplates(self.config["templates"],
261 universe=bridgeManager.universe)
263 # See if composites should be disassembled
264 self.composites = CompositesMap(self.config["composites"],
265 universe=bridgeManager.universe)
267 tableName = self.config["records", "table"]
268 try:
269 # Storage of paths and formatters, keyed by dataset_id
270 self._table = bridgeManager.opaque.register(tableName, self.makeTableSpec())
271 # Interface to Registry.
272 self._bridge = bridgeManager.register(self.name)
273 except ReadOnlyDatabaseError:
274 # If the database is read only and we just tried and failed to
275 # create a table, it means someone is trying to create a read-only
276 # butler client for an empty repo. That should be okay, as long
277 # as they then try to get any datasets before some other client
278 # creates the table. Chances are they'rejust validating
279 # configuration.
280 pass
282 # Determine whether checksums should be used - default to False
283 self.useChecksum = self.config.get("checksum", False)
285 # Check existence and create directory structure if necessary
286 if not self.root.exists():
287 if "create" not in self.config or not self.config["create"]: 287 ↛ 288line 287 didn't jump to line 288, because the condition on line 287 was never true
288 raise ValueError(f"No valid root and not allowed to create one at: {self.root}")
289 try:
290 self.root.mkdir()
291 except Exception as e:
292 raise ValueError(f"Can not create datastore root '{self.root}', check permissions."
293 f" Got error: {e}") from e
295 def __str__(self) -> str:
296 return str(self.root)
298 @property
299 def bridge(self) -> DatastoreRegistryBridge:
300 return self._bridge
302 def _artifact_exists(self, location: Location) -> bool:
303 """Check that an artifact exists in this datastore at the specified
304 location.
306 Parameters
307 ----------
308 location : `Location`
309 Expected location of the artifact associated with this datastore.
311 Returns
312 -------
313 exists : `bool`
314 True if the location can be found, false otherwise.
315 """
316 log.debug("Checking if resource exists: %s", location.uri)
317 return location.uri.exists()
319 def _delete_artifact(self, location: Location) -> None:
320 """Delete the artifact from the datastore.
322 Parameters
323 ----------
324 location : `Location`
325 Location of the artifact associated with this datastore.
326 """
327 log.debug("Deleting file: %s", location.uri)
328 location.uri.remove()
329 log.debug("Successfully deleted file: %s", location.uri)
331 def addStoredItemInfo(self, refs: Iterable[DatasetRef], infos: Iterable[StoredFileInfo]) -> None:
332 # Docstring inherited from GenericBaseDatastore
333 records = []
334 for ref, info in zip(refs, infos):
335 # Component should come from ref and fall back on info
336 component = ref.datasetType.component()
337 if component is None and info.component is not None: 337 ↛ 338line 337 didn't jump to line 338, because the condition on line 337 was never true
338 component = info.component
339 if component is None:
340 # Use empty string since we want this to be part of the
341 # primary key.
342 component = NULLSTR
343 records.append(
344 dict(dataset_id=ref.id, formatter=info.formatter, path=info.path,
345 storage_class=info.storageClass.name, component=component,
346 checksum=info.checksum, file_size=info.file_size)
347 )
348 self._table.insert(*records)
350 def getStoredItemsInfo(self, ref: DatasetIdRef) -> List[StoredFileInfo]:
351 # Docstring inherited from GenericBaseDatastore
353 # Look for the dataset_id -- there might be multiple matches
354 # if we have disassembled the dataset.
355 records = list(self._table.fetch(dataset_id=ref.id))
357 results = []
358 for record in records:
359 # Convert name of StorageClass to instance
360 storageClass = self.storageClassFactory.getStorageClass(record["storage_class"])
361 component = record["component"] if (record["component"]
362 and record["component"] != NULLSTR) else None
364 info = StoredFileInfo(formatter=record["formatter"],
365 path=record["path"],
366 storageClass=storageClass,
367 component=component,
368 checksum=record["checksum"],
369 file_size=record["file_size"])
370 results.append(info)
372 return results
374 def _registered_refs_per_artifact(self, pathInStore: str) -> Set[int]:
375 """Return all dataset refs associated with the supplied path.
377 Parameters
378 ----------
379 pathInStore : `str`
380 Path of interest in the data store.
382 Returns
383 -------
384 ids : `set` of `int`
385 All `DatasetRef` IDs associated with this path.
386 """
387 records = list(self._table.fetch(path=pathInStore))
388 ids = {r["dataset_id"] for r in records}
389 return ids
391 def removeStoredItemInfo(self, ref: DatasetIdRef) -> None:
392 # Docstring inherited from GenericBaseDatastore
393 self._table.delete(dataset_id=ref.id)
395 def _get_dataset_locations_info(self, ref: DatasetIdRef) -> List[Tuple[Location, StoredFileInfo]]:
396 r"""Find all the `Location`\ s of the requested dataset in the
397 `Datastore` and the associated stored file information.
399 Parameters
400 ----------
401 ref : `DatasetRef`
402 Reference to the required `Dataset`.
404 Returns
405 -------
406 results : `list` [`tuple` [`Location`, `StoredFileInfo` ]]
407 Location of the dataset within the datastore and
408 stored information about each file and its formatter.
409 """
410 # Get the file information (this will fail if no file)
411 records = self.getStoredItemsInfo(ref)
413 # Use the path to determine the location
414 return [(self.locationFactory.fromPath(r.path), r) for r in records]
416 def _can_remove_dataset_artifact(self, ref: DatasetIdRef, location: Location) -> bool:
417 """Check that there is only one dataset associated with the
418 specified artifact.
420 Parameters
421 ----------
422 ref : `DatasetRef` or `FakeDatasetRef`
423 Dataset to be removed.
424 location : `Location`
425 The location of the artifact to be removed.
427 Returns
428 -------
429 can_remove : `Bool`
430 True if the artifact can be safely removed.
431 """
433 # Get all entries associated with this path
434 allRefs = self._registered_refs_per_artifact(location.pathInStore)
435 if not allRefs: 435 ↛ 436line 435 didn't jump to line 436, because the condition on line 435 was never true
436 raise RuntimeError(f"Datastore inconsistency error. {location.pathInStore} not in registry")
438 # Remove these refs from all the refs and if there is nothing left
439 # then we can delete
440 remainingRefs = allRefs - {ref.id}
442 if remainingRefs:
443 return False
444 return True
446 def _prepare_for_get(self, ref: DatasetRef,
447 parameters: Optional[Mapping[str, Any]] = None) -> List[DatastoreFileGetInformation]:
448 """Check parameters for ``get`` and obtain formatter and
449 location.
451 Parameters
452 ----------
453 ref : `DatasetRef`
454 Reference to the required Dataset.
455 parameters : `dict`
456 `StorageClass`-specific parameters that specify, for example,
457 a slice of the dataset to be loaded.
459 Returns
460 -------
461 getInfo : `list` [`DatastoreFileGetInformation`]
462 Parameters needed to retrieve each file.
463 """
464 log.debug("Retrieve %s from %s with parameters %s", ref, self.name, parameters)
466 # Get file metadata and internal metadata
467 fileLocations = self._get_dataset_locations_info(ref)
468 if not fileLocations:
469 raise FileNotFoundError(f"Could not retrieve dataset {ref}.")
471 # The storage class we want to use eventually
472 refStorageClass = ref.datasetType.storageClass
474 if len(fileLocations) > 1:
475 disassembled = True
476 else:
477 disassembled = False
479 # Is this a component request?
480 refComponent = ref.datasetType.component()
482 fileGetInfo = []
483 for location, storedFileInfo in fileLocations:
485 # The storage class used to write the file
486 writeStorageClass = storedFileInfo.storageClass
488 # If this has been disassembled we need read to match the write
489 if disassembled:
490 readStorageClass = writeStorageClass
491 else:
492 readStorageClass = refStorageClass
494 formatter = getInstanceOf(storedFileInfo.formatter,
495 FileDescriptor(location, readStorageClass=readStorageClass,
496 storageClass=writeStorageClass, parameters=parameters),
497 ref.dataId)
499 formatterParams, notFormatterParams = formatter.segregateParameters()
501 # Of the remaining parameters, extract the ones supported by
502 # this StorageClass (for components not all will be handled)
503 assemblerParams = readStorageClass.filterParameters(notFormatterParams)
505 # The ref itself could be a component if the dataset was
506 # disassembled by butler, or we disassembled in datastore and
507 # components came from the datastore records
508 component = storedFileInfo.component if storedFileInfo.component else refComponent
510 fileGetInfo.append(DatastoreFileGetInformation(location, formatter, storedFileInfo,
511 assemblerParams, formatterParams,
512 component, readStorageClass))
514 return fileGetInfo
516 def _prepare_for_put(self, inMemoryDataset: Any, ref: DatasetRef) -> Tuple[Location, Formatter]:
517 """Check the arguments for ``put`` and obtain formatter and
518 location.
520 Parameters
521 ----------
522 inMemoryDataset : `object`
523 The dataset to store.
524 ref : `DatasetRef`
525 Reference to the associated Dataset.
527 Returns
528 -------
529 location : `Location`
530 The location to write the dataset.
531 formatter : `Formatter`
532 The `Formatter` to use to write the dataset.
534 Raises
535 ------
536 TypeError
537 Supplied object and storage class are inconsistent.
538 DatasetTypeNotSupportedError
539 The associated `DatasetType` is not handled by this datastore.
540 """
541 self._validate_put_parameters(inMemoryDataset, ref)
543 # Work out output file name
544 try:
545 template = self.templates.getTemplate(ref)
546 except KeyError as e:
547 raise DatasetTypeNotSupportedError(f"Unable to find template for {ref}") from e
549 # Validate the template to protect against filenames from different
550 # dataIds returning the same and causing overwrite confusion.
551 template.validateTemplate(ref)
553 location = self.locationFactory.fromPath(template.format(ref))
555 # Get the formatter based on the storage class
556 storageClass = ref.datasetType.storageClass
557 try:
558 formatter = self.formatterFactory.getFormatter(ref,
559 FileDescriptor(location,
560 storageClass=storageClass),
561 ref.dataId)
562 except KeyError as e:
563 raise DatasetTypeNotSupportedError(f"Unable to find formatter for {ref} in datastore "
564 f"{self.name}") from e
566 # Now that we know the formatter, update the location
567 location = formatter.makeUpdatedLocation(location)
569 return location, formatter
571 def _overrideTransferMode(self, *datasets: FileDataset, transfer: Optional[str] = None) -> Optional[str]:
572 # Docstring inherited from base class
573 if transfer != "auto":
574 return transfer
576 # See if the paths are within the datastore or not
577 inside = [self._pathInStore(d.path) is not None for d in datasets]
579 if all(inside):
580 transfer = None
581 elif not any(inside): 581 ↛ 585line 581 didn't jump to line 585, because the condition on line 581 was never false
582 # Allow ButlerURI to use its own knowledge
583 transfer = "auto"
584 else:
585 raise ValueError("Some datasets are inside the datastore and some are outside."
586 " Please use an explicit transfer mode and not 'auto'.")
588 return transfer
590 def _pathInStore(self, path: str) -> Optional[str]:
591 """Return path relative to datastore root
593 Parameters
594 ----------
595 path : `str`
596 Path to dataset. Can be absolute. If relative assumed to
597 be relative to the datastore. Returns path in datastore
598 or raises an exception if the path it outside.
600 Returns
601 -------
602 inStore : `str`
603 Path relative to datastore root. Returns `None` if the file is
604 outside the root.
605 """
606 # Relative path will always be relative to datastore
607 pathUri = ButlerURI(path, forceAbsolute=False)
608 return pathUri.relative_to(self.root)
610 def _standardizeIngestPath(self, path: str, *, transfer: Optional[str] = None) -> str:
611 """Standardize the path of a to-be-ingested file.
613 Parameters
614 ----------
615 path : `str`
616 Path of a file to be ingested.
617 transfer : `str`, optional
618 How (and whether) the dataset should be added to the datastore.
619 See `ingest` for details of transfer modes.
620 This implementation is provided only so
621 `NotImplementedError` can be raised if the mode is not supported;
622 actual transfers are deferred to `_extractIngestInfo`.
624 Returns
625 -------
626 path : `str`
627 New path in what the datastore considers standard form.
629 Notes
630 -----
631 Subclasses of `FileDatastore` can implement this method instead
632 of `_prepIngest`. It should not modify the data repository or given
633 file in any way.
635 Raises
636 ------
637 NotImplementedError
638 Raised if the datastore does not support the given transfer mode
639 (including the case where ingest is not supported at all).
640 FileNotFoundError
641 Raised if one of the given files does not exist.
642 """
643 if transfer not in (None,) + self.root.transferModes: 643 ↛ 644line 643 didn't jump to line 644, because the condition on line 643 was never true
644 raise NotImplementedError(f"Transfer mode {transfer} not supported.")
646 # A relative URI indicates relative to datastore root
647 srcUri = ButlerURI(path, forceAbsolute=False)
648 if not srcUri.isabs():
649 srcUri = self.root.join(path)
651 if not srcUri.exists():
652 raise FileNotFoundError(f"Resource at {srcUri} does not exist; note that paths to ingest "
653 f"are assumed to be relative to {self.root} unless they are absolute.")
655 if transfer is None:
656 relpath = srcUri.relative_to(self.root)
657 if not relpath:
658 raise RuntimeError(f"Transfer is none but source file ({srcUri}) is not "
659 f"within datastore ({self.root})")
661 # Return the relative path within the datastore for internal
662 # transfer
663 path = relpath
665 return path
667 def _extractIngestInfo(self, path: Union[str, ButlerURI], ref: DatasetRef, *,
668 formatter: Union[Formatter, Type[Formatter]],
669 transfer: Optional[str] = None) -> StoredFileInfo:
670 """Relocate (if necessary) and extract `StoredFileInfo` from a
671 to-be-ingested file.
673 Parameters
674 ----------
675 path : `str` or `ButlerURI`
676 URI or path of a file to be ingested.
677 ref : `DatasetRef`
678 Reference for the dataset being ingested. Guaranteed to have
679 ``dataset_id not None`.
680 formatter : `type` or `Formatter`
681 `Formatter` subclass to use for this dataset or an instance.
682 transfer : `str`, optional
683 How (and whether) the dataset should be added to the datastore.
684 See `ingest` for details of transfer modes.
686 Returns
687 -------
688 info : `StoredFileInfo`
689 Internal datastore record for this file. This will be inserted by
690 the caller; the `_extractIngestInfo` is only resposible for
691 creating and populating the struct.
693 Raises
694 ------
695 FileNotFoundError
696 Raised if one of the given files does not exist.
697 FileExistsError
698 Raised if transfer is not `None` but the (internal) location the
699 file would be moved to is already occupied.
700 """
701 if self._transaction is None: 701 ↛ 702line 701 didn't jump to line 702, because the condition on line 701 was never true
702 raise RuntimeError("Ingest called without transaction enabled")
704 # Create URI of the source path, do not need to force a relative
705 # path to absolute.
706 srcUri = ButlerURI(path, forceAbsolute=False)
708 # Track whether we have read the size of the source yet
709 have_sized = False
711 if transfer is None:
712 # A relative path is assumed to be relative to the datastore
713 # in this context
714 if not srcUri.isabs():
715 tgtLocation = self.locationFactory.fromPath(srcUri.ospath)
716 else:
717 # Work out the path in the datastore from an absolute URI
718 # This is required to be within the datastore.
719 pathInStore = srcUri.relative_to(self.root)
720 if pathInStore is None: 720 ↛ 721line 720 didn't jump to line 721, because the condition on line 720 was never true
721 raise RuntimeError(f"Unexpectedly learned that {srcUri} is "
722 f"not within datastore {self.root}")
723 tgtLocation = self.locationFactory.fromPath(pathInStore)
724 else:
725 # Work out the name we want this ingested file to have
726 # inside the datastore
727 tgtLocation = self._calculate_ingested_datastore_name(srcUri, ref, formatter)
728 if not tgtLocation.uri.dirname().exists():
729 log.debug("Folder %s does not exist yet.", tgtLocation.uri.dirname())
730 tgtLocation.uri.dirname().mkdir()
732 # if we are transferring from a local file to a remote location
733 # it may be more efficient to get the size and checksum of the
734 # local file rather than the transferred one
735 if not srcUri.scheme or srcUri.scheme == "file": 735 ↛ 741line 735 didn't jump to line 741, because the condition on line 735 was never false
736 size = srcUri.size()
737 checksum = self.computeChecksum(srcUri) if self.useChecksum else None
738 have_sized = True
740 # transfer the resource to the destination
741 tgtLocation.uri.transfer_from(srcUri, transfer=transfer, transaction=self._transaction)
743 # the file should exist in the datastore now
744 if not have_sized:
745 size = tgtLocation.uri.size()
746 checksum = self.computeChecksum(tgtLocation.uri) if self.useChecksum else None
748 return StoredFileInfo(formatter=formatter, path=tgtLocation.pathInStore,
749 storageClass=ref.datasetType.storageClass,
750 component=ref.datasetType.component(),
751 file_size=size, checksum=checksum)
753 def _prepIngest(self, *datasets: FileDataset, transfer: Optional[str] = None) -> _IngestPrepData:
754 # Docstring inherited from Datastore._prepIngest.
755 filtered = []
756 for dataset in datasets:
757 acceptable = [ref for ref in dataset.refs if self.constraints.isAcceptable(ref)]
758 if not acceptable:
759 continue
760 else:
761 dataset.refs = acceptable
762 if dataset.formatter is None:
763 dataset.formatter = self.formatterFactory.getFormatterClass(dataset.refs[0])
764 else:
765 assert isinstance(dataset.formatter, (type, str))
766 dataset.formatter = getClassOf(dataset.formatter)
767 dataset.path = self._standardizeIngestPath(dataset.path, transfer=transfer)
768 filtered.append(dataset)
769 return _IngestPrepData(filtered)
771 @transactional
772 def _finishIngest(self, prepData: Datastore.IngestPrepData, *, transfer: Optional[str] = None) -> None:
773 # Docstring inherited from Datastore._finishIngest.
774 refsAndInfos = []
775 for dataset in prepData.datasets:
776 # Do ingest as if the first dataset ref is associated with the file
777 info = self._extractIngestInfo(dataset.path, dataset.refs[0], formatter=dataset.formatter,
778 transfer=transfer)
779 refsAndInfos.extend([(ref, info) for ref in dataset.refs])
780 self._register_datasets(refsAndInfos)
782 def _calculate_ingested_datastore_name(self, srcUri: ButlerURI, ref: DatasetRef,
783 formatter: Union[Formatter, Type[Formatter]]) -> Location:
784 """Given a source URI and a DatasetRef, determine the name the
785 dataset will have inside datastore.
787 Parameters
788 ----------
789 srcUri : `ButlerURI`
790 URI to the source dataset file.
791 ref : `DatasetRef`
792 Ref associated with the newly-ingested dataset artifact. This
793 is used to determine the name within the datastore.
794 formatter : `Formatter` or Formatter class.
795 Formatter to use for validation. Can be a class or an instance.
797 Returns
798 -------
799 location : `Location`
800 Target location for the newly-ingested dataset.
801 """
802 # Ingesting a file from outside the datastore.
803 # This involves a new name.
804 template = self.templates.getTemplate(ref)
805 location = self.locationFactory.fromPath(template.format(ref))
807 # Get the extension
808 ext = srcUri.getExtension()
810 # Update the destination to include that extension
811 location.updateExtension(ext)
813 # Ask the formatter to validate this extension
814 formatter.validateExtension(location)
816 return location
818 def _write_in_memory_to_artifact(self, inMemoryDataset: Any, ref: DatasetRef) -> StoredFileInfo:
819 """Write out in memory dataset to datastore.
821 Parameters
822 ----------
823 inMemoryDataset : `object`
824 Dataset to write to datastore.
825 ref : `DatasetRef`
826 Registry information associated with this dataset.
828 Returns
829 -------
830 info : `StoredFileInfo`
831 Information describin the artifact written to the datastore.
832 """
833 location, formatter = self._prepare_for_put(inMemoryDataset, ref)
834 uri = location.uri
836 if not uri.dirname().exists():
837 log.debug("Folder %s does not exist yet so creating it.", uri.dirname())
838 uri.dirname().mkdir()
840 if self._transaction is None: 840 ↛ 841line 840 didn't jump to line 841, because the condition on line 840 was never true
841 raise RuntimeError("Attempting to write artifact without transaction enabled")
843 def _removeFileExists(uri: ButlerURI) -> None:
844 """Remove a file and do not complain if it is not there.
846 This is important since a formatter might fail before the file
847 is written and we should not confuse people by writing spurious
848 error messages to the log.
849 """
850 try:
851 uri.remove()
852 except FileNotFoundError:
853 pass
855 # Register a callback to try to delete the uploaded data if
856 # something fails below
857 self._transaction.registerUndo("artifactWrite", _removeFileExists, uri)
859 # For a local file, simply use the formatter directly
860 if uri.isLocal:
861 path = formatter.write(inMemoryDataset)
862 assert self.root.join(path) == uri
863 log.debug("Successfully wrote python object to local file at %s", uri)
864 else:
865 # This is a remote URI, so first try bytes and write directly else
866 # fallback to a temporary file
867 try:
868 serializedDataset = formatter.toBytes(inMemoryDataset)
869 log.debug("Writing bytes directly to %s", uri)
870 uri.write(serializedDataset, overwrite=True)
871 log.debug("Successfully wrote bytes directly to %s", uri)
872 except NotImplementedError:
873 with tempfile.NamedTemporaryFile(suffix=uri.getExtension()) as tmpFile:
874 # Need to configure the formatter to write to a different
875 # location and that needs us to overwrite internals
876 tmpLocation = Location(*os.path.split(tmpFile.name))
877 log.debug("Writing dataset to temporary location at %s", tmpLocation.uri)
878 with formatter._updateLocation(tmpLocation):
879 formatter.write(inMemoryDataset)
880 uri.transfer_from(tmpLocation.uri, transfer="copy", overwrite=True)
881 log.debug("Successfully wrote dataset to %s via a temporary file.", uri)
883 # URI is needed to resolve what ingest case are we dealing with
884 return self._extractIngestInfo(uri, ref, formatter=formatter)
886 def _read_artifact_into_memory(self, getInfo: DatastoreFileGetInformation,
887 ref: DatasetRef, isComponent: bool = False) -> Any:
888 """Read the artifact from datastore into in memory object.
890 Parameters
891 ----------
892 getInfo : `DatastoreFileGetInformation`
893 Information about the artifact within the datastore.
894 ref : `DatasetRef`
895 The registry information associated with this artifact.
896 isComponent : `bool`
897 Flag to indicate if a component is being read from this artifact.
899 Returns
900 -------
901 inMemoryDataset : `object`
902 The artifact as a python object.
903 """
904 location = getInfo.location
905 uri = location.uri
906 log.debug("Accessing data from %s", uri)
908 # Cannot recalculate checksum but can compare size as a quick check
909 recorded_size = getInfo.info.file_size
910 resource_size = uri.size()
911 if resource_size != recorded_size: 911 ↛ 912line 911 didn't jump to line 912, because the condition on line 911 was never true
912 raise RuntimeError("Integrity failure in Datastore. "
913 f"Size of file {uri} ({resource_size}) "
914 f"does not match size recorded in registry of {recorded_size}")
916 # For the general case we have choices for how to proceed.
917 # 1. Always use a local file (downloading the remote resource to a
918 # temporary file if needed).
919 # 2. Use a threshold size and read into memory and use bytes.
920 # Use both for now with an arbitrary hand off size.
921 # This allows small datasets to be downloaded from remote object
922 # stores without requiring a temporary file.
924 formatter = getInfo.formatter
925 nbytes_max = 10_000_000 # Arbitrary number that we can tune
926 if resource_size <= nbytes_max and formatter.can_read_bytes():
927 serializedDataset = uri.read()
928 log.debug("Deserializing %s from %d bytes from location %s with formatter %s",
929 f"component {getInfo.component}" if isComponent else "",
930 len(serializedDataset), uri, formatter.name())
931 try:
932 result = formatter.fromBytes(serializedDataset,
933 component=getInfo.component if isComponent else None)
934 except Exception as e:
935 raise ValueError(f"Failure from formatter '{formatter.name()}' for dataset {ref.id}"
936 f" ({ref.datasetType.name} from {uri}): {e}") from e
937 else:
938 # Read from file
939 with uri.as_local() as local_uri:
940 # Have to update the Location associated with the formatter
941 # because formatter.read does not allow an override.
942 # This could be improved.
943 msg = ""
944 newLocation = None
945 if uri != local_uri:
946 newLocation = Location(*local_uri.split())
947 msg = "(via download to local file)"
949 log.debug("Reading %s from location %s %s with formatter %s",
950 f"component {getInfo.component}" if isComponent else "",
951 uri, msg, formatter.name())
952 try:
953 with formatter._updateLocation(newLocation):
954 result = formatter.read(component=getInfo.component if isComponent else None)
955 except Exception as e:
956 raise ValueError(f"Failure from formatter '{formatter.name()}' for dataset {ref.id}"
957 f" ({ref.datasetType.name} from {uri}): {e}") from e
959 return self._post_process_get(result, getInfo.readStorageClass, getInfo.assemblerParams,
960 isComponent=isComponent)
962 def exists(self, ref: DatasetRef) -> bool:
963 """Check if the dataset exists in the datastore.
965 Parameters
966 ----------
967 ref : `DatasetRef`
968 Reference to the required dataset.
970 Returns
971 -------
972 exists : `bool`
973 `True` if the entity exists in the `Datastore`.
974 """
975 fileLocations = self._get_dataset_locations_info(ref)
976 if not fileLocations:
977 return False
978 for location, _ in fileLocations:
979 if not self._artifact_exists(location):
980 return False
982 return True
984 def getURIs(self, ref: DatasetRef,
985 predict: bool = False) -> Tuple[Optional[ButlerURI], Dict[str, ButlerURI]]:
986 """Return URIs associated with dataset.
988 Parameters
989 ----------
990 ref : `DatasetRef`
991 Reference to the required dataset.
992 predict : `bool`, optional
993 If the datastore does not know about the dataset, should it
994 return a predicted URI or not?
996 Returns
997 -------
998 primary : `ButlerURI`
999 The URI to the primary artifact associated with this dataset.
1000 If the dataset was disassembled within the datastore this
1001 may be `None`.
1002 components : `dict`
1003 URIs to any components associated with the dataset artifact.
1004 Can be empty if there are no components.
1005 """
1007 primary: Optional[ButlerURI] = None
1008 components: Dict[str, ButlerURI] = {}
1010 # if this has never been written then we have to guess
1011 if not self.exists(ref):
1012 if not predict:
1013 raise FileNotFoundError("Dataset {} not in this datastore".format(ref))
1015 def predictLocation(thisRef: DatasetRef) -> Location:
1016 template = self.templates.getTemplate(thisRef)
1017 location = self.locationFactory.fromPath(template.format(thisRef))
1018 storageClass = ref.datasetType.storageClass
1019 formatter = self.formatterFactory.getFormatter(thisRef,
1020 FileDescriptor(location,
1021 storageClass=storageClass))
1022 # Try to use the extension attribute but ignore problems if the
1023 # formatter does not define one.
1024 try:
1025 location = formatter.makeUpdatedLocation(location)
1026 except Exception:
1027 # Use the default extension
1028 pass
1029 return location
1031 doDisassembly = self.composites.shouldBeDisassembled(ref)
1033 if doDisassembly:
1035 for component, componentStorage in ref.datasetType.storageClass.components.items():
1036 compRef = ref.makeComponentRef(component)
1037 compLocation = predictLocation(compRef)
1039 # Add a URI fragment to indicate this is a guess
1040 components[component] = ButlerURI(compLocation.uri.geturl() + "#predicted")
1042 else:
1044 location = predictLocation(ref)
1046 # Add a URI fragment to indicate this is a guess
1047 primary = ButlerURI(location.uri.geturl() + "#predicted")
1049 return primary, components
1051 # If this is a ref that we have written we can get the path.
1052 # Get file metadata and internal metadata
1053 fileLocations = self._get_dataset_locations_info(ref)
1055 if not fileLocations: 1055 ↛ 1056line 1055 didn't jump to line 1056, because the condition on line 1055 was never true
1056 raise RuntimeError(f"Unexpectedly got no artifacts for dataset {ref}")
1058 if len(fileLocations) == 1:
1059 # No disassembly so this is the primary URI
1060 primary = ButlerURI(fileLocations[0][0].uri)
1062 else:
1063 for location, storedFileInfo in fileLocations:
1064 if storedFileInfo.component is None: 1064 ↛ 1065line 1064 didn't jump to line 1065, because the condition on line 1064 was never true
1065 raise RuntimeError(f"Unexpectedly got no component name for a component at {location}")
1066 components[storedFileInfo.component] = ButlerURI(location.uri)
1068 return primary, components
1070 def getURI(self, ref: DatasetRef, predict: bool = False) -> ButlerURI:
1071 """URI to the Dataset.
1073 Parameters
1074 ----------
1075 ref : `DatasetRef`
1076 Reference to the required Dataset.
1077 predict : `bool`
1078 If `True`, allow URIs to be returned of datasets that have not
1079 been written.
1081 Returns
1082 -------
1083 uri : `str`
1084 URI pointing to the dataset within the datastore. If the
1085 dataset does not exist in the datastore, and if ``predict`` is
1086 `True`, the URI will be a prediction and will include a URI
1087 fragment "#predicted".
1088 If the datastore does not have entities that relate well
1089 to the concept of a URI the returned URI will be
1090 descriptive. The returned URI is not guaranteed to be obtainable.
1092 Raises
1093 ------
1094 FileNotFoundError
1095 Raised if a URI has been requested for a dataset that does not
1096 exist and guessing is not allowed.
1097 RuntimeError
1098 Raised if a request is made for a single URI but multiple URIs
1099 are associated with this dataset.
1101 Notes
1102 -----
1103 When a predicted URI is requested an attempt will be made to form
1104 a reasonable URI based on file templates and the expected formatter.
1105 """
1106 primary, components = self.getURIs(ref, predict)
1107 if primary is None or components: 1107 ↛ 1108line 1107 didn't jump to line 1108, because the condition on line 1107 was never true
1108 raise RuntimeError(f"Dataset ({ref}) includes distinct URIs for components. "
1109 "Use Dataastore.getURIs() instead.")
1110 return primary
1112 def get(self, ref: DatasetRef, parameters: Optional[Mapping[str, Any]] = None) -> Any:
1113 """Load an InMemoryDataset from the store.
1115 Parameters
1116 ----------
1117 ref : `DatasetRef`
1118 Reference to the required Dataset.
1119 parameters : `dict`
1120 `StorageClass`-specific parameters that specify, for example,
1121 a slice of the dataset to be loaded.
1123 Returns
1124 -------
1125 inMemoryDataset : `object`
1126 Requested dataset or slice thereof as an InMemoryDataset.
1128 Raises
1129 ------
1130 FileNotFoundError
1131 Requested dataset can not be retrieved.
1132 TypeError
1133 Return value from formatter has unexpected type.
1134 ValueError
1135 Formatter failed to process the dataset.
1136 """
1137 allGetInfo = self._prepare_for_get(ref, parameters)
1138 refComponent = ref.datasetType.component()
1140 # Supplied storage class for the component being read
1141 refStorageClass = ref.datasetType.storageClass
1143 # Create mapping from component name to related info
1144 allComponents = {i.component: i for i in allGetInfo}
1146 # By definition the dataset is disassembled if we have more
1147 # than one record for it.
1148 isDisassembled = len(allGetInfo) > 1
1150 # Look for the special case where we are disassembled but the
1151 # component is a derived component that was not written during
1152 # disassembly. For this scenario we need to check that the
1153 # component requested is listed as a derived component for the
1154 # composite storage class
1155 isDisassembledReadOnlyComponent = False
1156 if isDisassembled and refComponent:
1157 # The composite storage class should be accessible through
1158 # the component dataset type
1159 compositeStorageClass = ref.datasetType.parentStorageClass
1161 # In the unlikely scenario where the composite storage
1162 # class is not known, we can only assume that this is a
1163 # normal component. If that assumption is wrong then the
1164 # branch below that reads a persisted component will fail
1165 # so there is no need to complain here.
1166 if compositeStorageClass is not None: 1166 ↛ 1169line 1166 didn't jump to line 1169, because the condition on line 1166 was never false
1167 isDisassembledReadOnlyComponent = refComponent in compositeStorageClass.derivedComponents
1169 if isDisassembled and not refComponent:
1170 # This was a disassembled dataset spread over multiple files
1171 # and we need to put them all back together again.
1172 # Read into memory and then assemble
1174 # Check that the supplied parameters are suitable for the type read
1175 refStorageClass.validateParameters(parameters)
1177 # We want to keep track of all the parameters that were not used
1178 # by formatters. We assume that if any of the component formatters
1179 # use a parameter that we do not need to apply it again in the
1180 # assembler.
1181 usedParams = set()
1183 components: Dict[str, Any] = {}
1184 for getInfo in allGetInfo:
1185 # assemblerParams are parameters not understood by the
1186 # associated formatter.
1187 usedParams.update(set(getInfo.formatterParams))
1189 component = getInfo.component
1191 if component is None: 1191 ↛ 1192line 1191 didn't jump to line 1192, because the condition on line 1191 was never true
1192 raise RuntimeError(f"Internal error in datastore assembly of {ref}")
1194 # We do not want the formatter to think it's reading
1195 # a component though because it is really reading a
1196 # standalone dataset -- always tell reader it is not a
1197 # component.
1198 components[component] = self._read_artifact_into_memory(getInfo, ref, isComponent=False)
1200 inMemoryDataset = ref.datasetType.storageClass.delegate().assemble(components)
1202 # Any unused parameters will have to be passed to the assembler
1203 if parameters:
1204 unusedParams = {k: v for k, v in parameters.items() if k not in usedParams}
1205 else:
1206 unusedParams = {}
1208 # Process parameters
1209 return ref.datasetType.storageClass.delegate().handleParameters(inMemoryDataset,
1210 parameters=unusedParams)
1212 elif isDisassembledReadOnlyComponent:
1214 compositeStorageClass = ref.datasetType.parentStorageClass
1215 if compositeStorageClass is None: 1215 ↛ 1216line 1215 didn't jump to line 1216, because the condition on line 1215 was never true
1216 raise RuntimeError(f"Unable to retrieve derived component '{refComponent}' since"
1217 "no composite storage class is available.")
1219 if refComponent is None: 1219 ↛ 1221line 1219 didn't jump to line 1221, because the condition on line 1219 was never true
1220 # Mainly for mypy
1221 raise RuntimeError(f"Internal error in datastore {self.name}: component can not be None here")
1223 # Assume that every derived component can be calculated by
1224 # forwarding the request to a single read/write component.
1225 # Rather than guessing which rw component is the right one by
1226 # scanning each for a derived component of the same name,
1227 # we ask the storage class delegate directly which one is best to
1228 # use.
1229 compositeDelegate = compositeStorageClass.delegate()
1230 forwardedComponent = compositeDelegate.selectResponsibleComponent(refComponent,
1231 set(allComponents))
1233 # Select the relevant component
1234 rwInfo = allComponents[forwardedComponent]
1236 # For now assume that read parameters are validated against
1237 # the real component and not the requested component
1238 forwardedStorageClass = rwInfo.formatter.fileDescriptor.readStorageClass
1239 forwardedStorageClass.validateParameters(parameters)
1241 # Unfortunately the FileDescriptor inside the formatter will have
1242 # the wrong write storage class so we need to create a new one
1243 # given the immutability constraint.
1244 writeStorageClass = rwInfo.info.storageClass
1246 # We may need to put some thought into parameters for read
1247 # components but for now forward them on as is
1248 readFormatter = type(rwInfo.formatter)(FileDescriptor(rwInfo.location,
1249 readStorageClass=refStorageClass,
1250 storageClass=writeStorageClass,
1251 parameters=parameters),
1252 ref.dataId)
1254 # The assembler can not receive any parameter requests for a
1255 # derived component at this time since the assembler will
1256 # see the storage class of the derived component and those
1257 # parameters will have to be handled by the formatter on the
1258 # forwarded storage class.
1259 assemblerParams: Dict[str, Any] = {}
1261 # Need to created a new info that specifies the derived
1262 # component and associated storage class
1263 readInfo = DatastoreFileGetInformation(rwInfo.location, readFormatter,
1264 rwInfo.info, assemblerParams, {},
1265 refComponent, refStorageClass)
1267 return self._read_artifact_into_memory(readInfo, ref, isComponent=True)
1269 else:
1270 # Single file request or component from that composite file
1271 for lookup in (refComponent, None): 1271 ↛ 1276line 1271 didn't jump to line 1276, because the loop on line 1271 didn't complete
1272 if lookup in allComponents: 1272 ↛ 1271line 1272 didn't jump to line 1271, because the condition on line 1272 was never false
1273 getInfo = allComponents[lookup]
1274 break
1275 else:
1276 raise FileNotFoundError(f"Component {refComponent} not found "
1277 f"for ref {ref} in datastore {self.name}")
1279 # Do not need the component itself if already disassembled
1280 if isDisassembled:
1281 isComponent = False
1282 else:
1283 isComponent = getInfo.component is not None
1285 # For a disassembled component we can validate parametersagainst
1286 # the component storage class directly
1287 if isDisassembled:
1288 refStorageClass.validateParameters(parameters)
1289 else:
1290 # For an assembled composite this could be a derived
1291 # component derived from a real component. The validity
1292 # of the parameters is not clear. For now validate against
1293 # the composite storage class
1294 getInfo.formatter.fileDescriptor.storageClass.validateParameters(parameters)
1296 return self._read_artifact_into_memory(getInfo, ref, isComponent=isComponent)
1298 @transactional
1299 def put(self, inMemoryDataset: Any, ref: DatasetRef) -> None:
1300 """Write a InMemoryDataset with a given `DatasetRef` to the store.
1302 Parameters
1303 ----------
1304 inMemoryDataset : `object`
1305 The dataset to store.
1306 ref : `DatasetRef`
1307 Reference to the associated Dataset.
1309 Raises
1310 ------
1311 TypeError
1312 Supplied object and storage class are inconsistent.
1313 DatasetTypeNotSupportedError
1314 The associated `DatasetType` is not handled by this datastore.
1316 Notes
1317 -----
1318 If the datastore is configured to reject certain dataset types it
1319 is possible that the put will fail and raise a
1320 `DatasetTypeNotSupportedError`. The main use case for this is to
1321 allow `ChainedDatastore` to put to multiple datastores without
1322 requiring that every datastore accepts the dataset.
1323 """
1325 doDisassembly = self.composites.shouldBeDisassembled(ref)
1326 # doDisassembly = True
1328 artifacts = []
1329 if doDisassembly:
1330 components = ref.datasetType.storageClass.delegate().disassemble(inMemoryDataset)
1331 for component, componentInfo in components.items():
1332 # Don't recurse because we want to take advantage of
1333 # bulk insert -- need a new DatasetRef that refers to the
1334 # same dataset_id but has the component DatasetType
1335 # DatasetType does not refer to the types of components
1336 # So we construct one ourselves.
1337 compRef = ref.makeComponentRef(component)
1338 storedInfo = self._write_in_memory_to_artifact(componentInfo.component, compRef)
1339 artifacts.append((compRef, storedInfo))
1340 else:
1341 # Write the entire thing out
1342 storedInfo = self._write_in_memory_to_artifact(inMemoryDataset, ref)
1343 artifacts.append((ref, storedInfo))
1345 self._register_datasets(artifacts)
1347 @transactional
1348 def trash(self, ref: DatasetRef, ignore_errors: bool = True) -> None:
1349 """Indicate to the datastore that a dataset can be removed.
1351 Parameters
1352 ----------
1353 ref : `DatasetRef`
1354 Reference to the required Dataset.
1355 ignore_errors : `bool`
1356 If `True` return without error even if something went wrong.
1357 Problems could occur if another process is simultaneously trying
1358 to delete.
1360 Raises
1361 ------
1362 FileNotFoundError
1363 Attempt to remove a dataset that does not exist.
1364 """
1365 # Get file metadata and internal metadata
1366 log.debug("Trashing %s in datastore %s", ref, self.name)
1368 fileLocations = self._get_dataset_locations_info(ref)
1370 if not fileLocations:
1371 err_msg = f"Requested dataset to trash ({ref}) is not known to datastore {self.name}"
1372 if ignore_errors:
1373 log.warning(err_msg)
1374 return
1375 else:
1376 raise FileNotFoundError(err_msg)
1378 for location, storedFileInfo in fileLocations:
1379 if not self._artifact_exists(location): 1379 ↛ 1380line 1379 didn't jump to line 1380, because the condition on line 1379 was never true
1380 err_msg = f"Dataset is known to datastore {self.name} but " \
1381 f"associated artifact ({location.uri}) is missing"
1382 if ignore_errors:
1383 log.warning(err_msg)
1384 return
1385 else:
1386 raise FileNotFoundError(err_msg)
1388 # Mark dataset as trashed
1389 try:
1390 self._move_to_trash_in_registry(ref)
1391 except Exception as e:
1392 if ignore_errors:
1393 log.warning(f"Attempted to mark dataset ({ref}) to be trashed in datastore {self.name} "
1394 f"but encountered an error: {e}")
1395 pass
1396 else:
1397 raise
1399 @transactional
1400 def emptyTrash(self, ignore_errors: bool = True) -> None:
1401 """Remove all datasets from the trash.
1403 Parameters
1404 ----------
1405 ignore_errors : `bool`
1406 If `True` return without error even if something went wrong.
1407 Problems could occur if another process is simultaneously trying
1408 to delete.
1409 """
1410 log.debug("Emptying trash in datastore %s", self.name)
1411 # Context manager will empty trash iff we finish it without raising.
1412 with self.bridge.emptyTrash() as trashed:
1413 for ref in trashed:
1414 fileLocations = self._get_dataset_locations_info(ref)
1416 if not fileLocations: 1416 ↛ 1417line 1416 didn't jump to line 1417, because the condition on line 1416 was never true
1417 err_msg = f"Requested dataset ({ref}) does not exist in datastore {self.name}"
1418 if ignore_errors:
1419 log.warning(err_msg)
1420 continue
1421 else:
1422 raise FileNotFoundError(err_msg)
1424 for location, _ in fileLocations:
1426 if not self._artifact_exists(location): 1426 ↛ 1427line 1426 didn't jump to line 1427, because the condition on line 1426 was never true
1427 err_msg = f"Dataset {location.uri} no longer present in datastore {self.name}"
1428 if ignore_errors:
1429 log.warning(err_msg)
1430 continue
1431 else:
1432 raise FileNotFoundError(err_msg)
1434 # Can only delete the artifact if there are no references
1435 # to the file from untrashed dataset refs.
1436 if self._can_remove_dataset_artifact(ref, location):
1437 # Point of no return for this artifact
1438 log.debug("Removing artifact %s from datastore %s", location.uri, self.name)
1439 try:
1440 self._delete_artifact(location)
1441 except Exception as e:
1442 if ignore_errors:
1443 log.critical("Encountered error removing artifact %s from datastore %s: %s",
1444 location.uri, self.name, e)
1445 else:
1446 raise
1448 # Now must remove the entry from the internal registry even if
1449 # the artifact removal failed and was ignored,
1450 # otherwise the removal check above will never be true
1451 try:
1452 # There may be multiple rows associated with this ref
1453 # depending on disassembly
1454 self.removeStoredItemInfo(ref)
1455 except Exception as e:
1456 if ignore_errors:
1457 log.warning("Error removing dataset %s (%s) from internal registry of %s: %s",
1458 ref.id, location.uri, self.name, e)
1459 continue
1460 else:
1461 raise FileNotFoundError(err_msg)
1463 def validateConfiguration(self, entities: Iterable[Union[DatasetRef, DatasetType, StorageClass]],
1464 logFailures: bool = False) -> None:
1465 """Validate some of the configuration for this datastore.
1467 Parameters
1468 ----------
1469 entities : iterable of `DatasetRef`, `DatasetType`, or `StorageClass`
1470 Entities to test against this configuration. Can be differing
1471 types.
1472 logFailures : `bool`, optional
1473 If `True`, output a log message for every validation error
1474 detected.
1476 Raises
1477 ------
1478 DatastoreValidationError
1479 Raised if there is a validation problem with a configuration.
1480 All the problems are reported in a single exception.
1482 Notes
1483 -----
1484 This method checks that all the supplied entities have valid file
1485 templates and also have formatters defined.
1486 """
1488 templateFailed = None
1489 try:
1490 self.templates.validateTemplates(entities, logFailures=logFailures)
1491 except FileTemplateValidationError as e:
1492 templateFailed = str(e)
1494 formatterFailed = []
1495 for entity in entities:
1496 try:
1497 self.formatterFactory.getFormatterClass(entity)
1498 except KeyError as e:
1499 formatterFailed.append(str(e))
1500 if logFailures: 1500 ↛ 1495line 1500 didn't jump to line 1495, because the condition on line 1500 was never false
1501 log.fatal("Formatter failure: %s", e)
1503 if templateFailed or formatterFailed:
1504 messages = []
1505 if templateFailed: 1505 ↛ 1506line 1505 didn't jump to line 1506, because the condition on line 1505 was never true
1506 messages.append(templateFailed)
1507 if formatterFailed: 1507 ↛ 1509line 1507 didn't jump to line 1509, because the condition on line 1507 was never false
1508 messages.append(",".join(formatterFailed))
1509 msg = ";\n".join(messages)
1510 raise DatastoreValidationError(msg)
1512 def getLookupKeys(self) -> Set[LookupKey]:
1513 # Docstring is inherited from base class
1514 return self.templates.getLookupKeys() | self.formatterFactory.getLookupKeys() | \
1515 self.constraints.getLookupKeys()
1517 def validateKey(self, lookupKey: LookupKey,
1518 entity: Union[DatasetRef, DatasetType, StorageClass]) -> None:
1519 # Docstring is inherited from base class
1520 # The key can be valid in either formatters or templates so we can
1521 # only check the template if it exists
1522 if lookupKey in self.templates:
1523 try:
1524 self.templates[lookupKey].validateTemplate(entity)
1525 except FileTemplateValidationError as e:
1526 raise DatastoreValidationError(e) from e
1528 def export(self, refs: Iterable[DatasetRef], *,
1529 directory: Optional[Union[ButlerURI, str]] = None,
1530 transfer: Optional[str] = "auto") -> Iterable[FileDataset]:
1531 # Docstring inherited from Datastore.export.
1532 if transfer is not None and directory is None: 1532 ↛ 1533line 1532 didn't jump to line 1533, because the condition on line 1532 was never true
1533 raise RuntimeError(f"Cannot export using transfer mode {transfer} with no "
1534 "export directory given")
1536 # Force the directory to be a URI object
1537 directoryUri: Optional[ButlerURI] = None
1538 if directory is not None: 1538 ↛ 1541line 1538 didn't jump to line 1541, because the condition on line 1538 was never false
1539 directoryUri = ButlerURI(directory, forceDirectory=True)
1541 if transfer is not None and directoryUri is not None: 1541 ↛ 1546line 1541 didn't jump to line 1546, because the condition on line 1541 was never false
1542 # mypy needs the second test
1543 if not directoryUri.exists(): 1543 ↛ 1544line 1543 didn't jump to line 1544, because the condition on line 1543 was never true
1544 raise FileNotFoundError(f"Export location {directory} does not exist")
1546 for ref in refs:
1547 fileLocations = self._get_dataset_locations_info(ref)
1548 if not fileLocations: 1548 ↛ 1549line 1548 didn't jump to line 1549, because the condition on line 1548 was never true
1549 raise FileNotFoundError(f"Could not retrieve dataset {ref}.")
1550 # For now we can not export disassembled datasets
1551 if len(fileLocations) > 1:
1552 raise NotImplementedError(f"Can not export disassembled datasets such as {ref}")
1553 location, storedFileInfo = fileLocations[0]
1554 if transfer is None: 1554 ↛ 1557line 1554 didn't jump to line 1557, because the condition on line 1554 was never true
1555 # TODO: do we also need to return the readStorageClass somehow?
1556 # We will use the path in store directly
1557 pass
1558 else:
1559 # mypy needs help
1560 assert directoryUri is not None, "directoryUri must be defined to get here"
1561 storeUri = ButlerURI(location.uri)
1562 exportUri = directoryUri.join(location.pathInStore)
1563 exportUri.transfer_from(storeUri, transfer=transfer)
1565 yield FileDataset(refs=[ref], path=location.pathInStore, formatter=storedFileInfo.formatter)
1567 @staticmethod
1568 def computeChecksum(uri: ButlerURI, algorithm: str = "blake2b", block_size: int = 8192) -> Optional[str]:
1569 """Compute the checksum of the supplied file.
1571 Parameters
1572 ----------
1573 uri : `ButlerURI`
1574 Name of resource to calculate checksum from.
1575 algorithm : `str`, optional
1576 Name of algorithm to use. Must be one of the algorithms supported
1577 by :py:class`hashlib`.
1578 block_size : `int`
1579 Number of bytes to read from file at one time.
1581 Returns
1582 -------
1583 hexdigest : `str`
1584 Hex digest of the file.
1586 Notes
1587 -----
1588 Currently returns None if the URI is for a remote resource.
1589 """
1590 if algorithm not in hashlib.algorithms_guaranteed: 1590 ↛ 1591line 1590 didn't jump to line 1591, because the condition on line 1590 was never true
1591 raise NameError("The specified algorithm '{}' is not supported by hashlib".format(algorithm))
1593 if not uri.isLocal: 1593 ↛ 1594line 1593 didn't jump to line 1594, because the condition on line 1593 was never true
1594 return None
1596 hasher = hashlib.new(algorithm)
1598 with uri.as_local() as local_uri:
1599 with open(local_uri.ospath, "rb") as f:
1600 for chunk in iter(lambda: f.read(block_size), b""):
1601 hasher.update(chunk)
1603 return hasher.hexdigest()