Coverage for python/lsst/daf/butler/datastores/fileDatastore.py : 83%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
21from __future__ import annotations
23"""Generic file-based datastore code."""
25__all__ = ("FileDatastore", )
27import hashlib
28import logging
29import os
30import tempfile
32from sqlalchemy import BigInteger, String
34from dataclasses import dataclass
35from typing import (
36 TYPE_CHECKING,
37 Any,
38 ClassVar,
39 Dict,
40 Iterable,
41 List,
42 Mapping,
43 Optional,
44 Set,
45 Tuple,
46 Type,
47 Union,
48)
50from lsst.daf.butler import (
51 ButlerURI,
52 CompositesMap,
53 Config,
54 FileDataset,
55 DatasetRef,
56 DatasetType,
57 DatasetTypeNotSupportedError,
58 Datastore,
59 DatastoreConfig,
60 DatastoreValidationError,
61 FileDescriptor,
62 FileTemplates,
63 FileTemplateValidationError,
64 Formatter,
65 FormatterFactory,
66 Location,
67 LocationFactory,
68 StorageClass,
69 StoredFileInfo,
70)
72from lsst.daf.butler import ddl
73from lsst.daf.butler.registry.interfaces import (
74 ReadOnlyDatabaseError,
75 DatastoreRegistryBridge,
76)
78from lsst.daf.butler.core.repoRelocation import replaceRoot
79from lsst.daf.butler.core.utils import getInstanceOf, getClassOf, transactional
80from .genericDatastore import GenericBaseDatastore
82if TYPE_CHECKING: 82 ↛ 83line 82 didn't jump to line 83, because the condition on line 82 was never true
83 from lsst.daf.butler import LookupKey
84 from lsst.daf.butler.registry.interfaces import DatasetIdRef, DatastoreRegistryBridgeManager
86log = logging.getLogger(__name__)
88# String to use when a Python None is encountered
89NULLSTR = "__NULL_STRING__"
92class _IngestPrepData(Datastore.IngestPrepData):
93 """Helper class for FileDatastore ingest implementation.
95 Parameters
96 ----------
97 datasets : `list` of `FileDataset`
98 Files to be ingested by this datastore.
99 """
100 def __init__(self, datasets: List[FileDataset]):
101 super().__init__(ref for dataset in datasets for ref in dataset.refs)
102 self.datasets = datasets
105@dataclass(frozen=True)
106class DatastoreFileGetInformation:
107 """Collection of useful parameters needed to retrieve a file from
108 a Datastore.
109 """
111 location: Location
112 """The location from which to read the dataset."""
114 formatter: Formatter
115 """The `Formatter` to use to deserialize the dataset."""
117 info: StoredFileInfo
118 """Stored information about this file and its formatter."""
120 assemblerParams: Dict[str, Any]
121 """Parameters to use for post-processing the retrieved dataset."""
123 formatterParams: Dict[str, Any]
124 """Parameters that were understood by the associated formatter."""
126 component: Optional[str]
127 """The component to be retrieved (can be `None`)."""
129 readStorageClass: StorageClass
130 """The `StorageClass` of the dataset being read."""
133class FileDatastore(GenericBaseDatastore):
134 """Generic Datastore for file-based implementations.
136 Should always be sub-classed since key abstract methods are missing.
138 Parameters
139 ----------
140 config : `DatastoreConfig` or `str`
141 Configuration as either a `Config` object or URI to file.
142 bridgeManager : `DatastoreRegistryBridgeManager`
143 Object that manages the interface between `Registry` and datastores.
144 butlerRoot : `str`, optional
145 New datastore root to use to override the configuration value.
147 Raises
148 ------
149 ValueError
150 If root location does not exist and ``create`` is `False` in the
151 configuration.
152 """
154 defaultConfigFile: ClassVar[Optional[str]] = None
155 """Path to configuration defaults. Accessed within the ``config`` resource
156 or relative to a search path. Can be None if no defaults specified.
157 """
159 root: ButlerURI
160 """Root directory URI of this `Datastore`."""
162 locationFactory: LocationFactory
163 """Factory for creating locations relative to the datastore root."""
165 formatterFactory: FormatterFactory
166 """Factory for creating instances of formatters."""
168 templates: FileTemplates
169 """File templates that can be used by this `Datastore`."""
171 composites: CompositesMap
172 """Determines whether a dataset should be disassembled on put."""
174 defaultConfigFile = "datastores/fileDatastore.yaml"
175 """Path to configuration defaults. Accessed within the ``config`` resource
176 or relative to a search path. Can be None if no defaults specified.
177 """
179 @classmethod
180 def setConfigRoot(cls, root: str, config: Config, full: Config, overwrite: bool = True) -> None:
181 """Set any filesystem-dependent config options for this Datastore to
182 be appropriate for a new empty repository with the given root.
184 Parameters
185 ----------
186 root : `str`
187 URI to the root of the data repository.
188 config : `Config`
189 A `Config` to update. Only the subset understood by
190 this component will be updated. Will not expand
191 defaults.
192 full : `Config`
193 A complete config with all defaults expanded that can be
194 converted to a `DatastoreConfig`. Read-only and will not be
195 modified by this method.
196 Repository-specific options that should not be obtained
197 from defaults when Butler instances are constructed
198 should be copied from ``full`` to ``config``.
199 overwrite : `bool`, optional
200 If `False`, do not modify a value in ``config`` if the value
201 already exists. Default is always to overwrite with the provided
202 ``root``.
204 Notes
205 -----
206 If a keyword is explicitly defined in the supplied ``config`` it
207 will not be overridden by this method if ``overwrite`` is `False`.
208 This allows explicit values set in external configs to be retained.
209 """
210 Config.updateParameters(DatastoreConfig, config, full,
211 toUpdate={"root": root},
212 toCopy=("cls", ("records", "table")), overwrite=overwrite)
214 @classmethod
215 def makeTableSpec(cls) -> ddl.TableSpec:
216 return ddl.TableSpec(
217 fields=[
218 ddl.FieldSpec(name="dataset_id", dtype=BigInteger, primaryKey=True),
219 ddl.FieldSpec(name="path", dtype=String, length=256, nullable=False),
220 ddl.FieldSpec(name="formatter", dtype=String, length=128, nullable=False),
221 ddl.FieldSpec(name="storage_class", dtype=String, length=64, nullable=False),
222 # Use empty string to indicate no component
223 ddl.FieldSpec(name="component", dtype=String, length=32, primaryKey=True),
224 # TODO: should checksum be Base64Bytes instead?
225 ddl.FieldSpec(name="checksum", dtype=String, length=128, nullable=True),
226 ddl.FieldSpec(name="file_size", dtype=BigInteger, nullable=True),
227 ],
228 unique=frozenset(),
229 )
231 def __init__(self, config: Union[DatastoreConfig, str],
232 bridgeManager: DatastoreRegistryBridgeManager, butlerRoot: str = None):
233 super().__init__(config, bridgeManager)
234 if "root" not in self.config: 234 ↛ 235line 234 didn't jump to line 235, because the condition on line 234 was never true
235 raise ValueError("No root directory specified in configuration")
237 # Name ourselves either using an explicit name or a name
238 # derived from the (unexpanded) root
239 if "name" in self.config:
240 self.name = self.config["name"]
241 else:
242 # We use the unexpanded root in the name to indicate that this
243 # datastore can be moved without having to update registry.
244 self.name = "{}@{}".format(type(self).__name__,
245 self.config["root"])
247 # Support repository relocation in config
248 # Existence of self.root is checked in subclass
249 self.root = ButlerURI(replaceRoot(self.config["root"], butlerRoot),
250 forceDirectory=True, forceAbsolute=True)
252 self.locationFactory = LocationFactory(self.root)
253 self.formatterFactory = FormatterFactory()
255 # Now associate formatters with storage classes
256 self.formatterFactory.registerFormatters(self.config["formatters"],
257 universe=bridgeManager.universe)
259 # Read the file naming templates
260 self.templates = FileTemplates(self.config["templates"],
261 universe=bridgeManager.universe)
263 # See if composites should be disassembled
264 self.composites = CompositesMap(self.config["composites"],
265 universe=bridgeManager.universe)
267 tableName = self.config["records", "table"]
268 try:
269 # Storage of paths and formatters, keyed by dataset_id
270 self._table = bridgeManager.opaque.register(tableName, self.makeTableSpec())
271 # Interface to Registry.
272 self._bridge = bridgeManager.register(self.name)
273 except ReadOnlyDatabaseError:
274 # If the database is read only and we just tried and failed to
275 # create a table, it means someone is trying to create a read-only
276 # butler client for an empty repo. That should be okay, as long
277 # as they then try to get any datasets before some other client
278 # creates the table. Chances are they'rejust validating
279 # configuration.
280 pass
282 # Determine whether checksums should be used - default to False
283 self.useChecksum = self.config.get("checksum", False)
285 # Check existence and create directory structure if necessary
286 if not self.root.exists():
287 if "create" not in self.config or not self.config["create"]: 287 ↛ 288line 287 didn't jump to line 288, because the condition on line 287 was never true
288 raise ValueError(f"No valid root and not allowed to create one at: {self.root}")
289 try:
290 self.root.mkdir()
291 except Exception as e:
292 raise ValueError(f"Can not create datastore root '{self.root}', check permissions."
293 f" Got error: {e}") from e
295 def __str__(self) -> str:
296 return str(self.root)
298 @property
299 def bridge(self) -> DatastoreRegistryBridge:
300 return self._bridge
302 def _artifact_exists(self, location: Location) -> bool:
303 """Check that an artifact exists in this datastore at the specified
304 location.
306 Parameters
307 ----------
308 location : `Location`
309 Expected location of the artifact associated with this datastore.
311 Returns
312 -------
313 exists : `bool`
314 True if the location can be found, false otherwise.
315 """
316 log.debug("Checking if resource exists: %s", location.uri)
317 return location.uri.exists()
319 def _delete_artifact(self, location: Location) -> None:
320 """Delete the artifact from the datastore.
322 Parameters
323 ----------
324 location : `Location`
325 Location of the artifact associated with this datastore.
326 """
327 log.debug("Deleting file: %s", location.uri)
328 location.uri.remove()
329 log.debug("Successfully deleted file: %s", location.uri)
331 def addStoredItemInfo(self, refs: Iterable[DatasetRef], infos: Iterable[StoredFileInfo]) -> None:
332 # Docstring inherited from GenericBaseDatastore
333 records = []
334 for ref, info in zip(refs, infos):
335 # Component should come from ref and fall back on info
336 component = ref.datasetType.component()
337 if component is None and info.component is not None: 337 ↛ 338line 337 didn't jump to line 338, because the condition on line 337 was never true
338 component = info.component
339 if component is None:
340 # Use empty string since we want this to be part of the
341 # primary key.
342 component = NULLSTR
343 records.append(
344 dict(dataset_id=ref.id, formatter=info.formatter, path=info.path,
345 storage_class=info.storageClass.name, component=component,
346 checksum=info.checksum, file_size=info.file_size)
347 )
348 self._table.insert(*records)
350 def getStoredItemsInfo(self, ref: DatasetIdRef) -> List[StoredFileInfo]:
351 # Docstring inherited from GenericBaseDatastore
353 # Look for the dataset_id -- there might be multiple matches
354 # if we have disassembled the dataset.
355 records = list(self._table.fetch(dataset_id=ref.id))
357 results = []
358 for record in records:
359 # Convert name of StorageClass to instance
360 storageClass = self.storageClassFactory.getStorageClass(record["storage_class"])
361 component = record["component"] if (record["component"]
362 and record["component"] != NULLSTR) else None
364 info = StoredFileInfo(formatter=record["formatter"],
365 path=record["path"],
366 storageClass=storageClass,
367 component=component,
368 checksum=record["checksum"],
369 file_size=record["file_size"])
370 results.append(info)
372 return results
374 def _registered_refs_per_artifact(self, pathInStore: ButlerURI) -> Set[int]:
375 """Return all dataset refs associated with the supplied path.
377 Parameters
378 ----------
379 pathInStore : `ButlerURI`
380 Path of interest in the data store.
382 Returns
383 -------
384 ids : `set` of `int`
385 All `DatasetRef` IDs associated with this path.
386 """
387 records = list(self._table.fetch(path=str(pathInStore)))
388 ids = {r["dataset_id"] for r in records}
389 return ids
391 def removeStoredItemInfo(self, ref: DatasetIdRef) -> None:
392 # Docstring inherited from GenericBaseDatastore
393 self._table.delete(dataset_id=ref.id)
395 def _get_dataset_locations_info(self, ref: DatasetIdRef) -> List[Tuple[Location, StoredFileInfo]]:
396 r"""Find all the `Location`\ s of the requested dataset in the
397 `Datastore` and the associated stored file information.
399 Parameters
400 ----------
401 ref : `DatasetRef`
402 Reference to the required `Dataset`.
404 Returns
405 -------
406 results : `list` [`tuple` [`Location`, `StoredFileInfo` ]]
407 Location of the dataset within the datastore and
408 stored information about each file and its formatter.
409 """
410 # Get the file information (this will fail if no file)
411 records = self.getStoredItemsInfo(ref)
413 # Use the path to determine the location -- we need to take
414 # into account absolute URIs in the datastore record
415 locations: List[Tuple[Location, StoredFileInfo]] = []
416 for r in records:
417 uriInStore = ButlerURI(r.path, forceAbsolute=False)
418 if uriInStore.isabs(): 418 ↛ 419line 418 didn't jump to line 419, because the condition on line 418 was never true
419 location = Location(None, uriInStore)
420 else:
421 location = self.locationFactory.fromPath(r.path)
422 locations.append((location, r))
423 return locations
425 def _can_remove_dataset_artifact(self, ref: DatasetIdRef, location: Location) -> bool:
426 """Check that there is only one dataset associated with the
427 specified artifact.
429 Parameters
430 ----------
431 ref : `DatasetRef` or `FakeDatasetRef`
432 Dataset to be removed.
433 location : `Location`
434 The location of the artifact to be removed.
436 Returns
437 -------
438 can_remove : `Bool`
439 True if the artifact can be safely removed.
440 """
442 # Get all entries associated with this path
443 allRefs = self._registered_refs_per_artifact(location.pathInStore)
444 if not allRefs: 444 ↛ 445line 444 didn't jump to line 445, because the condition on line 444 was never true
445 raise RuntimeError(f"Datastore inconsistency error. {location.pathInStore} not in registry")
447 # Remove these refs from all the refs and if there is nothing left
448 # then we can delete
449 remainingRefs = allRefs - {ref.id}
451 if remainingRefs:
452 return False
453 return True
455 def _prepare_for_get(self, ref: DatasetRef,
456 parameters: Optional[Mapping[str, Any]] = None) -> List[DatastoreFileGetInformation]:
457 """Check parameters for ``get`` and obtain formatter and
458 location.
460 Parameters
461 ----------
462 ref : `DatasetRef`
463 Reference to the required Dataset.
464 parameters : `dict`
465 `StorageClass`-specific parameters that specify, for example,
466 a slice of the dataset to be loaded.
468 Returns
469 -------
470 getInfo : `list` [`DatastoreFileGetInformation`]
471 Parameters needed to retrieve each file.
472 """
473 log.debug("Retrieve %s from %s with parameters %s", ref, self.name, parameters)
475 # Get file metadata and internal metadata
476 fileLocations = self._get_dataset_locations_info(ref)
477 if not fileLocations:
478 raise FileNotFoundError(f"Could not retrieve dataset {ref}.")
480 # The storage class we want to use eventually
481 refStorageClass = ref.datasetType.storageClass
483 if len(fileLocations) > 1:
484 disassembled = True
485 else:
486 disassembled = False
488 # Is this a component request?
489 refComponent = ref.datasetType.component()
491 fileGetInfo = []
492 for location, storedFileInfo in fileLocations:
494 # The storage class used to write the file
495 writeStorageClass = storedFileInfo.storageClass
497 # If this has been disassembled we need read to match the write
498 if disassembled:
499 readStorageClass = writeStorageClass
500 else:
501 readStorageClass = refStorageClass
503 formatter = getInstanceOf(storedFileInfo.formatter,
504 FileDescriptor(location, readStorageClass=readStorageClass,
505 storageClass=writeStorageClass, parameters=parameters),
506 ref.dataId)
508 formatterParams, notFormatterParams = formatter.segregateParameters()
510 # Of the remaining parameters, extract the ones supported by
511 # this StorageClass (for components not all will be handled)
512 assemblerParams = readStorageClass.filterParameters(notFormatterParams)
514 # The ref itself could be a component if the dataset was
515 # disassembled by butler, or we disassembled in datastore and
516 # components came from the datastore records
517 component = storedFileInfo.component if storedFileInfo.component else refComponent
519 fileGetInfo.append(DatastoreFileGetInformation(location, formatter, storedFileInfo,
520 assemblerParams, formatterParams,
521 component, readStorageClass))
523 return fileGetInfo
525 def _prepare_for_put(self, inMemoryDataset: Any, ref: DatasetRef) -> Tuple[Location, Formatter]:
526 """Check the arguments for ``put`` and obtain formatter and
527 location.
529 Parameters
530 ----------
531 inMemoryDataset : `object`
532 The dataset to store.
533 ref : `DatasetRef`
534 Reference to the associated Dataset.
536 Returns
537 -------
538 location : `Location`
539 The location to write the dataset.
540 formatter : `Formatter`
541 The `Formatter` to use to write the dataset.
543 Raises
544 ------
545 TypeError
546 Supplied object and storage class are inconsistent.
547 DatasetTypeNotSupportedError
548 The associated `DatasetType` is not handled by this datastore.
549 """
550 self._validate_put_parameters(inMemoryDataset, ref)
552 # Work out output file name
553 try:
554 template = self.templates.getTemplate(ref)
555 except KeyError as e:
556 raise DatasetTypeNotSupportedError(f"Unable to find template for {ref}") from e
558 # Validate the template to protect against filenames from different
559 # dataIds returning the same and causing overwrite confusion.
560 template.validateTemplate(ref)
562 location = self.locationFactory.fromPath(template.format(ref))
564 # Get the formatter based on the storage class
565 storageClass = ref.datasetType.storageClass
566 try:
567 formatter = self.formatterFactory.getFormatter(ref,
568 FileDescriptor(location,
569 storageClass=storageClass),
570 ref.dataId)
571 except KeyError as e:
572 raise DatasetTypeNotSupportedError(f"Unable to find formatter for {ref} in datastore "
573 f"{self.name}") from e
575 # Now that we know the formatter, update the location
576 location = formatter.makeUpdatedLocation(location)
578 return location, formatter
580 def _overrideTransferMode(self, *datasets: FileDataset, transfer: Optional[str] = None) -> Optional[str]:
581 # Docstring inherited from base class
582 if transfer != "auto":
583 return transfer
585 # See if the paths are within the datastore or not
586 inside = [self._pathInStore(d.path) is not None for d in datasets]
588 if all(inside):
589 transfer = None
590 elif not any(inside): 590 ↛ 594line 590 didn't jump to line 594, because the condition on line 590 was never false
591 # Allow ButlerURI to use its own knowledge
592 transfer = "auto"
593 else:
594 raise ValueError("Some datasets are inside the datastore and some are outside."
595 " Please use an explicit transfer mode and not 'auto'.")
597 return transfer
599 def _pathInStore(self, path: str) -> Optional[str]:
600 """Return path relative to datastore root
602 Parameters
603 ----------
604 path : `str`
605 Path to dataset. Can be absolute. If relative assumed to
606 be relative to the datastore. Returns path in datastore
607 or raises an exception if the path it outside.
609 Returns
610 -------
611 inStore : `str`
612 Path relative to datastore root. Returns `None` if the file is
613 outside the root.
614 """
615 # Relative path will always be relative to datastore
616 pathUri = ButlerURI(path, forceAbsolute=False)
617 return pathUri.relative_to(self.root)
619 def _standardizeIngestPath(self, path: str, *, transfer: Optional[str] = None) -> str:
620 """Standardize the path of a to-be-ingested file.
622 Parameters
623 ----------
624 path : `str`
625 Path of a file to be ingested.
626 transfer : `str`, optional
627 How (and whether) the dataset should be added to the datastore.
628 See `ingest` for details of transfer modes.
629 This implementation is provided only so
630 `NotImplementedError` can be raised if the mode is not supported;
631 actual transfers are deferred to `_extractIngestInfo`.
633 Returns
634 -------
635 path : `str`
636 New path in what the datastore considers standard form.
638 Notes
639 -----
640 Subclasses of `FileDatastore` can implement this method instead
641 of `_prepIngest`. It should not modify the data repository or given
642 file in any way.
644 Raises
645 ------
646 NotImplementedError
647 Raised if the datastore does not support the given transfer mode
648 (including the case where ingest is not supported at all).
649 FileNotFoundError
650 Raised if one of the given files does not exist.
651 """
652 if transfer not in (None, "direct") + self.root.transferModes: 652 ↛ 653line 652 didn't jump to line 653, because the condition on line 652 was never true
653 raise NotImplementedError(f"Transfer mode {transfer} not supported.")
655 # A relative URI indicates relative to datastore root
656 srcUri = ButlerURI(path, forceAbsolute=False)
657 if not srcUri.isabs():
658 srcUri = self.root.join(path)
660 if not srcUri.exists():
661 raise FileNotFoundError(f"Resource at {srcUri} does not exist; note that paths to ingest "
662 f"are assumed to be relative to {self.root} unless they are absolute.")
664 if transfer is None:
665 relpath = srcUri.relative_to(self.root)
666 if not relpath:
667 raise RuntimeError(f"Transfer is none but source file ({srcUri}) is not "
668 f"within datastore ({self.root})")
670 # Return the relative path within the datastore for internal
671 # transfer
672 path = relpath
674 return path
676 def _extractIngestInfo(self, path: Union[str, ButlerURI], ref: DatasetRef, *,
677 formatter: Union[Formatter, Type[Formatter]],
678 transfer: Optional[str] = None) -> StoredFileInfo:
679 """Relocate (if necessary) and extract `StoredFileInfo` from a
680 to-be-ingested file.
682 Parameters
683 ----------
684 path : `str` or `ButlerURI`
685 URI or path of a file to be ingested.
686 ref : `DatasetRef`
687 Reference for the dataset being ingested. Guaranteed to have
688 ``dataset_id not None`.
689 formatter : `type` or `Formatter`
690 `Formatter` subclass to use for this dataset or an instance.
691 transfer : `str`, optional
692 How (and whether) the dataset should be added to the datastore.
693 See `ingest` for details of transfer modes.
695 Returns
696 -------
697 info : `StoredFileInfo`
698 Internal datastore record for this file. This will be inserted by
699 the caller; the `_extractIngestInfo` is only resposible for
700 creating and populating the struct.
702 Raises
703 ------
704 FileNotFoundError
705 Raised if one of the given files does not exist.
706 FileExistsError
707 Raised if transfer is not `None` but the (internal) location the
708 file would be moved to is already occupied.
709 """
710 if self._transaction is None: 710 ↛ 711line 710 didn't jump to line 711, because the condition on line 710 was never true
711 raise RuntimeError("Ingest called without transaction enabled")
713 # Create URI of the source path, do not need to force a relative
714 # path to absolute.
715 srcUri = ButlerURI(path, forceAbsolute=False)
717 # Track whether we have read the size of the source yet
718 have_sized = False
720 tgtLocation: Optional[Location]
721 if transfer is None:
722 # A relative path is assumed to be relative to the datastore
723 # in this context
724 if not srcUri.isabs():
725 tgtLocation = self.locationFactory.fromPath(srcUri.ospath)
726 else:
727 # Work out the path in the datastore from an absolute URI
728 # This is required to be within the datastore.
729 pathInStore = srcUri.relative_to(self.root)
730 if pathInStore is None: 730 ↛ 731line 730 didn't jump to line 731, because the condition on line 730 was never true
731 raise RuntimeError(f"Unexpectedly learned that {srcUri} is "
732 f"not within datastore {self.root}")
733 tgtLocation = self.locationFactory.fromPath(pathInStore)
734 elif transfer == "direct": 734 ↛ 739line 734 didn't jump to line 739, because the condition on line 734 was never true
735 # Want to store the full URI to the resource directly in
736 # datastore. This is useful for referring to permanent archive
737 # storage for raw data.
738 # Trust that people know what they are doing.
739 tgtLocation = None
740 else:
741 # Work out the name we want this ingested file to have
742 # inside the datastore
743 tgtLocation = self._calculate_ingested_datastore_name(srcUri, ref, formatter)
744 if not tgtLocation.uri.dirname().exists():
745 log.debug("Folder %s does not exist yet.", tgtLocation.uri.dirname())
746 tgtLocation.uri.dirname().mkdir()
748 # if we are transferring from a local file to a remote location
749 # it may be more efficient to get the size and checksum of the
750 # local file rather than the transferred one
751 if not srcUri.scheme or srcUri.scheme == "file": 751 ↛ 757line 751 didn't jump to line 757, because the condition on line 751 was never false
752 size = srcUri.size()
753 checksum = self.computeChecksum(srcUri) if self.useChecksum else None
754 have_sized = True
756 # transfer the resource to the destination
757 tgtLocation.uri.transfer_from(srcUri, transfer=transfer, transaction=self._transaction)
759 if tgtLocation is None: 759 ↛ 761line 759 didn't jump to line 761, because the condition on line 759 was never true
760 # This means we are using direct mode
761 targetUri = srcUri
762 targetPath = str(srcUri)
763 else:
764 targetUri = tgtLocation.uri
765 targetPath = tgtLocation.pathInStore.path
767 # the file should exist in the datastore now
768 if not have_sized:
769 size = targetUri.size()
770 checksum = self.computeChecksum(targetUri) if self.useChecksum else None
772 return StoredFileInfo(formatter=formatter, path=targetPath,
773 storageClass=ref.datasetType.storageClass,
774 component=ref.datasetType.component(),
775 file_size=size, checksum=checksum)
777 def _prepIngest(self, *datasets: FileDataset, transfer: Optional[str] = None) -> _IngestPrepData:
778 # Docstring inherited from Datastore._prepIngest.
779 filtered = []
780 for dataset in datasets:
781 acceptable = [ref for ref in dataset.refs if self.constraints.isAcceptable(ref)]
782 if not acceptable:
783 continue
784 else:
785 dataset.refs = acceptable
786 if dataset.formatter is None:
787 dataset.formatter = self.formatterFactory.getFormatterClass(dataset.refs[0])
788 else:
789 assert isinstance(dataset.formatter, (type, str))
790 dataset.formatter = getClassOf(dataset.formatter)
791 dataset.path = self._standardizeIngestPath(dataset.path, transfer=transfer)
792 filtered.append(dataset)
793 return _IngestPrepData(filtered)
795 @transactional
796 def _finishIngest(self, prepData: Datastore.IngestPrepData, *, transfer: Optional[str] = None) -> None:
797 # Docstring inherited from Datastore._finishIngest.
798 refsAndInfos = []
799 for dataset in prepData.datasets:
800 # Do ingest as if the first dataset ref is associated with the file
801 info = self._extractIngestInfo(dataset.path, dataset.refs[0], formatter=dataset.formatter,
802 transfer=transfer)
803 refsAndInfos.extend([(ref, info) for ref in dataset.refs])
804 self._register_datasets(refsAndInfos)
806 def _calculate_ingested_datastore_name(self, srcUri: ButlerURI, ref: DatasetRef,
807 formatter: Union[Formatter, Type[Formatter]]) -> Location:
808 """Given a source URI and a DatasetRef, determine the name the
809 dataset will have inside datastore.
811 Parameters
812 ----------
813 srcUri : `ButlerURI`
814 URI to the source dataset file.
815 ref : `DatasetRef`
816 Ref associated with the newly-ingested dataset artifact. This
817 is used to determine the name within the datastore.
818 formatter : `Formatter` or Formatter class.
819 Formatter to use for validation. Can be a class or an instance.
821 Returns
822 -------
823 location : `Location`
824 Target location for the newly-ingested dataset.
825 """
826 # Ingesting a file from outside the datastore.
827 # This involves a new name.
828 template = self.templates.getTemplate(ref)
829 location = self.locationFactory.fromPath(template.format(ref))
831 # Get the extension
832 ext = srcUri.getExtension()
834 # Update the destination to include that extension
835 location.updateExtension(ext)
837 # Ask the formatter to validate this extension
838 formatter.validateExtension(location)
840 return location
842 def _write_in_memory_to_artifact(self, inMemoryDataset: Any, ref: DatasetRef) -> StoredFileInfo:
843 """Write out in memory dataset to datastore.
845 Parameters
846 ----------
847 inMemoryDataset : `object`
848 Dataset to write to datastore.
849 ref : `DatasetRef`
850 Registry information associated with this dataset.
852 Returns
853 -------
854 info : `StoredFileInfo`
855 Information describin the artifact written to the datastore.
856 """
857 location, formatter = self._prepare_for_put(inMemoryDataset, ref)
858 uri = location.uri
860 if not uri.dirname().exists():
861 log.debug("Folder %s does not exist yet so creating it.", uri.dirname())
862 uri.dirname().mkdir()
864 if self._transaction is None: 864 ↛ 865line 864 didn't jump to line 865, because the condition on line 864 was never true
865 raise RuntimeError("Attempting to write artifact without transaction enabled")
867 def _removeFileExists(uri: ButlerURI) -> None:
868 """Remove a file and do not complain if it is not there.
870 This is important since a formatter might fail before the file
871 is written and we should not confuse people by writing spurious
872 error messages to the log.
873 """
874 try:
875 uri.remove()
876 except FileNotFoundError:
877 pass
879 # Register a callback to try to delete the uploaded data if
880 # something fails below
881 self._transaction.registerUndo("artifactWrite", _removeFileExists, uri)
883 # For a local file, simply use the formatter directly
884 if uri.isLocal:
885 formatter.write(inMemoryDataset)
886 log.debug("Successfully wrote python object to local file at %s", uri)
887 else:
888 # This is a remote URI, so first try bytes and write directly else
889 # fallback to a temporary file
890 try:
891 serializedDataset = formatter.toBytes(inMemoryDataset)
892 log.debug("Writing bytes directly to %s", uri)
893 uri.write(serializedDataset, overwrite=True)
894 log.debug("Successfully wrote bytes directly to %s", uri)
895 except NotImplementedError:
896 with tempfile.NamedTemporaryFile(suffix=uri.getExtension()) as tmpFile:
897 # Need to configure the formatter to write to a different
898 # location and that needs us to overwrite internals
899 tmpLocation = Location(*os.path.split(tmpFile.name))
900 log.debug("Writing dataset to temporary location at %s", tmpLocation.uri)
901 with formatter._updateLocation(tmpLocation):
902 formatter.write(inMemoryDataset)
903 uri.transfer_from(tmpLocation.uri, transfer="copy", overwrite=True)
904 log.debug("Successfully wrote dataset to %s via a temporary file.", uri)
906 # URI is needed to resolve what ingest case are we dealing with
907 return self._extractIngestInfo(uri, ref, formatter=formatter)
909 def _read_artifact_into_memory(self, getInfo: DatastoreFileGetInformation,
910 ref: DatasetRef, isComponent: bool = False) -> Any:
911 """Read the artifact from datastore into in memory object.
913 Parameters
914 ----------
915 getInfo : `DatastoreFileGetInformation`
916 Information about the artifact within the datastore.
917 ref : `DatasetRef`
918 The registry information associated with this artifact.
919 isComponent : `bool`
920 Flag to indicate if a component is being read from this artifact.
922 Returns
923 -------
924 inMemoryDataset : `object`
925 The artifact as a python object.
926 """
927 location = getInfo.location
928 uri = location.uri
929 log.debug("Accessing data from %s", uri)
931 # Cannot recalculate checksum but can compare size as a quick check
932 recorded_size = getInfo.info.file_size
933 resource_size = uri.size()
934 if resource_size != recorded_size: 934 ↛ 935line 934 didn't jump to line 935, because the condition on line 934 was never true
935 raise RuntimeError("Integrity failure in Datastore. "
936 f"Size of file {uri} ({resource_size}) "
937 f"does not match size recorded in registry of {recorded_size}")
939 # For the general case we have choices for how to proceed.
940 # 1. Always use a local file (downloading the remote resource to a
941 # temporary file if needed).
942 # 2. Use a threshold size and read into memory and use bytes.
943 # Use both for now with an arbitrary hand off size.
944 # This allows small datasets to be downloaded from remote object
945 # stores without requiring a temporary file.
947 formatter = getInfo.formatter
948 nbytes_max = 10_000_000 # Arbitrary number that we can tune
949 if resource_size <= nbytes_max and formatter.can_read_bytes():
950 serializedDataset = uri.read()
951 log.debug("Deserializing %s from %d bytes from location %s with formatter %s",
952 f"component {getInfo.component}" if isComponent else "",
953 len(serializedDataset), uri, formatter.name())
954 try:
955 result = formatter.fromBytes(serializedDataset,
956 component=getInfo.component if isComponent else None)
957 except Exception as e:
958 raise ValueError(f"Failure from formatter '{formatter.name()}' for dataset {ref.id}"
959 f" ({ref.datasetType.name} from {uri}): {e}") from e
960 else:
961 # Read from file
962 with uri.as_local() as local_uri:
963 # Have to update the Location associated with the formatter
964 # because formatter.read does not allow an override.
965 # This could be improved.
966 msg = ""
967 newLocation = None
968 if uri != local_uri:
969 newLocation = Location(*local_uri.split())
970 msg = "(via download to local file)"
972 log.debug("Reading %s from location %s %s with formatter %s",
973 f"component {getInfo.component}" if isComponent else "",
974 uri, msg, formatter.name())
975 try:
976 with formatter._updateLocation(newLocation):
977 result = formatter.read(component=getInfo.component if isComponent else None)
978 except Exception as e:
979 raise ValueError(f"Failure from formatter '{formatter.name()}' for dataset {ref.id}"
980 f" ({ref.datasetType.name} from {uri}): {e}") from e
982 return self._post_process_get(result, getInfo.readStorageClass, getInfo.assemblerParams,
983 isComponent=isComponent)
985 def exists(self, ref: DatasetRef) -> bool:
986 """Check if the dataset exists in the datastore.
988 Parameters
989 ----------
990 ref : `DatasetRef`
991 Reference to the required dataset.
993 Returns
994 -------
995 exists : `bool`
996 `True` if the entity exists in the `Datastore`.
997 """
998 fileLocations = self._get_dataset_locations_info(ref)
999 if not fileLocations:
1000 return False
1001 for location, _ in fileLocations:
1002 if not self._artifact_exists(location):
1003 return False
1005 return True
1007 def getURIs(self, ref: DatasetRef,
1008 predict: bool = False) -> Tuple[Optional[ButlerURI], Dict[str, ButlerURI]]:
1009 """Return URIs associated with dataset.
1011 Parameters
1012 ----------
1013 ref : `DatasetRef`
1014 Reference to the required dataset.
1015 predict : `bool`, optional
1016 If the datastore does not know about the dataset, should it
1017 return a predicted URI or not?
1019 Returns
1020 -------
1021 primary : `ButlerURI`
1022 The URI to the primary artifact associated with this dataset.
1023 If the dataset was disassembled within the datastore this
1024 may be `None`.
1025 components : `dict`
1026 URIs to any components associated with the dataset artifact.
1027 Can be empty if there are no components.
1028 """
1030 primary: Optional[ButlerURI] = None
1031 components: Dict[str, ButlerURI] = {}
1033 # if this has never been written then we have to guess
1034 if not self.exists(ref):
1035 if not predict:
1036 raise FileNotFoundError("Dataset {} not in this datastore".format(ref))
1038 def predictLocation(thisRef: DatasetRef) -> Location:
1039 template = self.templates.getTemplate(thisRef)
1040 location = self.locationFactory.fromPath(template.format(thisRef))
1041 storageClass = ref.datasetType.storageClass
1042 formatter = self.formatterFactory.getFormatter(thisRef,
1043 FileDescriptor(location,
1044 storageClass=storageClass))
1045 # Try to use the extension attribute but ignore problems if the
1046 # formatter does not define one.
1047 try:
1048 location = formatter.makeUpdatedLocation(location)
1049 except Exception:
1050 # Use the default extension
1051 pass
1052 return location
1054 doDisassembly = self.composites.shouldBeDisassembled(ref)
1056 if doDisassembly:
1058 for component, componentStorage in ref.datasetType.storageClass.components.items():
1059 compRef = ref.makeComponentRef(component)
1060 compLocation = predictLocation(compRef)
1062 # Add a URI fragment to indicate this is a guess
1063 components[component] = ButlerURI(compLocation.uri.geturl() + "#predicted")
1065 else:
1067 location = predictLocation(ref)
1069 # Add a URI fragment to indicate this is a guess
1070 primary = ButlerURI(location.uri.geturl() + "#predicted")
1072 return primary, components
1074 # If this is a ref that we have written we can get the path.
1075 # Get file metadata and internal metadata
1076 fileLocations = self._get_dataset_locations_info(ref)
1078 if not fileLocations: 1078 ↛ 1079line 1078 didn't jump to line 1079, because the condition on line 1078 was never true
1079 raise RuntimeError(f"Unexpectedly got no artifacts for dataset {ref}")
1081 if len(fileLocations) == 1:
1082 # No disassembly so this is the primary URI
1083 primary = ButlerURI(fileLocations[0][0].uri)
1085 else:
1086 for location, storedFileInfo in fileLocations:
1087 if storedFileInfo.component is None: 1087 ↛ 1088line 1087 didn't jump to line 1088, because the condition on line 1087 was never true
1088 raise RuntimeError(f"Unexpectedly got no component name for a component at {location}")
1089 components[storedFileInfo.component] = ButlerURI(location.uri)
1091 return primary, components
1093 def getURI(self, ref: DatasetRef, predict: bool = False) -> ButlerURI:
1094 """URI to the Dataset.
1096 Parameters
1097 ----------
1098 ref : `DatasetRef`
1099 Reference to the required Dataset.
1100 predict : `bool`
1101 If `True`, allow URIs to be returned of datasets that have not
1102 been written.
1104 Returns
1105 -------
1106 uri : `str`
1107 URI pointing to the dataset within the datastore. If the
1108 dataset does not exist in the datastore, and if ``predict`` is
1109 `True`, the URI will be a prediction and will include a URI
1110 fragment "#predicted".
1111 If the datastore does not have entities that relate well
1112 to the concept of a URI the returned URI will be
1113 descriptive. The returned URI is not guaranteed to be obtainable.
1115 Raises
1116 ------
1117 FileNotFoundError
1118 Raised if a URI has been requested for a dataset that does not
1119 exist and guessing is not allowed.
1120 RuntimeError
1121 Raised if a request is made for a single URI but multiple URIs
1122 are associated with this dataset.
1124 Notes
1125 -----
1126 When a predicted URI is requested an attempt will be made to form
1127 a reasonable URI based on file templates and the expected formatter.
1128 """
1129 primary, components = self.getURIs(ref, predict)
1130 if primary is None or components: 1130 ↛ 1131line 1130 didn't jump to line 1131, because the condition on line 1130 was never true
1131 raise RuntimeError(f"Dataset ({ref}) includes distinct URIs for components. "
1132 "Use Dataastore.getURIs() instead.")
1133 return primary
1135 def get(self, ref: DatasetRef, parameters: Optional[Mapping[str, Any]] = None) -> Any:
1136 """Load an InMemoryDataset from the store.
1138 Parameters
1139 ----------
1140 ref : `DatasetRef`
1141 Reference to the required Dataset.
1142 parameters : `dict`
1143 `StorageClass`-specific parameters that specify, for example,
1144 a slice of the dataset to be loaded.
1146 Returns
1147 -------
1148 inMemoryDataset : `object`
1149 Requested dataset or slice thereof as an InMemoryDataset.
1151 Raises
1152 ------
1153 FileNotFoundError
1154 Requested dataset can not be retrieved.
1155 TypeError
1156 Return value from formatter has unexpected type.
1157 ValueError
1158 Formatter failed to process the dataset.
1159 """
1160 allGetInfo = self._prepare_for_get(ref, parameters)
1161 refComponent = ref.datasetType.component()
1163 # Supplied storage class for the component being read
1164 refStorageClass = ref.datasetType.storageClass
1166 # Create mapping from component name to related info
1167 allComponents = {i.component: i for i in allGetInfo}
1169 # By definition the dataset is disassembled if we have more
1170 # than one record for it.
1171 isDisassembled = len(allGetInfo) > 1
1173 # Look for the special case where we are disassembled but the
1174 # component is a derived component that was not written during
1175 # disassembly. For this scenario we need to check that the
1176 # component requested is listed as a derived component for the
1177 # composite storage class
1178 isDisassembledReadOnlyComponent = False
1179 if isDisassembled and refComponent:
1180 # The composite storage class should be accessible through
1181 # the component dataset type
1182 compositeStorageClass = ref.datasetType.parentStorageClass
1184 # In the unlikely scenario where the composite storage
1185 # class is not known, we can only assume that this is a
1186 # normal component. If that assumption is wrong then the
1187 # branch below that reads a persisted component will fail
1188 # so there is no need to complain here.
1189 if compositeStorageClass is not None: 1189 ↛ 1192line 1189 didn't jump to line 1192, because the condition on line 1189 was never false
1190 isDisassembledReadOnlyComponent = refComponent in compositeStorageClass.derivedComponents
1192 if isDisassembled and not refComponent:
1193 # This was a disassembled dataset spread over multiple files
1194 # and we need to put them all back together again.
1195 # Read into memory and then assemble
1197 # Check that the supplied parameters are suitable for the type read
1198 refStorageClass.validateParameters(parameters)
1200 # We want to keep track of all the parameters that were not used
1201 # by formatters. We assume that if any of the component formatters
1202 # use a parameter that we do not need to apply it again in the
1203 # assembler.
1204 usedParams = set()
1206 components: Dict[str, Any] = {}
1207 for getInfo in allGetInfo:
1208 # assemblerParams are parameters not understood by the
1209 # associated formatter.
1210 usedParams.update(set(getInfo.formatterParams))
1212 component = getInfo.component
1214 if component is None: 1214 ↛ 1215line 1214 didn't jump to line 1215, because the condition on line 1214 was never true
1215 raise RuntimeError(f"Internal error in datastore assembly of {ref}")
1217 # We do not want the formatter to think it's reading
1218 # a component though because it is really reading a
1219 # standalone dataset -- always tell reader it is not a
1220 # component.
1221 components[component] = self._read_artifact_into_memory(getInfo, ref, isComponent=False)
1223 inMemoryDataset = ref.datasetType.storageClass.delegate().assemble(components)
1225 # Any unused parameters will have to be passed to the assembler
1226 if parameters:
1227 unusedParams = {k: v for k, v in parameters.items() if k not in usedParams}
1228 else:
1229 unusedParams = {}
1231 # Process parameters
1232 return ref.datasetType.storageClass.delegate().handleParameters(inMemoryDataset,
1233 parameters=unusedParams)
1235 elif isDisassembledReadOnlyComponent:
1237 compositeStorageClass = ref.datasetType.parentStorageClass
1238 if compositeStorageClass is None: 1238 ↛ 1239line 1238 didn't jump to line 1239, because the condition on line 1238 was never true
1239 raise RuntimeError(f"Unable to retrieve derived component '{refComponent}' since"
1240 "no composite storage class is available.")
1242 if refComponent is None: 1242 ↛ 1244line 1242 didn't jump to line 1244, because the condition on line 1242 was never true
1243 # Mainly for mypy
1244 raise RuntimeError(f"Internal error in datastore {self.name}: component can not be None here")
1246 # Assume that every derived component can be calculated by
1247 # forwarding the request to a single read/write component.
1248 # Rather than guessing which rw component is the right one by
1249 # scanning each for a derived component of the same name,
1250 # we ask the storage class delegate directly which one is best to
1251 # use.
1252 compositeDelegate = compositeStorageClass.delegate()
1253 forwardedComponent = compositeDelegate.selectResponsibleComponent(refComponent,
1254 set(allComponents))
1256 # Select the relevant component
1257 rwInfo = allComponents[forwardedComponent]
1259 # For now assume that read parameters are validated against
1260 # the real component and not the requested component
1261 forwardedStorageClass = rwInfo.formatter.fileDescriptor.readStorageClass
1262 forwardedStorageClass.validateParameters(parameters)
1264 # Unfortunately the FileDescriptor inside the formatter will have
1265 # the wrong write storage class so we need to create a new one
1266 # given the immutability constraint.
1267 writeStorageClass = rwInfo.info.storageClass
1269 # We may need to put some thought into parameters for read
1270 # components but for now forward them on as is
1271 readFormatter = type(rwInfo.formatter)(FileDescriptor(rwInfo.location,
1272 readStorageClass=refStorageClass,
1273 storageClass=writeStorageClass,
1274 parameters=parameters),
1275 ref.dataId)
1277 # The assembler can not receive any parameter requests for a
1278 # derived component at this time since the assembler will
1279 # see the storage class of the derived component and those
1280 # parameters will have to be handled by the formatter on the
1281 # forwarded storage class.
1282 assemblerParams: Dict[str, Any] = {}
1284 # Need to created a new info that specifies the derived
1285 # component and associated storage class
1286 readInfo = DatastoreFileGetInformation(rwInfo.location, readFormatter,
1287 rwInfo.info, assemblerParams, {},
1288 refComponent, refStorageClass)
1290 return self._read_artifact_into_memory(readInfo, ref, isComponent=True)
1292 else:
1293 # Single file request or component from that composite file
1294 for lookup in (refComponent, None): 1294 ↛ 1299line 1294 didn't jump to line 1299, because the loop on line 1294 didn't complete
1295 if lookup in allComponents: 1295 ↛ 1294line 1295 didn't jump to line 1294, because the condition on line 1295 was never false
1296 getInfo = allComponents[lookup]
1297 break
1298 else:
1299 raise FileNotFoundError(f"Component {refComponent} not found "
1300 f"for ref {ref} in datastore {self.name}")
1302 # Do not need the component itself if already disassembled
1303 if isDisassembled:
1304 isComponent = False
1305 else:
1306 isComponent = getInfo.component is not None
1308 # For a disassembled component we can validate parametersagainst
1309 # the component storage class directly
1310 if isDisassembled:
1311 refStorageClass.validateParameters(parameters)
1312 else:
1313 # For an assembled composite this could be a derived
1314 # component derived from a real component. The validity
1315 # of the parameters is not clear. For now validate against
1316 # the composite storage class
1317 getInfo.formatter.fileDescriptor.storageClass.validateParameters(parameters)
1319 return self._read_artifact_into_memory(getInfo, ref, isComponent=isComponent)
1321 @transactional
1322 def put(self, inMemoryDataset: Any, ref: DatasetRef) -> None:
1323 """Write a InMemoryDataset with a given `DatasetRef` to the store.
1325 Parameters
1326 ----------
1327 inMemoryDataset : `object`
1328 The dataset to store.
1329 ref : `DatasetRef`
1330 Reference to the associated Dataset.
1332 Raises
1333 ------
1334 TypeError
1335 Supplied object and storage class are inconsistent.
1336 DatasetTypeNotSupportedError
1337 The associated `DatasetType` is not handled by this datastore.
1339 Notes
1340 -----
1341 If the datastore is configured to reject certain dataset types it
1342 is possible that the put will fail and raise a
1343 `DatasetTypeNotSupportedError`. The main use case for this is to
1344 allow `ChainedDatastore` to put to multiple datastores without
1345 requiring that every datastore accepts the dataset.
1346 """
1348 doDisassembly = self.composites.shouldBeDisassembled(ref)
1349 # doDisassembly = True
1351 artifacts = []
1352 if doDisassembly:
1353 components = ref.datasetType.storageClass.delegate().disassemble(inMemoryDataset)
1354 for component, componentInfo in components.items():
1355 # Don't recurse because we want to take advantage of
1356 # bulk insert -- need a new DatasetRef that refers to the
1357 # same dataset_id but has the component DatasetType
1358 # DatasetType does not refer to the types of components
1359 # So we construct one ourselves.
1360 compRef = ref.makeComponentRef(component)
1361 storedInfo = self._write_in_memory_to_artifact(componentInfo.component, compRef)
1362 artifacts.append((compRef, storedInfo))
1363 else:
1364 # Write the entire thing out
1365 storedInfo = self._write_in_memory_to_artifact(inMemoryDataset, ref)
1366 artifacts.append((ref, storedInfo))
1368 self._register_datasets(artifacts)
1370 @transactional
1371 def trash(self, ref: DatasetRef, ignore_errors: bool = True) -> None:
1372 """Indicate to the datastore that a dataset can be removed.
1374 Parameters
1375 ----------
1376 ref : `DatasetRef`
1377 Reference to the required Dataset.
1378 ignore_errors : `bool`
1379 If `True` return without error even if something went wrong.
1380 Problems could occur if another process is simultaneously trying
1381 to delete.
1383 Raises
1384 ------
1385 FileNotFoundError
1386 Attempt to remove a dataset that does not exist.
1387 """
1388 # Get file metadata and internal metadata
1389 log.debug("Trashing %s in datastore %s", ref, self.name)
1391 fileLocations = self._get_dataset_locations_info(ref)
1393 if not fileLocations:
1394 err_msg = f"Requested dataset to trash ({ref}) is not known to datastore {self.name}"
1395 if ignore_errors:
1396 log.warning(err_msg)
1397 return
1398 else:
1399 raise FileNotFoundError(err_msg)
1401 for location, storedFileInfo in fileLocations:
1402 if not self._artifact_exists(location): 1402 ↛ 1403line 1402 didn't jump to line 1403, because the condition on line 1402 was never true
1403 err_msg = f"Dataset is known to datastore {self.name} but " \
1404 f"associated artifact ({location.uri}) is missing"
1405 if ignore_errors:
1406 log.warning(err_msg)
1407 return
1408 else:
1409 raise FileNotFoundError(err_msg)
1411 # Mark dataset as trashed
1412 try:
1413 self._move_to_trash_in_registry(ref)
1414 except Exception as e:
1415 if ignore_errors:
1416 log.warning(f"Attempted to mark dataset ({ref}) to be trashed in datastore {self.name} "
1417 f"but encountered an error: {e}")
1418 pass
1419 else:
1420 raise
1422 @transactional
1423 def emptyTrash(self, ignore_errors: bool = True) -> None:
1424 """Remove all datasets from the trash.
1426 Parameters
1427 ----------
1428 ignore_errors : `bool`
1429 If `True` return without error even if something went wrong.
1430 Problems could occur if another process is simultaneously trying
1431 to delete.
1432 """
1433 log.debug("Emptying trash in datastore %s", self.name)
1434 # Context manager will empty trash iff we finish it without raising.
1435 with self.bridge.emptyTrash() as trashed:
1436 for ref in trashed:
1437 fileLocations = self._get_dataset_locations_info(ref)
1439 if not fileLocations: 1439 ↛ 1440line 1439 didn't jump to line 1440, because the condition on line 1439 was never true
1440 err_msg = f"Requested dataset ({ref}) does not exist in datastore {self.name}"
1441 if ignore_errors:
1442 log.warning(err_msg)
1443 continue
1444 else:
1445 raise FileNotFoundError(err_msg)
1447 for location, _ in fileLocations:
1449 if not self._artifact_exists(location): 1449 ↛ 1450line 1449 didn't jump to line 1450, because the condition on line 1449 was never true
1450 err_msg = f"Dataset {location.uri} no longer present in datastore {self.name}"
1451 if ignore_errors:
1452 log.warning(err_msg)
1453 continue
1454 else:
1455 raise FileNotFoundError(err_msg)
1457 # Can only delete the artifact if there are no references
1458 # to the file from untrashed dataset refs.
1459 if self._can_remove_dataset_artifact(ref, location):
1460 # Point of no return for this artifact
1461 log.debug("Removing artifact %s from datastore %s", location.uri, self.name)
1462 try:
1463 self._delete_artifact(location)
1464 except Exception as e:
1465 if ignore_errors:
1466 log.critical("Encountered error removing artifact %s from datastore %s: %s",
1467 location.uri, self.name, e)
1468 else:
1469 raise
1471 # Now must remove the entry from the internal registry even if
1472 # the artifact removal failed and was ignored,
1473 # otherwise the removal check above will never be true
1474 try:
1475 # There may be multiple rows associated with this ref
1476 # depending on disassembly
1477 self.removeStoredItemInfo(ref)
1478 except Exception as e:
1479 if ignore_errors:
1480 log.warning("Error removing dataset %s (%s) from internal registry of %s: %s",
1481 ref.id, location.uri, self.name, e)
1482 continue
1483 else:
1484 raise FileNotFoundError(err_msg)
1486 def validateConfiguration(self, entities: Iterable[Union[DatasetRef, DatasetType, StorageClass]],
1487 logFailures: bool = False) -> None:
1488 """Validate some of the configuration for this datastore.
1490 Parameters
1491 ----------
1492 entities : iterable of `DatasetRef`, `DatasetType`, or `StorageClass`
1493 Entities to test against this configuration. Can be differing
1494 types.
1495 logFailures : `bool`, optional
1496 If `True`, output a log message for every validation error
1497 detected.
1499 Raises
1500 ------
1501 DatastoreValidationError
1502 Raised if there is a validation problem with a configuration.
1503 All the problems are reported in a single exception.
1505 Notes
1506 -----
1507 This method checks that all the supplied entities have valid file
1508 templates and also have formatters defined.
1509 """
1511 templateFailed = None
1512 try:
1513 self.templates.validateTemplates(entities, logFailures=logFailures)
1514 except FileTemplateValidationError as e:
1515 templateFailed = str(e)
1517 formatterFailed = []
1518 for entity in entities:
1519 try:
1520 self.formatterFactory.getFormatterClass(entity)
1521 except KeyError as e:
1522 formatterFailed.append(str(e))
1523 if logFailures: 1523 ↛ 1518line 1523 didn't jump to line 1518, because the condition on line 1523 was never false
1524 log.fatal("Formatter failure: %s", e)
1526 if templateFailed or formatterFailed:
1527 messages = []
1528 if templateFailed: 1528 ↛ 1529line 1528 didn't jump to line 1529, because the condition on line 1528 was never true
1529 messages.append(templateFailed)
1530 if formatterFailed: 1530 ↛ 1532line 1530 didn't jump to line 1532, because the condition on line 1530 was never false
1531 messages.append(",".join(formatterFailed))
1532 msg = ";\n".join(messages)
1533 raise DatastoreValidationError(msg)
1535 def getLookupKeys(self) -> Set[LookupKey]:
1536 # Docstring is inherited from base class
1537 return self.templates.getLookupKeys() | self.formatterFactory.getLookupKeys() | \
1538 self.constraints.getLookupKeys()
1540 def validateKey(self, lookupKey: LookupKey,
1541 entity: Union[DatasetRef, DatasetType, StorageClass]) -> None:
1542 # Docstring is inherited from base class
1543 # The key can be valid in either formatters or templates so we can
1544 # only check the template if it exists
1545 if lookupKey in self.templates:
1546 try:
1547 self.templates[lookupKey].validateTemplate(entity)
1548 except FileTemplateValidationError as e:
1549 raise DatastoreValidationError(e) from e
1551 def export(self, refs: Iterable[DatasetRef], *,
1552 directory: Optional[Union[ButlerURI, str]] = None,
1553 transfer: Optional[str] = "auto") -> Iterable[FileDataset]:
1554 # Docstring inherited from Datastore.export.
1555 if transfer is not None and directory is None: 1555 ↛ 1556line 1555 didn't jump to line 1556, because the condition on line 1555 was never true
1556 raise RuntimeError(f"Cannot export using transfer mode {transfer} with no "
1557 "export directory given")
1559 # Force the directory to be a URI object
1560 directoryUri: Optional[ButlerURI] = None
1561 if directory is not None: 1561 ↛ 1564line 1561 didn't jump to line 1564, because the condition on line 1561 was never false
1562 directoryUri = ButlerURI(directory, forceDirectory=True)
1564 if transfer is not None and directoryUri is not None: 1564 ↛ 1569line 1564 didn't jump to line 1569, because the condition on line 1564 was never false
1565 # mypy needs the second test
1566 if not directoryUri.exists(): 1566 ↛ 1567line 1566 didn't jump to line 1567, because the condition on line 1566 was never true
1567 raise FileNotFoundError(f"Export location {directory} does not exist")
1569 for ref in refs:
1570 fileLocations = self._get_dataset_locations_info(ref)
1571 if not fileLocations: 1571 ↛ 1572line 1571 didn't jump to line 1572, because the condition on line 1571 was never true
1572 raise FileNotFoundError(f"Could not retrieve dataset {ref}.")
1573 # For now we can not export disassembled datasets
1574 if len(fileLocations) > 1:
1575 raise NotImplementedError(f"Can not export disassembled datasets such as {ref}")
1576 location, storedFileInfo = fileLocations[0]
1578 pathInStore = location.pathInStore.path
1579 if transfer is None: 1579 ↛ 1582line 1579 didn't jump to line 1582, because the condition on line 1579 was never true
1580 # TODO: do we also need to return the readStorageClass somehow?
1581 # We will use the path in store directly
1582 pass
1583 elif transfer == "direct": 1583 ↛ 1585line 1583 didn't jump to line 1585, because the condition on line 1583 was never true
1584 # Use full URIs to the remote store in the export
1585 pathInStore = str(location.uri)
1586 else:
1587 # mypy needs help
1588 assert directoryUri is not None, "directoryUri must be defined to get here"
1589 storeUri = ButlerURI(location.uri)
1591 # if the datastore has an absolute URI to a resource, we
1592 # have two options:
1593 # 1. Keep the absolute URI in the exported YAML
1594 # 2. Allocate a new name in the local datastore and transfer
1595 # it.
1596 # For now go with option 2
1597 if location.pathInStore.isabs(): 1597 ↛ 1598line 1597 didn't jump to line 1598, because the condition on line 1597 was never true
1598 template = self.templates.getTemplate(ref)
1599 pathInStore = template.format(ref)
1601 exportUri = directoryUri.join(pathInStore)
1602 exportUri.transfer_from(storeUri, transfer=transfer)
1604 yield FileDataset(refs=[ref], path=pathInStore, formatter=storedFileInfo.formatter)
1606 @staticmethod
1607 def computeChecksum(uri: ButlerURI, algorithm: str = "blake2b", block_size: int = 8192) -> Optional[str]:
1608 """Compute the checksum of the supplied file.
1610 Parameters
1611 ----------
1612 uri : `ButlerURI`
1613 Name of resource to calculate checksum from.
1614 algorithm : `str`, optional
1615 Name of algorithm to use. Must be one of the algorithms supported
1616 by :py:class`hashlib`.
1617 block_size : `int`
1618 Number of bytes to read from file at one time.
1620 Returns
1621 -------
1622 hexdigest : `str`
1623 Hex digest of the file.
1625 Notes
1626 -----
1627 Currently returns None if the URI is for a remote resource.
1628 """
1629 if algorithm not in hashlib.algorithms_guaranteed: 1629 ↛ 1630line 1629 didn't jump to line 1630, because the condition on line 1629 was never true
1630 raise NameError("The specified algorithm '{}' is not supported by hashlib".format(algorithm))
1632 if not uri.isLocal: 1632 ↛ 1633line 1632 didn't jump to line 1633, because the condition on line 1632 was never true
1633 return None
1635 hasher = hashlib.new(algorithm)
1637 with uri.as_local() as local_uri:
1638 with open(local_uri.ospath, "rb") as f:
1639 for chunk in iter(lambda: f.read(block_size), b""):
1640 hasher.update(chunk)
1642 return hasher.hexdigest()