Coverage for python/lsst/daf/butler/datastores/fileDatastore.py : 84%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
21from __future__ import annotations
23"""Generic file-based datastore code."""
25__all__ = ("FileDatastore", )
27import hashlib
28import logging
29import os
30import tempfile
32from sqlalchemy import BigInteger, String
34from dataclasses import dataclass
35from typing import (
36 TYPE_CHECKING,
37 Any,
38 ClassVar,
39 Dict,
40 Iterable,
41 List,
42 Mapping,
43 Optional,
44 Set,
45 Tuple,
46 Type,
47 Union,
48)
50from lsst.daf.butler import (
51 ButlerURI,
52 CompositesMap,
53 Config,
54 FileDataset,
55 DatasetRef,
56 DatasetType,
57 DatasetTypeNotSupportedError,
58 Datastore,
59 DatastoreConfig,
60 DatastoreValidationError,
61 FileDescriptor,
62 FileTemplates,
63 FileTemplateValidationError,
64 Formatter,
65 FormatterFactory,
66 Location,
67 LocationFactory,
68 StorageClass,
69 StoredFileInfo,
70)
72from lsst.daf.butler import ddl
73from lsst.daf.butler.registry.interfaces import (
74 ReadOnlyDatabaseError,
75 DatastoreRegistryBridge,
76)
78from lsst.daf.butler.core.repoRelocation import replaceRoot
79from lsst.daf.butler.core.utils import getInstanceOf, getClassOf, transactional
80from .genericDatastore import GenericBaseDatastore
82if TYPE_CHECKING: 82 ↛ 83line 82 didn't jump to line 83, because the condition on line 82 was never true
83 from lsst.daf.butler import LookupKey
84 from lsst.daf.butler.registry.interfaces import DatasetIdRef, DatastoreRegistryBridgeManager
86log = logging.getLogger(__name__)
88# String to use when a Python None is encountered
89NULLSTR = "__NULL_STRING__"
92class _IngestPrepData(Datastore.IngestPrepData):
93 """Helper class for FileDatastore ingest implementation.
95 Parameters
96 ----------
97 datasets : `list` of `FileDataset`
98 Files to be ingested by this datastore.
99 """
100 def __init__(self, datasets: List[FileDataset]):
101 super().__init__(ref for dataset in datasets for ref in dataset.refs)
102 self.datasets = datasets
105@dataclass(frozen=True)
106class DatastoreFileGetInformation:
107 """Collection of useful parameters needed to retrieve a file from
108 a Datastore.
109 """
111 location: Location
112 """The location from which to read the dataset."""
114 formatter: Formatter
115 """The `Formatter` to use to deserialize the dataset."""
117 info: StoredFileInfo
118 """Stored information about this file and its formatter."""
120 assemblerParams: Dict[str, Any]
121 """Parameters to use for post-processing the retrieved dataset."""
123 formatterParams: Dict[str, Any]
124 """Parameters that were understood by the associated formatter."""
126 component: Optional[str]
127 """The component to be retrieved (can be `None`)."""
129 readStorageClass: StorageClass
130 """The `StorageClass` of the dataset being read."""
133class FileDatastore(GenericBaseDatastore):
134 """Generic Datastore for file-based implementations.
136 Should always be sub-classed since key abstract methods are missing.
138 Parameters
139 ----------
140 config : `DatastoreConfig` or `str`
141 Configuration as either a `Config` object or URI to file.
142 bridgeManager : `DatastoreRegistryBridgeManager`
143 Object that manages the interface between `Registry` and datastores.
144 butlerRoot : `str`, optional
145 New datastore root to use to override the configuration value.
147 Raises
148 ------
149 ValueError
150 If root location does not exist and ``create`` is `False` in the
151 configuration.
152 """
154 defaultConfigFile: ClassVar[Optional[str]] = None
155 """Path to configuration defaults. Accessed within the ``config`` resource
156 or relative to a search path. Can be None if no defaults specified.
157 """
159 root: ButlerURI
160 """Root directory URI of this `Datastore`."""
162 locationFactory: LocationFactory
163 """Factory for creating locations relative to the datastore root."""
165 formatterFactory: FormatterFactory
166 """Factory for creating instances of formatters."""
168 templates: FileTemplates
169 """File templates that can be used by this `Datastore`."""
171 composites: CompositesMap
172 """Determines whether a dataset should be disassembled on put."""
174 defaultConfigFile = "datastores/fileDatastore.yaml"
175 """Path to configuration defaults. Accessed within the ``config`` resource
176 or relative to a search path. Can be None if no defaults specified.
177 """
179 @classmethod
180 def setConfigRoot(cls, root: str, config: Config, full: Config, overwrite: bool = True) -> None:
181 """Set any filesystem-dependent config options for this Datastore to
182 be appropriate for a new empty repository with the given root.
184 Parameters
185 ----------
186 root : `str`
187 URI to the root of the data repository.
188 config : `Config`
189 A `Config` to update. Only the subset understood by
190 this component will be updated. Will not expand
191 defaults.
192 full : `Config`
193 A complete config with all defaults expanded that can be
194 converted to a `DatastoreConfig`. Read-only and will not be
195 modified by this method.
196 Repository-specific options that should not be obtained
197 from defaults when Butler instances are constructed
198 should be copied from ``full`` to ``config``.
199 overwrite : `bool`, optional
200 If `False`, do not modify a value in ``config`` if the value
201 already exists. Default is always to overwrite with the provided
202 ``root``.
204 Notes
205 -----
206 If a keyword is explicitly defined in the supplied ``config`` it
207 will not be overridden by this method if ``overwrite`` is `False`.
208 This allows explicit values set in external configs to be retained.
209 """
210 Config.updateParameters(DatastoreConfig, config, full,
211 toUpdate={"root": root},
212 toCopy=("cls", ("records", "table")), overwrite=overwrite)
214 @classmethod
215 def makeTableSpec(cls) -> ddl.TableSpec:
216 return ddl.TableSpec(
217 fields=[
218 ddl.FieldSpec(name="dataset_id", dtype=BigInteger, primaryKey=True),
219 ddl.FieldSpec(name="path", dtype=String, length=256, nullable=False),
220 ddl.FieldSpec(name="formatter", dtype=String, length=128, nullable=False),
221 ddl.FieldSpec(name="storage_class", dtype=String, length=64, nullable=False),
222 # Use empty string to indicate no component
223 ddl.FieldSpec(name="component", dtype=String, length=32, primaryKey=True),
224 # TODO: should checksum be Base64Bytes instead?
225 ddl.FieldSpec(name="checksum", dtype=String, length=128, nullable=True),
226 ddl.FieldSpec(name="file_size", dtype=BigInteger, nullable=True),
227 ],
228 unique=frozenset(),
229 )
231 def __init__(self, config: Union[DatastoreConfig, str],
232 bridgeManager: DatastoreRegistryBridgeManager, butlerRoot: str = None):
233 super().__init__(config, bridgeManager)
234 if "root" not in self.config: 234 ↛ 235line 234 didn't jump to line 235, because the condition on line 234 was never true
235 raise ValueError("No root directory specified in configuration")
237 # Name ourselves either using an explicit name or a name
238 # derived from the (unexpanded) root
239 if "name" in self.config:
240 self.name = self.config["name"]
241 else:
242 # We use the unexpanded root in the name to indicate that this
243 # datastore can be moved without having to update registry.
244 self.name = "{}@{}".format(type(self).__name__,
245 self.config["root"])
247 # Support repository relocation in config
248 # Existence of self.root is checked in subclass
249 self.root = ButlerURI(replaceRoot(self.config["root"], butlerRoot),
250 forceDirectory=True, forceAbsolute=True)
252 self.locationFactory = LocationFactory(self.root)
253 self.formatterFactory = FormatterFactory()
255 # Now associate formatters with storage classes
256 self.formatterFactory.registerFormatters(self.config["formatters"],
257 universe=bridgeManager.universe)
259 # Read the file naming templates
260 self.templates = FileTemplates(self.config["templates"],
261 universe=bridgeManager.universe)
263 # See if composites should be disassembled
264 self.composites = CompositesMap(self.config["composites"],
265 universe=bridgeManager.universe)
267 tableName = self.config["records", "table"]
268 try:
269 # Storage of paths and formatters, keyed by dataset_id
270 self._table = bridgeManager.opaque.register(tableName, self.makeTableSpec())
271 # Interface to Registry.
272 self._bridge = bridgeManager.register(self.name)
273 except ReadOnlyDatabaseError:
274 # If the database is read only and we just tried and failed to
275 # create a table, it means someone is trying to create a read-only
276 # butler client for an empty repo. That should be okay, as long
277 # as they then try to get any datasets before some other client
278 # creates the table. Chances are they'rejust validating
279 # configuration.
280 pass
282 # Determine whether checksums should be used - default to False
283 self.useChecksum = self.config.get("checksum", False)
285 # Determine whether we can fall back to configuration if a
286 # requested dataset is not known to registry
287 self.trustGetRequest = self.config.get("trust_get_request", False)
289 # Check existence and create directory structure if necessary
290 if not self.root.exists():
291 if "create" not in self.config or not self.config["create"]: 291 ↛ 292line 291 didn't jump to line 292, because the condition on line 291 was never true
292 raise ValueError(f"No valid root and not allowed to create one at: {self.root}")
293 try:
294 self.root.mkdir()
295 except Exception as e:
296 raise ValueError(f"Can not create datastore root '{self.root}', check permissions."
297 f" Got error: {e}") from e
299 def __str__(self) -> str:
300 return str(self.root)
302 @property
303 def bridge(self) -> DatastoreRegistryBridge:
304 return self._bridge
306 def _artifact_exists(self, location: Location) -> bool:
307 """Check that an artifact exists in this datastore at the specified
308 location.
310 Parameters
311 ----------
312 location : `Location`
313 Expected location of the artifact associated with this datastore.
315 Returns
316 -------
317 exists : `bool`
318 True if the location can be found, false otherwise.
319 """
320 log.debug("Checking if resource exists: %s", location.uri)
321 return location.uri.exists()
323 def _delete_artifact(self, location: Location) -> None:
324 """Delete the artifact from the datastore.
326 Parameters
327 ----------
328 location : `Location`
329 Location of the artifact associated with this datastore.
330 """
331 log.debug("Deleting file: %s", location.uri)
332 location.uri.remove()
333 log.debug("Successfully deleted file: %s", location.uri)
335 def addStoredItemInfo(self, refs: Iterable[DatasetRef], infos: Iterable[StoredFileInfo]) -> None:
336 # Docstring inherited from GenericBaseDatastore
337 records = []
338 for ref, info in zip(refs, infos):
339 # Component should come from ref and fall back on info
340 component = ref.datasetType.component()
341 if component is None and info.component is not None: 341 ↛ 342line 341 didn't jump to line 342, because the condition on line 341 was never true
342 component = info.component
343 if component is None:
344 # Use empty string since we want this to be part of the
345 # primary key.
346 component = NULLSTR
347 records.append(
348 dict(dataset_id=ref.id, formatter=info.formatter, path=info.path,
349 storage_class=info.storageClass.name, component=component,
350 checksum=info.checksum, file_size=info.file_size)
351 )
352 self._table.insert(*records)
354 def getStoredItemsInfo(self, ref: DatasetIdRef) -> List[StoredFileInfo]:
355 # Docstring inherited from GenericBaseDatastore
357 # Look for the dataset_id -- there might be multiple matches
358 # if we have disassembled the dataset.
359 records = list(self._table.fetch(dataset_id=ref.id))
361 results = []
362 for record in records:
363 # Convert name of StorageClass to instance
364 storageClass = self.storageClassFactory.getStorageClass(record["storage_class"])
365 component = record["component"] if (record["component"]
366 and record["component"] != NULLSTR) else None
368 info = StoredFileInfo(formatter=record["formatter"],
369 path=record["path"],
370 storageClass=storageClass,
371 component=component,
372 checksum=record["checksum"],
373 file_size=record["file_size"])
374 results.append(info)
376 return results
378 def _registered_refs_per_artifact(self, pathInStore: ButlerURI) -> Set[int]:
379 """Return all dataset refs associated with the supplied path.
381 Parameters
382 ----------
383 pathInStore : `ButlerURI`
384 Path of interest in the data store.
386 Returns
387 -------
388 ids : `set` of `int`
389 All `DatasetRef` IDs associated with this path.
390 """
391 records = list(self._table.fetch(path=str(pathInStore)))
392 ids = {r["dataset_id"] for r in records}
393 return ids
395 def removeStoredItemInfo(self, ref: DatasetIdRef) -> None:
396 # Docstring inherited from GenericBaseDatastore
397 self._table.delete(dataset_id=ref.id)
399 def _get_dataset_locations_info(self, ref: DatasetIdRef) -> List[Tuple[Location, StoredFileInfo]]:
400 r"""Find all the `Location`\ s of the requested dataset in the
401 `Datastore` and the associated stored file information.
403 Parameters
404 ----------
405 ref : `DatasetRef`
406 Reference to the required `Dataset`.
408 Returns
409 -------
410 results : `list` [`tuple` [`Location`, `StoredFileInfo` ]]
411 Location of the dataset within the datastore and
412 stored information about each file and its formatter.
413 """
414 # Get the file information (this will fail if no file)
415 records = self.getStoredItemsInfo(ref)
417 # Use the path to determine the location -- we need to take
418 # into account absolute URIs in the datastore record
419 locations: List[Tuple[Location, StoredFileInfo]] = []
420 for r in records:
421 uriInStore = ButlerURI(r.path, forceAbsolute=False)
422 if uriInStore.isabs(): 422 ↛ 423line 422 didn't jump to line 423, because the condition on line 422 was never true
423 location = Location(None, uriInStore)
424 else:
425 location = self.locationFactory.fromPath(r.path)
426 locations.append((location, r))
427 return locations
429 def _can_remove_dataset_artifact(self, ref: DatasetIdRef, location: Location) -> bool:
430 """Check that there is only one dataset associated with the
431 specified artifact.
433 Parameters
434 ----------
435 ref : `DatasetRef` or `FakeDatasetRef`
436 Dataset to be removed.
437 location : `Location`
438 The location of the artifact to be removed.
440 Returns
441 -------
442 can_remove : `Bool`
443 True if the artifact can be safely removed.
444 """
446 # Get all entries associated with this path
447 allRefs = self._registered_refs_per_artifact(location.pathInStore)
448 if not allRefs: 448 ↛ 449line 448 didn't jump to line 449, because the condition on line 448 was never true
449 raise RuntimeError(f"Datastore inconsistency error. {location.pathInStore} not in registry")
451 # Remove these refs from all the refs and if there is nothing left
452 # then we can delete
453 remainingRefs = allRefs - {ref.id}
455 if remainingRefs:
456 return False
457 return True
459 def _get_expected_dataset_locations_info(self, ref: DatasetRef) -> List[Tuple[Location,
460 StoredFileInfo]]:
461 """Predict the location and related file information of the requested
462 dataset in this datastore.
464 Parameters
465 ----------
466 ref : `DatasetRef`
467 Reference to the required `Dataset`.
469 Returns
470 -------
471 results : `list` [`tuple` [`Location`, `StoredFileInfo` ]]
472 Expected Location of the dataset within the datastore and
473 placeholder information about each file and its formatter.
475 Notes
476 -----
477 Uses the current configuration to determine how we would expect the
478 datastore files to have been written if we couldn't ask registry.
479 This is safe so long as there has been no change to datastore
480 configuration between writing the dataset and wanting to read it.
481 Will not work for files that have been ingested without using the
482 standard file template or default formatter.
483 """
485 # If we have a component ref we always need to ask the questions
486 # of the composite. If the composite is disassembled this routine
487 # should return all components. If the composite was not
488 # disassembled the composite is what is stored regardless of
489 # component request. Note that if the caller has disassembled
490 # a composite there is no way for this guess to know that
491 # without trying both the composite and component ref and seeing
492 # if there is something at the component Location even without
493 # disassembly being enabled.
494 if ref.datasetType.isComponent():
495 ref = ref.makeCompositeRef()
497 # See if the ref is a composite that should be disassembled
498 doDisassembly = self.composites.shouldBeDisassembled(ref)
500 all_info: List[Tuple[Location, Formatter, StorageClass, Optional[str]]] = []
502 if doDisassembly:
503 for component, componentStorage in ref.datasetType.storageClass.components.items():
504 compRef = ref.makeComponentRef(component)
505 location, formatter = self._determine_put_formatter_location(compRef)
506 all_info.append((location, formatter, componentStorage, component))
508 else:
509 # Always use the composite ref if no disassembly
510 location, formatter = self._determine_put_formatter_location(ref)
511 all_info.append((location, formatter, ref.datasetType.storageClass, None))
513 # Convert the list of tuples to have StoredFileInfo as second element
514 return [(location, StoredFileInfo(formatter=formatter,
515 path=location.pathInStore.path,
516 storageClass=storageClass,
517 component=component,
518 checksum=None,
519 file_size=-1))
520 for location, formatter, storageClass, component in all_info]
522 def _prepare_for_get(self, ref: DatasetRef,
523 parameters: Optional[Mapping[str, Any]] = None) -> List[DatastoreFileGetInformation]:
524 """Check parameters for ``get`` and obtain formatter and
525 location.
527 Parameters
528 ----------
529 ref : `DatasetRef`
530 Reference to the required Dataset.
531 parameters : `dict`
532 `StorageClass`-specific parameters that specify, for example,
533 a slice of the dataset to be loaded.
535 Returns
536 -------
537 getInfo : `list` [`DatastoreFileGetInformation`]
538 Parameters needed to retrieve each file.
539 """
540 log.debug("Retrieve %s from %s with parameters %s", ref, self.name, parameters)
542 # Get file metadata and internal metadata
543 fileLocations = self._get_dataset_locations_info(ref)
544 if not fileLocations:
545 if not self.trustGetRequest:
546 raise FileNotFoundError(f"Could not retrieve dataset {ref}.")
547 # Assume the dataset is where we think it should be
548 fileLocations = self._get_expected_dataset_locations_info(ref)
550 # The storage class we want to use eventually
551 refStorageClass = ref.datasetType.storageClass
553 if len(fileLocations) > 1:
554 disassembled = True
555 else:
556 disassembled = False
558 # Is this a component request?
559 refComponent = ref.datasetType.component()
561 fileGetInfo = []
562 for location, storedFileInfo in fileLocations:
564 # The storage class used to write the file
565 writeStorageClass = storedFileInfo.storageClass
567 # If this has been disassembled we need read to match the write
568 if disassembled:
569 readStorageClass = writeStorageClass
570 else:
571 readStorageClass = refStorageClass
573 formatter = getInstanceOf(storedFileInfo.formatter,
574 FileDescriptor(location, readStorageClass=readStorageClass,
575 storageClass=writeStorageClass, parameters=parameters),
576 ref.dataId)
578 formatterParams, notFormatterParams = formatter.segregateParameters()
580 # Of the remaining parameters, extract the ones supported by
581 # this StorageClass (for components not all will be handled)
582 assemblerParams = readStorageClass.filterParameters(notFormatterParams)
584 # The ref itself could be a component if the dataset was
585 # disassembled by butler, or we disassembled in datastore and
586 # components came from the datastore records
587 component = storedFileInfo.component if storedFileInfo.component else refComponent
589 fileGetInfo.append(DatastoreFileGetInformation(location, formatter, storedFileInfo,
590 assemblerParams, formatterParams,
591 component, readStorageClass))
593 return fileGetInfo
595 def _prepare_for_put(self, inMemoryDataset: Any, ref: DatasetRef) -> Tuple[Location, Formatter]:
596 """Check the arguments for ``put`` and obtain formatter and
597 location.
599 Parameters
600 ----------
601 inMemoryDataset : `object`
602 The dataset to store.
603 ref : `DatasetRef`
604 Reference to the associated Dataset.
606 Returns
607 -------
608 location : `Location`
609 The location to write the dataset.
610 formatter : `Formatter`
611 The `Formatter` to use to write the dataset.
613 Raises
614 ------
615 TypeError
616 Supplied object and storage class are inconsistent.
617 DatasetTypeNotSupportedError
618 The associated `DatasetType` is not handled by this datastore.
619 """
620 self._validate_put_parameters(inMemoryDataset, ref)
621 return self._determine_put_formatter_location(ref)
623 def _determine_put_formatter_location(self, ref: DatasetRef) -> Tuple[Location, Formatter]:
624 """Calculate the formatter and output location to use for put.
626 Parameters
627 ----------
628 ref : `DatasetRef`
629 Reference to the associated Dataset.
631 Returns
632 -------
633 location : `Location`
634 The location to write the dataset.
635 formatter : `Formatter`
636 The `Formatter` to use to write the dataset.
637 """
638 # Work out output file name
639 try:
640 template = self.templates.getTemplate(ref)
641 except KeyError as e:
642 raise DatasetTypeNotSupportedError(f"Unable to find template for {ref}") from e
644 # Validate the template to protect against filenames from different
645 # dataIds returning the same and causing overwrite confusion.
646 template.validateTemplate(ref)
648 location = self.locationFactory.fromPath(template.format(ref))
650 # Get the formatter based on the storage class
651 storageClass = ref.datasetType.storageClass
652 try:
653 formatter = self.formatterFactory.getFormatter(ref,
654 FileDescriptor(location,
655 storageClass=storageClass),
656 ref.dataId)
657 except KeyError as e:
658 raise DatasetTypeNotSupportedError(f"Unable to find formatter for {ref} in datastore "
659 f"{self.name}") from e
661 # Now that we know the formatter, update the location
662 location = formatter.makeUpdatedLocation(location)
664 return location, formatter
666 def _overrideTransferMode(self, *datasets: FileDataset, transfer: Optional[str] = None) -> Optional[str]:
667 # Docstring inherited from base class
668 if transfer != "auto":
669 return transfer
671 # See if the paths are within the datastore or not
672 inside = [self._pathInStore(d.path) is not None for d in datasets]
674 if all(inside):
675 transfer = None
676 elif not any(inside): 676 ↛ 680line 676 didn't jump to line 680, because the condition on line 676 was never false
677 # Allow ButlerURI to use its own knowledge
678 transfer = "auto"
679 else:
680 raise ValueError("Some datasets are inside the datastore and some are outside."
681 " Please use an explicit transfer mode and not 'auto'.")
683 return transfer
685 def _pathInStore(self, path: str) -> Optional[str]:
686 """Return path relative to datastore root
688 Parameters
689 ----------
690 path : `str`
691 Path to dataset. Can be absolute. If relative assumed to
692 be relative to the datastore. Returns path in datastore
693 or raises an exception if the path it outside.
695 Returns
696 -------
697 inStore : `str`
698 Path relative to datastore root. Returns `None` if the file is
699 outside the root.
700 """
701 # Relative path will always be relative to datastore
702 pathUri = ButlerURI(path, forceAbsolute=False)
703 return pathUri.relative_to(self.root)
705 def _standardizeIngestPath(self, path: str, *, transfer: Optional[str] = None) -> str:
706 """Standardize the path of a to-be-ingested file.
708 Parameters
709 ----------
710 path : `str`
711 Path of a file to be ingested.
712 transfer : `str`, optional
713 How (and whether) the dataset should be added to the datastore.
714 See `ingest` for details of transfer modes.
715 This implementation is provided only so
716 `NotImplementedError` can be raised if the mode is not supported;
717 actual transfers are deferred to `_extractIngestInfo`.
719 Returns
720 -------
721 path : `str`
722 New path in what the datastore considers standard form.
724 Notes
725 -----
726 Subclasses of `FileDatastore` can implement this method instead
727 of `_prepIngest`. It should not modify the data repository or given
728 file in any way.
730 Raises
731 ------
732 NotImplementedError
733 Raised if the datastore does not support the given transfer mode
734 (including the case where ingest is not supported at all).
735 FileNotFoundError
736 Raised if one of the given files does not exist.
737 """
738 if transfer not in (None, "direct") + self.root.transferModes: 738 ↛ 739line 738 didn't jump to line 739, because the condition on line 738 was never true
739 raise NotImplementedError(f"Transfer mode {transfer} not supported.")
741 # A relative URI indicates relative to datastore root
742 srcUri = ButlerURI(path, forceAbsolute=False)
743 if not srcUri.isabs():
744 srcUri = self.root.join(path)
746 if not srcUri.exists():
747 raise FileNotFoundError(f"Resource at {srcUri} does not exist; note that paths to ingest "
748 f"are assumed to be relative to {self.root} unless they are absolute.")
750 if transfer is None:
751 relpath = srcUri.relative_to(self.root)
752 if not relpath:
753 raise RuntimeError(f"Transfer is none but source file ({srcUri}) is not "
754 f"within datastore ({self.root})")
756 # Return the relative path within the datastore for internal
757 # transfer
758 path = relpath
760 return path
762 def _extractIngestInfo(self, path: Union[str, ButlerURI], ref: DatasetRef, *,
763 formatter: Union[Formatter, Type[Formatter]],
764 transfer: Optional[str] = None) -> StoredFileInfo:
765 """Relocate (if necessary) and extract `StoredFileInfo` from a
766 to-be-ingested file.
768 Parameters
769 ----------
770 path : `str` or `ButlerURI`
771 URI or path of a file to be ingested.
772 ref : `DatasetRef`
773 Reference for the dataset being ingested. Guaranteed to have
774 ``dataset_id not None`.
775 formatter : `type` or `Formatter`
776 `Formatter` subclass to use for this dataset or an instance.
777 transfer : `str`, optional
778 How (and whether) the dataset should be added to the datastore.
779 See `ingest` for details of transfer modes.
781 Returns
782 -------
783 info : `StoredFileInfo`
784 Internal datastore record for this file. This will be inserted by
785 the caller; the `_extractIngestInfo` is only resposible for
786 creating and populating the struct.
788 Raises
789 ------
790 FileNotFoundError
791 Raised if one of the given files does not exist.
792 FileExistsError
793 Raised if transfer is not `None` but the (internal) location the
794 file would be moved to is already occupied.
795 """
796 if self._transaction is None: 796 ↛ 797line 796 didn't jump to line 797, because the condition on line 796 was never true
797 raise RuntimeError("Ingest called without transaction enabled")
799 # Create URI of the source path, do not need to force a relative
800 # path to absolute.
801 srcUri = ButlerURI(path, forceAbsolute=False)
803 # Track whether we have read the size of the source yet
804 have_sized = False
806 tgtLocation: Optional[Location]
807 if transfer is None:
808 # A relative path is assumed to be relative to the datastore
809 # in this context
810 if not srcUri.isabs():
811 tgtLocation = self.locationFactory.fromPath(srcUri.ospath)
812 else:
813 # Work out the path in the datastore from an absolute URI
814 # This is required to be within the datastore.
815 pathInStore = srcUri.relative_to(self.root)
816 if pathInStore is None: 816 ↛ 817line 816 didn't jump to line 817, because the condition on line 816 was never true
817 raise RuntimeError(f"Unexpectedly learned that {srcUri} is "
818 f"not within datastore {self.root}")
819 tgtLocation = self.locationFactory.fromPath(pathInStore)
820 elif transfer == "direct": 820 ↛ 825line 820 didn't jump to line 825, because the condition on line 820 was never true
821 # Want to store the full URI to the resource directly in
822 # datastore. This is useful for referring to permanent archive
823 # storage for raw data.
824 # Trust that people know what they are doing.
825 tgtLocation = None
826 else:
827 # Work out the name we want this ingested file to have
828 # inside the datastore
829 tgtLocation = self._calculate_ingested_datastore_name(srcUri, ref, formatter)
830 if not tgtLocation.uri.dirname().exists():
831 log.debug("Folder %s does not exist yet.", tgtLocation.uri.dirname())
832 tgtLocation.uri.dirname().mkdir()
834 # if we are transferring from a local file to a remote location
835 # it may be more efficient to get the size and checksum of the
836 # local file rather than the transferred one
837 if not srcUri.scheme or srcUri.scheme == "file": 837 ↛ 843line 837 didn't jump to line 843, because the condition on line 837 was never false
838 size = srcUri.size()
839 checksum = self.computeChecksum(srcUri) if self.useChecksum else None
840 have_sized = True
842 # transfer the resource to the destination
843 tgtLocation.uri.transfer_from(srcUri, transfer=transfer, transaction=self._transaction)
845 if tgtLocation is None: 845 ↛ 847line 845 didn't jump to line 847, because the condition on line 845 was never true
846 # This means we are using direct mode
847 targetUri = srcUri
848 targetPath = str(srcUri)
849 else:
850 targetUri = tgtLocation.uri
851 targetPath = tgtLocation.pathInStore.path
853 # the file should exist in the datastore now
854 if not have_sized:
855 size = targetUri.size()
856 checksum = self.computeChecksum(targetUri) if self.useChecksum else None
858 return StoredFileInfo(formatter=formatter, path=targetPath,
859 storageClass=ref.datasetType.storageClass,
860 component=ref.datasetType.component(),
861 file_size=size, checksum=checksum)
863 def _prepIngest(self, *datasets: FileDataset, transfer: Optional[str] = None) -> _IngestPrepData:
864 # Docstring inherited from Datastore._prepIngest.
865 filtered = []
866 for dataset in datasets:
867 acceptable = [ref for ref in dataset.refs if self.constraints.isAcceptable(ref)]
868 if not acceptable:
869 continue
870 else:
871 dataset.refs = acceptable
872 if dataset.formatter is None:
873 dataset.formatter = self.formatterFactory.getFormatterClass(dataset.refs[0])
874 else:
875 assert isinstance(dataset.formatter, (type, str))
876 dataset.formatter = getClassOf(dataset.formatter)
877 dataset.path = self._standardizeIngestPath(dataset.path, transfer=transfer)
878 filtered.append(dataset)
879 return _IngestPrepData(filtered)
881 @transactional
882 def _finishIngest(self, prepData: Datastore.IngestPrepData, *, transfer: Optional[str] = None) -> None:
883 # Docstring inherited from Datastore._finishIngest.
884 refsAndInfos = []
885 for dataset in prepData.datasets:
886 # Do ingest as if the first dataset ref is associated with the file
887 info = self._extractIngestInfo(dataset.path, dataset.refs[0], formatter=dataset.formatter,
888 transfer=transfer)
889 refsAndInfos.extend([(ref, info) for ref in dataset.refs])
890 self._register_datasets(refsAndInfos)
892 def _calculate_ingested_datastore_name(self, srcUri: ButlerURI, ref: DatasetRef,
893 formatter: Union[Formatter, Type[Formatter]]) -> Location:
894 """Given a source URI and a DatasetRef, determine the name the
895 dataset will have inside datastore.
897 Parameters
898 ----------
899 srcUri : `ButlerURI`
900 URI to the source dataset file.
901 ref : `DatasetRef`
902 Ref associated with the newly-ingested dataset artifact. This
903 is used to determine the name within the datastore.
904 formatter : `Formatter` or Formatter class.
905 Formatter to use for validation. Can be a class or an instance.
907 Returns
908 -------
909 location : `Location`
910 Target location for the newly-ingested dataset.
911 """
912 # Ingesting a file from outside the datastore.
913 # This involves a new name.
914 template = self.templates.getTemplate(ref)
915 location = self.locationFactory.fromPath(template.format(ref))
917 # Get the extension
918 ext = srcUri.getExtension()
920 # Update the destination to include that extension
921 location.updateExtension(ext)
923 # Ask the formatter to validate this extension
924 formatter.validateExtension(location)
926 return location
928 def _write_in_memory_to_artifact(self, inMemoryDataset: Any, ref: DatasetRef) -> StoredFileInfo:
929 """Write out in memory dataset to datastore.
931 Parameters
932 ----------
933 inMemoryDataset : `object`
934 Dataset to write to datastore.
935 ref : `DatasetRef`
936 Registry information associated with this dataset.
938 Returns
939 -------
940 info : `StoredFileInfo`
941 Information describin the artifact written to the datastore.
942 """
943 location, formatter = self._prepare_for_put(inMemoryDataset, ref)
944 uri = location.uri
946 if not uri.dirname().exists():
947 log.debug("Folder %s does not exist yet so creating it.", uri.dirname())
948 uri.dirname().mkdir()
950 if self._transaction is None: 950 ↛ 951line 950 didn't jump to line 951, because the condition on line 950 was never true
951 raise RuntimeError("Attempting to write artifact without transaction enabled")
953 def _removeFileExists(uri: ButlerURI) -> None:
954 """Remove a file and do not complain if it is not there.
956 This is important since a formatter might fail before the file
957 is written and we should not confuse people by writing spurious
958 error messages to the log.
959 """
960 try:
961 uri.remove()
962 except FileNotFoundError:
963 pass
965 # Register a callback to try to delete the uploaded data if
966 # something fails below
967 self._transaction.registerUndo("artifactWrite", _removeFileExists, uri)
969 # For a local file, simply use the formatter directly
970 if uri.isLocal:
971 formatter.write(inMemoryDataset)
972 log.debug("Successfully wrote python object to local file at %s", uri)
973 else:
974 # This is a remote URI, so first try bytes and write directly else
975 # fallback to a temporary file
976 try:
977 serializedDataset = formatter.toBytes(inMemoryDataset)
978 log.debug("Writing bytes directly to %s", uri)
979 uri.write(serializedDataset, overwrite=True)
980 log.debug("Successfully wrote bytes directly to %s", uri)
981 except NotImplementedError:
982 with tempfile.NamedTemporaryFile(suffix=uri.getExtension()) as tmpFile:
983 # Need to configure the formatter to write to a different
984 # location and that needs us to overwrite internals
985 tmpLocation = Location(*os.path.split(tmpFile.name))
986 log.debug("Writing dataset to temporary location at %s", tmpLocation.uri)
987 with formatter._updateLocation(tmpLocation):
988 formatter.write(inMemoryDataset)
989 uri.transfer_from(tmpLocation.uri, transfer="copy", overwrite=True)
990 log.debug("Successfully wrote dataset to %s via a temporary file.", uri)
992 # URI is needed to resolve what ingest case are we dealing with
993 return self._extractIngestInfo(uri, ref, formatter=formatter)
995 def _read_artifact_into_memory(self, getInfo: DatastoreFileGetInformation,
996 ref: DatasetRef, isComponent: bool = False) -> Any:
997 """Read the artifact from datastore into in memory object.
999 Parameters
1000 ----------
1001 getInfo : `DatastoreFileGetInformation`
1002 Information about the artifact within the datastore.
1003 ref : `DatasetRef`
1004 The registry information associated with this artifact.
1005 isComponent : `bool`
1006 Flag to indicate if a component is being read from this artifact.
1008 Returns
1009 -------
1010 inMemoryDataset : `object`
1011 The artifact as a python object.
1012 """
1013 location = getInfo.location
1014 uri = location.uri
1015 log.debug("Accessing data from %s", uri)
1017 # Cannot recalculate checksum but can compare size as a quick check
1018 # Do not do this if the size is negative since that indicates
1019 # we do not know.
1020 recorded_size = getInfo.info.file_size
1021 resource_size = uri.size()
1022 if recorded_size >= 0 and resource_size != recorded_size: 1022 ↛ 1023line 1022 didn't jump to line 1023, because the condition on line 1022 was never true
1023 raise RuntimeError("Integrity failure in Datastore. "
1024 f"Size of file {uri} ({resource_size}) "
1025 f"does not match size recorded in registry of {recorded_size}")
1027 # For the general case we have choices for how to proceed.
1028 # 1. Always use a local file (downloading the remote resource to a
1029 # temporary file if needed).
1030 # 2. Use a threshold size and read into memory and use bytes.
1031 # Use both for now with an arbitrary hand off size.
1032 # This allows small datasets to be downloaded from remote object
1033 # stores without requiring a temporary file.
1035 formatter = getInfo.formatter
1036 nbytes_max = 10_000_000 # Arbitrary number that we can tune
1037 if resource_size <= nbytes_max and formatter.can_read_bytes():
1038 serializedDataset = uri.read()
1039 log.debug("Deserializing %s from %d bytes from location %s with formatter %s",
1040 f"component {getInfo.component}" if isComponent else "",
1041 len(serializedDataset), uri, formatter.name())
1042 try:
1043 result = formatter.fromBytes(serializedDataset,
1044 component=getInfo.component if isComponent else None)
1045 except Exception as e:
1046 raise ValueError(f"Failure from formatter '{formatter.name()}' for dataset {ref.id}"
1047 f" ({ref.datasetType.name} from {uri}): {e}") from e
1048 else:
1049 # Read from file
1050 with uri.as_local() as local_uri:
1051 # Have to update the Location associated with the formatter
1052 # because formatter.read does not allow an override.
1053 # This could be improved.
1054 msg = ""
1055 newLocation = None
1056 if uri != local_uri:
1057 newLocation = Location(*local_uri.split())
1058 msg = "(via download to local file)"
1060 log.debug("Reading %s from location %s %s with formatter %s",
1061 f"component {getInfo.component}" if isComponent else "",
1062 uri, msg, formatter.name())
1063 try:
1064 with formatter._updateLocation(newLocation):
1065 result = formatter.read(component=getInfo.component if isComponent else None)
1066 except Exception as e:
1067 raise ValueError(f"Failure from formatter '{formatter.name()}' for dataset {ref.id}"
1068 f" ({ref.datasetType.name} from {uri}): {e}") from e
1070 return self._post_process_get(result, getInfo.readStorageClass, getInfo.assemblerParams,
1071 isComponent=isComponent)
1073 def exists(self, ref: DatasetRef) -> bool:
1074 """Check if the dataset exists in the datastore.
1076 Parameters
1077 ----------
1078 ref : `DatasetRef`
1079 Reference to the required dataset.
1081 Returns
1082 -------
1083 exists : `bool`
1084 `True` if the entity exists in the `Datastore`.
1085 """
1086 fileLocations = self._get_dataset_locations_info(ref)
1088 # if we are being asked to trust that registry might not be correct
1089 # we ask for the expected locations and check them explicitly
1090 if not fileLocations:
1091 if not self.trustGetRequest:
1092 return False
1093 fileLocations = self._get_expected_dataset_locations_info(ref)
1094 for location, _ in fileLocations:
1095 if not self._artifact_exists(location):
1096 return False
1098 return True
1100 def getURIs(self, ref: DatasetRef,
1101 predict: bool = False) -> Tuple[Optional[ButlerURI], Dict[str, ButlerURI]]:
1102 """Return URIs associated with dataset.
1104 Parameters
1105 ----------
1106 ref : `DatasetRef`
1107 Reference to the required dataset.
1108 predict : `bool`, optional
1109 If the datastore does not know about the dataset, should it
1110 return a predicted URI or not?
1112 Returns
1113 -------
1114 primary : `ButlerURI`
1115 The URI to the primary artifact associated with this dataset.
1116 If the dataset was disassembled within the datastore this
1117 may be `None`.
1118 components : `dict`
1119 URIs to any components associated with the dataset artifact.
1120 Can be empty if there are no components.
1121 """
1123 primary: Optional[ButlerURI] = None
1124 components: Dict[str, ButlerURI] = {}
1126 # if this has never been written then we have to guess
1127 if not self.exists(ref):
1128 if not predict:
1129 raise FileNotFoundError("Dataset {} not in this datastore".format(ref))
1131 doDisassembly = self.composites.shouldBeDisassembled(ref)
1133 if doDisassembly:
1135 for component, componentStorage in ref.datasetType.storageClass.components.items():
1136 compRef = ref.makeComponentRef(component)
1137 compLocation, _ = self._determine_put_formatter_location(compRef)
1139 # Add a URI fragment to indicate this is a guess
1140 components[component] = ButlerURI(compLocation.uri.geturl() + "#predicted")
1142 else:
1144 location, _ = self._determine_put_formatter_location(ref)
1146 # Add a URI fragment to indicate this is a guess
1147 primary = ButlerURI(location.uri.geturl() + "#predicted")
1149 return primary, components
1151 # If this is a ref that we have written we can get the path.
1152 # Get file metadata and internal metadata
1153 fileLocations = self._get_dataset_locations_info(ref)
1155 guessing = False
1156 if not fileLocations:
1157 if not self.trustGetRequest: 1157 ↛ 1158line 1157 didn't jump to line 1158, because the condition on line 1157 was never true
1158 raise RuntimeError(f"Unexpectedly got no artifacts for dataset {ref}")
1159 fileLocations = self._get_expected_dataset_locations_info(ref)
1160 guessing = True
1162 if len(fileLocations) == 1:
1163 # No disassembly so this is the primary URI
1164 uri = fileLocations[0][0].uri
1165 if guessing and not uri.exists(): 1165 ↛ 1166line 1165 didn't jump to line 1166, because the condition on line 1165 was never true
1166 raise FileNotFoundError(f"Expected URI ({uri}) does not exist")
1167 primary = uri
1169 else:
1170 for location, storedFileInfo in fileLocations:
1171 if storedFileInfo.component is None: 1171 ↛ 1172line 1171 didn't jump to line 1172, because the condition on line 1171 was never true
1172 raise RuntimeError(f"Unexpectedly got no component name for a component at {location}")
1173 uri = location.uri
1174 if guessing and not uri.exists(): 1174 ↛ 1175line 1174 didn't jump to line 1175, because the condition on line 1174 was never true
1175 raise FileNotFoundError(f"Expected URI ({uri}) does not exist")
1176 components[storedFileInfo.component] = uri
1178 return primary, components
1180 def getURI(self, ref: DatasetRef, predict: bool = False) -> ButlerURI:
1181 """URI to the Dataset.
1183 Parameters
1184 ----------
1185 ref : `DatasetRef`
1186 Reference to the required Dataset.
1187 predict : `bool`
1188 If `True`, allow URIs to be returned of datasets that have not
1189 been written.
1191 Returns
1192 -------
1193 uri : `str`
1194 URI pointing to the dataset within the datastore. If the
1195 dataset does not exist in the datastore, and if ``predict`` is
1196 `True`, the URI will be a prediction and will include a URI
1197 fragment "#predicted".
1198 If the datastore does not have entities that relate well
1199 to the concept of a URI the returned URI will be
1200 descriptive. The returned URI is not guaranteed to be obtainable.
1202 Raises
1203 ------
1204 FileNotFoundError
1205 Raised if a URI has been requested for a dataset that does not
1206 exist and guessing is not allowed.
1207 RuntimeError
1208 Raised if a request is made for a single URI but multiple URIs
1209 are associated with this dataset.
1211 Notes
1212 -----
1213 When a predicted URI is requested an attempt will be made to form
1214 a reasonable URI based on file templates and the expected formatter.
1215 """
1216 primary, components = self.getURIs(ref, predict)
1217 if primary is None or components: 1217 ↛ 1218line 1217 didn't jump to line 1218, because the condition on line 1217 was never true
1218 raise RuntimeError(f"Dataset ({ref}) includes distinct URIs for components. "
1219 "Use Dataastore.getURIs() instead.")
1220 return primary
1222 def get(self, ref: DatasetRef, parameters: Optional[Mapping[str, Any]] = None) -> Any:
1223 """Load an InMemoryDataset from the store.
1225 Parameters
1226 ----------
1227 ref : `DatasetRef`
1228 Reference to the required Dataset.
1229 parameters : `dict`
1230 `StorageClass`-specific parameters that specify, for example,
1231 a slice of the dataset to be loaded.
1233 Returns
1234 -------
1235 inMemoryDataset : `object`
1236 Requested dataset or slice thereof as an InMemoryDataset.
1238 Raises
1239 ------
1240 FileNotFoundError
1241 Requested dataset can not be retrieved.
1242 TypeError
1243 Return value from formatter has unexpected type.
1244 ValueError
1245 Formatter failed to process the dataset.
1246 """
1247 allGetInfo = self._prepare_for_get(ref, parameters)
1248 refComponent = ref.datasetType.component()
1250 # Supplied storage class for the component being read
1251 refStorageClass = ref.datasetType.storageClass
1253 # Create mapping from component name to related info
1254 allComponents = {i.component: i for i in allGetInfo}
1256 # By definition the dataset is disassembled if we have more
1257 # than one record for it.
1258 isDisassembled = len(allGetInfo) > 1
1260 # Look for the special case where we are disassembled but the
1261 # component is a derived component that was not written during
1262 # disassembly. For this scenario we need to check that the
1263 # component requested is listed as a derived component for the
1264 # composite storage class
1265 isDisassembledReadOnlyComponent = False
1266 if isDisassembled and refComponent:
1267 # The composite storage class should be accessible through
1268 # the component dataset type
1269 compositeStorageClass = ref.datasetType.parentStorageClass
1271 # In the unlikely scenario where the composite storage
1272 # class is not known, we can only assume that this is a
1273 # normal component. If that assumption is wrong then the
1274 # branch below that reads a persisted component will fail
1275 # so there is no need to complain here.
1276 if compositeStorageClass is not None: 1276 ↛ 1279line 1276 didn't jump to line 1279, because the condition on line 1276 was never false
1277 isDisassembledReadOnlyComponent = refComponent in compositeStorageClass.derivedComponents
1279 if isDisassembled and not refComponent:
1280 # This was a disassembled dataset spread over multiple files
1281 # and we need to put them all back together again.
1282 # Read into memory and then assemble
1284 # Check that the supplied parameters are suitable for the type read
1285 refStorageClass.validateParameters(parameters)
1287 # We want to keep track of all the parameters that were not used
1288 # by formatters. We assume that if any of the component formatters
1289 # use a parameter that we do not need to apply it again in the
1290 # assembler.
1291 usedParams = set()
1293 components: Dict[str, Any] = {}
1294 for getInfo in allGetInfo:
1295 # assemblerParams are parameters not understood by the
1296 # associated formatter.
1297 usedParams.update(set(getInfo.formatterParams))
1299 component = getInfo.component
1301 if component is None: 1301 ↛ 1302line 1301 didn't jump to line 1302, because the condition on line 1301 was never true
1302 raise RuntimeError(f"Internal error in datastore assembly of {ref}")
1304 # We do not want the formatter to think it's reading
1305 # a component though because it is really reading a
1306 # standalone dataset -- always tell reader it is not a
1307 # component.
1308 components[component] = self._read_artifact_into_memory(getInfo, ref, isComponent=False)
1310 inMemoryDataset = ref.datasetType.storageClass.delegate().assemble(components)
1312 # Any unused parameters will have to be passed to the assembler
1313 if parameters:
1314 unusedParams = {k: v for k, v in parameters.items() if k not in usedParams}
1315 else:
1316 unusedParams = {}
1318 # Process parameters
1319 return ref.datasetType.storageClass.delegate().handleParameters(inMemoryDataset,
1320 parameters=unusedParams)
1322 elif isDisassembledReadOnlyComponent:
1324 compositeStorageClass = ref.datasetType.parentStorageClass
1325 if compositeStorageClass is None: 1325 ↛ 1326line 1325 didn't jump to line 1326, because the condition on line 1325 was never true
1326 raise RuntimeError(f"Unable to retrieve derived component '{refComponent}' since"
1327 "no composite storage class is available.")
1329 if refComponent is None: 1329 ↛ 1331line 1329 didn't jump to line 1331, because the condition on line 1329 was never true
1330 # Mainly for mypy
1331 raise RuntimeError(f"Internal error in datastore {self.name}: component can not be None here")
1333 # Assume that every derived component can be calculated by
1334 # forwarding the request to a single read/write component.
1335 # Rather than guessing which rw component is the right one by
1336 # scanning each for a derived component of the same name,
1337 # we ask the storage class delegate directly which one is best to
1338 # use.
1339 compositeDelegate = compositeStorageClass.delegate()
1340 forwardedComponent = compositeDelegate.selectResponsibleComponent(refComponent,
1341 set(allComponents))
1343 # Select the relevant component
1344 rwInfo = allComponents[forwardedComponent]
1346 # For now assume that read parameters are validated against
1347 # the real component and not the requested component
1348 forwardedStorageClass = rwInfo.formatter.fileDescriptor.readStorageClass
1349 forwardedStorageClass.validateParameters(parameters)
1351 # Unfortunately the FileDescriptor inside the formatter will have
1352 # the wrong write storage class so we need to create a new one
1353 # given the immutability constraint.
1354 writeStorageClass = rwInfo.info.storageClass
1356 # We may need to put some thought into parameters for read
1357 # components but for now forward them on as is
1358 readFormatter = type(rwInfo.formatter)(FileDescriptor(rwInfo.location,
1359 readStorageClass=refStorageClass,
1360 storageClass=writeStorageClass,
1361 parameters=parameters),
1362 ref.dataId)
1364 # The assembler can not receive any parameter requests for a
1365 # derived component at this time since the assembler will
1366 # see the storage class of the derived component and those
1367 # parameters will have to be handled by the formatter on the
1368 # forwarded storage class.
1369 assemblerParams: Dict[str, Any] = {}
1371 # Need to created a new info that specifies the derived
1372 # component and associated storage class
1373 readInfo = DatastoreFileGetInformation(rwInfo.location, readFormatter,
1374 rwInfo.info, assemblerParams, {},
1375 refComponent, refStorageClass)
1377 return self._read_artifact_into_memory(readInfo, ref, isComponent=True)
1379 else:
1380 # Single file request or component from that composite file
1381 for lookup in (refComponent, None): 1381 ↛ 1386line 1381 didn't jump to line 1386, because the loop on line 1381 didn't complete
1382 if lookup in allComponents: 1382 ↛ 1381line 1382 didn't jump to line 1381, because the condition on line 1382 was never false
1383 getInfo = allComponents[lookup]
1384 break
1385 else:
1386 raise FileNotFoundError(f"Component {refComponent} not found "
1387 f"for ref {ref} in datastore {self.name}")
1389 # Do not need the component itself if already disassembled
1390 if isDisassembled:
1391 isComponent = False
1392 else:
1393 isComponent = getInfo.component is not None
1395 # For a disassembled component we can validate parametersagainst
1396 # the component storage class directly
1397 if isDisassembled:
1398 refStorageClass.validateParameters(parameters)
1399 else:
1400 # For an assembled composite this could be a derived
1401 # component derived from a real component. The validity
1402 # of the parameters is not clear. For now validate against
1403 # the composite storage class
1404 getInfo.formatter.fileDescriptor.storageClass.validateParameters(parameters)
1406 return self._read_artifact_into_memory(getInfo, ref, isComponent=isComponent)
1408 @transactional
1409 def put(self, inMemoryDataset: Any, ref: DatasetRef) -> None:
1410 """Write a InMemoryDataset with a given `DatasetRef` to the store.
1412 Parameters
1413 ----------
1414 inMemoryDataset : `object`
1415 The dataset to store.
1416 ref : `DatasetRef`
1417 Reference to the associated Dataset.
1419 Raises
1420 ------
1421 TypeError
1422 Supplied object and storage class are inconsistent.
1423 DatasetTypeNotSupportedError
1424 The associated `DatasetType` is not handled by this datastore.
1426 Notes
1427 -----
1428 If the datastore is configured to reject certain dataset types it
1429 is possible that the put will fail and raise a
1430 `DatasetTypeNotSupportedError`. The main use case for this is to
1431 allow `ChainedDatastore` to put to multiple datastores without
1432 requiring that every datastore accepts the dataset.
1433 """
1435 doDisassembly = self.composites.shouldBeDisassembled(ref)
1436 # doDisassembly = True
1438 artifacts = []
1439 if doDisassembly:
1440 components = ref.datasetType.storageClass.delegate().disassemble(inMemoryDataset)
1441 for component, componentInfo in components.items():
1442 # Don't recurse because we want to take advantage of
1443 # bulk insert -- need a new DatasetRef that refers to the
1444 # same dataset_id but has the component DatasetType
1445 # DatasetType does not refer to the types of components
1446 # So we construct one ourselves.
1447 compRef = ref.makeComponentRef(component)
1448 storedInfo = self._write_in_memory_to_artifact(componentInfo.component, compRef)
1449 artifacts.append((compRef, storedInfo))
1450 else:
1451 # Write the entire thing out
1452 storedInfo = self._write_in_memory_to_artifact(inMemoryDataset, ref)
1453 artifacts.append((ref, storedInfo))
1455 self._register_datasets(artifacts)
1457 @transactional
1458 def trash(self, ref: DatasetRef, ignore_errors: bool = True) -> None:
1459 """Indicate to the datastore that a dataset can be removed.
1461 Parameters
1462 ----------
1463 ref : `DatasetRef`
1464 Reference to the required Dataset.
1465 ignore_errors : `bool`
1466 If `True` return without error even if something went wrong.
1467 Problems could occur if another process is simultaneously trying
1468 to delete.
1470 Raises
1471 ------
1472 FileNotFoundError
1473 Attempt to remove a dataset that does not exist.
1474 """
1475 # Get file metadata and internal metadata
1476 log.debug("Trashing %s in datastore %s", ref, self.name)
1478 fileLocations = self._get_dataset_locations_info(ref)
1480 if not fileLocations:
1481 err_msg = f"Requested dataset to trash ({ref}) is not known to datastore {self.name}"
1482 if ignore_errors:
1483 log.warning(err_msg)
1484 return
1485 else:
1486 raise FileNotFoundError(err_msg)
1488 for location, storedFileInfo in fileLocations:
1489 if not self._artifact_exists(location): 1489 ↛ 1490line 1489 didn't jump to line 1490, because the condition on line 1489 was never true
1490 err_msg = f"Dataset is known to datastore {self.name} but " \
1491 f"associated artifact ({location.uri}) is missing"
1492 if ignore_errors:
1493 log.warning(err_msg)
1494 return
1495 else:
1496 raise FileNotFoundError(err_msg)
1498 # Mark dataset as trashed
1499 try:
1500 self._move_to_trash_in_registry(ref)
1501 except Exception as e:
1502 if ignore_errors:
1503 log.warning(f"Attempted to mark dataset ({ref}) to be trashed in datastore {self.name} "
1504 f"but encountered an error: {e}")
1505 pass
1506 else:
1507 raise
1509 @transactional
1510 def emptyTrash(self, ignore_errors: bool = True) -> None:
1511 """Remove all datasets from the trash.
1513 Parameters
1514 ----------
1515 ignore_errors : `bool`
1516 If `True` return without error even if something went wrong.
1517 Problems could occur if another process is simultaneously trying
1518 to delete.
1519 """
1520 log.debug("Emptying trash in datastore %s", self.name)
1521 # Context manager will empty trash iff we finish it without raising.
1522 with self.bridge.emptyTrash() as trashed:
1523 for ref in trashed:
1524 fileLocations = self._get_dataset_locations_info(ref)
1526 if not fileLocations: 1526 ↛ 1527line 1526 didn't jump to line 1527, because the condition on line 1526 was never true
1527 err_msg = f"Requested dataset ({ref}) does not exist in datastore {self.name}"
1528 if ignore_errors:
1529 log.warning(err_msg)
1530 continue
1531 else:
1532 raise FileNotFoundError(err_msg)
1534 for location, _ in fileLocations:
1536 if not self._artifact_exists(location): 1536 ↛ 1537line 1536 didn't jump to line 1537, because the condition on line 1536 was never true
1537 err_msg = f"Dataset {location.uri} no longer present in datastore {self.name}"
1538 if ignore_errors:
1539 log.warning(err_msg)
1540 continue
1541 else:
1542 raise FileNotFoundError(err_msg)
1544 # Can only delete the artifact if there are no references
1545 # to the file from untrashed dataset refs.
1546 if self._can_remove_dataset_artifact(ref, location):
1547 # Point of no return for this artifact
1548 log.debug("Removing artifact %s from datastore %s", location.uri, self.name)
1549 try:
1550 self._delete_artifact(location)
1551 except Exception as e:
1552 if ignore_errors:
1553 log.critical("Encountered error removing artifact %s from datastore %s: %s",
1554 location.uri, self.name, e)
1555 else:
1556 raise
1558 # Now must remove the entry from the internal registry even if
1559 # the artifact removal failed and was ignored,
1560 # otherwise the removal check above will never be true
1561 try:
1562 # There may be multiple rows associated with this ref
1563 # depending on disassembly
1564 self.removeStoredItemInfo(ref)
1565 except Exception as e:
1566 if ignore_errors:
1567 log.warning("Error removing dataset %s (%s) from internal registry of %s: %s",
1568 ref.id, location.uri, self.name, e)
1569 continue
1570 else:
1571 raise FileNotFoundError(err_msg)
1573 def validateConfiguration(self, entities: Iterable[Union[DatasetRef, DatasetType, StorageClass]],
1574 logFailures: bool = False) -> None:
1575 """Validate some of the configuration for this datastore.
1577 Parameters
1578 ----------
1579 entities : iterable of `DatasetRef`, `DatasetType`, or `StorageClass`
1580 Entities to test against this configuration. Can be differing
1581 types.
1582 logFailures : `bool`, optional
1583 If `True`, output a log message for every validation error
1584 detected.
1586 Raises
1587 ------
1588 DatastoreValidationError
1589 Raised if there is a validation problem with a configuration.
1590 All the problems are reported in a single exception.
1592 Notes
1593 -----
1594 This method checks that all the supplied entities have valid file
1595 templates and also have formatters defined.
1596 """
1598 templateFailed = None
1599 try:
1600 self.templates.validateTemplates(entities, logFailures=logFailures)
1601 except FileTemplateValidationError as e:
1602 templateFailed = str(e)
1604 formatterFailed = []
1605 for entity in entities:
1606 try:
1607 self.formatterFactory.getFormatterClass(entity)
1608 except KeyError as e:
1609 formatterFailed.append(str(e))
1610 if logFailures: 1610 ↛ 1605line 1610 didn't jump to line 1605, because the condition on line 1610 was never false
1611 log.critical("Formatter failure: %s", e)
1613 if templateFailed or formatterFailed:
1614 messages = []
1615 if templateFailed: 1615 ↛ 1616line 1615 didn't jump to line 1616, because the condition on line 1615 was never true
1616 messages.append(templateFailed)
1617 if formatterFailed: 1617 ↛ 1619line 1617 didn't jump to line 1619, because the condition on line 1617 was never false
1618 messages.append(",".join(formatterFailed))
1619 msg = ";\n".join(messages)
1620 raise DatastoreValidationError(msg)
1622 def getLookupKeys(self) -> Set[LookupKey]:
1623 # Docstring is inherited from base class
1624 return self.templates.getLookupKeys() | self.formatterFactory.getLookupKeys() | \
1625 self.constraints.getLookupKeys()
1627 def validateKey(self, lookupKey: LookupKey,
1628 entity: Union[DatasetRef, DatasetType, StorageClass]) -> None:
1629 # Docstring is inherited from base class
1630 # The key can be valid in either formatters or templates so we can
1631 # only check the template if it exists
1632 if lookupKey in self.templates:
1633 try:
1634 self.templates[lookupKey].validateTemplate(entity)
1635 except FileTemplateValidationError as e:
1636 raise DatastoreValidationError(e) from e
1638 def export(self, refs: Iterable[DatasetRef], *,
1639 directory: Optional[Union[ButlerURI, str]] = None,
1640 transfer: Optional[str] = "auto") -> Iterable[FileDataset]:
1641 # Docstring inherited from Datastore.export.
1642 if transfer is not None and directory is None: 1642 ↛ 1643line 1642 didn't jump to line 1643, because the condition on line 1642 was never true
1643 raise RuntimeError(f"Cannot export using transfer mode {transfer} with no "
1644 "export directory given")
1646 # Force the directory to be a URI object
1647 directoryUri: Optional[ButlerURI] = None
1648 if directory is not None: 1648 ↛ 1651line 1648 didn't jump to line 1651, because the condition on line 1648 was never false
1649 directoryUri = ButlerURI(directory, forceDirectory=True)
1651 if transfer is not None and directoryUri is not None: 1651 ↛ 1656line 1651 didn't jump to line 1656, because the condition on line 1651 was never false
1652 # mypy needs the second test
1653 if not directoryUri.exists(): 1653 ↛ 1654line 1653 didn't jump to line 1654, because the condition on line 1653 was never true
1654 raise FileNotFoundError(f"Export location {directory} does not exist")
1656 for ref in refs:
1657 fileLocations = self._get_dataset_locations_info(ref)
1658 if not fileLocations: 1658 ↛ 1659line 1658 didn't jump to line 1659, because the condition on line 1658 was never true
1659 raise FileNotFoundError(f"Could not retrieve dataset {ref}.")
1660 # For now we can not export disassembled datasets
1661 if len(fileLocations) > 1:
1662 raise NotImplementedError(f"Can not export disassembled datasets such as {ref}")
1663 location, storedFileInfo = fileLocations[0]
1665 pathInStore = location.pathInStore.path
1666 if transfer is None: 1666 ↛ 1669line 1666 didn't jump to line 1669, because the condition on line 1666 was never true
1667 # TODO: do we also need to return the readStorageClass somehow?
1668 # We will use the path in store directly
1669 pass
1670 elif transfer == "direct": 1670 ↛ 1672line 1670 didn't jump to line 1672, because the condition on line 1670 was never true
1671 # Use full URIs to the remote store in the export
1672 pathInStore = str(location.uri)
1673 else:
1674 # mypy needs help
1675 assert directoryUri is not None, "directoryUri must be defined to get here"
1676 storeUri = ButlerURI(location.uri)
1678 # if the datastore has an absolute URI to a resource, we
1679 # have two options:
1680 # 1. Keep the absolute URI in the exported YAML
1681 # 2. Allocate a new name in the local datastore and transfer
1682 # it.
1683 # For now go with option 2
1684 if location.pathInStore.isabs(): 1684 ↛ 1685line 1684 didn't jump to line 1685, because the condition on line 1684 was never true
1685 template = self.templates.getTemplate(ref)
1686 pathInStore = template.format(ref)
1688 exportUri = directoryUri.join(pathInStore)
1689 exportUri.transfer_from(storeUri, transfer=transfer)
1691 yield FileDataset(refs=[ref], path=pathInStore, formatter=storedFileInfo.formatter)
1693 @staticmethod
1694 def computeChecksum(uri: ButlerURI, algorithm: str = "blake2b", block_size: int = 8192) -> Optional[str]:
1695 """Compute the checksum of the supplied file.
1697 Parameters
1698 ----------
1699 uri : `ButlerURI`
1700 Name of resource to calculate checksum from.
1701 algorithm : `str`, optional
1702 Name of algorithm to use. Must be one of the algorithms supported
1703 by :py:class`hashlib`.
1704 block_size : `int`
1705 Number of bytes to read from file at one time.
1707 Returns
1708 -------
1709 hexdigest : `str`
1710 Hex digest of the file.
1712 Notes
1713 -----
1714 Currently returns None if the URI is for a remote resource.
1715 """
1716 if algorithm not in hashlib.algorithms_guaranteed: 1716 ↛ 1717line 1716 didn't jump to line 1717, because the condition on line 1716 was never true
1717 raise NameError("The specified algorithm '{}' is not supported by hashlib".format(algorithm))
1719 if not uri.isLocal: 1719 ↛ 1720line 1719 didn't jump to line 1720, because the condition on line 1719 was never true
1720 return None
1722 hasher = hashlib.new(algorithm)
1724 with uri.as_local() as local_uri:
1725 with open(local_uri.ospath, "rb") as f:
1726 for chunk in iter(lambda: f.read(block_size), b""):
1727 hasher.update(chunk)
1729 return hasher.hexdigest()