Coverage for python/lsst/daf/butler/datastores/fileDatastore.py : 84%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
21from __future__ import annotations
23"""Generic file-based datastore code."""
25__all__ = ("FileDatastore", )
27import hashlib
28import logging
29import os
30import tempfile
32from sqlalchemy import BigInteger, String
34from dataclasses import dataclass
35from typing import (
36 TYPE_CHECKING,
37 Any,
38 ClassVar,
39 Dict,
40 Iterable,
41 List,
42 Mapping,
43 Optional,
44 Set,
45 Tuple,
46 Type,
47 Union,
48)
50from lsst.daf.butler import (
51 ButlerURI,
52 CompositesMap,
53 Config,
54 FileDataset,
55 DatasetRef,
56 DatasetType,
57 DatasetTypeNotSupportedError,
58 Datastore,
59 DatastoreConfig,
60 DatastoreValidationError,
61 FileDescriptor,
62 FileTemplates,
63 FileTemplateValidationError,
64 Formatter,
65 FormatterFactory,
66 Location,
67 LocationFactory,
68 StorageClass,
69 StoredFileInfo,
70)
72from lsst.daf.butler import ddl
73from lsst.daf.butler.registry.interfaces import (
74 ReadOnlyDatabaseError,
75 DatastoreRegistryBridge,
76)
78from lsst.daf.butler.core.repoRelocation import replaceRoot
79from lsst.daf.butler.core.utils import getInstanceOf, getClassOf, transactional
80from .genericDatastore import GenericBaseDatastore
82if TYPE_CHECKING: 82 ↛ 83line 82 didn't jump to line 83, because the condition on line 82 was never true
83 from lsst.daf.butler import LookupKey
84 from lsst.daf.butler.registry.interfaces import DatasetIdRef, DatastoreRegistryBridgeManager
86log = logging.getLogger(__name__)
88# String to use when a Python None is encountered
89NULLSTR = "__NULL_STRING__"
92class _IngestPrepData(Datastore.IngestPrepData):
93 """Helper class for FileDatastore ingest implementation.
95 Parameters
96 ----------
97 datasets : `list` of `FileDataset`
98 Files to be ingested by this datastore.
99 """
100 def __init__(self, datasets: List[FileDataset]):
101 super().__init__(ref for dataset in datasets for ref in dataset.refs)
102 self.datasets = datasets
105@dataclass(frozen=True)
106class DatastoreFileGetInformation:
107 """Collection of useful parameters needed to retrieve a file from
108 a Datastore.
109 """
111 location: Location
112 """The location from which to read the dataset."""
114 formatter: Formatter
115 """The `Formatter` to use to deserialize the dataset."""
117 info: StoredFileInfo
118 """Stored information about this file and its formatter."""
120 assemblerParams: Dict[str, Any]
121 """Parameters to use for post-processing the retrieved dataset."""
123 formatterParams: Dict[str, Any]
124 """Parameters that were understood by the associated formatter."""
126 component: Optional[str]
127 """The component to be retrieved (can be `None`)."""
129 readStorageClass: StorageClass
130 """The `StorageClass` of the dataset being read."""
133class FileDatastore(GenericBaseDatastore):
134 """Generic Datastore for file-based implementations.
136 Should always be sub-classed since key abstract methods are missing.
138 Parameters
139 ----------
140 config : `DatastoreConfig` or `str`
141 Configuration as either a `Config` object or URI to file.
142 bridgeManager : `DatastoreRegistryBridgeManager`
143 Object that manages the interface between `Registry` and datastores.
144 butlerRoot : `str`, optional
145 New datastore root to use to override the configuration value.
147 Raises
148 ------
149 ValueError
150 If root location does not exist and ``create`` is `False` in the
151 configuration.
152 """
154 defaultConfigFile: ClassVar[Optional[str]] = None
155 """Path to configuration defaults. Accessed within the ``config`` resource
156 or relative to a search path. Can be None if no defaults specified.
157 """
159 root: ButlerURI
160 """Root directory URI of this `Datastore`."""
162 locationFactory: LocationFactory
163 """Factory for creating locations relative to the datastore root."""
165 formatterFactory: FormatterFactory
166 """Factory for creating instances of formatters."""
168 templates: FileTemplates
169 """File templates that can be used by this `Datastore`."""
171 composites: CompositesMap
172 """Determines whether a dataset should be disassembled on put."""
174 defaultConfigFile = "datastores/fileDatastore.yaml"
175 """Path to configuration defaults. Accessed within the ``config`` resource
176 or relative to a search path. Can be None if no defaults specified.
177 """
179 @classmethod
180 def setConfigRoot(cls, root: str, config: Config, full: Config, overwrite: bool = True) -> None:
181 """Set any filesystem-dependent config options for this Datastore to
182 be appropriate for a new empty repository with the given root.
184 Parameters
185 ----------
186 root : `str`
187 URI to the root of the data repository.
188 config : `Config`
189 A `Config` to update. Only the subset understood by
190 this component will be updated. Will not expand
191 defaults.
192 full : `Config`
193 A complete config with all defaults expanded that can be
194 converted to a `DatastoreConfig`. Read-only and will not be
195 modified by this method.
196 Repository-specific options that should not be obtained
197 from defaults when Butler instances are constructed
198 should be copied from ``full`` to ``config``.
199 overwrite : `bool`, optional
200 If `False`, do not modify a value in ``config`` if the value
201 already exists. Default is always to overwrite with the provided
202 ``root``.
204 Notes
205 -----
206 If a keyword is explicitly defined in the supplied ``config`` it
207 will not be overridden by this method if ``overwrite`` is `False`.
208 This allows explicit values set in external configs to be retained.
209 """
210 Config.updateParameters(DatastoreConfig, config, full,
211 toUpdate={"root": root},
212 toCopy=("cls", ("records", "table")), overwrite=overwrite)
214 @classmethod
215 def makeTableSpec(cls) -> ddl.TableSpec:
216 return ddl.TableSpec(
217 fields=[
218 ddl.FieldSpec(name="dataset_id", dtype=BigInteger, primaryKey=True),
219 ddl.FieldSpec(name="path", dtype=String, length=256, nullable=False),
220 ddl.FieldSpec(name="formatter", dtype=String, length=128, nullable=False),
221 ddl.FieldSpec(name="storage_class", dtype=String, length=64, nullable=False),
222 # Use empty string to indicate no component
223 ddl.FieldSpec(name="component", dtype=String, length=32, primaryKey=True),
224 # TODO: should checksum be Base64Bytes instead?
225 ddl.FieldSpec(name="checksum", dtype=String, length=128, nullable=True),
226 ddl.FieldSpec(name="file_size", dtype=BigInteger, nullable=True),
227 ],
228 unique=frozenset(),
229 )
231 def __init__(self, config: Union[DatastoreConfig, str],
232 bridgeManager: DatastoreRegistryBridgeManager, butlerRoot: str = None):
233 super().__init__(config, bridgeManager)
234 if "root" not in self.config: 234 ↛ 235line 234 didn't jump to line 235, because the condition on line 234 was never true
235 raise ValueError("No root directory specified in configuration")
237 # Name ourselves either using an explicit name or a name
238 # derived from the (unexpanded) root
239 if "name" in self.config:
240 self.name = self.config["name"]
241 else:
242 # We use the unexpanded root in the name to indicate that this
243 # datastore can be moved without having to update registry.
244 self.name = "{}@{}".format(type(self).__name__,
245 self.config["root"])
247 # Support repository relocation in config
248 # Existence of self.root is checked in subclass
249 self.root = ButlerURI(replaceRoot(self.config["root"], butlerRoot),
250 forceDirectory=True, forceAbsolute=True)
252 self.locationFactory = LocationFactory(self.root)
253 self.formatterFactory = FormatterFactory()
255 # Now associate formatters with storage classes
256 self.formatterFactory.registerFormatters(self.config["formatters"],
257 universe=bridgeManager.universe)
259 # Read the file naming templates
260 self.templates = FileTemplates(self.config["templates"],
261 universe=bridgeManager.universe)
263 # See if composites should be disassembled
264 self.composites = CompositesMap(self.config["composites"],
265 universe=bridgeManager.universe)
267 tableName = self.config["records", "table"]
268 try:
269 # Storage of paths and formatters, keyed by dataset_id
270 self._table = bridgeManager.opaque.register(tableName, self.makeTableSpec())
271 # Interface to Registry.
272 self._bridge = bridgeManager.register(self.name)
273 except ReadOnlyDatabaseError:
274 # If the database is read only and we just tried and failed to
275 # create a table, it means someone is trying to create a read-only
276 # butler client for an empty repo. That should be okay, as long
277 # as they then try to get any datasets before some other client
278 # creates the table. Chances are they'rejust validating
279 # configuration.
280 pass
282 # Determine whether checksums should be used - default to False
283 self.useChecksum = self.config.get("checksum", False)
285 # Determine whether we can fall back to configuration if a
286 # requested dataset is not known to registry
287 self.trustGetRequest = self.config.get("trust_get_request", False)
289 # Check existence and create directory structure if necessary
290 if not self.root.exists():
291 if "create" not in self.config or not self.config["create"]: 291 ↛ 292line 291 didn't jump to line 292, because the condition on line 291 was never true
292 raise ValueError(f"No valid root and not allowed to create one at: {self.root}")
293 try:
294 self.root.mkdir()
295 except Exception as e:
296 raise ValueError(f"Can not create datastore root '{self.root}', check permissions."
297 f" Got error: {e}") from e
299 def __str__(self) -> str:
300 return str(self.root)
302 @property
303 def bridge(self) -> DatastoreRegistryBridge:
304 return self._bridge
306 def _artifact_exists(self, location: Location) -> bool:
307 """Check that an artifact exists in this datastore at the specified
308 location.
310 Parameters
311 ----------
312 location : `Location`
313 Expected location of the artifact associated with this datastore.
315 Returns
316 -------
317 exists : `bool`
318 True if the location can be found, false otherwise.
319 """
320 log.debug("Checking if resource exists: %s", location.uri)
321 return location.uri.exists()
323 def _delete_artifact(self, location: Location) -> None:
324 """Delete the artifact from the datastore.
326 Parameters
327 ----------
328 location : `Location`
329 Location of the artifact associated with this datastore.
330 """
331 if location.pathInStore.isabs(): 331 ↛ 332line 331 didn't jump to line 332, because the condition on line 331 was never true
332 raise RuntimeError(f"Cannot delete artifact with absolute uri {location.uri}.")
333 log.debug("Deleting file: %s", location.uri)
334 location.uri.remove()
335 log.debug("Successfully deleted file: %s", location.uri)
337 def addStoredItemInfo(self, refs: Iterable[DatasetRef], infos: Iterable[StoredFileInfo]) -> None:
338 # Docstring inherited from GenericBaseDatastore
339 records = []
340 for ref, info in zip(refs, infos):
341 # Component should come from ref and fall back on info
342 component = ref.datasetType.component()
343 if component is None and info.component is not None: 343 ↛ 344line 343 didn't jump to line 344, because the condition on line 343 was never true
344 component = info.component
345 if component is None:
346 # Use empty string since we want this to be part of the
347 # primary key.
348 component = NULLSTR
349 records.append(
350 dict(dataset_id=ref.id, formatter=info.formatter, path=info.path,
351 storage_class=info.storageClass.name, component=component,
352 checksum=info.checksum, file_size=info.file_size)
353 )
354 self._table.insert(*records)
356 def getStoredItemsInfo(self, ref: DatasetIdRef) -> List[StoredFileInfo]:
357 # Docstring inherited from GenericBaseDatastore
359 # Look for the dataset_id -- there might be multiple matches
360 # if we have disassembled the dataset.
361 records = list(self._table.fetch(dataset_id=ref.id))
363 results = []
364 for record in records:
365 # Convert name of StorageClass to instance
366 storageClass = self.storageClassFactory.getStorageClass(record["storage_class"])
367 component = record["component"] if (record["component"]
368 and record["component"] != NULLSTR) else None
370 info = StoredFileInfo(formatter=record["formatter"],
371 path=record["path"],
372 storageClass=storageClass,
373 component=component,
374 checksum=record["checksum"],
375 file_size=record["file_size"])
376 results.append(info)
378 return results
380 def _registered_refs_per_artifact(self, pathInStore: ButlerURI) -> Set[int]:
381 """Return all dataset refs associated with the supplied path.
383 Parameters
384 ----------
385 pathInStore : `ButlerURI`
386 Path of interest in the data store.
388 Returns
389 -------
390 ids : `set` of `int`
391 All `DatasetRef` IDs associated with this path.
392 """
393 records = list(self._table.fetch(path=str(pathInStore)))
394 ids = {r["dataset_id"] for r in records}
395 return ids
397 def removeStoredItemInfo(self, ref: DatasetIdRef) -> None:
398 # Docstring inherited from GenericBaseDatastore
399 self._table.delete(["dataset_id"], {"dataset_id": ref.id})
401 def _get_dataset_locations_info(self, ref: DatasetIdRef) -> List[Tuple[Location, StoredFileInfo]]:
402 r"""Find all the `Location`\ s of the requested dataset in the
403 `Datastore` and the associated stored file information.
405 Parameters
406 ----------
407 ref : `DatasetRef`
408 Reference to the required `Dataset`.
410 Returns
411 -------
412 results : `list` [`tuple` [`Location`, `StoredFileInfo` ]]
413 Location of the dataset within the datastore and
414 stored information about each file and its formatter.
415 """
416 # Get the file information (this will fail if no file)
417 records = self.getStoredItemsInfo(ref)
419 # Use the path to determine the location -- we need to take
420 # into account absolute URIs in the datastore record
421 locations: List[Tuple[Location, StoredFileInfo]] = []
422 for r in records:
423 uriInStore = ButlerURI(r.path, forceAbsolute=False)
424 if uriInStore.isabs(): 424 ↛ 425line 424 didn't jump to line 425, because the condition on line 424 was never true
425 location = Location(None, uriInStore)
426 else:
427 location = self.locationFactory.fromPath(r.path)
428 locations.append((location, r))
429 return locations
431 def _can_remove_dataset_artifact(self, ref: DatasetIdRef, location: Location) -> bool:
432 """Check that there is only one dataset associated with the
433 specified artifact.
435 Parameters
436 ----------
437 ref : `DatasetRef` or `FakeDatasetRef`
438 Dataset to be removed.
439 location : `Location`
440 The location of the artifact to be removed.
442 Returns
443 -------
444 can_remove : `Bool`
445 True if the artifact can be safely removed.
446 """
447 # Can't ever delete absolute URIs.
448 if location.pathInStore.isabs(): 448 ↛ 449line 448 didn't jump to line 449, because the condition on line 448 was never true
449 return False
451 # Get all entries associated with this path
452 allRefs = self._registered_refs_per_artifact(location.pathInStore)
453 if not allRefs: 453 ↛ 454line 453 didn't jump to line 454, because the condition on line 453 was never true
454 raise RuntimeError(f"Datastore inconsistency error. {location.pathInStore} not in registry")
456 # Remove these refs from all the refs and if there is nothing left
457 # then we can delete
458 remainingRefs = allRefs - {ref.id}
460 if remainingRefs:
461 return False
462 return True
464 def _get_expected_dataset_locations_info(self, ref: DatasetRef) -> List[Tuple[Location,
465 StoredFileInfo]]:
466 """Predict the location and related file information of the requested
467 dataset in this datastore.
469 Parameters
470 ----------
471 ref : `DatasetRef`
472 Reference to the required `Dataset`.
474 Returns
475 -------
476 results : `list` [`tuple` [`Location`, `StoredFileInfo` ]]
477 Expected Location of the dataset within the datastore and
478 placeholder information about each file and its formatter.
480 Notes
481 -----
482 Uses the current configuration to determine how we would expect the
483 datastore files to have been written if we couldn't ask registry.
484 This is safe so long as there has been no change to datastore
485 configuration between writing the dataset and wanting to read it.
486 Will not work for files that have been ingested without using the
487 standard file template or default formatter.
488 """
490 # If we have a component ref we always need to ask the questions
491 # of the composite. If the composite is disassembled this routine
492 # should return all components. If the composite was not
493 # disassembled the composite is what is stored regardless of
494 # component request. Note that if the caller has disassembled
495 # a composite there is no way for this guess to know that
496 # without trying both the composite and component ref and seeing
497 # if there is something at the component Location even without
498 # disassembly being enabled.
499 if ref.datasetType.isComponent():
500 ref = ref.makeCompositeRef()
502 # See if the ref is a composite that should be disassembled
503 doDisassembly = self.composites.shouldBeDisassembled(ref)
505 all_info: List[Tuple[Location, Formatter, StorageClass, Optional[str]]] = []
507 if doDisassembly:
508 for component, componentStorage in ref.datasetType.storageClass.components.items():
509 compRef = ref.makeComponentRef(component)
510 location, formatter = self._determine_put_formatter_location(compRef)
511 all_info.append((location, formatter, componentStorage, component))
513 else:
514 # Always use the composite ref if no disassembly
515 location, formatter = self._determine_put_formatter_location(ref)
516 all_info.append((location, formatter, ref.datasetType.storageClass, None))
518 # Convert the list of tuples to have StoredFileInfo as second element
519 return [(location, StoredFileInfo(formatter=formatter,
520 path=location.pathInStore.path,
521 storageClass=storageClass,
522 component=component,
523 checksum=None,
524 file_size=-1))
525 for location, formatter, storageClass, component in all_info]
527 def _prepare_for_get(self, ref: DatasetRef,
528 parameters: Optional[Mapping[str, Any]] = None) -> List[DatastoreFileGetInformation]:
529 """Check parameters for ``get`` and obtain formatter and
530 location.
532 Parameters
533 ----------
534 ref : `DatasetRef`
535 Reference to the required Dataset.
536 parameters : `dict`
537 `StorageClass`-specific parameters that specify, for example,
538 a slice of the dataset to be loaded.
540 Returns
541 -------
542 getInfo : `list` [`DatastoreFileGetInformation`]
543 Parameters needed to retrieve each file.
544 """
545 log.debug("Retrieve %s from %s with parameters %s", ref, self.name, parameters)
547 # Get file metadata and internal metadata
548 fileLocations = self._get_dataset_locations_info(ref)
549 if not fileLocations:
550 if not self.trustGetRequest:
551 raise FileNotFoundError(f"Could not retrieve dataset {ref}.")
552 # Assume the dataset is where we think it should be
553 fileLocations = self._get_expected_dataset_locations_info(ref)
555 # The storage class we want to use eventually
556 refStorageClass = ref.datasetType.storageClass
558 if len(fileLocations) > 1:
559 disassembled = True
560 else:
561 disassembled = False
563 # Is this a component request?
564 refComponent = ref.datasetType.component()
566 fileGetInfo = []
567 for location, storedFileInfo in fileLocations:
569 # The storage class used to write the file
570 writeStorageClass = storedFileInfo.storageClass
572 # If this has been disassembled we need read to match the write
573 if disassembled:
574 readStorageClass = writeStorageClass
575 else:
576 readStorageClass = refStorageClass
578 formatter = getInstanceOf(storedFileInfo.formatter,
579 FileDescriptor(location, readStorageClass=readStorageClass,
580 storageClass=writeStorageClass, parameters=parameters),
581 ref.dataId)
583 formatterParams, notFormatterParams = formatter.segregateParameters()
585 # Of the remaining parameters, extract the ones supported by
586 # this StorageClass (for components not all will be handled)
587 assemblerParams = readStorageClass.filterParameters(notFormatterParams)
589 # The ref itself could be a component if the dataset was
590 # disassembled by butler, or we disassembled in datastore and
591 # components came from the datastore records
592 component = storedFileInfo.component if storedFileInfo.component else refComponent
594 fileGetInfo.append(DatastoreFileGetInformation(location, formatter, storedFileInfo,
595 assemblerParams, formatterParams,
596 component, readStorageClass))
598 return fileGetInfo
600 def _prepare_for_put(self, inMemoryDataset: Any, ref: DatasetRef) -> Tuple[Location, Formatter]:
601 """Check the arguments for ``put`` and obtain formatter and
602 location.
604 Parameters
605 ----------
606 inMemoryDataset : `object`
607 The dataset to store.
608 ref : `DatasetRef`
609 Reference to the associated Dataset.
611 Returns
612 -------
613 location : `Location`
614 The location to write the dataset.
615 formatter : `Formatter`
616 The `Formatter` to use to write the dataset.
618 Raises
619 ------
620 TypeError
621 Supplied object and storage class are inconsistent.
622 DatasetTypeNotSupportedError
623 The associated `DatasetType` is not handled by this datastore.
624 """
625 self._validate_put_parameters(inMemoryDataset, ref)
626 return self._determine_put_formatter_location(ref)
628 def _determine_put_formatter_location(self, ref: DatasetRef) -> Tuple[Location, Formatter]:
629 """Calculate the formatter and output location to use for put.
631 Parameters
632 ----------
633 ref : `DatasetRef`
634 Reference to the associated Dataset.
636 Returns
637 -------
638 location : `Location`
639 The location to write the dataset.
640 formatter : `Formatter`
641 The `Formatter` to use to write the dataset.
642 """
643 # Work out output file name
644 try:
645 template = self.templates.getTemplate(ref)
646 except KeyError as e:
647 raise DatasetTypeNotSupportedError(f"Unable to find template for {ref}") from e
649 # Validate the template to protect against filenames from different
650 # dataIds returning the same and causing overwrite confusion.
651 template.validateTemplate(ref)
653 location = self.locationFactory.fromPath(template.format(ref))
655 # Get the formatter based on the storage class
656 storageClass = ref.datasetType.storageClass
657 try:
658 formatter = self.formatterFactory.getFormatter(ref,
659 FileDescriptor(location,
660 storageClass=storageClass),
661 ref.dataId)
662 except KeyError as e:
663 raise DatasetTypeNotSupportedError(f"Unable to find formatter for {ref} in datastore "
664 f"{self.name}") from e
666 # Now that we know the formatter, update the location
667 location = formatter.makeUpdatedLocation(location)
669 return location, formatter
671 def _overrideTransferMode(self, *datasets: FileDataset, transfer: Optional[str] = None) -> Optional[str]:
672 # Docstring inherited from base class
673 if transfer != "auto":
674 return transfer
676 # See if the paths are within the datastore or not
677 inside = [self._pathInStore(d.path) is not None for d in datasets]
679 if all(inside):
680 transfer = None
681 elif not any(inside): 681 ↛ 685line 681 didn't jump to line 685, because the condition on line 681 was never false
682 # Allow ButlerURI to use its own knowledge
683 transfer = "auto"
684 else:
685 raise ValueError("Some datasets are inside the datastore and some are outside."
686 " Please use an explicit transfer mode and not 'auto'.")
688 return transfer
690 def _pathInStore(self, path: str) -> Optional[str]:
691 """Return path relative to datastore root
693 Parameters
694 ----------
695 path : `str`
696 Path to dataset. Can be absolute. If relative assumed to
697 be relative to the datastore. Returns path in datastore
698 or raises an exception if the path it outside.
700 Returns
701 -------
702 inStore : `str`
703 Path relative to datastore root. Returns `None` if the file is
704 outside the root.
705 """
706 # Relative path will always be relative to datastore
707 pathUri = ButlerURI(path, forceAbsolute=False)
708 return pathUri.relative_to(self.root)
710 def _standardizeIngestPath(self, path: str, *, transfer: Optional[str] = None) -> str:
711 """Standardize the path of a to-be-ingested file.
713 Parameters
714 ----------
715 path : `str`
716 Path of a file to be ingested.
717 transfer : `str`, optional
718 How (and whether) the dataset should be added to the datastore.
719 See `ingest` for details of transfer modes.
720 This implementation is provided only so
721 `NotImplementedError` can be raised if the mode is not supported;
722 actual transfers are deferred to `_extractIngestInfo`.
724 Returns
725 -------
726 path : `str`
727 New path in what the datastore considers standard form.
729 Notes
730 -----
731 Subclasses of `FileDatastore` can implement this method instead
732 of `_prepIngest`. It should not modify the data repository or given
733 file in any way.
735 Raises
736 ------
737 NotImplementedError
738 Raised if the datastore does not support the given transfer mode
739 (including the case where ingest is not supported at all).
740 FileNotFoundError
741 Raised if one of the given files does not exist.
742 """
743 if transfer not in (None, "direct") + self.root.transferModes: 743 ↛ 744line 743 didn't jump to line 744, because the condition on line 743 was never true
744 raise NotImplementedError(f"Transfer mode {transfer} not supported.")
746 # A relative URI indicates relative to datastore root
747 srcUri = ButlerURI(path, forceAbsolute=False)
748 if not srcUri.isabs():
749 srcUri = self.root.join(path)
751 if not srcUri.exists():
752 raise FileNotFoundError(f"Resource at {srcUri} does not exist; note that paths to ingest "
753 f"are assumed to be relative to {self.root} unless they are absolute.")
755 if transfer is None:
756 relpath = srcUri.relative_to(self.root)
757 if not relpath:
758 raise RuntimeError(f"Transfer is none but source file ({srcUri}) is not "
759 f"within datastore ({self.root})")
761 # Return the relative path within the datastore for internal
762 # transfer
763 path = relpath
765 return path
767 def _extractIngestInfo(self, path: Union[str, ButlerURI], ref: DatasetRef, *,
768 formatter: Union[Formatter, Type[Formatter]],
769 transfer: Optional[str] = None) -> StoredFileInfo:
770 """Relocate (if necessary) and extract `StoredFileInfo` from a
771 to-be-ingested file.
773 Parameters
774 ----------
775 path : `str` or `ButlerURI`
776 URI or path of a file to be ingested.
777 ref : `DatasetRef`
778 Reference for the dataset being ingested. Guaranteed to have
779 ``dataset_id not None`.
780 formatter : `type` or `Formatter`
781 `Formatter` subclass to use for this dataset or an instance.
782 transfer : `str`, optional
783 How (and whether) the dataset should be added to the datastore.
784 See `ingest` for details of transfer modes.
786 Returns
787 -------
788 info : `StoredFileInfo`
789 Internal datastore record for this file. This will be inserted by
790 the caller; the `_extractIngestInfo` is only resposible for
791 creating and populating the struct.
793 Raises
794 ------
795 FileNotFoundError
796 Raised if one of the given files does not exist.
797 FileExistsError
798 Raised if transfer is not `None` but the (internal) location the
799 file would be moved to is already occupied.
800 """
801 if self._transaction is None: 801 ↛ 802line 801 didn't jump to line 802, because the condition on line 801 was never true
802 raise RuntimeError("Ingest called without transaction enabled")
804 # Create URI of the source path, do not need to force a relative
805 # path to absolute.
806 srcUri = ButlerURI(path, forceAbsolute=False)
808 # Track whether we have read the size of the source yet
809 have_sized = False
811 tgtLocation: Optional[Location]
812 if transfer is None:
813 # A relative path is assumed to be relative to the datastore
814 # in this context
815 if not srcUri.isabs():
816 tgtLocation = self.locationFactory.fromPath(srcUri.ospath)
817 else:
818 # Work out the path in the datastore from an absolute URI
819 # This is required to be within the datastore.
820 pathInStore = srcUri.relative_to(self.root)
821 if pathInStore is None: 821 ↛ 822line 821 didn't jump to line 822, because the condition on line 821 was never true
822 raise RuntimeError(f"Unexpectedly learned that {srcUri} is "
823 f"not within datastore {self.root}")
824 tgtLocation = self.locationFactory.fromPath(pathInStore)
825 elif transfer == "direct": 825 ↛ 830line 825 didn't jump to line 830, because the condition on line 825 was never true
826 # Want to store the full URI to the resource directly in
827 # datastore. This is useful for referring to permanent archive
828 # storage for raw data.
829 # Trust that people know what they are doing.
830 tgtLocation = None
831 else:
832 # Work out the name we want this ingested file to have
833 # inside the datastore
834 tgtLocation = self._calculate_ingested_datastore_name(srcUri, ref, formatter)
835 if not tgtLocation.uri.dirname().exists():
836 log.debug("Folder %s does not exist yet.", tgtLocation.uri.dirname())
837 tgtLocation.uri.dirname().mkdir()
839 # if we are transferring from a local file to a remote location
840 # it may be more efficient to get the size and checksum of the
841 # local file rather than the transferred one
842 if not srcUri.scheme or srcUri.scheme == "file": 842 ↛ 848line 842 didn't jump to line 848, because the condition on line 842 was never false
843 size = srcUri.size()
844 checksum = self.computeChecksum(srcUri) if self.useChecksum else None
845 have_sized = True
847 # transfer the resource to the destination
848 tgtLocation.uri.transfer_from(srcUri, transfer=transfer, transaction=self._transaction)
850 if tgtLocation is None: 850 ↛ 852line 850 didn't jump to line 852, because the condition on line 850 was never true
851 # This means we are using direct mode
852 targetUri = srcUri
853 targetPath = str(srcUri)
854 else:
855 targetUri = tgtLocation.uri
856 targetPath = tgtLocation.pathInStore.path
858 # the file should exist in the datastore now
859 if not have_sized:
860 size = targetUri.size()
861 checksum = self.computeChecksum(targetUri) if self.useChecksum else None
863 return StoredFileInfo(formatter=formatter, path=targetPath,
864 storageClass=ref.datasetType.storageClass,
865 component=ref.datasetType.component(),
866 file_size=size, checksum=checksum)
868 def _prepIngest(self, *datasets: FileDataset, transfer: Optional[str] = None) -> _IngestPrepData:
869 # Docstring inherited from Datastore._prepIngest.
870 filtered = []
871 for dataset in datasets:
872 acceptable = [ref for ref in dataset.refs if self.constraints.isAcceptable(ref)]
873 if not acceptable:
874 continue
875 else:
876 dataset.refs = acceptable
877 if dataset.formatter is None:
878 dataset.formatter = self.formatterFactory.getFormatterClass(dataset.refs[0])
879 else:
880 assert isinstance(dataset.formatter, (type, str))
881 dataset.formatter = getClassOf(dataset.formatter)
882 dataset.path = self._standardizeIngestPath(dataset.path, transfer=transfer)
883 filtered.append(dataset)
884 return _IngestPrepData(filtered)
886 @transactional
887 def _finishIngest(self, prepData: Datastore.IngestPrepData, *, transfer: Optional[str] = None) -> None:
888 # Docstring inherited from Datastore._finishIngest.
889 refsAndInfos = []
890 for dataset in prepData.datasets:
891 # Do ingest as if the first dataset ref is associated with the file
892 info = self._extractIngestInfo(dataset.path, dataset.refs[0], formatter=dataset.formatter,
893 transfer=transfer)
894 refsAndInfos.extend([(ref, info) for ref in dataset.refs])
895 self._register_datasets(refsAndInfos)
897 def _calculate_ingested_datastore_name(self, srcUri: ButlerURI, ref: DatasetRef,
898 formatter: Union[Formatter, Type[Formatter]]) -> Location:
899 """Given a source URI and a DatasetRef, determine the name the
900 dataset will have inside datastore.
902 Parameters
903 ----------
904 srcUri : `ButlerURI`
905 URI to the source dataset file.
906 ref : `DatasetRef`
907 Ref associated with the newly-ingested dataset artifact. This
908 is used to determine the name within the datastore.
909 formatter : `Formatter` or Formatter class.
910 Formatter to use for validation. Can be a class or an instance.
912 Returns
913 -------
914 location : `Location`
915 Target location for the newly-ingested dataset.
916 """
917 # Ingesting a file from outside the datastore.
918 # This involves a new name.
919 template = self.templates.getTemplate(ref)
920 location = self.locationFactory.fromPath(template.format(ref))
922 # Get the extension
923 ext = srcUri.getExtension()
925 # Update the destination to include that extension
926 location.updateExtension(ext)
928 # Ask the formatter to validate this extension
929 formatter.validateExtension(location)
931 return location
933 def _write_in_memory_to_artifact(self, inMemoryDataset: Any, ref: DatasetRef) -> StoredFileInfo:
934 """Write out in memory dataset to datastore.
936 Parameters
937 ----------
938 inMemoryDataset : `object`
939 Dataset to write to datastore.
940 ref : `DatasetRef`
941 Registry information associated with this dataset.
943 Returns
944 -------
945 info : `StoredFileInfo`
946 Information describin the artifact written to the datastore.
947 """
948 location, formatter = self._prepare_for_put(inMemoryDataset, ref)
949 uri = location.uri
951 if not uri.dirname().exists():
952 log.debug("Folder %s does not exist yet so creating it.", uri.dirname())
953 uri.dirname().mkdir()
955 if self._transaction is None: 955 ↛ 956line 955 didn't jump to line 956, because the condition on line 955 was never true
956 raise RuntimeError("Attempting to write artifact without transaction enabled")
958 def _removeFileExists(uri: ButlerURI) -> None:
959 """Remove a file and do not complain if it is not there.
961 This is important since a formatter might fail before the file
962 is written and we should not confuse people by writing spurious
963 error messages to the log.
964 """
965 try:
966 uri.remove()
967 except FileNotFoundError:
968 pass
970 # Register a callback to try to delete the uploaded data if
971 # something fails below
972 self._transaction.registerUndo("artifactWrite", _removeFileExists, uri)
974 # For a local file, simply use the formatter directly
975 if uri.isLocal:
976 formatter.write(inMemoryDataset)
977 log.debug("Successfully wrote python object to local file at %s", uri)
978 else:
979 # This is a remote URI, so first try bytes and write directly else
980 # fallback to a temporary file
981 try:
982 serializedDataset = formatter.toBytes(inMemoryDataset)
983 log.debug("Writing bytes directly to %s", uri)
984 uri.write(serializedDataset, overwrite=True)
985 log.debug("Successfully wrote bytes directly to %s", uri)
986 except NotImplementedError:
987 with tempfile.NamedTemporaryFile(suffix=uri.getExtension()) as tmpFile:
988 # Need to configure the formatter to write to a different
989 # location and that needs us to overwrite internals
990 tmpLocation = Location(*os.path.split(tmpFile.name))
991 log.debug("Writing dataset to temporary location at %s", tmpLocation.uri)
992 with formatter._updateLocation(tmpLocation):
993 formatter.write(inMemoryDataset)
994 uri.transfer_from(tmpLocation.uri, transfer="copy", overwrite=True)
995 log.debug("Successfully wrote dataset to %s via a temporary file.", uri)
997 # URI is needed to resolve what ingest case are we dealing with
998 return self._extractIngestInfo(uri, ref, formatter=formatter)
1000 def _read_artifact_into_memory(self, getInfo: DatastoreFileGetInformation,
1001 ref: DatasetRef, isComponent: bool = False) -> Any:
1002 """Read the artifact from datastore into in memory object.
1004 Parameters
1005 ----------
1006 getInfo : `DatastoreFileGetInformation`
1007 Information about the artifact within the datastore.
1008 ref : `DatasetRef`
1009 The registry information associated with this artifact.
1010 isComponent : `bool`
1011 Flag to indicate if a component is being read from this artifact.
1013 Returns
1014 -------
1015 inMemoryDataset : `object`
1016 The artifact as a python object.
1017 """
1018 location = getInfo.location
1019 uri = location.uri
1020 log.debug("Accessing data from %s", uri)
1022 # Cannot recalculate checksum but can compare size as a quick check
1023 # Do not do this if the size is negative since that indicates
1024 # we do not know.
1025 recorded_size = getInfo.info.file_size
1026 resource_size = uri.size()
1027 if recorded_size >= 0 and resource_size != recorded_size: 1027 ↛ 1028line 1027 didn't jump to line 1028, because the condition on line 1027 was never true
1028 raise RuntimeError("Integrity failure in Datastore. "
1029 f"Size of file {uri} ({resource_size}) "
1030 f"does not match size recorded in registry of {recorded_size}")
1032 # For the general case we have choices for how to proceed.
1033 # 1. Always use a local file (downloading the remote resource to a
1034 # temporary file if needed).
1035 # 2. Use a threshold size and read into memory and use bytes.
1036 # Use both for now with an arbitrary hand off size.
1037 # This allows small datasets to be downloaded from remote object
1038 # stores without requiring a temporary file.
1040 formatter = getInfo.formatter
1041 nbytes_max = 10_000_000 # Arbitrary number that we can tune
1042 if resource_size <= nbytes_max and formatter.can_read_bytes():
1043 serializedDataset = uri.read()
1044 log.debug("Deserializing %s from %d bytes from location %s with formatter %s",
1045 f"component {getInfo.component}" if isComponent else "",
1046 len(serializedDataset), uri, formatter.name())
1047 try:
1048 result = formatter.fromBytes(serializedDataset,
1049 component=getInfo.component if isComponent else None)
1050 except Exception as e:
1051 raise ValueError(f"Failure from formatter '{formatter.name()}' for dataset {ref.id}"
1052 f" ({ref.datasetType.name} from {uri}): {e}") from e
1053 else:
1054 # Read from file
1055 with uri.as_local() as local_uri:
1056 # Have to update the Location associated with the formatter
1057 # because formatter.read does not allow an override.
1058 # This could be improved.
1059 msg = ""
1060 newLocation = None
1061 if uri != local_uri:
1062 newLocation = Location(*local_uri.split())
1063 msg = "(via download to local file)"
1065 log.debug("Reading %s from location %s %s with formatter %s",
1066 f"component {getInfo.component}" if isComponent else "",
1067 uri, msg, formatter.name())
1068 try:
1069 with formatter._updateLocation(newLocation):
1070 result = formatter.read(component=getInfo.component if isComponent else None)
1071 except Exception as e:
1072 raise ValueError(f"Failure from formatter '{formatter.name()}' for dataset {ref.id}"
1073 f" ({ref.datasetType.name} from {uri}): {e}") from e
1075 return self._post_process_get(result, getInfo.readStorageClass, getInfo.assemblerParams,
1076 isComponent=isComponent)
1078 def exists(self, ref: DatasetRef) -> bool:
1079 """Check if the dataset exists in the datastore.
1081 Parameters
1082 ----------
1083 ref : `DatasetRef`
1084 Reference to the required dataset.
1086 Returns
1087 -------
1088 exists : `bool`
1089 `True` if the entity exists in the `Datastore`.
1090 """
1091 fileLocations = self._get_dataset_locations_info(ref)
1093 # if we are being asked to trust that registry might not be correct
1094 # we ask for the expected locations and check them explicitly
1095 if not fileLocations:
1096 if not self.trustGetRequest:
1097 return False
1098 fileLocations = self._get_expected_dataset_locations_info(ref)
1099 for location, _ in fileLocations:
1100 if not self._artifact_exists(location):
1101 return False
1103 return True
1105 def getURIs(self, ref: DatasetRef,
1106 predict: bool = False) -> Tuple[Optional[ButlerURI], Dict[str, ButlerURI]]:
1107 """Return URIs associated with dataset.
1109 Parameters
1110 ----------
1111 ref : `DatasetRef`
1112 Reference to the required dataset.
1113 predict : `bool`, optional
1114 If the datastore does not know about the dataset, should it
1115 return a predicted URI or not?
1117 Returns
1118 -------
1119 primary : `ButlerURI`
1120 The URI to the primary artifact associated with this dataset.
1121 If the dataset was disassembled within the datastore this
1122 may be `None`.
1123 components : `dict`
1124 URIs to any components associated with the dataset artifact.
1125 Can be empty if there are no components.
1126 """
1128 primary: Optional[ButlerURI] = None
1129 components: Dict[str, ButlerURI] = {}
1131 # if this has never been written then we have to guess
1132 if not self.exists(ref):
1133 if not predict:
1134 raise FileNotFoundError("Dataset {} not in this datastore".format(ref))
1136 doDisassembly = self.composites.shouldBeDisassembled(ref)
1138 if doDisassembly:
1140 for component, componentStorage in ref.datasetType.storageClass.components.items():
1141 compRef = ref.makeComponentRef(component)
1142 compLocation, _ = self._determine_put_formatter_location(compRef)
1144 # Add a URI fragment to indicate this is a guess
1145 components[component] = ButlerURI(compLocation.uri.geturl() + "#predicted")
1147 else:
1149 location, _ = self._determine_put_formatter_location(ref)
1151 # Add a URI fragment to indicate this is a guess
1152 primary = ButlerURI(location.uri.geturl() + "#predicted")
1154 return primary, components
1156 # If this is a ref that we have written we can get the path.
1157 # Get file metadata and internal metadata
1158 fileLocations = self._get_dataset_locations_info(ref)
1160 guessing = False
1161 if not fileLocations:
1162 if not self.trustGetRequest: 1162 ↛ 1163line 1162 didn't jump to line 1163, because the condition on line 1162 was never true
1163 raise RuntimeError(f"Unexpectedly got no artifacts for dataset {ref}")
1164 fileLocations = self._get_expected_dataset_locations_info(ref)
1165 guessing = True
1167 if len(fileLocations) == 1:
1168 # No disassembly so this is the primary URI
1169 uri = fileLocations[0][0].uri
1170 if guessing and not uri.exists(): 1170 ↛ 1171line 1170 didn't jump to line 1171, because the condition on line 1170 was never true
1171 raise FileNotFoundError(f"Expected URI ({uri}) does not exist")
1172 primary = uri
1174 else:
1175 for location, storedFileInfo in fileLocations:
1176 if storedFileInfo.component is None: 1176 ↛ 1177line 1176 didn't jump to line 1177, because the condition on line 1176 was never true
1177 raise RuntimeError(f"Unexpectedly got no component name for a component at {location}")
1178 uri = location.uri
1179 if guessing and not uri.exists(): 1179 ↛ 1180line 1179 didn't jump to line 1180, because the condition on line 1179 was never true
1180 raise FileNotFoundError(f"Expected URI ({uri}) does not exist")
1181 components[storedFileInfo.component] = uri
1183 return primary, components
1185 def getURI(self, ref: DatasetRef, predict: bool = False) -> ButlerURI:
1186 """URI to the Dataset.
1188 Parameters
1189 ----------
1190 ref : `DatasetRef`
1191 Reference to the required Dataset.
1192 predict : `bool`
1193 If `True`, allow URIs to be returned of datasets that have not
1194 been written.
1196 Returns
1197 -------
1198 uri : `str`
1199 URI pointing to the dataset within the datastore. If the
1200 dataset does not exist in the datastore, and if ``predict`` is
1201 `True`, the URI will be a prediction and will include a URI
1202 fragment "#predicted".
1203 If the datastore does not have entities that relate well
1204 to the concept of a URI the returned URI will be
1205 descriptive. The returned URI is not guaranteed to be obtainable.
1207 Raises
1208 ------
1209 FileNotFoundError
1210 Raised if a URI has been requested for a dataset that does not
1211 exist and guessing is not allowed.
1212 RuntimeError
1213 Raised if a request is made for a single URI but multiple URIs
1214 are associated with this dataset.
1216 Notes
1217 -----
1218 When a predicted URI is requested an attempt will be made to form
1219 a reasonable URI based on file templates and the expected formatter.
1220 """
1221 primary, components = self.getURIs(ref, predict)
1222 if primary is None or components: 1222 ↛ 1223line 1222 didn't jump to line 1223, because the condition on line 1222 was never true
1223 raise RuntimeError(f"Dataset ({ref}) includes distinct URIs for components. "
1224 "Use Dataastore.getURIs() instead.")
1225 return primary
1227 def get(self, ref: DatasetRef, parameters: Optional[Mapping[str, Any]] = None) -> Any:
1228 """Load an InMemoryDataset from the store.
1230 Parameters
1231 ----------
1232 ref : `DatasetRef`
1233 Reference to the required Dataset.
1234 parameters : `dict`
1235 `StorageClass`-specific parameters that specify, for example,
1236 a slice of the dataset to be loaded.
1238 Returns
1239 -------
1240 inMemoryDataset : `object`
1241 Requested dataset or slice thereof as an InMemoryDataset.
1243 Raises
1244 ------
1245 FileNotFoundError
1246 Requested dataset can not be retrieved.
1247 TypeError
1248 Return value from formatter has unexpected type.
1249 ValueError
1250 Formatter failed to process the dataset.
1251 """
1252 allGetInfo = self._prepare_for_get(ref, parameters)
1253 refComponent = ref.datasetType.component()
1255 # Supplied storage class for the component being read
1256 refStorageClass = ref.datasetType.storageClass
1258 # Create mapping from component name to related info
1259 allComponents = {i.component: i for i in allGetInfo}
1261 # By definition the dataset is disassembled if we have more
1262 # than one record for it.
1263 isDisassembled = len(allGetInfo) > 1
1265 # Look for the special case where we are disassembled but the
1266 # component is a derived component that was not written during
1267 # disassembly. For this scenario we need to check that the
1268 # component requested is listed as a derived component for the
1269 # composite storage class
1270 isDisassembledReadOnlyComponent = False
1271 if isDisassembled and refComponent:
1272 # The composite storage class should be accessible through
1273 # the component dataset type
1274 compositeStorageClass = ref.datasetType.parentStorageClass
1276 # In the unlikely scenario where the composite storage
1277 # class is not known, we can only assume that this is a
1278 # normal component. If that assumption is wrong then the
1279 # branch below that reads a persisted component will fail
1280 # so there is no need to complain here.
1281 if compositeStorageClass is not None: 1281 ↛ 1284line 1281 didn't jump to line 1284, because the condition on line 1281 was never false
1282 isDisassembledReadOnlyComponent = refComponent in compositeStorageClass.derivedComponents
1284 if isDisassembled and not refComponent:
1285 # This was a disassembled dataset spread over multiple files
1286 # and we need to put them all back together again.
1287 # Read into memory and then assemble
1289 # Check that the supplied parameters are suitable for the type read
1290 refStorageClass.validateParameters(parameters)
1292 # We want to keep track of all the parameters that were not used
1293 # by formatters. We assume that if any of the component formatters
1294 # use a parameter that we do not need to apply it again in the
1295 # assembler.
1296 usedParams = set()
1298 components: Dict[str, Any] = {}
1299 for getInfo in allGetInfo:
1300 # assemblerParams are parameters not understood by the
1301 # associated formatter.
1302 usedParams.update(set(getInfo.formatterParams))
1304 component = getInfo.component
1306 if component is None: 1306 ↛ 1307line 1306 didn't jump to line 1307, because the condition on line 1306 was never true
1307 raise RuntimeError(f"Internal error in datastore assembly of {ref}")
1309 # We do not want the formatter to think it's reading
1310 # a component though because it is really reading a
1311 # standalone dataset -- always tell reader it is not a
1312 # component.
1313 components[component] = self._read_artifact_into_memory(getInfo, ref, isComponent=False)
1315 inMemoryDataset = ref.datasetType.storageClass.delegate().assemble(components)
1317 # Any unused parameters will have to be passed to the assembler
1318 if parameters:
1319 unusedParams = {k: v for k, v in parameters.items() if k not in usedParams}
1320 else:
1321 unusedParams = {}
1323 # Process parameters
1324 return ref.datasetType.storageClass.delegate().handleParameters(inMemoryDataset,
1325 parameters=unusedParams)
1327 elif isDisassembledReadOnlyComponent:
1329 compositeStorageClass = ref.datasetType.parentStorageClass
1330 if compositeStorageClass is None: 1330 ↛ 1331line 1330 didn't jump to line 1331, because the condition on line 1330 was never true
1331 raise RuntimeError(f"Unable to retrieve derived component '{refComponent}' since"
1332 "no composite storage class is available.")
1334 if refComponent is None: 1334 ↛ 1336line 1334 didn't jump to line 1336, because the condition on line 1334 was never true
1335 # Mainly for mypy
1336 raise RuntimeError(f"Internal error in datastore {self.name}: component can not be None here")
1338 # Assume that every derived component can be calculated by
1339 # forwarding the request to a single read/write component.
1340 # Rather than guessing which rw component is the right one by
1341 # scanning each for a derived component of the same name,
1342 # we ask the storage class delegate directly which one is best to
1343 # use.
1344 compositeDelegate = compositeStorageClass.delegate()
1345 forwardedComponent = compositeDelegate.selectResponsibleComponent(refComponent,
1346 set(allComponents))
1348 # Select the relevant component
1349 rwInfo = allComponents[forwardedComponent]
1351 # For now assume that read parameters are validated against
1352 # the real component and not the requested component
1353 forwardedStorageClass = rwInfo.formatter.fileDescriptor.readStorageClass
1354 forwardedStorageClass.validateParameters(parameters)
1356 # Unfortunately the FileDescriptor inside the formatter will have
1357 # the wrong write storage class so we need to create a new one
1358 # given the immutability constraint.
1359 writeStorageClass = rwInfo.info.storageClass
1361 # We may need to put some thought into parameters for read
1362 # components but for now forward them on as is
1363 readFormatter = type(rwInfo.formatter)(FileDescriptor(rwInfo.location,
1364 readStorageClass=refStorageClass,
1365 storageClass=writeStorageClass,
1366 parameters=parameters),
1367 ref.dataId)
1369 # The assembler can not receive any parameter requests for a
1370 # derived component at this time since the assembler will
1371 # see the storage class of the derived component and those
1372 # parameters will have to be handled by the formatter on the
1373 # forwarded storage class.
1374 assemblerParams: Dict[str, Any] = {}
1376 # Need to created a new info that specifies the derived
1377 # component and associated storage class
1378 readInfo = DatastoreFileGetInformation(rwInfo.location, readFormatter,
1379 rwInfo.info, assemblerParams, {},
1380 refComponent, refStorageClass)
1382 return self._read_artifact_into_memory(readInfo, ref, isComponent=True)
1384 else:
1385 # Single file request or component from that composite file
1386 for lookup in (refComponent, None): 1386 ↛ 1391line 1386 didn't jump to line 1391, because the loop on line 1386 didn't complete
1387 if lookup in allComponents: 1387 ↛ 1386line 1387 didn't jump to line 1386, because the condition on line 1387 was never false
1388 getInfo = allComponents[lookup]
1389 break
1390 else:
1391 raise FileNotFoundError(f"Component {refComponent} not found "
1392 f"for ref {ref} in datastore {self.name}")
1394 # Do not need the component itself if already disassembled
1395 if isDisassembled:
1396 isComponent = False
1397 else:
1398 isComponent = getInfo.component is not None
1400 # For a disassembled component we can validate parametersagainst
1401 # the component storage class directly
1402 if isDisassembled:
1403 refStorageClass.validateParameters(parameters)
1404 else:
1405 # For an assembled composite this could be a derived
1406 # component derived from a real component. The validity
1407 # of the parameters is not clear. For now validate against
1408 # the composite storage class
1409 getInfo.formatter.fileDescriptor.storageClass.validateParameters(parameters)
1411 return self._read_artifact_into_memory(getInfo, ref, isComponent=isComponent)
1413 @transactional
1414 def put(self, inMemoryDataset: Any, ref: DatasetRef) -> None:
1415 """Write a InMemoryDataset with a given `DatasetRef` to the store.
1417 Parameters
1418 ----------
1419 inMemoryDataset : `object`
1420 The dataset to store.
1421 ref : `DatasetRef`
1422 Reference to the associated Dataset.
1424 Raises
1425 ------
1426 TypeError
1427 Supplied object and storage class are inconsistent.
1428 DatasetTypeNotSupportedError
1429 The associated `DatasetType` is not handled by this datastore.
1431 Notes
1432 -----
1433 If the datastore is configured to reject certain dataset types it
1434 is possible that the put will fail and raise a
1435 `DatasetTypeNotSupportedError`. The main use case for this is to
1436 allow `ChainedDatastore` to put to multiple datastores without
1437 requiring that every datastore accepts the dataset.
1438 """
1440 doDisassembly = self.composites.shouldBeDisassembled(ref)
1441 # doDisassembly = True
1443 artifacts = []
1444 if doDisassembly:
1445 components = ref.datasetType.storageClass.delegate().disassemble(inMemoryDataset)
1446 for component, componentInfo in components.items():
1447 # Don't recurse because we want to take advantage of
1448 # bulk insert -- need a new DatasetRef that refers to the
1449 # same dataset_id but has the component DatasetType
1450 # DatasetType does not refer to the types of components
1451 # So we construct one ourselves.
1452 compRef = ref.makeComponentRef(component)
1453 storedInfo = self._write_in_memory_to_artifact(componentInfo.component, compRef)
1454 artifacts.append((compRef, storedInfo))
1455 else:
1456 # Write the entire thing out
1457 storedInfo = self._write_in_memory_to_artifact(inMemoryDataset, ref)
1458 artifacts.append((ref, storedInfo))
1460 self._register_datasets(artifacts)
1462 @transactional
1463 def trash(self, ref: DatasetRef, ignore_errors: bool = True) -> None:
1464 """Indicate to the datastore that a dataset can be removed.
1466 Parameters
1467 ----------
1468 ref : `DatasetRef`
1469 Reference to the required Dataset.
1470 ignore_errors : `bool`
1471 If `True` return without error even if something went wrong.
1472 Problems could occur if another process is simultaneously trying
1473 to delete.
1475 Raises
1476 ------
1477 FileNotFoundError
1478 Attempt to remove a dataset that does not exist.
1479 """
1480 # Get file metadata and internal metadata
1481 log.debug("Trashing %s in datastore %s", ref, self.name)
1483 fileLocations = self._get_dataset_locations_info(ref)
1485 if not fileLocations:
1486 err_msg = f"Requested dataset to trash ({ref}) is not known to datastore {self.name}"
1487 if ignore_errors:
1488 log.warning(err_msg)
1489 return
1490 else:
1491 raise FileNotFoundError(err_msg)
1493 for location, storedFileInfo in fileLocations:
1494 if not self._artifact_exists(location): 1494 ↛ 1495line 1494 didn't jump to line 1495, because the condition on line 1494 was never true
1495 err_msg = f"Dataset is known to datastore {self.name} but " \
1496 f"associated artifact ({location.uri}) is missing"
1497 if ignore_errors:
1498 log.warning(err_msg)
1499 return
1500 else:
1501 raise FileNotFoundError(err_msg)
1503 # Mark dataset as trashed
1504 try:
1505 self._move_to_trash_in_registry(ref)
1506 except Exception as e:
1507 if ignore_errors:
1508 log.warning(f"Attempted to mark dataset ({ref}) to be trashed in datastore {self.name} "
1509 f"but encountered an error: {e}")
1510 pass
1511 else:
1512 raise
1514 @transactional
1515 def emptyTrash(self, ignore_errors: bool = True) -> None:
1516 """Remove all datasets from the trash.
1518 Parameters
1519 ----------
1520 ignore_errors : `bool`
1521 If `True` return without error even if something went wrong.
1522 Problems could occur if another process is simultaneously trying
1523 to delete.
1524 """
1525 log.debug("Emptying trash in datastore %s", self.name)
1526 # Context manager will empty trash iff we finish it without raising.
1527 with self.bridge.emptyTrash() as trashed:
1528 for ref in trashed:
1529 fileLocations = self._get_dataset_locations_info(ref)
1531 if not fileLocations: 1531 ↛ 1532line 1531 didn't jump to line 1532, because the condition on line 1531 was never true
1532 err_msg = f"Requested dataset ({ref}) does not exist in datastore {self.name}"
1533 if ignore_errors:
1534 log.warning(err_msg)
1535 continue
1536 else:
1537 raise FileNotFoundError(err_msg)
1539 for location, _ in fileLocations:
1541 if not self._artifact_exists(location): 1541 ↛ 1542line 1541 didn't jump to line 1542, because the condition on line 1541 was never true
1542 err_msg = f"Dataset {location.uri} no longer present in datastore {self.name}"
1543 if ignore_errors:
1544 log.warning(err_msg)
1545 continue
1546 else:
1547 raise FileNotFoundError(err_msg)
1549 # Can only delete the artifact if there are no references
1550 # to the file from untrashed dataset refs.
1551 if self._can_remove_dataset_artifact(ref, location):
1552 # Point of no return for this artifact
1553 log.debug("Removing artifact %s from datastore %s", location.uri, self.name)
1554 try:
1555 self._delete_artifact(location)
1556 except Exception as e:
1557 if ignore_errors:
1558 log.critical("Encountered error removing artifact %s from datastore %s: %s",
1559 location.uri, self.name, e)
1560 else:
1561 raise
1563 # Now must remove the entry from the internal registry even if
1564 # the artifact removal failed and was ignored,
1565 # otherwise the removal check above will never be true
1566 try:
1567 # There may be multiple rows associated with this ref
1568 # depending on disassembly
1569 self.removeStoredItemInfo(ref)
1570 except Exception as e:
1571 if ignore_errors:
1572 log.warning("Error removing dataset %s (%s) from internal registry of %s: %s",
1573 ref.id, location.uri, self.name, e)
1574 continue
1575 else:
1576 raise FileNotFoundError(
1577 f"Error removing dataset {ref.id} ({location.uri}) from internal registry "
1578 f"of {self.name}"
1579 ) from e
1581 @transactional
1582 def forget(self, refs: Iterable[DatasetRef]) -> None:
1583 # Docstring inherited.
1584 refs = list(refs)
1585 self.bridge.forget(refs)
1586 self._table.delete(["dataset_id"], *[{"dataset_id": ref.getCheckedId()} for ref in refs])
1588 def validateConfiguration(self, entities: Iterable[Union[DatasetRef, DatasetType, StorageClass]],
1589 logFailures: bool = False) -> None:
1590 """Validate some of the configuration for this datastore.
1592 Parameters
1593 ----------
1594 entities : iterable of `DatasetRef`, `DatasetType`, or `StorageClass`
1595 Entities to test against this configuration. Can be differing
1596 types.
1597 logFailures : `bool`, optional
1598 If `True`, output a log message for every validation error
1599 detected.
1601 Raises
1602 ------
1603 DatastoreValidationError
1604 Raised if there is a validation problem with a configuration.
1605 All the problems are reported in a single exception.
1607 Notes
1608 -----
1609 This method checks that all the supplied entities have valid file
1610 templates and also have formatters defined.
1611 """
1613 templateFailed = None
1614 try:
1615 self.templates.validateTemplates(entities, logFailures=logFailures)
1616 except FileTemplateValidationError as e:
1617 templateFailed = str(e)
1619 formatterFailed = []
1620 for entity in entities:
1621 try:
1622 self.formatterFactory.getFormatterClass(entity)
1623 except KeyError as e:
1624 formatterFailed.append(str(e))
1625 if logFailures: 1625 ↛ 1620line 1625 didn't jump to line 1620, because the condition on line 1625 was never false
1626 log.critical("Formatter failure: %s", e)
1628 if templateFailed or formatterFailed:
1629 messages = []
1630 if templateFailed: 1630 ↛ 1631line 1630 didn't jump to line 1631, because the condition on line 1630 was never true
1631 messages.append(templateFailed)
1632 if formatterFailed: 1632 ↛ 1634line 1632 didn't jump to line 1634, because the condition on line 1632 was never false
1633 messages.append(",".join(formatterFailed))
1634 msg = ";\n".join(messages)
1635 raise DatastoreValidationError(msg)
1637 def getLookupKeys(self) -> Set[LookupKey]:
1638 # Docstring is inherited from base class
1639 return self.templates.getLookupKeys() | self.formatterFactory.getLookupKeys() | \
1640 self.constraints.getLookupKeys()
1642 def validateKey(self, lookupKey: LookupKey,
1643 entity: Union[DatasetRef, DatasetType, StorageClass]) -> None:
1644 # Docstring is inherited from base class
1645 # The key can be valid in either formatters or templates so we can
1646 # only check the template if it exists
1647 if lookupKey in self.templates:
1648 try:
1649 self.templates[lookupKey].validateTemplate(entity)
1650 except FileTemplateValidationError as e:
1651 raise DatastoreValidationError(e) from e
1653 def export(self, refs: Iterable[DatasetRef], *,
1654 directory: Optional[Union[ButlerURI, str]] = None,
1655 transfer: Optional[str] = "auto") -> Iterable[FileDataset]:
1656 # Docstring inherited from Datastore.export.
1657 if transfer is not None and directory is None: 1657 ↛ 1658line 1657 didn't jump to line 1658, because the condition on line 1657 was never true
1658 raise RuntimeError(f"Cannot export using transfer mode {transfer} with no "
1659 "export directory given")
1661 # Force the directory to be a URI object
1662 directoryUri: Optional[ButlerURI] = None
1663 if directory is not None: 1663 ↛ 1666line 1663 didn't jump to line 1666, because the condition on line 1663 was never false
1664 directoryUri = ButlerURI(directory, forceDirectory=True)
1666 if transfer is not None and directoryUri is not None: 1666 ↛ 1671line 1666 didn't jump to line 1671, because the condition on line 1666 was never false
1667 # mypy needs the second test
1668 if not directoryUri.exists(): 1668 ↛ 1669line 1668 didn't jump to line 1669, because the condition on line 1668 was never true
1669 raise FileNotFoundError(f"Export location {directory} does not exist")
1671 for ref in refs:
1672 fileLocations = self._get_dataset_locations_info(ref)
1673 if not fileLocations: 1673 ↛ 1674line 1673 didn't jump to line 1674, because the condition on line 1673 was never true
1674 raise FileNotFoundError(f"Could not retrieve dataset {ref}.")
1675 # For now we can not export disassembled datasets
1676 if len(fileLocations) > 1:
1677 raise NotImplementedError(f"Can not export disassembled datasets such as {ref}")
1678 location, storedFileInfo = fileLocations[0]
1680 pathInStore = location.pathInStore.path
1681 if transfer is None: 1681 ↛ 1684line 1681 didn't jump to line 1684, because the condition on line 1681 was never true
1682 # TODO: do we also need to return the readStorageClass somehow?
1683 # We will use the path in store directly
1684 pass
1685 elif transfer == "direct": 1685 ↛ 1687line 1685 didn't jump to line 1687, because the condition on line 1685 was never true
1686 # Use full URIs to the remote store in the export
1687 pathInStore = str(location.uri)
1688 else:
1689 # mypy needs help
1690 assert directoryUri is not None, "directoryUri must be defined to get here"
1691 storeUri = ButlerURI(location.uri)
1693 # if the datastore has an absolute URI to a resource, we
1694 # have two options:
1695 # 1. Keep the absolute URI in the exported YAML
1696 # 2. Allocate a new name in the local datastore and transfer
1697 # it.
1698 # For now go with option 2
1699 if location.pathInStore.isabs(): 1699 ↛ 1700line 1699 didn't jump to line 1700, because the condition on line 1699 was never true
1700 template = self.templates.getTemplate(ref)
1701 pathInStore = template.format(ref)
1703 exportUri = directoryUri.join(pathInStore)
1704 exportUri.transfer_from(storeUri, transfer=transfer)
1706 yield FileDataset(refs=[ref], path=pathInStore, formatter=storedFileInfo.formatter)
1708 @staticmethod
1709 def computeChecksum(uri: ButlerURI, algorithm: str = "blake2b", block_size: int = 8192) -> Optional[str]:
1710 """Compute the checksum of the supplied file.
1712 Parameters
1713 ----------
1714 uri : `ButlerURI`
1715 Name of resource to calculate checksum from.
1716 algorithm : `str`, optional
1717 Name of algorithm to use. Must be one of the algorithms supported
1718 by :py:class`hashlib`.
1719 block_size : `int`
1720 Number of bytes to read from file at one time.
1722 Returns
1723 -------
1724 hexdigest : `str`
1725 Hex digest of the file.
1727 Notes
1728 -----
1729 Currently returns None if the URI is for a remote resource.
1730 """
1731 if algorithm not in hashlib.algorithms_guaranteed: 1731 ↛ 1732line 1731 didn't jump to line 1732, because the condition on line 1731 was never true
1732 raise NameError("The specified algorithm '{}' is not supported by hashlib".format(algorithm))
1734 if not uri.isLocal: 1734 ↛ 1735line 1734 didn't jump to line 1735, because the condition on line 1734 was never true
1735 return None
1737 hasher = hashlib.new(algorithm)
1739 with uri.as_local() as local_uri:
1740 with open(local_uri.ospath, "rb") as f:
1741 for chunk in iter(lambda: f.read(block_size), b""):
1742 hasher.update(chunk)
1744 return hasher.hexdigest()