Coverage for python/lsst/daf/butler/datastores/fileDatastore.py : 84%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
21from __future__ import annotations
23"""Generic file-based datastore code."""
25__all__ = ("FileDatastore", )
27import hashlib
28import logging
29import os
30import tempfile
32from sqlalchemy import BigInteger, String
34from dataclasses import dataclass
35from typing import (
36 TYPE_CHECKING,
37 Any,
38 ClassVar,
39 Dict,
40 Iterable,
41 List,
42 Mapping,
43 Optional,
44 Set,
45 Tuple,
46 Type,
47 Union,
48)
50from lsst.daf.butler import (
51 ButlerURI,
52 CompositesMap,
53 Config,
54 FileDataset,
55 DatasetRef,
56 DatasetType,
57 DatasetTypeNotSupportedError,
58 Datastore,
59 DatastoreConfig,
60 DatastoreValidationError,
61 FileDescriptor,
62 FileTemplates,
63 FileTemplateValidationError,
64 Formatter,
65 FormatterFactory,
66 Location,
67 LocationFactory,
68 StorageClass,
69 StoredFileInfo,
70)
72from lsst.daf.butler import ddl
73from lsst.daf.butler.registry.interfaces import (
74 ReadOnlyDatabaseError,
75 DatastoreRegistryBridge,
76)
78from lsst.daf.butler.core.repoRelocation import replaceRoot
79from lsst.daf.butler.core.utils import getInstanceOf, getClassOf, transactional
80from .genericDatastore import GenericBaseDatastore
82if TYPE_CHECKING: 82 ↛ 83line 82 didn't jump to line 83, because the condition on line 82 was never true
83 from lsst.daf.butler import LookupKey
84 from lsst.daf.butler.registry.interfaces import DatasetIdRef, DatastoreRegistryBridgeManager
86log = logging.getLogger(__name__)
88# String to use when a Python None is encountered
89NULLSTR = "__NULL_STRING__"
92class _IngestPrepData(Datastore.IngestPrepData):
93 """Helper class for FileDatastore ingest implementation.
95 Parameters
96 ----------
97 datasets : `list` of `FileDataset`
98 Files to be ingested by this datastore.
99 """
100 def __init__(self, datasets: List[FileDataset]):
101 super().__init__(ref for dataset in datasets for ref in dataset.refs)
102 self.datasets = datasets
105@dataclass(frozen=True)
106class DatastoreFileGetInformation:
107 """Collection of useful parameters needed to retrieve a file from
108 a Datastore.
109 """
111 location: Location
112 """The location from which to read the dataset."""
114 formatter: Formatter
115 """The `Formatter` to use to deserialize the dataset."""
117 info: StoredFileInfo
118 """Stored information about this file and its formatter."""
120 assemblerParams: Dict[str, Any]
121 """Parameters to use for post-processing the retrieved dataset."""
123 formatterParams: Dict[str, Any]
124 """Parameters that were understood by the associated formatter."""
126 component: Optional[str]
127 """The component to be retrieved (can be `None`)."""
129 readStorageClass: StorageClass
130 """The `StorageClass` of the dataset being read."""
133class FileDatastore(GenericBaseDatastore):
134 """Generic Datastore for file-based implementations.
136 Should always be sub-classed since key abstract methods are missing.
138 Parameters
139 ----------
140 config : `DatastoreConfig` or `str`
141 Configuration as either a `Config` object or URI to file.
142 bridgeManager : `DatastoreRegistryBridgeManager`
143 Object that manages the interface between `Registry` and datastores.
144 butlerRoot : `str`, optional
145 New datastore root to use to override the configuration value.
147 Raises
148 ------
149 ValueError
150 If root location does not exist and ``create`` is `False` in the
151 configuration.
152 """
154 defaultConfigFile: ClassVar[Optional[str]] = None
155 """Path to configuration defaults. Accessed within the ``config`` resource
156 or relative to a search path. Can be None if no defaults specified.
157 """
159 root: ButlerURI
160 """Root directory URI of this `Datastore`."""
162 locationFactory: LocationFactory
163 """Factory for creating locations relative to the datastore root."""
165 formatterFactory: FormatterFactory
166 """Factory for creating instances of formatters."""
168 templates: FileTemplates
169 """File templates that can be used by this `Datastore`."""
171 composites: CompositesMap
172 """Determines whether a dataset should be disassembled on put."""
174 defaultConfigFile = "datastores/fileDatastore.yaml"
175 """Path to configuration defaults. Accessed within the ``config`` resource
176 or relative to a search path. Can be None if no defaults specified.
177 """
179 @classmethod
180 def setConfigRoot(cls, root: str, config: Config, full: Config, overwrite: bool = True) -> None:
181 """Set any filesystem-dependent config options for this Datastore to
182 be appropriate for a new empty repository with the given root.
184 Parameters
185 ----------
186 root : `str`
187 URI to the root of the data repository.
188 config : `Config`
189 A `Config` to update. Only the subset understood by
190 this component will be updated. Will not expand
191 defaults.
192 full : `Config`
193 A complete config with all defaults expanded that can be
194 converted to a `DatastoreConfig`. Read-only and will not be
195 modified by this method.
196 Repository-specific options that should not be obtained
197 from defaults when Butler instances are constructed
198 should be copied from ``full`` to ``config``.
199 overwrite : `bool`, optional
200 If `False`, do not modify a value in ``config`` if the value
201 already exists. Default is always to overwrite with the provided
202 ``root``.
204 Notes
205 -----
206 If a keyword is explicitly defined in the supplied ``config`` it
207 will not be overridden by this method if ``overwrite`` is `False`.
208 This allows explicit values set in external configs to be retained.
209 """
210 Config.updateParameters(DatastoreConfig, config, full,
211 toUpdate={"root": root},
212 toCopy=("cls", ("records", "table")), overwrite=overwrite)
214 @classmethod
215 def makeTableSpec(cls) -> ddl.TableSpec:
216 return ddl.TableSpec(
217 fields=[
218 ddl.FieldSpec(name="dataset_id", dtype=BigInteger, primaryKey=True),
219 ddl.FieldSpec(name="path", dtype=String, length=256, nullable=False),
220 ddl.FieldSpec(name="formatter", dtype=String, length=128, nullable=False),
221 ddl.FieldSpec(name="storage_class", dtype=String, length=64, nullable=False),
222 # Use empty string to indicate no component
223 ddl.FieldSpec(name="component", dtype=String, length=32, primaryKey=True),
224 # TODO: should checksum be Base64Bytes instead?
225 ddl.FieldSpec(name="checksum", dtype=String, length=128, nullable=True),
226 ddl.FieldSpec(name="file_size", dtype=BigInteger, nullable=True),
227 ],
228 unique=frozenset(),
229 )
231 def __init__(self, config: Union[DatastoreConfig, str],
232 bridgeManager: DatastoreRegistryBridgeManager, butlerRoot: str = None):
233 super().__init__(config, bridgeManager)
234 if "root" not in self.config: 234 ↛ 235line 234 didn't jump to line 235, because the condition on line 234 was never true
235 raise ValueError("No root directory specified in configuration")
237 # Name ourselves either using an explicit name or a name
238 # derived from the (unexpanded) root
239 if "name" in self.config:
240 self.name = self.config["name"]
241 else:
242 # We use the unexpanded root in the name to indicate that this
243 # datastore can be moved without having to update registry.
244 self.name = "{}@{}".format(type(self).__name__,
245 self.config["root"])
247 # Support repository relocation in config
248 # Existence of self.root is checked in subclass
249 self.root = ButlerURI(replaceRoot(self.config["root"], butlerRoot),
250 forceDirectory=True, forceAbsolute=True)
252 self.locationFactory = LocationFactory(self.root)
253 self.formatterFactory = FormatterFactory()
255 # Now associate formatters with storage classes
256 self.formatterFactory.registerFormatters(self.config["formatters"],
257 universe=bridgeManager.universe)
259 # Read the file naming templates
260 self.templates = FileTemplates(self.config["templates"],
261 universe=bridgeManager.universe)
263 # See if composites should be disassembled
264 self.composites = CompositesMap(self.config["composites"],
265 universe=bridgeManager.universe)
267 tableName = self.config["records", "table"]
268 try:
269 # Storage of paths and formatters, keyed by dataset_id
270 self._table = bridgeManager.opaque.register(tableName, self.makeTableSpec())
271 # Interface to Registry.
272 self._bridge = bridgeManager.register(self.name)
273 except ReadOnlyDatabaseError:
274 # If the database is read only and we just tried and failed to
275 # create a table, it means someone is trying to create a read-only
276 # butler client for an empty repo. That should be okay, as long
277 # as they then try to get any datasets before some other client
278 # creates the table. Chances are they'rejust validating
279 # configuration.
280 pass
282 # Determine whether checksums should be used - default to False
283 self.useChecksum = self.config.get("checksum", False)
285 # Determine whether we can fall back to configuration if a
286 # requested dataset is not known to registry
287 self.trustGetRequest = self.config.get("trust_get_request", False)
289 # Check existence and create directory structure if necessary
290 if not self.root.exists():
291 if "create" not in self.config or not self.config["create"]: 291 ↛ 292line 291 didn't jump to line 292, because the condition on line 291 was never true
292 raise ValueError(f"No valid root and not allowed to create one at: {self.root}")
293 try:
294 self.root.mkdir()
295 except Exception as e:
296 raise ValueError(f"Can not create datastore root '{self.root}', check permissions."
297 f" Got error: {e}") from e
299 def __str__(self) -> str:
300 return str(self.root)
302 @property
303 def bridge(self) -> DatastoreRegistryBridge:
304 return self._bridge
306 def _artifact_exists(self, location: Location) -> bool:
307 """Check that an artifact exists in this datastore at the specified
308 location.
310 Parameters
311 ----------
312 location : `Location`
313 Expected location of the artifact associated with this datastore.
315 Returns
316 -------
317 exists : `bool`
318 True if the location can be found, false otherwise.
319 """
320 log.debug("Checking if resource exists: %s", location.uri)
321 return location.uri.exists()
323 def _delete_artifact(self, location: Location) -> None:
324 """Delete the artifact from the datastore.
326 Parameters
327 ----------
328 location : `Location`
329 Location of the artifact associated with this datastore.
330 """
331 if location.pathInStore.isabs(): 331 ↛ 332line 331 didn't jump to line 332, because the condition on line 331 was never true
332 raise RuntimeError(f"Cannot delete artifact with absolute uri {location.uri}.")
333 log.debug("Deleting file: %s", location.uri)
334 location.uri.remove()
335 log.debug("Successfully deleted file: %s", location.uri)
337 def addStoredItemInfo(self, refs: Iterable[DatasetRef], infos: Iterable[StoredFileInfo]) -> None:
338 # Docstring inherited from GenericBaseDatastore
339 records = []
340 for ref, info in zip(refs, infos):
341 # Component should come from ref and fall back on info
342 component = ref.datasetType.component()
343 if component is None and info.component is not None: 343 ↛ 344line 343 didn't jump to line 344, because the condition on line 343 was never true
344 component = info.component
345 if component is None:
346 # Use empty string since we want this to be part of the
347 # primary key.
348 component = NULLSTR
349 records.append(
350 dict(dataset_id=ref.id, formatter=info.formatter, path=info.path,
351 storage_class=info.storageClass.name, component=component,
352 checksum=info.checksum, file_size=info.file_size)
353 )
354 self._table.insert(*records)
356 def getStoredItemsInfo(self, ref: DatasetIdRef) -> List[StoredFileInfo]:
357 # Docstring inherited from GenericBaseDatastore
359 # Look for the dataset_id -- there might be multiple matches
360 # if we have disassembled the dataset.
361 records = list(self._table.fetch(dataset_id=ref.id))
363 results = []
364 for record in records:
365 # Convert name of StorageClass to instance
366 storageClass = self.storageClassFactory.getStorageClass(record["storage_class"])
367 component = record["component"] if (record["component"]
368 and record["component"] != NULLSTR) else None
370 info = StoredFileInfo(formatter=record["formatter"],
371 path=record["path"],
372 storageClass=storageClass,
373 component=component,
374 checksum=record["checksum"],
375 file_size=record["file_size"])
376 results.append(info)
378 return results
380 def _registered_refs_per_artifact(self, pathInStore: ButlerURI) -> Set[int]:
381 """Return all dataset refs associated with the supplied path.
383 Parameters
384 ----------
385 pathInStore : `ButlerURI`
386 Path of interest in the data store.
388 Returns
389 -------
390 ids : `set` of `int`
391 All `DatasetRef` IDs associated with this path.
392 """
393 records = list(self._table.fetch(path=str(pathInStore)))
394 ids = {r["dataset_id"] for r in records}
395 return ids
397 def removeStoredItemInfo(self, ref: DatasetIdRef) -> None:
398 # Docstring inherited from GenericBaseDatastore
399 self._table.delete(["dataset_id"], {"dataset_id": ref.id})
401 def _get_dataset_locations_info(self, ref: DatasetIdRef) -> List[Tuple[Location, StoredFileInfo]]:
402 r"""Find all the `Location`\ s of the requested dataset in the
403 `Datastore` and the associated stored file information.
405 Parameters
406 ----------
407 ref : `DatasetRef`
408 Reference to the required `Dataset`.
410 Returns
411 -------
412 results : `list` [`tuple` [`Location`, `StoredFileInfo` ]]
413 Location of the dataset within the datastore and
414 stored information about each file and its formatter.
415 """
416 # Get the file information (this will fail if no file)
417 records = self.getStoredItemsInfo(ref)
419 # Use the path to determine the location -- we need to take
420 # into account absolute URIs in the datastore record
421 locations: List[Tuple[Location, StoredFileInfo]] = []
422 for r in records:
423 uriInStore = ButlerURI(r.path, forceAbsolute=False)
424 if uriInStore.isabs(): 424 ↛ 425line 424 didn't jump to line 425, because the condition on line 424 was never true
425 location = Location(None, uriInStore)
426 else:
427 location = self.locationFactory.fromPath(r.path)
428 locations.append((location, r))
429 return locations
431 def _can_remove_dataset_artifact(self, ref: DatasetIdRef, location: Location) -> bool:
432 """Check that there is only one dataset associated with the
433 specified artifact.
435 Parameters
436 ----------
437 ref : `DatasetRef` or `FakeDatasetRef`
438 Dataset to be removed.
439 location : `Location`
440 The location of the artifact to be removed.
442 Returns
443 -------
444 can_remove : `Bool`
445 True if the artifact can be safely removed.
446 """
447 # Can't ever delete absolute URIs.
448 if location.pathInStore.isabs(): 448 ↛ 449line 448 didn't jump to line 449, because the condition on line 448 was never true
449 return False
451 # Get all entries associated with this path
452 allRefs = self._registered_refs_per_artifact(location.pathInStore)
453 if not allRefs: 453 ↛ 454line 453 didn't jump to line 454, because the condition on line 453 was never true
454 raise RuntimeError(f"Datastore inconsistency error. {location.pathInStore} not in registry")
456 # Remove these refs from all the refs and if there is nothing left
457 # then we can delete
458 remainingRefs = allRefs - {ref.id}
460 if remainingRefs:
461 return False
462 return True
464 def _get_expected_dataset_locations_info(self, ref: DatasetRef) -> List[Tuple[Location,
465 StoredFileInfo]]:
466 """Predict the location and related file information of the requested
467 dataset in this datastore.
469 Parameters
470 ----------
471 ref : `DatasetRef`
472 Reference to the required `Dataset`.
474 Returns
475 -------
476 results : `list` [`tuple` [`Location`, `StoredFileInfo` ]]
477 Expected Location of the dataset within the datastore and
478 placeholder information about each file and its formatter.
480 Notes
481 -----
482 Uses the current configuration to determine how we would expect the
483 datastore files to have been written if we couldn't ask registry.
484 This is safe so long as there has been no change to datastore
485 configuration between writing the dataset and wanting to read it.
486 Will not work for files that have been ingested without using the
487 standard file template or default formatter.
488 """
490 # If we have a component ref we always need to ask the questions
491 # of the composite. If the composite is disassembled this routine
492 # should return all components. If the composite was not
493 # disassembled the composite is what is stored regardless of
494 # component request. Note that if the caller has disassembled
495 # a composite there is no way for this guess to know that
496 # without trying both the composite and component ref and seeing
497 # if there is something at the component Location even without
498 # disassembly being enabled.
499 if ref.datasetType.isComponent():
500 ref = ref.makeCompositeRef()
502 # See if the ref is a composite that should be disassembled
503 doDisassembly = self.composites.shouldBeDisassembled(ref)
505 all_info: List[Tuple[Location, Formatter, StorageClass, Optional[str]]] = []
507 if doDisassembly:
508 for component, componentStorage in ref.datasetType.storageClass.components.items():
509 compRef = ref.makeComponentRef(component)
510 location, formatter = self._determine_put_formatter_location(compRef)
511 all_info.append((location, formatter, componentStorage, component))
513 else:
514 # Always use the composite ref if no disassembly
515 location, formatter = self._determine_put_formatter_location(ref)
516 all_info.append((location, formatter, ref.datasetType.storageClass, None))
518 # Convert the list of tuples to have StoredFileInfo as second element
519 return [(location, StoredFileInfo(formatter=formatter,
520 path=location.pathInStore.path,
521 storageClass=storageClass,
522 component=component,
523 checksum=None,
524 file_size=-1))
525 for location, formatter, storageClass, component in all_info]
527 def _prepare_for_get(self, ref: DatasetRef,
528 parameters: Optional[Mapping[str, Any]] = None) -> List[DatastoreFileGetInformation]:
529 """Check parameters for ``get`` and obtain formatter and
530 location.
532 Parameters
533 ----------
534 ref : `DatasetRef`
535 Reference to the required Dataset.
536 parameters : `dict`
537 `StorageClass`-specific parameters that specify, for example,
538 a slice of the dataset to be loaded.
540 Returns
541 -------
542 getInfo : `list` [`DatastoreFileGetInformation`]
543 Parameters needed to retrieve each file.
544 """
545 log.debug("Retrieve %s from %s with parameters %s", ref, self.name, parameters)
547 # Get file metadata and internal metadata
548 fileLocations = self._get_dataset_locations_info(ref)
549 if not fileLocations:
550 if not self.trustGetRequest:
551 raise FileNotFoundError(f"Could not retrieve dataset {ref}.")
552 # Assume the dataset is where we think it should be
553 fileLocations = self._get_expected_dataset_locations_info(ref)
555 # The storage class we want to use eventually
556 refStorageClass = ref.datasetType.storageClass
558 if len(fileLocations) > 1:
559 disassembled = True
560 else:
561 disassembled = False
563 # Is this a component request?
564 refComponent = ref.datasetType.component()
566 fileGetInfo = []
567 for location, storedFileInfo in fileLocations:
569 # The storage class used to write the file
570 writeStorageClass = storedFileInfo.storageClass
572 # If this has been disassembled we need read to match the write
573 if disassembled:
574 readStorageClass = writeStorageClass
575 else:
576 readStorageClass = refStorageClass
578 formatter = getInstanceOf(storedFileInfo.formatter,
579 FileDescriptor(location, readStorageClass=readStorageClass,
580 storageClass=writeStorageClass, parameters=parameters),
581 ref.dataId)
583 formatterParams, notFormatterParams = formatter.segregateParameters()
585 # Of the remaining parameters, extract the ones supported by
586 # this StorageClass (for components not all will be handled)
587 assemblerParams = readStorageClass.filterParameters(notFormatterParams)
589 # The ref itself could be a component if the dataset was
590 # disassembled by butler, or we disassembled in datastore and
591 # components came from the datastore records
592 component = storedFileInfo.component if storedFileInfo.component else refComponent
594 fileGetInfo.append(DatastoreFileGetInformation(location, formatter, storedFileInfo,
595 assemblerParams, formatterParams,
596 component, readStorageClass))
598 return fileGetInfo
600 def _prepare_for_put(self, inMemoryDataset: Any, ref: DatasetRef) -> Tuple[Location, Formatter]:
601 """Check the arguments for ``put`` and obtain formatter and
602 location.
604 Parameters
605 ----------
606 inMemoryDataset : `object`
607 The dataset to store.
608 ref : `DatasetRef`
609 Reference to the associated Dataset.
611 Returns
612 -------
613 location : `Location`
614 The location to write the dataset.
615 formatter : `Formatter`
616 The `Formatter` to use to write the dataset.
618 Raises
619 ------
620 TypeError
621 Supplied object and storage class are inconsistent.
622 DatasetTypeNotSupportedError
623 The associated `DatasetType` is not handled by this datastore.
624 """
625 self._validate_put_parameters(inMemoryDataset, ref)
626 return self._determine_put_formatter_location(ref)
628 def _determine_put_formatter_location(self, ref: DatasetRef) -> Tuple[Location, Formatter]:
629 """Calculate the formatter and output location to use for put.
631 Parameters
632 ----------
633 ref : `DatasetRef`
634 Reference to the associated Dataset.
636 Returns
637 -------
638 location : `Location`
639 The location to write the dataset.
640 formatter : `Formatter`
641 The `Formatter` to use to write the dataset.
642 """
643 # Work out output file name
644 try:
645 template = self.templates.getTemplate(ref)
646 except KeyError as e:
647 raise DatasetTypeNotSupportedError(f"Unable to find template for {ref}") from e
649 # Validate the template to protect against filenames from different
650 # dataIds returning the same and causing overwrite confusion.
651 template.validateTemplate(ref)
653 location = self.locationFactory.fromPath(template.format(ref))
655 # Get the formatter based on the storage class
656 storageClass = ref.datasetType.storageClass
657 try:
658 formatter = self.formatterFactory.getFormatter(ref,
659 FileDescriptor(location,
660 storageClass=storageClass),
661 ref.dataId)
662 except KeyError as e:
663 raise DatasetTypeNotSupportedError(f"Unable to find formatter for {ref} in datastore "
664 f"{self.name}") from e
666 # Now that we know the formatter, update the location
667 location = formatter.makeUpdatedLocation(location)
669 return location, formatter
671 def _overrideTransferMode(self, *datasets: FileDataset, transfer: Optional[str] = None) -> Optional[str]:
672 # Docstring inherited from base class
673 if transfer != "auto":
674 return transfer
676 # See if the paths are within the datastore or not
677 inside = [self._pathInStore(d.path) is not None for d in datasets]
679 if all(inside):
680 transfer = None
681 elif not any(inside): 681 ↛ 685line 681 didn't jump to line 685, because the condition on line 681 was never false
682 # Allow ButlerURI to use its own knowledge
683 transfer = "auto"
684 else:
685 raise ValueError("Some datasets are inside the datastore and some are outside."
686 " Please use an explicit transfer mode and not 'auto'.")
688 return transfer
690 def _pathInStore(self, path: Union[str, ButlerURI]) -> Optional[str]:
691 """Return path relative to datastore root
693 Parameters
694 ----------
695 path : `str` or `ButlerURI`
696 Path to dataset. Can be absolute URI. If relative assumed to
697 be relative to the datastore. Returns path in datastore
698 or raises an exception if the path it outside.
700 Returns
701 -------
702 inStore : `str`
703 Path relative to datastore root. Returns `None` if the file is
704 outside the root.
705 """
706 # Relative path will always be relative to datastore
707 pathUri = ButlerURI(path, forceAbsolute=False)
708 return pathUri.relative_to(self.root)
710 def _standardizeIngestPath(self, path: Union[str, ButlerURI], *,
711 transfer: Optional[str] = None) -> Union[str, ButlerURI]:
712 """Standardize the path of a to-be-ingested file.
714 Parameters
715 ----------
716 path : `str` or `ButlerURI`
717 Path of a file to be ingested.
718 transfer : `str`, optional
719 How (and whether) the dataset should be added to the datastore.
720 See `ingest` for details of transfer modes.
721 This implementation is provided only so
722 `NotImplementedError` can be raised if the mode is not supported;
723 actual transfers are deferred to `_extractIngestInfo`.
725 Returns
726 -------
727 path : `str` or `ButlerURI`
728 New path in what the datastore considers standard form. If an
729 absolute URI was given that will be returned unchanged.
731 Notes
732 -----
733 Subclasses of `FileDatastore` can implement this method instead
734 of `_prepIngest`. It should not modify the data repository or given
735 file in any way.
737 Raises
738 ------
739 NotImplementedError
740 Raised if the datastore does not support the given transfer mode
741 (including the case where ingest is not supported at all).
742 FileNotFoundError
743 Raised if one of the given files does not exist.
744 """
745 if transfer not in (None, "direct") + self.root.transferModes: 745 ↛ 746line 745 didn't jump to line 746, because the condition on line 745 was never true
746 raise NotImplementedError(f"Transfer mode {transfer} not supported.")
748 # A relative URI indicates relative to datastore root
749 srcUri = ButlerURI(path, forceAbsolute=False)
750 if not srcUri.isabs():
751 srcUri = self.root.join(path)
753 if not srcUri.exists():
754 raise FileNotFoundError(f"Resource at {srcUri} does not exist; note that paths to ingest "
755 f"are assumed to be relative to {self.root} unless they are absolute.")
757 if transfer is None:
758 relpath = srcUri.relative_to(self.root)
759 if not relpath:
760 raise RuntimeError(f"Transfer is none but source file ({srcUri}) is not "
761 f"within datastore ({self.root})")
763 # Return the relative path within the datastore for internal
764 # transfer
765 path = relpath
767 return path
769 def _extractIngestInfo(self, path: Union[str, ButlerURI], ref: DatasetRef, *,
770 formatter: Union[Formatter, Type[Formatter]],
771 transfer: Optional[str] = None) -> StoredFileInfo:
772 """Relocate (if necessary) and extract `StoredFileInfo` from a
773 to-be-ingested file.
775 Parameters
776 ----------
777 path : `str` or `ButlerURI`
778 URI or path of a file to be ingested.
779 ref : `DatasetRef`
780 Reference for the dataset being ingested. Guaranteed to have
781 ``dataset_id not None`.
782 formatter : `type` or `Formatter`
783 `Formatter` subclass to use for this dataset or an instance.
784 transfer : `str`, optional
785 How (and whether) the dataset should be added to the datastore.
786 See `ingest` for details of transfer modes.
788 Returns
789 -------
790 info : `StoredFileInfo`
791 Internal datastore record for this file. This will be inserted by
792 the caller; the `_extractIngestInfo` is only resposible for
793 creating and populating the struct.
795 Raises
796 ------
797 FileNotFoundError
798 Raised if one of the given files does not exist.
799 FileExistsError
800 Raised if transfer is not `None` but the (internal) location the
801 file would be moved to is already occupied.
802 """
803 if self._transaction is None: 803 ↛ 804line 803 didn't jump to line 804, because the condition on line 803 was never true
804 raise RuntimeError("Ingest called without transaction enabled")
806 # Create URI of the source path, do not need to force a relative
807 # path to absolute.
808 srcUri = ButlerURI(path, forceAbsolute=False)
810 # Track whether we have read the size of the source yet
811 have_sized = False
813 tgtLocation: Optional[Location]
814 if transfer is None:
815 # A relative path is assumed to be relative to the datastore
816 # in this context
817 if not srcUri.isabs():
818 tgtLocation = self.locationFactory.fromPath(srcUri.ospath)
819 else:
820 # Work out the path in the datastore from an absolute URI
821 # This is required to be within the datastore.
822 pathInStore = srcUri.relative_to(self.root)
823 if pathInStore is None: 823 ↛ 824line 823 didn't jump to line 824, because the condition on line 823 was never true
824 raise RuntimeError(f"Unexpectedly learned that {srcUri} is "
825 f"not within datastore {self.root}")
826 tgtLocation = self.locationFactory.fromPath(pathInStore)
827 elif transfer == "direct": 827 ↛ 832line 827 didn't jump to line 832, because the condition on line 827 was never true
828 # Want to store the full URI to the resource directly in
829 # datastore. This is useful for referring to permanent archive
830 # storage for raw data.
831 # Trust that people know what they are doing.
832 tgtLocation = None
833 else:
834 # Work out the name we want this ingested file to have
835 # inside the datastore
836 tgtLocation = self._calculate_ingested_datastore_name(srcUri, ref, formatter)
837 if not tgtLocation.uri.dirname().exists():
838 log.debug("Folder %s does not exist yet.", tgtLocation.uri.dirname())
839 tgtLocation.uri.dirname().mkdir()
841 # if we are transferring from a local file to a remote location
842 # it may be more efficient to get the size and checksum of the
843 # local file rather than the transferred one
844 if not srcUri.scheme or srcUri.scheme == "file": 844 ↛ 850line 844 didn't jump to line 850, because the condition on line 844 was never false
845 size = srcUri.size()
846 checksum = self.computeChecksum(srcUri) if self.useChecksum else None
847 have_sized = True
849 # transfer the resource to the destination
850 tgtLocation.uri.transfer_from(srcUri, transfer=transfer, transaction=self._transaction)
852 if tgtLocation is None: 852 ↛ 854line 852 didn't jump to line 854, because the condition on line 852 was never true
853 # This means we are using direct mode
854 targetUri = srcUri
855 targetPath = str(srcUri)
856 else:
857 targetUri = tgtLocation.uri
858 targetPath = tgtLocation.pathInStore.path
860 # the file should exist in the datastore now
861 if not have_sized:
862 size = targetUri.size()
863 checksum = self.computeChecksum(targetUri) if self.useChecksum else None
865 return StoredFileInfo(formatter=formatter, path=targetPath,
866 storageClass=ref.datasetType.storageClass,
867 component=ref.datasetType.component(),
868 file_size=size, checksum=checksum)
870 def _prepIngest(self, *datasets: FileDataset, transfer: Optional[str] = None) -> _IngestPrepData:
871 # Docstring inherited from Datastore._prepIngest.
872 filtered = []
873 for dataset in datasets:
874 acceptable = [ref for ref in dataset.refs if self.constraints.isAcceptable(ref)]
875 if not acceptable:
876 continue
877 else:
878 dataset.refs = acceptable
879 if dataset.formatter is None:
880 dataset.formatter = self.formatterFactory.getFormatterClass(dataset.refs[0])
881 else:
882 assert isinstance(dataset.formatter, (type, str))
883 dataset.formatter = getClassOf(dataset.formatter)
884 dataset.path = self._standardizeIngestPath(dataset.path, transfer=transfer)
885 filtered.append(dataset)
886 return _IngestPrepData(filtered)
888 @transactional
889 def _finishIngest(self, prepData: Datastore.IngestPrepData, *, transfer: Optional[str] = None) -> None:
890 # Docstring inherited from Datastore._finishIngest.
891 refsAndInfos = []
892 for dataset in prepData.datasets:
893 # Do ingest as if the first dataset ref is associated with the file
894 info = self._extractIngestInfo(dataset.path, dataset.refs[0], formatter=dataset.formatter,
895 transfer=transfer)
896 refsAndInfos.extend([(ref, info) for ref in dataset.refs])
897 self._register_datasets(refsAndInfos)
899 def _calculate_ingested_datastore_name(self, srcUri: ButlerURI, ref: DatasetRef,
900 formatter: Union[Formatter, Type[Formatter]]) -> Location:
901 """Given a source URI and a DatasetRef, determine the name the
902 dataset will have inside datastore.
904 Parameters
905 ----------
906 srcUri : `ButlerURI`
907 URI to the source dataset file.
908 ref : `DatasetRef`
909 Ref associated with the newly-ingested dataset artifact. This
910 is used to determine the name within the datastore.
911 formatter : `Formatter` or Formatter class.
912 Formatter to use for validation. Can be a class or an instance.
914 Returns
915 -------
916 location : `Location`
917 Target location for the newly-ingested dataset.
918 """
919 # Ingesting a file from outside the datastore.
920 # This involves a new name.
921 template = self.templates.getTemplate(ref)
922 location = self.locationFactory.fromPath(template.format(ref))
924 # Get the extension
925 ext = srcUri.getExtension()
927 # Update the destination to include that extension
928 location.updateExtension(ext)
930 # Ask the formatter to validate this extension
931 formatter.validateExtension(location)
933 return location
935 def _write_in_memory_to_artifact(self, inMemoryDataset: Any, ref: DatasetRef) -> StoredFileInfo:
936 """Write out in memory dataset to datastore.
938 Parameters
939 ----------
940 inMemoryDataset : `object`
941 Dataset to write to datastore.
942 ref : `DatasetRef`
943 Registry information associated with this dataset.
945 Returns
946 -------
947 info : `StoredFileInfo`
948 Information describin the artifact written to the datastore.
949 """
950 location, formatter = self._prepare_for_put(inMemoryDataset, ref)
951 uri = location.uri
953 if not uri.dirname().exists():
954 log.debug("Folder %s does not exist yet so creating it.", uri.dirname())
955 uri.dirname().mkdir()
957 if self._transaction is None: 957 ↛ 958line 957 didn't jump to line 958, because the condition on line 957 was never true
958 raise RuntimeError("Attempting to write artifact without transaction enabled")
960 def _removeFileExists(uri: ButlerURI) -> None:
961 """Remove a file and do not complain if it is not there.
963 This is important since a formatter might fail before the file
964 is written and we should not confuse people by writing spurious
965 error messages to the log.
966 """
967 try:
968 uri.remove()
969 except FileNotFoundError:
970 pass
972 # Register a callback to try to delete the uploaded data if
973 # something fails below
974 self._transaction.registerUndo("artifactWrite", _removeFileExists, uri)
976 # For a local file, simply use the formatter directly
977 if uri.isLocal:
978 formatter.write(inMemoryDataset)
979 log.debug("Successfully wrote python object to local file at %s", uri)
980 else:
981 # This is a remote URI, so first try bytes and write directly else
982 # fallback to a temporary file
983 try:
984 serializedDataset = formatter.toBytes(inMemoryDataset)
985 log.debug("Writing bytes directly to %s", uri)
986 uri.write(serializedDataset, overwrite=True)
987 log.debug("Successfully wrote bytes directly to %s", uri)
988 except NotImplementedError:
989 with tempfile.NamedTemporaryFile(suffix=uri.getExtension()) as tmpFile:
990 # Need to configure the formatter to write to a different
991 # location and that needs us to overwrite internals
992 tmpLocation = Location(*os.path.split(tmpFile.name))
993 log.debug("Writing dataset to temporary location at %s", tmpLocation.uri)
994 with formatter._updateLocation(tmpLocation):
995 formatter.write(inMemoryDataset)
996 uri.transfer_from(tmpLocation.uri, transfer="copy", overwrite=True)
997 log.debug("Successfully wrote dataset to %s via a temporary file.", uri)
999 # URI is needed to resolve what ingest case are we dealing with
1000 return self._extractIngestInfo(uri, ref, formatter=formatter)
1002 def _read_artifact_into_memory(self, getInfo: DatastoreFileGetInformation,
1003 ref: DatasetRef, isComponent: bool = False) -> Any:
1004 """Read the artifact from datastore into in memory object.
1006 Parameters
1007 ----------
1008 getInfo : `DatastoreFileGetInformation`
1009 Information about the artifact within the datastore.
1010 ref : `DatasetRef`
1011 The registry information associated with this artifact.
1012 isComponent : `bool`
1013 Flag to indicate if a component is being read from this artifact.
1015 Returns
1016 -------
1017 inMemoryDataset : `object`
1018 The artifact as a python object.
1019 """
1020 location = getInfo.location
1021 uri = location.uri
1022 log.debug("Accessing data from %s", uri)
1024 # Cannot recalculate checksum but can compare size as a quick check
1025 # Do not do this if the size is negative since that indicates
1026 # we do not know.
1027 recorded_size = getInfo.info.file_size
1028 resource_size = uri.size()
1029 if recorded_size >= 0 and resource_size != recorded_size: 1029 ↛ 1030line 1029 didn't jump to line 1030, because the condition on line 1029 was never true
1030 raise RuntimeError("Integrity failure in Datastore. "
1031 f"Size of file {uri} ({resource_size}) "
1032 f"does not match size recorded in registry of {recorded_size}")
1034 # For the general case we have choices for how to proceed.
1035 # 1. Always use a local file (downloading the remote resource to a
1036 # temporary file if needed).
1037 # 2. Use a threshold size and read into memory and use bytes.
1038 # Use both for now with an arbitrary hand off size.
1039 # This allows small datasets to be downloaded from remote object
1040 # stores without requiring a temporary file.
1042 formatter = getInfo.formatter
1043 nbytes_max = 10_000_000 # Arbitrary number that we can tune
1044 if resource_size <= nbytes_max and formatter.can_read_bytes():
1045 serializedDataset = uri.read()
1046 log.debug("Deserializing %s from %d bytes from location %s with formatter %s",
1047 f"component {getInfo.component}" if isComponent else "",
1048 len(serializedDataset), uri, formatter.name())
1049 try:
1050 result = formatter.fromBytes(serializedDataset,
1051 component=getInfo.component if isComponent else None)
1052 except Exception as e:
1053 raise ValueError(f"Failure from formatter '{formatter.name()}' for dataset {ref.id}"
1054 f" ({ref.datasetType.name} from {uri}): {e}") from e
1055 else:
1056 # Read from file
1057 with uri.as_local() as local_uri:
1058 # Have to update the Location associated with the formatter
1059 # because formatter.read does not allow an override.
1060 # This could be improved.
1061 msg = ""
1062 newLocation = None
1063 if uri != local_uri:
1064 newLocation = Location(*local_uri.split())
1065 msg = "(via download to local file)"
1067 log.debug("Reading %s from location %s %s with formatter %s",
1068 f"component {getInfo.component}" if isComponent else "",
1069 uri, msg, formatter.name())
1070 try:
1071 with formatter._updateLocation(newLocation):
1072 result = formatter.read(component=getInfo.component if isComponent else None)
1073 except Exception as e:
1074 raise ValueError(f"Failure from formatter '{formatter.name()}' for dataset {ref.id}"
1075 f" ({ref.datasetType.name} from {uri}): {e}") from e
1077 return self._post_process_get(result, getInfo.readStorageClass, getInfo.assemblerParams,
1078 isComponent=isComponent)
1080 def exists(self, ref: DatasetRef) -> bool:
1081 """Check if the dataset exists in the datastore.
1083 Parameters
1084 ----------
1085 ref : `DatasetRef`
1086 Reference to the required dataset.
1088 Returns
1089 -------
1090 exists : `bool`
1091 `True` if the entity exists in the `Datastore`.
1092 """
1093 fileLocations = self._get_dataset_locations_info(ref)
1095 # if we are being asked to trust that registry might not be correct
1096 # we ask for the expected locations and check them explicitly
1097 if not fileLocations:
1098 if not self.trustGetRequest:
1099 return False
1100 fileLocations = self._get_expected_dataset_locations_info(ref)
1101 for location, _ in fileLocations:
1102 if not self._artifact_exists(location):
1103 return False
1105 return True
1107 def getURIs(self, ref: DatasetRef,
1108 predict: bool = False) -> Tuple[Optional[ButlerURI], Dict[str, ButlerURI]]:
1109 """Return URIs associated with dataset.
1111 Parameters
1112 ----------
1113 ref : `DatasetRef`
1114 Reference to the required dataset.
1115 predict : `bool`, optional
1116 If the datastore does not know about the dataset, should it
1117 return a predicted URI or not?
1119 Returns
1120 -------
1121 primary : `ButlerURI`
1122 The URI to the primary artifact associated with this dataset.
1123 If the dataset was disassembled within the datastore this
1124 may be `None`.
1125 components : `dict`
1126 URIs to any components associated with the dataset artifact.
1127 Can be empty if there are no components.
1128 """
1130 primary: Optional[ButlerURI] = None
1131 components: Dict[str, ButlerURI] = {}
1133 # if this has never been written then we have to guess
1134 if not self.exists(ref):
1135 if not predict:
1136 raise FileNotFoundError("Dataset {} not in this datastore".format(ref))
1138 doDisassembly = self.composites.shouldBeDisassembled(ref)
1140 if doDisassembly:
1142 for component, componentStorage in ref.datasetType.storageClass.components.items():
1143 compRef = ref.makeComponentRef(component)
1144 compLocation, _ = self._determine_put_formatter_location(compRef)
1146 # Add a URI fragment to indicate this is a guess
1147 components[component] = ButlerURI(compLocation.uri.geturl() + "#predicted")
1149 else:
1151 location, _ = self._determine_put_formatter_location(ref)
1153 # Add a URI fragment to indicate this is a guess
1154 primary = ButlerURI(location.uri.geturl() + "#predicted")
1156 return primary, components
1158 # If this is a ref that we have written we can get the path.
1159 # Get file metadata and internal metadata
1160 fileLocations = self._get_dataset_locations_info(ref)
1162 guessing = False
1163 if not fileLocations:
1164 if not self.trustGetRequest: 1164 ↛ 1165line 1164 didn't jump to line 1165, because the condition on line 1164 was never true
1165 raise RuntimeError(f"Unexpectedly got no artifacts for dataset {ref}")
1166 fileLocations = self._get_expected_dataset_locations_info(ref)
1167 guessing = True
1169 if len(fileLocations) == 1:
1170 # No disassembly so this is the primary URI
1171 uri = fileLocations[0][0].uri
1172 if guessing and not uri.exists(): 1172 ↛ 1173line 1172 didn't jump to line 1173, because the condition on line 1172 was never true
1173 raise FileNotFoundError(f"Expected URI ({uri}) does not exist")
1174 primary = uri
1176 else:
1177 for location, storedFileInfo in fileLocations:
1178 if storedFileInfo.component is None: 1178 ↛ 1179line 1178 didn't jump to line 1179, because the condition on line 1178 was never true
1179 raise RuntimeError(f"Unexpectedly got no component name for a component at {location}")
1180 uri = location.uri
1181 if guessing and not uri.exists(): 1181 ↛ 1182line 1181 didn't jump to line 1182, because the condition on line 1181 was never true
1182 raise FileNotFoundError(f"Expected URI ({uri}) does not exist")
1183 components[storedFileInfo.component] = uri
1185 return primary, components
1187 def getURI(self, ref: DatasetRef, predict: bool = False) -> ButlerURI:
1188 """URI to the Dataset.
1190 Parameters
1191 ----------
1192 ref : `DatasetRef`
1193 Reference to the required Dataset.
1194 predict : `bool`
1195 If `True`, allow URIs to be returned of datasets that have not
1196 been written.
1198 Returns
1199 -------
1200 uri : `str`
1201 URI pointing to the dataset within the datastore. If the
1202 dataset does not exist in the datastore, and if ``predict`` is
1203 `True`, the URI will be a prediction and will include a URI
1204 fragment "#predicted".
1205 If the datastore does not have entities that relate well
1206 to the concept of a URI the returned URI will be
1207 descriptive. The returned URI is not guaranteed to be obtainable.
1209 Raises
1210 ------
1211 FileNotFoundError
1212 Raised if a URI has been requested for a dataset that does not
1213 exist and guessing is not allowed.
1214 RuntimeError
1215 Raised if a request is made for a single URI but multiple URIs
1216 are associated with this dataset.
1218 Notes
1219 -----
1220 When a predicted URI is requested an attempt will be made to form
1221 a reasonable URI based on file templates and the expected formatter.
1222 """
1223 primary, components = self.getURIs(ref, predict)
1224 if primary is None or components: 1224 ↛ 1225line 1224 didn't jump to line 1225, because the condition on line 1224 was never true
1225 raise RuntimeError(f"Dataset ({ref}) includes distinct URIs for components. "
1226 "Use Dataastore.getURIs() instead.")
1227 return primary
1229 def get(self, ref: DatasetRef, parameters: Optional[Mapping[str, Any]] = None) -> Any:
1230 """Load an InMemoryDataset from the store.
1232 Parameters
1233 ----------
1234 ref : `DatasetRef`
1235 Reference to the required Dataset.
1236 parameters : `dict`
1237 `StorageClass`-specific parameters that specify, for example,
1238 a slice of the dataset to be loaded.
1240 Returns
1241 -------
1242 inMemoryDataset : `object`
1243 Requested dataset or slice thereof as an InMemoryDataset.
1245 Raises
1246 ------
1247 FileNotFoundError
1248 Requested dataset can not be retrieved.
1249 TypeError
1250 Return value from formatter has unexpected type.
1251 ValueError
1252 Formatter failed to process the dataset.
1253 """
1254 allGetInfo = self._prepare_for_get(ref, parameters)
1255 refComponent = ref.datasetType.component()
1257 # Supplied storage class for the component being read
1258 refStorageClass = ref.datasetType.storageClass
1260 # Create mapping from component name to related info
1261 allComponents = {i.component: i for i in allGetInfo}
1263 # By definition the dataset is disassembled if we have more
1264 # than one record for it.
1265 isDisassembled = len(allGetInfo) > 1
1267 # Look for the special case where we are disassembled but the
1268 # component is a derived component that was not written during
1269 # disassembly. For this scenario we need to check that the
1270 # component requested is listed as a derived component for the
1271 # composite storage class
1272 isDisassembledReadOnlyComponent = False
1273 if isDisassembled and refComponent:
1274 # The composite storage class should be accessible through
1275 # the component dataset type
1276 compositeStorageClass = ref.datasetType.parentStorageClass
1278 # In the unlikely scenario where the composite storage
1279 # class is not known, we can only assume that this is a
1280 # normal component. If that assumption is wrong then the
1281 # branch below that reads a persisted component will fail
1282 # so there is no need to complain here.
1283 if compositeStorageClass is not None: 1283 ↛ 1286line 1283 didn't jump to line 1286, because the condition on line 1283 was never false
1284 isDisassembledReadOnlyComponent = refComponent in compositeStorageClass.derivedComponents
1286 if isDisassembled and not refComponent:
1287 # This was a disassembled dataset spread over multiple files
1288 # and we need to put them all back together again.
1289 # Read into memory and then assemble
1291 # Check that the supplied parameters are suitable for the type read
1292 refStorageClass.validateParameters(parameters)
1294 # We want to keep track of all the parameters that were not used
1295 # by formatters. We assume that if any of the component formatters
1296 # use a parameter that we do not need to apply it again in the
1297 # assembler.
1298 usedParams = set()
1300 components: Dict[str, Any] = {}
1301 for getInfo in allGetInfo:
1302 # assemblerParams are parameters not understood by the
1303 # associated formatter.
1304 usedParams.update(set(getInfo.formatterParams))
1306 component = getInfo.component
1308 if component is None: 1308 ↛ 1309line 1308 didn't jump to line 1309, because the condition on line 1308 was never true
1309 raise RuntimeError(f"Internal error in datastore assembly of {ref}")
1311 # We do not want the formatter to think it's reading
1312 # a component though because it is really reading a
1313 # standalone dataset -- always tell reader it is not a
1314 # component.
1315 components[component] = self._read_artifact_into_memory(getInfo, ref, isComponent=False)
1317 inMemoryDataset = ref.datasetType.storageClass.delegate().assemble(components)
1319 # Any unused parameters will have to be passed to the assembler
1320 if parameters:
1321 unusedParams = {k: v for k, v in parameters.items() if k not in usedParams}
1322 else:
1323 unusedParams = {}
1325 # Process parameters
1326 return ref.datasetType.storageClass.delegate().handleParameters(inMemoryDataset,
1327 parameters=unusedParams)
1329 elif isDisassembledReadOnlyComponent:
1331 compositeStorageClass = ref.datasetType.parentStorageClass
1332 if compositeStorageClass is None: 1332 ↛ 1333line 1332 didn't jump to line 1333, because the condition on line 1332 was never true
1333 raise RuntimeError(f"Unable to retrieve derived component '{refComponent}' since"
1334 "no composite storage class is available.")
1336 if refComponent is None: 1336 ↛ 1338line 1336 didn't jump to line 1338, because the condition on line 1336 was never true
1337 # Mainly for mypy
1338 raise RuntimeError(f"Internal error in datastore {self.name}: component can not be None here")
1340 # Assume that every derived component can be calculated by
1341 # forwarding the request to a single read/write component.
1342 # Rather than guessing which rw component is the right one by
1343 # scanning each for a derived component of the same name,
1344 # we ask the storage class delegate directly which one is best to
1345 # use.
1346 compositeDelegate = compositeStorageClass.delegate()
1347 forwardedComponent = compositeDelegate.selectResponsibleComponent(refComponent,
1348 set(allComponents))
1350 # Select the relevant component
1351 rwInfo = allComponents[forwardedComponent]
1353 # For now assume that read parameters are validated against
1354 # the real component and not the requested component
1355 forwardedStorageClass = rwInfo.formatter.fileDescriptor.readStorageClass
1356 forwardedStorageClass.validateParameters(parameters)
1358 # Unfortunately the FileDescriptor inside the formatter will have
1359 # the wrong write storage class so we need to create a new one
1360 # given the immutability constraint.
1361 writeStorageClass = rwInfo.info.storageClass
1363 # We may need to put some thought into parameters for read
1364 # components but for now forward them on as is
1365 readFormatter = type(rwInfo.formatter)(FileDescriptor(rwInfo.location,
1366 readStorageClass=refStorageClass,
1367 storageClass=writeStorageClass,
1368 parameters=parameters),
1369 ref.dataId)
1371 # The assembler can not receive any parameter requests for a
1372 # derived component at this time since the assembler will
1373 # see the storage class of the derived component and those
1374 # parameters will have to be handled by the formatter on the
1375 # forwarded storage class.
1376 assemblerParams: Dict[str, Any] = {}
1378 # Need to created a new info that specifies the derived
1379 # component and associated storage class
1380 readInfo = DatastoreFileGetInformation(rwInfo.location, readFormatter,
1381 rwInfo.info, assemblerParams, {},
1382 refComponent, refStorageClass)
1384 return self._read_artifact_into_memory(readInfo, ref, isComponent=True)
1386 else:
1387 # Single file request or component from that composite file
1388 for lookup in (refComponent, None): 1388 ↛ 1393line 1388 didn't jump to line 1393, because the loop on line 1388 didn't complete
1389 if lookup in allComponents: 1389 ↛ 1388line 1389 didn't jump to line 1388, because the condition on line 1389 was never false
1390 getInfo = allComponents[lookup]
1391 break
1392 else:
1393 raise FileNotFoundError(f"Component {refComponent} not found "
1394 f"for ref {ref} in datastore {self.name}")
1396 # Do not need the component itself if already disassembled
1397 if isDisassembled:
1398 isComponent = False
1399 else:
1400 isComponent = getInfo.component is not None
1402 # For a disassembled component we can validate parametersagainst
1403 # the component storage class directly
1404 if isDisassembled:
1405 refStorageClass.validateParameters(parameters)
1406 else:
1407 # For an assembled composite this could be a derived
1408 # component derived from a real component. The validity
1409 # of the parameters is not clear. For now validate against
1410 # the composite storage class
1411 getInfo.formatter.fileDescriptor.storageClass.validateParameters(parameters)
1413 return self._read_artifact_into_memory(getInfo, ref, isComponent=isComponent)
1415 @transactional
1416 def put(self, inMemoryDataset: Any, ref: DatasetRef) -> None:
1417 """Write a InMemoryDataset with a given `DatasetRef` to the store.
1419 Parameters
1420 ----------
1421 inMemoryDataset : `object`
1422 The dataset to store.
1423 ref : `DatasetRef`
1424 Reference to the associated Dataset.
1426 Raises
1427 ------
1428 TypeError
1429 Supplied object and storage class are inconsistent.
1430 DatasetTypeNotSupportedError
1431 The associated `DatasetType` is not handled by this datastore.
1433 Notes
1434 -----
1435 If the datastore is configured to reject certain dataset types it
1436 is possible that the put will fail and raise a
1437 `DatasetTypeNotSupportedError`. The main use case for this is to
1438 allow `ChainedDatastore` to put to multiple datastores without
1439 requiring that every datastore accepts the dataset.
1440 """
1442 doDisassembly = self.composites.shouldBeDisassembled(ref)
1443 # doDisassembly = True
1445 artifacts = []
1446 if doDisassembly:
1447 components = ref.datasetType.storageClass.delegate().disassemble(inMemoryDataset)
1448 for component, componentInfo in components.items():
1449 # Don't recurse because we want to take advantage of
1450 # bulk insert -- need a new DatasetRef that refers to the
1451 # same dataset_id but has the component DatasetType
1452 # DatasetType does not refer to the types of components
1453 # So we construct one ourselves.
1454 compRef = ref.makeComponentRef(component)
1455 storedInfo = self._write_in_memory_to_artifact(componentInfo.component, compRef)
1456 artifacts.append((compRef, storedInfo))
1457 else:
1458 # Write the entire thing out
1459 storedInfo = self._write_in_memory_to_artifact(inMemoryDataset, ref)
1460 artifacts.append((ref, storedInfo))
1462 self._register_datasets(artifacts)
1464 @transactional
1465 def trash(self, ref: DatasetRef, ignore_errors: bool = True) -> None:
1466 """Indicate to the datastore that a dataset can be removed.
1468 Parameters
1469 ----------
1470 ref : `DatasetRef`
1471 Reference to the required Dataset.
1472 ignore_errors : `bool`
1473 If `True` return without error even if something went wrong.
1474 Problems could occur if another process is simultaneously trying
1475 to delete.
1477 Raises
1478 ------
1479 FileNotFoundError
1480 Attempt to remove a dataset that does not exist.
1481 """
1482 # Get file metadata and internal metadata
1483 log.debug("Trashing %s in datastore %s", ref, self.name)
1485 fileLocations = self._get_dataset_locations_info(ref)
1487 if not fileLocations:
1488 err_msg = f"Requested dataset to trash ({ref}) is not known to datastore {self.name}"
1489 if ignore_errors:
1490 log.warning(err_msg)
1491 return
1492 else:
1493 raise FileNotFoundError(err_msg)
1495 for location, storedFileInfo in fileLocations:
1496 if not self._artifact_exists(location): 1496 ↛ 1497line 1496 didn't jump to line 1497, because the condition on line 1496 was never true
1497 err_msg = f"Dataset is known to datastore {self.name} but " \
1498 f"associated artifact ({location.uri}) is missing"
1499 if ignore_errors:
1500 log.warning(err_msg)
1501 return
1502 else:
1503 raise FileNotFoundError(err_msg)
1505 # Mark dataset as trashed
1506 try:
1507 self._move_to_trash_in_registry(ref)
1508 except Exception as e:
1509 if ignore_errors:
1510 log.warning(f"Attempted to mark dataset ({ref}) to be trashed in datastore {self.name} "
1511 f"but encountered an error: {e}")
1512 pass
1513 else:
1514 raise
1516 @transactional
1517 def emptyTrash(self, ignore_errors: bool = True) -> None:
1518 """Remove all datasets from the trash.
1520 Parameters
1521 ----------
1522 ignore_errors : `bool`
1523 If `True` return without error even if something went wrong.
1524 Problems could occur if another process is simultaneously trying
1525 to delete.
1526 """
1527 log.debug("Emptying trash in datastore %s", self.name)
1528 # Context manager will empty trash iff we finish it without raising.
1529 with self.bridge.emptyTrash() as trashed:
1530 for ref in trashed:
1531 fileLocations = self._get_dataset_locations_info(ref)
1533 if not fileLocations: 1533 ↛ 1534line 1533 didn't jump to line 1534, because the condition on line 1533 was never true
1534 err_msg = f"Requested dataset ({ref}) does not exist in datastore {self.name}"
1535 if ignore_errors:
1536 log.warning(err_msg)
1537 continue
1538 else:
1539 raise FileNotFoundError(err_msg)
1541 for location, _ in fileLocations:
1543 if not self._artifact_exists(location): 1543 ↛ 1544line 1543 didn't jump to line 1544, because the condition on line 1543 was never true
1544 err_msg = f"Dataset {location.uri} no longer present in datastore {self.name}"
1545 if ignore_errors:
1546 log.warning(err_msg)
1547 continue
1548 else:
1549 raise FileNotFoundError(err_msg)
1551 # Can only delete the artifact if there are no references
1552 # to the file from untrashed dataset refs.
1553 if self._can_remove_dataset_artifact(ref, location):
1554 # Point of no return for this artifact
1555 log.debug("Removing artifact %s from datastore %s", location.uri, self.name)
1556 try:
1557 self._delete_artifact(location)
1558 except Exception as e:
1559 if ignore_errors:
1560 log.critical("Encountered error removing artifact %s from datastore %s: %s",
1561 location.uri, self.name, e)
1562 else:
1563 raise
1565 # Now must remove the entry from the internal registry even if
1566 # the artifact removal failed and was ignored,
1567 # otherwise the removal check above will never be true
1568 try:
1569 # There may be multiple rows associated with this ref
1570 # depending on disassembly
1571 self.removeStoredItemInfo(ref)
1572 except Exception as e:
1573 if ignore_errors:
1574 log.warning("Error removing dataset %s (%s) from internal registry of %s: %s",
1575 ref.id, location.uri, self.name, e)
1576 continue
1577 else:
1578 raise FileNotFoundError(
1579 f"Error removing dataset {ref.id} ({location.uri}) from internal registry "
1580 f"of {self.name}"
1581 ) from e
1583 @transactional
1584 def forget(self, refs: Iterable[DatasetRef]) -> None:
1585 # Docstring inherited.
1586 refs = list(refs)
1587 self.bridge.forget(refs)
1588 self._table.delete(["dataset_id"], *[{"dataset_id": ref.getCheckedId()} for ref in refs])
1590 def validateConfiguration(self, entities: Iterable[Union[DatasetRef, DatasetType, StorageClass]],
1591 logFailures: bool = False) -> None:
1592 """Validate some of the configuration for this datastore.
1594 Parameters
1595 ----------
1596 entities : iterable of `DatasetRef`, `DatasetType`, or `StorageClass`
1597 Entities to test against this configuration. Can be differing
1598 types.
1599 logFailures : `bool`, optional
1600 If `True`, output a log message for every validation error
1601 detected.
1603 Raises
1604 ------
1605 DatastoreValidationError
1606 Raised if there is a validation problem with a configuration.
1607 All the problems are reported in a single exception.
1609 Notes
1610 -----
1611 This method checks that all the supplied entities have valid file
1612 templates and also have formatters defined.
1613 """
1615 templateFailed = None
1616 try:
1617 self.templates.validateTemplates(entities, logFailures=logFailures)
1618 except FileTemplateValidationError as e:
1619 templateFailed = str(e)
1621 formatterFailed = []
1622 for entity in entities:
1623 try:
1624 self.formatterFactory.getFormatterClass(entity)
1625 except KeyError as e:
1626 formatterFailed.append(str(e))
1627 if logFailures: 1627 ↛ 1622line 1627 didn't jump to line 1622, because the condition on line 1627 was never false
1628 log.critical("Formatter failure: %s", e)
1630 if templateFailed or formatterFailed:
1631 messages = []
1632 if templateFailed: 1632 ↛ 1633line 1632 didn't jump to line 1633, because the condition on line 1632 was never true
1633 messages.append(templateFailed)
1634 if formatterFailed: 1634 ↛ 1636line 1634 didn't jump to line 1636, because the condition on line 1634 was never false
1635 messages.append(",".join(formatterFailed))
1636 msg = ";\n".join(messages)
1637 raise DatastoreValidationError(msg)
1639 def getLookupKeys(self) -> Set[LookupKey]:
1640 # Docstring is inherited from base class
1641 return self.templates.getLookupKeys() | self.formatterFactory.getLookupKeys() | \
1642 self.constraints.getLookupKeys()
1644 def validateKey(self, lookupKey: LookupKey,
1645 entity: Union[DatasetRef, DatasetType, StorageClass]) -> None:
1646 # Docstring is inherited from base class
1647 # The key can be valid in either formatters or templates so we can
1648 # only check the template if it exists
1649 if lookupKey in self.templates:
1650 try:
1651 self.templates[lookupKey].validateTemplate(entity)
1652 except FileTemplateValidationError as e:
1653 raise DatastoreValidationError(e) from e
1655 def export(self, refs: Iterable[DatasetRef], *,
1656 directory: Optional[Union[ButlerURI, str]] = None,
1657 transfer: Optional[str] = "auto") -> Iterable[FileDataset]:
1658 # Docstring inherited from Datastore.export.
1659 if transfer is not None and directory is None: 1659 ↛ 1660line 1659 didn't jump to line 1660, because the condition on line 1659 was never true
1660 raise RuntimeError(f"Cannot export using transfer mode {transfer} with no "
1661 "export directory given")
1663 # Force the directory to be a URI object
1664 directoryUri: Optional[ButlerURI] = None
1665 if directory is not None: 1665 ↛ 1668line 1665 didn't jump to line 1668, because the condition on line 1665 was never false
1666 directoryUri = ButlerURI(directory, forceDirectory=True)
1668 if transfer is not None and directoryUri is not None: 1668 ↛ 1673line 1668 didn't jump to line 1673, because the condition on line 1668 was never false
1669 # mypy needs the second test
1670 if not directoryUri.exists(): 1670 ↛ 1671line 1670 didn't jump to line 1671, because the condition on line 1670 was never true
1671 raise FileNotFoundError(f"Export location {directory} does not exist")
1673 for ref in refs:
1674 fileLocations = self._get_dataset_locations_info(ref)
1675 if not fileLocations: 1675 ↛ 1676line 1675 didn't jump to line 1676, because the condition on line 1675 was never true
1676 raise FileNotFoundError(f"Could not retrieve dataset {ref}.")
1677 # For now we can not export disassembled datasets
1678 if len(fileLocations) > 1:
1679 raise NotImplementedError(f"Can not export disassembled datasets such as {ref}")
1680 location, storedFileInfo = fileLocations[0]
1682 pathInStore = location.pathInStore.path
1683 if transfer is None: 1683 ↛ 1686line 1683 didn't jump to line 1686, because the condition on line 1683 was never true
1684 # TODO: do we also need to return the readStorageClass somehow?
1685 # We will use the path in store directly
1686 pass
1687 elif transfer == "direct": 1687 ↛ 1689line 1687 didn't jump to line 1689, because the condition on line 1687 was never true
1688 # Use full URIs to the remote store in the export
1689 pathInStore = str(location.uri)
1690 else:
1691 # mypy needs help
1692 assert directoryUri is not None, "directoryUri must be defined to get here"
1693 storeUri = ButlerURI(location.uri)
1695 # if the datastore has an absolute URI to a resource, we
1696 # have two options:
1697 # 1. Keep the absolute URI in the exported YAML
1698 # 2. Allocate a new name in the local datastore and transfer
1699 # it.
1700 # For now go with option 2
1701 if location.pathInStore.isabs(): 1701 ↛ 1702line 1701 didn't jump to line 1702, because the condition on line 1701 was never true
1702 template = self.templates.getTemplate(ref)
1703 pathInStore = template.format(ref)
1705 exportUri = directoryUri.join(pathInStore)
1706 exportUri.transfer_from(storeUri, transfer=transfer)
1708 yield FileDataset(refs=[ref], path=pathInStore, formatter=storedFileInfo.formatter)
1710 @staticmethod
1711 def computeChecksum(uri: ButlerURI, algorithm: str = "blake2b", block_size: int = 8192) -> Optional[str]:
1712 """Compute the checksum of the supplied file.
1714 Parameters
1715 ----------
1716 uri : `ButlerURI`
1717 Name of resource to calculate checksum from.
1718 algorithm : `str`, optional
1719 Name of algorithm to use. Must be one of the algorithms supported
1720 by :py:class`hashlib`.
1721 block_size : `int`
1722 Number of bytes to read from file at one time.
1724 Returns
1725 -------
1726 hexdigest : `str`
1727 Hex digest of the file.
1729 Notes
1730 -----
1731 Currently returns None if the URI is for a remote resource.
1732 """
1733 if algorithm not in hashlib.algorithms_guaranteed: 1733 ↛ 1734line 1733 didn't jump to line 1734, because the condition on line 1733 was never true
1734 raise NameError("The specified algorithm '{}' is not supported by hashlib".format(algorithm))
1736 if not uri.isLocal: 1736 ↛ 1737line 1736 didn't jump to line 1737, because the condition on line 1736 was never true
1737 return None
1739 hasher = hashlib.new(algorithm)
1741 with uri.as_local() as local_uri:
1742 with open(local_uri.ospath, "rb") as f:
1743 for chunk in iter(lambda: f.read(block_size), b""):
1744 hasher.update(chunk)
1746 return hasher.hexdigest()