Coverage for python/lsst/daf/butler/datastores/fileDatastore.py : 83%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
21from __future__ import annotations
23"""Generic file-based datastore code."""
25__all__ = ("FileDatastore", )
27import hashlib
28import logging
29import os
30import tempfile
32from sqlalchemy import BigInteger, String
34from dataclasses import dataclass
35from typing import (
36 TYPE_CHECKING,
37 Any,
38 ClassVar,
39 Dict,
40 Iterable,
41 List,
42 Mapping,
43 Optional,
44 Set,
45 Tuple,
46 Type,
47 Union,
48)
50from lsst.daf.butler import (
51 ButlerURI,
52 CompositesMap,
53 Config,
54 FileDataset,
55 DatasetId,
56 DatasetRef,
57 DatasetType,
58 DatasetTypeNotSupportedError,
59 Datastore,
60 DatastoreCacheManager,
61 DatastoreDisabledCacheManager,
62 DatastoreConfig,
63 DatastoreValidationError,
64 FileDescriptor,
65 FileTemplates,
66 FileTemplateValidationError,
67 Formatter,
68 FormatterFactory,
69 Location,
70 LocationFactory,
71 Progress,
72 StorageClass,
73 StoredFileInfo,
74)
76from lsst.daf.butler import ddl
77from lsst.daf.butler.registry.interfaces import (
78 ReadOnlyDatabaseError,
79 DatastoreRegistryBridge,
80)
82from lsst.daf.butler.core.repoRelocation import replaceRoot
83from lsst.daf.butler.core.utils import getInstanceOf, getClassOf, transactional
84from .genericDatastore import GenericBaseDatastore
86if TYPE_CHECKING: 86 ↛ 87line 86 didn't jump to line 87, because the condition on line 86 was never true
87 from lsst.daf.butler import LookupKey, AbstractDatastoreCacheManager
88 from lsst.daf.butler.registry.interfaces import DatasetIdRef, DatastoreRegistryBridgeManager
90log = logging.getLogger(__name__)
92# String to use when a Python None is encountered
93NULLSTR = "__NULL_STRING__"
96class _IngestPrepData(Datastore.IngestPrepData):
97 """Helper class for FileDatastore ingest implementation.
99 Parameters
100 ----------
101 datasets : `list` of `FileDataset`
102 Files to be ingested by this datastore.
103 """
104 def __init__(self, datasets: List[FileDataset]):
105 super().__init__(ref for dataset in datasets for ref in dataset.refs)
106 self.datasets = datasets
109@dataclass(frozen=True)
110class DatastoreFileGetInformation:
111 """Collection of useful parameters needed to retrieve a file from
112 a Datastore.
113 """
115 location: Location
116 """The location from which to read the dataset."""
118 formatter: Formatter
119 """The `Formatter` to use to deserialize the dataset."""
121 info: StoredFileInfo
122 """Stored information about this file and its formatter."""
124 assemblerParams: Dict[str, Any]
125 """Parameters to use for post-processing the retrieved dataset."""
127 formatterParams: Dict[str, Any]
128 """Parameters that were understood by the associated formatter."""
130 component: Optional[str]
131 """The component to be retrieved (can be `None`)."""
133 readStorageClass: StorageClass
134 """The `StorageClass` of the dataset being read."""
137class FileDatastore(GenericBaseDatastore):
138 """Generic Datastore for file-based implementations.
140 Should always be sub-classed since key abstract methods are missing.
142 Parameters
143 ----------
144 config : `DatastoreConfig` or `str`
145 Configuration as either a `Config` object or URI to file.
146 bridgeManager : `DatastoreRegistryBridgeManager`
147 Object that manages the interface between `Registry` and datastores.
148 butlerRoot : `str`, optional
149 New datastore root to use to override the configuration value.
151 Raises
152 ------
153 ValueError
154 If root location does not exist and ``create`` is `False` in the
155 configuration.
156 """
158 defaultConfigFile: ClassVar[Optional[str]] = None
159 """Path to configuration defaults. Accessed within the ``config`` resource
160 or relative to a search path. Can be None if no defaults specified.
161 """
163 root: ButlerURI
164 """Root directory URI of this `Datastore`."""
166 locationFactory: LocationFactory
167 """Factory for creating locations relative to the datastore root."""
169 formatterFactory: FormatterFactory
170 """Factory for creating instances of formatters."""
172 templates: FileTemplates
173 """File templates that can be used by this `Datastore`."""
175 composites: CompositesMap
176 """Determines whether a dataset should be disassembled on put."""
178 defaultConfigFile = "datastores/fileDatastore.yaml"
179 """Path to configuration defaults. Accessed within the ``config`` resource
180 or relative to a search path. Can be None if no defaults specified.
181 """
183 @classmethod
184 def setConfigRoot(cls, root: str, config: Config, full: Config, overwrite: bool = True) -> None:
185 """Set any filesystem-dependent config options for this Datastore to
186 be appropriate for a new empty repository with the given root.
188 Parameters
189 ----------
190 root : `str`
191 URI to the root of the data repository.
192 config : `Config`
193 A `Config` to update. Only the subset understood by
194 this component will be updated. Will not expand
195 defaults.
196 full : `Config`
197 A complete config with all defaults expanded that can be
198 converted to a `DatastoreConfig`. Read-only and will not be
199 modified by this method.
200 Repository-specific options that should not be obtained
201 from defaults when Butler instances are constructed
202 should be copied from ``full`` to ``config``.
203 overwrite : `bool`, optional
204 If `False`, do not modify a value in ``config`` if the value
205 already exists. Default is always to overwrite with the provided
206 ``root``.
208 Notes
209 -----
210 If a keyword is explicitly defined in the supplied ``config`` it
211 will not be overridden by this method if ``overwrite`` is `False`.
212 This allows explicit values set in external configs to be retained.
213 """
214 Config.updateParameters(DatastoreConfig, config, full,
215 toUpdate={"root": root},
216 toCopy=("cls", ("records", "table")), overwrite=overwrite)
218 @classmethod
219 def makeTableSpec(cls, datasetIdColumnType: type) -> ddl.TableSpec:
220 return ddl.TableSpec(
221 fields=[
222 ddl.FieldSpec(name="dataset_id", dtype=datasetIdColumnType, primaryKey=True),
223 ddl.FieldSpec(name="path", dtype=String, length=256, nullable=False),
224 ddl.FieldSpec(name="formatter", dtype=String, length=128, nullable=False),
225 ddl.FieldSpec(name="storage_class", dtype=String, length=64, nullable=False),
226 # Use empty string to indicate no component
227 ddl.FieldSpec(name="component", dtype=String, length=32, primaryKey=True),
228 # TODO: should checksum be Base64Bytes instead?
229 ddl.FieldSpec(name="checksum", dtype=String, length=128, nullable=True),
230 ddl.FieldSpec(name="file_size", dtype=BigInteger, nullable=True),
231 ],
232 unique=frozenset(),
233 )
235 def __init__(self, config: Union[DatastoreConfig, str],
236 bridgeManager: DatastoreRegistryBridgeManager, butlerRoot: str = None):
237 super().__init__(config, bridgeManager)
238 if "root" not in self.config: 238 ↛ 239line 238 didn't jump to line 239, because the condition on line 238 was never true
239 raise ValueError("No root directory specified in configuration")
241 # Name ourselves either using an explicit name or a name
242 # derived from the (unexpanded) root
243 if "name" in self.config:
244 self.name = self.config["name"]
245 else:
246 # We use the unexpanded root in the name to indicate that this
247 # datastore can be moved without having to update registry.
248 self.name = "{}@{}".format(type(self).__name__,
249 self.config["root"])
251 # Support repository relocation in config
252 # Existence of self.root is checked in subclass
253 self.root = ButlerURI(replaceRoot(self.config["root"], butlerRoot),
254 forceDirectory=True, forceAbsolute=True)
256 self.locationFactory = LocationFactory(self.root)
257 self.formatterFactory = FormatterFactory()
259 # Now associate formatters with storage classes
260 self.formatterFactory.registerFormatters(self.config["formatters"],
261 universe=bridgeManager.universe)
263 # Read the file naming templates
264 self.templates = FileTemplates(self.config["templates"],
265 universe=bridgeManager.universe)
267 # See if composites should be disassembled
268 self.composites = CompositesMap(self.config["composites"],
269 universe=bridgeManager.universe)
271 tableName = self.config["records", "table"]
272 try:
273 # Storage of paths and formatters, keyed by dataset_id
274 self._table = bridgeManager.opaque.register(
275 tableName, self.makeTableSpec(bridgeManager.datasetIdColumnType))
276 # Interface to Registry.
277 self._bridge = bridgeManager.register(self.name)
278 except ReadOnlyDatabaseError:
279 # If the database is read only and we just tried and failed to
280 # create a table, it means someone is trying to create a read-only
281 # butler client for an empty repo. That should be okay, as long
282 # as they then try to get any datasets before some other client
283 # creates the table. Chances are they'rejust validating
284 # configuration.
285 pass
287 # Determine whether checksums should be used - default to False
288 self.useChecksum = self.config.get("checksum", False)
290 # Determine whether we can fall back to configuration if a
291 # requested dataset is not known to registry
292 self.trustGetRequest = self.config.get("trust_get_request", False)
294 # Create a cache manager
295 self.cacheManager: AbstractDatastoreCacheManager
296 if "cached" in self.config: 296 ↛ 300line 296 didn't jump to line 300, because the condition on line 296 was never false
297 self.cacheManager = DatastoreCacheManager(self.config["cached"],
298 universe=bridgeManager.universe)
299 else:
300 self.cacheManager = DatastoreDisabledCacheManager("",
301 universe=bridgeManager.universe)
303 # Check existence and create directory structure if necessary
304 if not self.root.exists():
305 if "create" not in self.config or not self.config["create"]: 305 ↛ 306line 305 didn't jump to line 306, because the condition on line 305 was never true
306 raise ValueError(f"No valid root and not allowed to create one at: {self.root}")
307 try:
308 self.root.mkdir()
309 except Exception as e:
310 raise ValueError(f"Can not create datastore root '{self.root}', check permissions."
311 f" Got error: {e}") from e
313 def __str__(self) -> str:
314 return str(self.root)
316 @property
317 def bridge(self) -> DatastoreRegistryBridge:
318 return self._bridge
320 def _artifact_exists(self, location: Location) -> bool:
321 """Check that an artifact exists in this datastore at the specified
322 location.
324 Parameters
325 ----------
326 location : `Location`
327 Expected location of the artifact associated with this datastore.
329 Returns
330 -------
331 exists : `bool`
332 True if the location can be found, false otherwise.
333 """
334 log.debug("Checking if resource exists: %s", location.uri)
335 return location.uri.exists()
337 def _delete_artifact(self, location: Location) -> None:
338 """Delete the artifact from the datastore.
340 Parameters
341 ----------
342 location : `Location`
343 Location of the artifact associated with this datastore.
344 """
345 if location.pathInStore.isabs(): 345 ↛ 346line 345 didn't jump to line 346, because the condition on line 345 was never true
346 raise RuntimeError(f"Cannot delete artifact with absolute uri {location.uri}.")
347 log.debug("Deleting file: %s", location.uri)
348 location.uri.remove()
349 log.debug("Successfully deleted file: %s", location.uri)
351 def addStoredItemInfo(self, refs: Iterable[DatasetRef], infos: Iterable[StoredFileInfo]) -> None:
352 # Docstring inherited from GenericBaseDatastore
353 records = []
354 for ref, info in zip(refs, infos):
355 # Component should come from ref and fall back on info
356 component = ref.datasetType.component()
357 if component is None and info.component is not None: 357 ↛ 358line 357 didn't jump to line 358, because the condition on line 357 was never true
358 component = info.component
359 if component is None:
360 # Use empty string since we want this to be part of the
361 # primary key.
362 component = NULLSTR
363 records.append(
364 dict(dataset_id=ref.id, formatter=info.formatter, path=info.path,
365 storage_class=info.storageClass.name, component=component,
366 checksum=info.checksum, file_size=info.file_size)
367 )
368 self._table.insert(*records)
370 def getStoredItemsInfo(self, ref: DatasetIdRef) -> List[StoredFileInfo]:
371 # Docstring inherited from GenericBaseDatastore
373 # Look for the dataset_id -- there might be multiple matches
374 # if we have disassembled the dataset.
375 records = list(self._table.fetch(dataset_id=ref.id))
377 results = []
378 for record in records:
379 # Convert name of StorageClass to instance
380 storageClass = self.storageClassFactory.getStorageClass(record["storage_class"])
381 component = record["component"] if (record["component"]
382 and record["component"] != NULLSTR) else None
384 info = StoredFileInfo(formatter=record["formatter"],
385 path=record["path"],
386 storageClass=storageClass,
387 component=component,
388 checksum=record["checksum"],
389 file_size=record["file_size"])
390 results.append(info)
392 return results
394 def _registered_refs_per_artifact(self, pathInStore: ButlerURI) -> Set[DatasetId]:
395 """Return all dataset refs associated with the supplied path.
397 Parameters
398 ----------
399 pathInStore : `ButlerURI`
400 Path of interest in the data store.
402 Returns
403 -------
404 ids : `set` of `int`
405 All `DatasetRef` IDs associated with this path.
406 """
407 records = list(self._table.fetch(path=str(pathInStore)))
408 ids = {r["dataset_id"] for r in records}
409 return ids
411 def removeStoredItemInfo(self, ref: DatasetIdRef) -> None:
412 # Docstring inherited from GenericBaseDatastore
413 self._table.delete(["dataset_id"], {"dataset_id": ref.id})
415 def _get_dataset_locations_info(self, ref: DatasetIdRef) -> List[Tuple[Location, StoredFileInfo]]:
416 r"""Find all the `Location`\ s of the requested dataset in the
417 `Datastore` and the associated stored file information.
419 Parameters
420 ----------
421 ref : `DatasetRef`
422 Reference to the required `Dataset`.
424 Returns
425 -------
426 results : `list` [`tuple` [`Location`, `StoredFileInfo` ]]
427 Location of the dataset within the datastore and
428 stored information about each file and its formatter.
429 """
430 # Get the file information (this will fail if no file)
431 records = self.getStoredItemsInfo(ref)
433 # Use the path to determine the location -- we need to take
434 # into account absolute URIs in the datastore record
435 locations: List[Tuple[Location, StoredFileInfo]] = []
436 for r in records:
437 uriInStore = ButlerURI(r.path, forceAbsolute=False)
438 if uriInStore.isabs(): 438 ↛ 439line 438 didn't jump to line 439, because the condition on line 438 was never true
439 location = Location(None, uriInStore)
440 else:
441 location = self.locationFactory.fromPath(r.path)
442 locations.append((location, r))
443 return locations
445 def _can_remove_dataset_artifact(self, ref: DatasetIdRef, location: Location) -> bool:
446 """Check that there is only one dataset associated with the
447 specified artifact.
449 Parameters
450 ----------
451 ref : `DatasetRef` or `FakeDatasetRef`
452 Dataset to be removed.
453 location : `Location`
454 The location of the artifact to be removed.
456 Returns
457 -------
458 can_remove : `Bool`
459 True if the artifact can be safely removed.
460 """
461 # Can't ever delete absolute URIs.
462 if location.pathInStore.isabs(): 462 ↛ 463line 462 didn't jump to line 463, because the condition on line 462 was never true
463 return False
465 # Get all entries associated with this path
466 allRefs = self._registered_refs_per_artifact(location.pathInStore)
467 if not allRefs: 467 ↛ 468line 467 didn't jump to line 468, because the condition on line 467 was never true
468 raise RuntimeError(f"Datastore inconsistency error. {location.pathInStore} not in registry")
470 # Remove these refs from all the refs and if there is nothing left
471 # then we can delete
472 remainingRefs = allRefs - {ref.id}
474 if remainingRefs:
475 return False
476 return True
478 def _get_expected_dataset_locations_info(self, ref: DatasetRef) -> List[Tuple[Location,
479 StoredFileInfo]]:
480 """Predict the location and related file information of the requested
481 dataset in this datastore.
483 Parameters
484 ----------
485 ref : `DatasetRef`
486 Reference to the required `Dataset`.
488 Returns
489 -------
490 results : `list` [`tuple` [`Location`, `StoredFileInfo` ]]
491 Expected Location of the dataset within the datastore and
492 placeholder information about each file and its formatter.
494 Notes
495 -----
496 Uses the current configuration to determine how we would expect the
497 datastore files to have been written if we couldn't ask registry.
498 This is safe so long as there has been no change to datastore
499 configuration between writing the dataset and wanting to read it.
500 Will not work for files that have been ingested without using the
501 standard file template or default formatter.
502 """
504 # If we have a component ref we always need to ask the questions
505 # of the composite. If the composite is disassembled this routine
506 # should return all components. If the composite was not
507 # disassembled the composite is what is stored regardless of
508 # component request. Note that if the caller has disassembled
509 # a composite there is no way for this guess to know that
510 # without trying both the composite and component ref and seeing
511 # if there is something at the component Location even without
512 # disassembly being enabled.
513 if ref.datasetType.isComponent():
514 ref = ref.makeCompositeRef()
516 # See if the ref is a composite that should be disassembled
517 doDisassembly = self.composites.shouldBeDisassembled(ref)
519 all_info: List[Tuple[Location, Formatter, StorageClass, Optional[str]]] = []
521 if doDisassembly:
522 for component, componentStorage in ref.datasetType.storageClass.components.items():
523 compRef = ref.makeComponentRef(component)
524 location, formatter = self._determine_put_formatter_location(compRef)
525 all_info.append((location, formatter, componentStorage, component))
527 else:
528 # Always use the composite ref if no disassembly
529 location, formatter = self._determine_put_formatter_location(ref)
530 all_info.append((location, formatter, ref.datasetType.storageClass, None))
532 # Convert the list of tuples to have StoredFileInfo as second element
533 return [(location, StoredFileInfo(formatter=formatter,
534 path=location.pathInStore.path,
535 storageClass=storageClass,
536 component=component,
537 checksum=None,
538 file_size=-1))
539 for location, formatter, storageClass, component in all_info]
541 def _prepare_for_get(self, ref: DatasetRef,
542 parameters: Optional[Mapping[str, Any]] = None) -> List[DatastoreFileGetInformation]:
543 """Check parameters for ``get`` and obtain formatter and
544 location.
546 Parameters
547 ----------
548 ref : `DatasetRef`
549 Reference to the required Dataset.
550 parameters : `dict`
551 `StorageClass`-specific parameters that specify, for example,
552 a slice of the dataset to be loaded.
554 Returns
555 -------
556 getInfo : `list` [`DatastoreFileGetInformation`]
557 Parameters needed to retrieve each file.
558 """
559 log.debug("Retrieve %s from %s with parameters %s", ref, self.name, parameters)
561 # Get file metadata and internal metadata
562 fileLocations = self._get_dataset_locations_info(ref)
563 if not fileLocations:
564 if not self.trustGetRequest:
565 raise FileNotFoundError(f"Could not retrieve dataset {ref}.")
566 # Assume the dataset is where we think it should be
567 fileLocations = self._get_expected_dataset_locations_info(ref)
569 # The storage class we want to use eventually
570 refStorageClass = ref.datasetType.storageClass
572 if len(fileLocations) > 1:
573 disassembled = True
574 else:
575 disassembled = False
577 # Is this a component request?
578 refComponent = ref.datasetType.component()
580 fileGetInfo = []
581 for location, storedFileInfo in fileLocations:
583 # The storage class used to write the file
584 writeStorageClass = storedFileInfo.storageClass
586 # If this has been disassembled we need read to match the write
587 if disassembled:
588 readStorageClass = writeStorageClass
589 else:
590 readStorageClass = refStorageClass
592 formatter = getInstanceOf(storedFileInfo.formatter,
593 FileDescriptor(location, readStorageClass=readStorageClass,
594 storageClass=writeStorageClass, parameters=parameters),
595 ref.dataId)
597 formatterParams, notFormatterParams = formatter.segregateParameters()
599 # Of the remaining parameters, extract the ones supported by
600 # this StorageClass (for components not all will be handled)
601 assemblerParams = readStorageClass.filterParameters(notFormatterParams)
603 # The ref itself could be a component if the dataset was
604 # disassembled by butler, or we disassembled in datastore and
605 # components came from the datastore records
606 component = storedFileInfo.component if storedFileInfo.component else refComponent
608 fileGetInfo.append(DatastoreFileGetInformation(location, formatter, storedFileInfo,
609 assemblerParams, formatterParams,
610 component, readStorageClass))
612 return fileGetInfo
614 def _prepare_for_put(self, inMemoryDataset: Any, ref: DatasetRef) -> Tuple[Location, Formatter]:
615 """Check the arguments for ``put`` and obtain formatter and
616 location.
618 Parameters
619 ----------
620 inMemoryDataset : `object`
621 The dataset to store.
622 ref : `DatasetRef`
623 Reference to the associated Dataset.
625 Returns
626 -------
627 location : `Location`
628 The location to write the dataset.
629 formatter : `Formatter`
630 The `Formatter` to use to write the dataset.
632 Raises
633 ------
634 TypeError
635 Supplied object and storage class are inconsistent.
636 DatasetTypeNotSupportedError
637 The associated `DatasetType` is not handled by this datastore.
638 """
639 self._validate_put_parameters(inMemoryDataset, ref)
640 return self._determine_put_formatter_location(ref)
642 def _determine_put_formatter_location(self, ref: DatasetRef) -> Tuple[Location, Formatter]:
643 """Calculate the formatter and output location to use for put.
645 Parameters
646 ----------
647 ref : `DatasetRef`
648 Reference to the associated Dataset.
650 Returns
651 -------
652 location : `Location`
653 The location to write the dataset.
654 formatter : `Formatter`
655 The `Formatter` to use to write the dataset.
656 """
657 # Work out output file name
658 try:
659 template = self.templates.getTemplate(ref)
660 except KeyError as e:
661 raise DatasetTypeNotSupportedError(f"Unable to find template for {ref}") from e
663 # Validate the template to protect against filenames from different
664 # dataIds returning the same and causing overwrite confusion.
665 template.validateTemplate(ref)
667 location = self.locationFactory.fromPath(template.format(ref))
669 # Get the formatter based on the storage class
670 storageClass = ref.datasetType.storageClass
671 try:
672 formatter = self.formatterFactory.getFormatter(ref,
673 FileDescriptor(location,
674 storageClass=storageClass),
675 ref.dataId)
676 except KeyError as e:
677 raise DatasetTypeNotSupportedError(f"Unable to find formatter for {ref} in datastore "
678 f"{self.name}") from e
680 # Now that we know the formatter, update the location
681 location = formatter.makeUpdatedLocation(location)
683 return location, formatter
685 def _overrideTransferMode(self, *datasets: FileDataset, transfer: Optional[str] = None) -> Optional[str]:
686 # Docstring inherited from base class
687 if transfer != "auto":
688 return transfer
690 # See if the paths are within the datastore or not
691 inside = [self._pathInStore(d.path) is not None for d in datasets]
693 if all(inside):
694 transfer = None
695 elif not any(inside): 695 ↛ 699line 695 didn't jump to line 699, because the condition on line 695 was never false
696 # Allow ButlerURI to use its own knowledge
697 transfer = "auto"
698 else:
699 raise ValueError("Some datasets are inside the datastore and some are outside."
700 " Please use an explicit transfer mode and not 'auto'.")
702 return transfer
704 def _pathInStore(self, path: Union[str, ButlerURI]) -> Optional[str]:
705 """Return path relative to datastore root
707 Parameters
708 ----------
709 path : `str` or `ButlerURI`
710 Path to dataset. Can be absolute URI. If relative assumed to
711 be relative to the datastore. Returns path in datastore
712 or raises an exception if the path it outside.
714 Returns
715 -------
716 inStore : `str`
717 Path relative to datastore root. Returns `None` if the file is
718 outside the root.
719 """
720 # Relative path will always be relative to datastore
721 pathUri = ButlerURI(path, forceAbsolute=False)
722 return pathUri.relative_to(self.root)
724 def _standardizeIngestPath(self, path: Union[str, ButlerURI], *,
725 transfer: Optional[str] = None) -> Union[str, ButlerURI]:
726 """Standardize the path of a to-be-ingested file.
728 Parameters
729 ----------
730 path : `str` or `ButlerURI`
731 Path of a file to be ingested.
732 transfer : `str`, optional
733 How (and whether) the dataset should be added to the datastore.
734 See `ingest` for details of transfer modes.
735 This implementation is provided only so
736 `NotImplementedError` can be raised if the mode is not supported;
737 actual transfers are deferred to `_extractIngestInfo`.
739 Returns
740 -------
741 path : `str` or `ButlerURI`
742 New path in what the datastore considers standard form. If an
743 absolute URI was given that will be returned unchanged.
745 Notes
746 -----
747 Subclasses of `FileDatastore` can implement this method instead
748 of `_prepIngest`. It should not modify the data repository or given
749 file in any way.
751 Raises
752 ------
753 NotImplementedError
754 Raised if the datastore does not support the given transfer mode
755 (including the case where ingest is not supported at all).
756 FileNotFoundError
757 Raised if one of the given files does not exist.
758 """
759 if transfer not in (None, "direct") + self.root.transferModes: 759 ↛ 760line 759 didn't jump to line 760, because the condition on line 759 was never true
760 raise NotImplementedError(f"Transfer mode {transfer} not supported.")
762 # A relative URI indicates relative to datastore root
763 srcUri = ButlerURI(path, forceAbsolute=False)
764 if not srcUri.isabs():
765 srcUri = self.root.join(path)
767 if not srcUri.exists():
768 raise FileNotFoundError(f"Resource at {srcUri} does not exist; note that paths to ingest "
769 f"are assumed to be relative to {self.root} unless they are absolute.")
771 if transfer is None:
772 relpath = srcUri.relative_to(self.root)
773 if not relpath:
774 raise RuntimeError(f"Transfer is none but source file ({srcUri}) is not "
775 f"within datastore ({self.root})")
777 # Return the relative path within the datastore for internal
778 # transfer
779 path = relpath
781 return path
783 def _extractIngestInfo(self, path: Union[str, ButlerURI], ref: DatasetRef, *,
784 formatter: Union[Formatter, Type[Formatter]],
785 transfer: Optional[str] = None) -> StoredFileInfo:
786 """Relocate (if necessary) and extract `StoredFileInfo` from a
787 to-be-ingested file.
789 Parameters
790 ----------
791 path : `str` or `ButlerURI`
792 URI or path of a file to be ingested.
793 ref : `DatasetRef`
794 Reference for the dataset being ingested. Guaranteed to have
795 ``dataset_id not None`.
796 formatter : `type` or `Formatter`
797 `Formatter` subclass to use for this dataset or an instance.
798 transfer : `str`, optional
799 How (and whether) the dataset should be added to the datastore.
800 See `ingest` for details of transfer modes.
802 Returns
803 -------
804 info : `StoredFileInfo`
805 Internal datastore record for this file. This will be inserted by
806 the caller; the `_extractIngestInfo` is only resposible for
807 creating and populating the struct.
809 Raises
810 ------
811 FileNotFoundError
812 Raised if one of the given files does not exist.
813 FileExistsError
814 Raised if transfer is not `None` but the (internal) location the
815 file would be moved to is already occupied.
816 """
817 if self._transaction is None: 817 ↛ 818line 817 didn't jump to line 818, because the condition on line 817 was never true
818 raise RuntimeError("Ingest called without transaction enabled")
820 # Create URI of the source path, do not need to force a relative
821 # path to absolute.
822 srcUri = ButlerURI(path, forceAbsolute=False)
824 # Track whether we have read the size of the source yet
825 have_sized = False
827 tgtLocation: Optional[Location]
828 if transfer is None:
829 # A relative path is assumed to be relative to the datastore
830 # in this context
831 if not srcUri.isabs():
832 tgtLocation = self.locationFactory.fromPath(srcUri.ospath)
833 else:
834 # Work out the path in the datastore from an absolute URI
835 # This is required to be within the datastore.
836 pathInStore = srcUri.relative_to(self.root)
837 if pathInStore is None: 837 ↛ 838line 837 didn't jump to line 838, because the condition on line 837 was never true
838 raise RuntimeError(f"Unexpectedly learned that {srcUri} is "
839 f"not within datastore {self.root}")
840 tgtLocation = self.locationFactory.fromPath(pathInStore)
841 elif transfer == "direct": 841 ↛ 846line 841 didn't jump to line 846, because the condition on line 841 was never true
842 # Want to store the full URI to the resource directly in
843 # datastore. This is useful for referring to permanent archive
844 # storage for raw data.
845 # Trust that people know what they are doing.
846 tgtLocation = None
847 else:
848 # Work out the name we want this ingested file to have
849 # inside the datastore
850 tgtLocation = self._calculate_ingested_datastore_name(srcUri, ref, formatter)
851 if not tgtLocation.uri.dirname().exists():
852 log.debug("Folder %s does not exist yet.", tgtLocation.uri.dirname())
853 tgtLocation.uri.dirname().mkdir()
855 # if we are transferring from a local file to a remote location
856 # it may be more efficient to get the size and checksum of the
857 # local file rather than the transferred one
858 if not srcUri.scheme or srcUri.scheme == "file": 858 ↛ 864line 858 didn't jump to line 864, because the condition on line 858 was never false
859 size = srcUri.size()
860 checksum = self.computeChecksum(srcUri) if self.useChecksum else None
861 have_sized = True
863 # transfer the resource to the destination
864 tgtLocation.uri.transfer_from(srcUri, transfer=transfer, transaction=self._transaction)
866 if tgtLocation is None: 866 ↛ 868line 866 didn't jump to line 868, because the condition on line 866 was never true
867 # This means we are using direct mode
868 targetUri = srcUri
869 targetPath = str(srcUri)
870 else:
871 targetUri = tgtLocation.uri
872 targetPath = tgtLocation.pathInStore.path
874 # the file should exist in the datastore now
875 if not have_sized:
876 size = targetUri.size()
877 checksum = self.computeChecksum(targetUri) if self.useChecksum else None
879 return StoredFileInfo(formatter=formatter, path=targetPath,
880 storageClass=ref.datasetType.storageClass,
881 component=ref.datasetType.component(),
882 file_size=size, checksum=checksum)
884 def _prepIngest(self, *datasets: FileDataset, transfer: Optional[str] = None) -> _IngestPrepData:
885 # Docstring inherited from Datastore._prepIngest.
886 filtered = []
887 for dataset in datasets:
888 acceptable = [ref for ref in dataset.refs if self.constraints.isAcceptable(ref)]
889 if not acceptable:
890 continue
891 else:
892 dataset.refs = acceptable
893 if dataset.formatter is None:
894 dataset.formatter = self.formatterFactory.getFormatterClass(dataset.refs[0])
895 else:
896 assert isinstance(dataset.formatter, (type, str))
897 dataset.formatter = getClassOf(dataset.formatter)
898 dataset.path = self._standardizeIngestPath(dataset.path, transfer=transfer)
899 filtered.append(dataset)
900 return _IngestPrepData(filtered)
902 @transactional
903 def _finishIngest(self, prepData: Datastore.IngestPrepData, *, transfer: Optional[str] = None) -> None:
904 # Docstring inherited from Datastore._finishIngest.
905 refsAndInfos = []
906 progress = Progress("lsst.daf.butler.datastores.FileDatastore.ingest", level=logging.DEBUG)
907 for dataset in progress.wrap(prepData.datasets, desc="Ingesting dataset files"):
908 # Do ingest as if the first dataset ref is associated with the file
909 info = self._extractIngestInfo(dataset.path, dataset.refs[0], formatter=dataset.formatter,
910 transfer=transfer)
911 refsAndInfos.extend([(ref, info) for ref in dataset.refs])
912 self._register_datasets(refsAndInfos)
914 def _calculate_ingested_datastore_name(self, srcUri: ButlerURI, ref: DatasetRef,
915 formatter: Union[Formatter, Type[Formatter]]) -> Location:
916 """Given a source URI and a DatasetRef, determine the name the
917 dataset will have inside datastore.
919 Parameters
920 ----------
921 srcUri : `ButlerURI`
922 URI to the source dataset file.
923 ref : `DatasetRef`
924 Ref associated with the newly-ingested dataset artifact. This
925 is used to determine the name within the datastore.
926 formatter : `Formatter` or Formatter class.
927 Formatter to use for validation. Can be a class or an instance.
929 Returns
930 -------
931 location : `Location`
932 Target location for the newly-ingested dataset.
933 """
934 # Ingesting a file from outside the datastore.
935 # This involves a new name.
936 template = self.templates.getTemplate(ref)
937 location = self.locationFactory.fromPath(template.format(ref))
939 # Get the extension
940 ext = srcUri.getExtension()
942 # Update the destination to include that extension
943 location.updateExtension(ext)
945 # Ask the formatter to validate this extension
946 formatter.validateExtension(location)
948 return location
950 def _write_in_memory_to_artifact(self, inMemoryDataset: Any, ref: DatasetRef) -> StoredFileInfo:
951 """Write out in memory dataset to datastore.
953 Parameters
954 ----------
955 inMemoryDataset : `object`
956 Dataset to write to datastore.
957 ref : `DatasetRef`
958 Registry information associated with this dataset.
960 Returns
961 -------
962 info : `StoredFileInfo`
963 Information describin the artifact written to the datastore.
964 """
965 location, formatter = self._prepare_for_put(inMemoryDataset, ref)
966 uri = location.uri
968 if not uri.dirname().exists():
969 log.debug("Folder %s does not exist yet so creating it.", uri.dirname())
970 uri.dirname().mkdir()
972 if self._transaction is None: 972 ↛ 973line 972 didn't jump to line 973, because the condition on line 972 was never true
973 raise RuntimeError("Attempting to write artifact without transaction enabled")
975 def _removeFileExists(uri: ButlerURI) -> None:
976 """Remove a file and do not complain if it is not there.
978 This is important since a formatter might fail before the file
979 is written and we should not confuse people by writing spurious
980 error messages to the log.
981 """
982 try:
983 uri.remove()
984 except FileNotFoundError:
985 pass
987 # Register a callback to try to delete the uploaded data if
988 # something fails below
989 self._transaction.registerUndo("artifactWrite", _removeFileExists, uri)
991 # For a local file, simply use the formatter directly
992 if uri.isLocal:
993 try:
994 formatter.write(inMemoryDataset)
995 except Exception as e:
996 raise RuntimeError(f"Failed to serialize dataset {ref} of type {type(inMemoryDataset)} "
997 f"to location {uri}") from e
998 log.debug("Successfully wrote python object to local file at %s", uri)
999 else:
1000 # This is a remote URI, so first try bytes and write directly else
1001 # fallback to a temporary file
1002 try:
1003 serializedDataset = formatter.toBytes(inMemoryDataset)
1004 except NotImplementedError: 1004 ↛ 1023line 1004 didn't jump to line 1023
1005 with tempfile.NamedTemporaryFile(suffix=uri.getExtension()) as tmpFile:
1006 # Need to configure the formatter to write to a different
1007 # location and that needs us to overwrite internals
1008 tmpLocation = Location(*os.path.split(tmpFile.name))
1009 log.debug("Writing dataset to temporary location at %s", tmpLocation.uri)
1010 with formatter._updateLocation(tmpLocation):
1011 try:
1012 formatter.write(inMemoryDataset)
1013 except Exception as e:
1014 raise RuntimeError(f"Failed to serialize dataset {ref} of type"
1015 f" {type(inMemoryDataset)} to "
1016 f"temporary location {tmpLocation.uri}") from e
1017 uri.transfer_from(tmpLocation.uri, transfer="copy", overwrite=True)
1019 # Cache if required
1020 self.cacheManager.move_to_cache(tmpLocation.uri, ref)
1022 log.debug("Successfully wrote dataset to %s via a temporary file.", uri)
1023 except Exception as e:
1024 raise RuntimeError(f"Failed to serialize dataset {ref} to bytes.") from e
1025 else:
1026 log.debug("Writing bytes directly to %s", uri)
1027 uri.write(serializedDataset, overwrite=True)
1028 log.debug("Successfully wrote bytes directly to %s", uri)
1030 # URI is needed to resolve what ingest case are we dealing with
1031 return self._extractIngestInfo(uri, ref, formatter=formatter)
1033 def _read_artifact_into_memory(self, getInfo: DatastoreFileGetInformation,
1034 ref: DatasetRef, isComponent: bool = False) -> Any:
1035 """Read the artifact from datastore into in memory object.
1037 Parameters
1038 ----------
1039 getInfo : `DatastoreFileGetInformation`
1040 Information about the artifact within the datastore.
1041 ref : `DatasetRef`
1042 The registry information associated with this artifact.
1043 isComponent : `bool`
1044 Flag to indicate if a component is being read from this artifact.
1046 Returns
1047 -------
1048 inMemoryDataset : `object`
1049 The artifact as a python object.
1050 """
1051 location = getInfo.location
1052 uri = location.uri
1053 log.debug("Accessing data from %s", uri)
1055 # Cannot recalculate checksum but can compare size as a quick check
1056 # Do not do this if the size is negative since that indicates
1057 # we do not know.
1058 recorded_size = getInfo.info.file_size
1059 resource_size = uri.size()
1060 if recorded_size >= 0 and resource_size != recorded_size: 1060 ↛ 1061line 1060 didn't jump to line 1061, because the condition on line 1060 was never true
1061 raise RuntimeError("Integrity failure in Datastore. "
1062 f"Size of file {uri} ({resource_size}) "
1063 f"does not match size recorded in registry of {recorded_size}")
1065 # For the general case we have choices for how to proceed.
1066 # 1. Always use a local file (downloading the remote resource to a
1067 # temporary file if needed).
1068 # 2. Use a threshold size and read into memory and use bytes.
1069 # Use both for now with an arbitrary hand off size.
1070 # This allows small datasets to be downloaded from remote object
1071 # stores without requiring a temporary file.
1073 formatter = getInfo.formatter
1074 nbytes_max = 10_000_000 # Arbitrary number that we can tune
1075 if resource_size <= nbytes_max and formatter.can_read_bytes():
1076 serializedDataset = uri.read()
1077 log.debug("Deserializing %s from %d bytes from location %s with formatter %s",
1078 f"component {getInfo.component}" if isComponent else "",
1079 len(serializedDataset), uri, formatter.name())
1080 try:
1081 result = formatter.fromBytes(serializedDataset,
1082 component=getInfo.component if isComponent else None)
1083 except Exception as e:
1084 raise ValueError(f"Failure from formatter '{formatter.name()}' for dataset {ref.id}"
1085 f" ({ref.datasetType.name} from {uri}): {e}") from e
1086 else:
1087 # Read from file.
1089 # Have to update the Location associated with the formatter
1090 # because formatter.read does not allow an override.
1091 # This could be improved.
1092 location_updated = False
1093 msg = ""
1095 # First check in cache for local version.
1096 # The cache will only be relevant for remote resources.
1097 if not uri.isLocal:
1098 cached_file = self.cacheManager.find_in_cache(ref, uri.getExtension())
1099 if cached_file is not None: 1099 ↛ 1100line 1099 didn't jump to line 1100, because the condition on line 1099 was never true
1100 msg = f"(via cache read of remote file {uri})"
1101 uri = cached_file
1102 location_updated = True
1104 with uri.as_local() as local_uri:
1106 # URI was remote and file was downloaded
1107 if uri != local_uri:
1108 cache_msg = ""
1109 location_updated = True
1111 # Cache the downloaded file if needed.
1112 cached_uri = self.cacheManager.move_to_cache(local_uri, ref)
1113 if cached_uri is not None: 1113 ↛ 1114line 1113 didn't jump to line 1114, because the condition on line 1113 was never true
1114 local_uri = cached_uri
1115 cache_msg = " and cached"
1117 msg = f"(via download to local file{cache_msg})"
1119 # Calculate the (possibly) new location for the formatter
1120 # to use.
1121 newLocation = Location(*local_uri.split()) if location_updated else None
1123 log.debug("Reading%s from location %s %s with formatter %s",
1124 f" component {getInfo.component}" if isComponent else "",
1125 uri, msg, formatter.name())
1126 try:
1127 with formatter._updateLocation(newLocation):
1128 result = formatter.read(component=getInfo.component if isComponent else None)
1129 except Exception as e:
1130 raise ValueError(f"Failure from formatter '{formatter.name()}' for dataset {ref.id}"
1131 f" ({ref.datasetType.name} from {uri}): {e}") from e
1133 return self._post_process_get(result, getInfo.readStorageClass, getInfo.assemblerParams,
1134 isComponent=isComponent)
1136 def exists(self, ref: DatasetRef) -> bool:
1137 """Check if the dataset exists in the datastore.
1139 Parameters
1140 ----------
1141 ref : `DatasetRef`
1142 Reference to the required dataset.
1144 Returns
1145 -------
1146 exists : `bool`
1147 `True` if the entity exists in the `Datastore`.
1148 """
1149 fileLocations = self._get_dataset_locations_info(ref)
1151 # if we are being asked to trust that registry might not be correct
1152 # we ask for the expected locations and check them explicitly
1153 if not fileLocations:
1154 if not self.trustGetRequest:
1155 return False
1156 fileLocations = self._get_expected_dataset_locations_info(ref)
1157 for location, _ in fileLocations:
1158 if not self._artifact_exists(location):
1159 return False
1161 return True
1163 def getURIs(self, ref: DatasetRef,
1164 predict: bool = False) -> Tuple[Optional[ButlerURI], Dict[str, ButlerURI]]:
1165 """Return URIs associated with dataset.
1167 Parameters
1168 ----------
1169 ref : `DatasetRef`
1170 Reference to the required dataset.
1171 predict : `bool`, optional
1172 If the datastore does not know about the dataset, should it
1173 return a predicted URI or not?
1175 Returns
1176 -------
1177 primary : `ButlerURI`
1178 The URI to the primary artifact associated with this dataset.
1179 If the dataset was disassembled within the datastore this
1180 may be `None`.
1181 components : `dict`
1182 URIs to any components associated with the dataset artifact.
1183 Can be empty if there are no components.
1184 """
1186 primary: Optional[ButlerURI] = None
1187 components: Dict[str, ButlerURI] = {}
1189 # if this has never been written then we have to guess
1190 if not self.exists(ref):
1191 if not predict:
1192 raise FileNotFoundError("Dataset {} not in this datastore".format(ref))
1194 doDisassembly = self.composites.shouldBeDisassembled(ref)
1196 if doDisassembly:
1198 for component, componentStorage in ref.datasetType.storageClass.components.items():
1199 compRef = ref.makeComponentRef(component)
1200 compLocation, _ = self._determine_put_formatter_location(compRef)
1202 # Add a URI fragment to indicate this is a guess
1203 components[component] = ButlerURI(compLocation.uri.geturl() + "#predicted")
1205 else:
1207 location, _ = self._determine_put_formatter_location(ref)
1209 # Add a URI fragment to indicate this is a guess
1210 primary = ButlerURI(location.uri.geturl() + "#predicted")
1212 return primary, components
1214 # If this is a ref that we have written we can get the path.
1215 # Get file metadata and internal metadata
1216 fileLocations = self._get_dataset_locations_info(ref)
1218 guessing = False
1219 if not fileLocations:
1220 if not self.trustGetRequest: 1220 ↛ 1221line 1220 didn't jump to line 1221, because the condition on line 1220 was never true
1221 raise RuntimeError(f"Unexpectedly got no artifacts for dataset {ref}")
1222 fileLocations = self._get_expected_dataset_locations_info(ref)
1223 guessing = True
1225 if len(fileLocations) == 1:
1226 # No disassembly so this is the primary URI
1227 uri = fileLocations[0][0].uri
1228 if guessing and not uri.exists(): 1228 ↛ 1229line 1228 didn't jump to line 1229, because the condition on line 1228 was never true
1229 raise FileNotFoundError(f"Expected URI ({uri}) does not exist")
1230 primary = uri
1232 else:
1233 for location, storedFileInfo in fileLocations:
1234 if storedFileInfo.component is None: 1234 ↛ 1235line 1234 didn't jump to line 1235, because the condition on line 1234 was never true
1235 raise RuntimeError(f"Unexpectedly got no component name for a component at {location}")
1236 uri = location.uri
1237 if guessing and not uri.exists(): 1237 ↛ 1238line 1237 didn't jump to line 1238, because the condition on line 1237 was never true
1238 raise FileNotFoundError(f"Expected URI ({uri}) does not exist")
1239 components[storedFileInfo.component] = uri
1241 return primary, components
1243 def getURI(self, ref: DatasetRef, predict: bool = False) -> ButlerURI:
1244 """URI to the Dataset.
1246 Parameters
1247 ----------
1248 ref : `DatasetRef`
1249 Reference to the required Dataset.
1250 predict : `bool`
1251 If `True`, allow URIs to be returned of datasets that have not
1252 been written.
1254 Returns
1255 -------
1256 uri : `str`
1257 URI pointing to the dataset within the datastore. If the
1258 dataset does not exist in the datastore, and if ``predict`` is
1259 `True`, the URI will be a prediction and will include a URI
1260 fragment "#predicted".
1261 If the datastore does not have entities that relate well
1262 to the concept of a URI the returned URI will be
1263 descriptive. The returned URI is not guaranteed to be obtainable.
1265 Raises
1266 ------
1267 FileNotFoundError
1268 Raised if a URI has been requested for a dataset that does not
1269 exist and guessing is not allowed.
1270 RuntimeError
1271 Raised if a request is made for a single URI but multiple URIs
1272 are associated with this dataset.
1274 Notes
1275 -----
1276 When a predicted URI is requested an attempt will be made to form
1277 a reasonable URI based on file templates and the expected formatter.
1278 """
1279 primary, components = self.getURIs(ref, predict)
1280 if primary is None or components: 1280 ↛ 1281line 1280 didn't jump to line 1281, because the condition on line 1280 was never true
1281 raise RuntimeError(f"Dataset ({ref}) includes distinct URIs for components. "
1282 "Use Dataastore.getURIs() instead.")
1283 return primary
1285 def get(self, ref: DatasetRef, parameters: Optional[Mapping[str, Any]] = None) -> Any:
1286 """Load an InMemoryDataset from the store.
1288 Parameters
1289 ----------
1290 ref : `DatasetRef`
1291 Reference to the required Dataset.
1292 parameters : `dict`
1293 `StorageClass`-specific parameters that specify, for example,
1294 a slice of the dataset to be loaded.
1296 Returns
1297 -------
1298 inMemoryDataset : `object`
1299 Requested dataset or slice thereof as an InMemoryDataset.
1301 Raises
1302 ------
1303 FileNotFoundError
1304 Requested dataset can not be retrieved.
1305 TypeError
1306 Return value from formatter has unexpected type.
1307 ValueError
1308 Formatter failed to process the dataset.
1309 """
1310 allGetInfo = self._prepare_for_get(ref, parameters)
1311 refComponent = ref.datasetType.component()
1313 # Supplied storage class for the component being read
1314 refStorageClass = ref.datasetType.storageClass
1316 # Create mapping from component name to related info
1317 allComponents = {i.component: i for i in allGetInfo}
1319 # By definition the dataset is disassembled if we have more
1320 # than one record for it.
1321 isDisassembled = len(allGetInfo) > 1
1323 # Look for the special case where we are disassembled but the
1324 # component is a derived component that was not written during
1325 # disassembly. For this scenario we need to check that the
1326 # component requested is listed as a derived component for the
1327 # composite storage class
1328 isDisassembledReadOnlyComponent = False
1329 if isDisassembled and refComponent:
1330 # The composite storage class should be accessible through
1331 # the component dataset type
1332 compositeStorageClass = ref.datasetType.parentStorageClass
1334 # In the unlikely scenario where the composite storage
1335 # class is not known, we can only assume that this is a
1336 # normal component. If that assumption is wrong then the
1337 # branch below that reads a persisted component will fail
1338 # so there is no need to complain here.
1339 if compositeStorageClass is not None: 1339 ↛ 1342line 1339 didn't jump to line 1342, because the condition on line 1339 was never false
1340 isDisassembledReadOnlyComponent = refComponent in compositeStorageClass.derivedComponents
1342 if isDisassembled and not refComponent:
1343 # This was a disassembled dataset spread over multiple files
1344 # and we need to put them all back together again.
1345 # Read into memory and then assemble
1347 # Check that the supplied parameters are suitable for the type read
1348 refStorageClass.validateParameters(parameters)
1350 # We want to keep track of all the parameters that were not used
1351 # by formatters. We assume that if any of the component formatters
1352 # use a parameter that we do not need to apply it again in the
1353 # assembler.
1354 usedParams = set()
1356 components: Dict[str, Any] = {}
1357 for getInfo in allGetInfo:
1358 # assemblerParams are parameters not understood by the
1359 # associated formatter.
1360 usedParams.update(set(getInfo.formatterParams))
1362 component = getInfo.component
1364 if component is None: 1364 ↛ 1365line 1364 didn't jump to line 1365, because the condition on line 1364 was never true
1365 raise RuntimeError(f"Internal error in datastore assembly of {ref}")
1367 # We do not want the formatter to think it's reading
1368 # a component though because it is really reading a
1369 # standalone dataset -- always tell reader it is not a
1370 # component.
1371 components[component] = self._read_artifact_into_memory(getInfo, ref, isComponent=False)
1373 inMemoryDataset = ref.datasetType.storageClass.delegate().assemble(components)
1375 # Any unused parameters will have to be passed to the assembler
1376 if parameters:
1377 unusedParams = {k: v for k, v in parameters.items() if k not in usedParams}
1378 else:
1379 unusedParams = {}
1381 # Process parameters
1382 return ref.datasetType.storageClass.delegate().handleParameters(inMemoryDataset,
1383 parameters=unusedParams)
1385 elif isDisassembledReadOnlyComponent:
1387 compositeStorageClass = ref.datasetType.parentStorageClass
1388 if compositeStorageClass is None: 1388 ↛ 1389line 1388 didn't jump to line 1389, because the condition on line 1388 was never true
1389 raise RuntimeError(f"Unable to retrieve derived component '{refComponent}' since"
1390 "no composite storage class is available.")
1392 if refComponent is None: 1392 ↛ 1394line 1392 didn't jump to line 1394, because the condition on line 1392 was never true
1393 # Mainly for mypy
1394 raise RuntimeError(f"Internal error in datastore {self.name}: component can not be None here")
1396 # Assume that every derived component can be calculated by
1397 # forwarding the request to a single read/write component.
1398 # Rather than guessing which rw component is the right one by
1399 # scanning each for a derived component of the same name,
1400 # we ask the storage class delegate directly which one is best to
1401 # use.
1402 compositeDelegate = compositeStorageClass.delegate()
1403 forwardedComponent = compositeDelegate.selectResponsibleComponent(refComponent,
1404 set(allComponents))
1406 # Select the relevant component
1407 rwInfo = allComponents[forwardedComponent]
1409 # For now assume that read parameters are validated against
1410 # the real component and not the requested component
1411 forwardedStorageClass = rwInfo.formatter.fileDescriptor.readStorageClass
1412 forwardedStorageClass.validateParameters(parameters)
1414 # Unfortunately the FileDescriptor inside the formatter will have
1415 # the wrong write storage class so we need to create a new one
1416 # given the immutability constraint.
1417 writeStorageClass = rwInfo.info.storageClass
1419 # We may need to put some thought into parameters for read
1420 # components but for now forward them on as is
1421 readFormatter = type(rwInfo.formatter)(FileDescriptor(rwInfo.location,
1422 readStorageClass=refStorageClass,
1423 storageClass=writeStorageClass,
1424 parameters=parameters),
1425 ref.dataId)
1427 # The assembler can not receive any parameter requests for a
1428 # derived component at this time since the assembler will
1429 # see the storage class of the derived component and those
1430 # parameters will have to be handled by the formatter on the
1431 # forwarded storage class.
1432 assemblerParams: Dict[str, Any] = {}
1434 # Need to created a new info that specifies the derived
1435 # component and associated storage class
1436 readInfo = DatastoreFileGetInformation(rwInfo.location, readFormatter,
1437 rwInfo.info, assemblerParams, {},
1438 refComponent, refStorageClass)
1440 return self._read_artifact_into_memory(readInfo, ref, isComponent=True)
1442 else:
1443 # Single file request or component from that composite file
1444 for lookup in (refComponent, None): 1444 ↛ 1449line 1444 didn't jump to line 1449, because the loop on line 1444 didn't complete
1445 if lookup in allComponents: 1445 ↛ 1444line 1445 didn't jump to line 1444, because the condition on line 1445 was never false
1446 getInfo = allComponents[lookup]
1447 break
1448 else:
1449 raise FileNotFoundError(f"Component {refComponent} not found "
1450 f"for ref {ref} in datastore {self.name}")
1452 # Do not need the component itself if already disassembled
1453 if isDisassembled:
1454 isComponent = False
1455 else:
1456 isComponent = getInfo.component is not None
1458 # For a disassembled component we can validate parametersagainst
1459 # the component storage class directly
1460 if isDisassembled:
1461 refStorageClass.validateParameters(parameters)
1462 else:
1463 # For an assembled composite this could be a derived
1464 # component derived from a real component. The validity
1465 # of the parameters is not clear. For now validate against
1466 # the composite storage class
1467 getInfo.formatter.fileDescriptor.storageClass.validateParameters(parameters)
1469 return self._read_artifact_into_memory(getInfo, ref, isComponent=isComponent)
1471 @transactional
1472 def put(self, inMemoryDataset: Any, ref: DatasetRef) -> None:
1473 """Write a InMemoryDataset with a given `DatasetRef` to the store.
1475 Parameters
1476 ----------
1477 inMemoryDataset : `object`
1478 The dataset to store.
1479 ref : `DatasetRef`
1480 Reference to the associated Dataset.
1482 Raises
1483 ------
1484 TypeError
1485 Supplied object and storage class are inconsistent.
1486 DatasetTypeNotSupportedError
1487 The associated `DatasetType` is not handled by this datastore.
1489 Notes
1490 -----
1491 If the datastore is configured to reject certain dataset types it
1492 is possible that the put will fail and raise a
1493 `DatasetTypeNotSupportedError`. The main use case for this is to
1494 allow `ChainedDatastore` to put to multiple datastores without
1495 requiring that every datastore accepts the dataset.
1496 """
1498 doDisassembly = self.composites.shouldBeDisassembled(ref)
1499 # doDisassembly = True
1501 artifacts = []
1502 if doDisassembly:
1503 components = ref.datasetType.storageClass.delegate().disassemble(inMemoryDataset)
1504 for component, componentInfo in components.items():
1505 # Don't recurse because we want to take advantage of
1506 # bulk insert -- need a new DatasetRef that refers to the
1507 # same dataset_id but has the component DatasetType
1508 # DatasetType does not refer to the types of components
1509 # So we construct one ourselves.
1510 compRef = ref.makeComponentRef(component)
1511 storedInfo = self._write_in_memory_to_artifact(componentInfo.component, compRef)
1512 artifacts.append((compRef, storedInfo))
1513 else:
1514 # Write the entire thing out
1515 storedInfo = self._write_in_memory_to_artifact(inMemoryDataset, ref)
1516 artifacts.append((ref, storedInfo))
1518 self._register_datasets(artifacts)
1520 @transactional
1521 def trash(self, ref: DatasetRef, ignore_errors: bool = True) -> None:
1522 """Indicate to the datastore that a dataset can be removed.
1524 Parameters
1525 ----------
1526 ref : `DatasetRef`
1527 Reference to the required Dataset.
1528 ignore_errors : `bool`
1529 If `True` return without error even if something went wrong.
1530 Problems could occur if another process is simultaneously trying
1531 to delete.
1533 Raises
1534 ------
1535 FileNotFoundError
1536 Attempt to remove a dataset that does not exist.
1537 """
1538 # Get file metadata and internal metadata
1539 log.debug("Trashing %s in datastore %s", ref, self.name)
1541 fileLocations = self._get_dataset_locations_info(ref)
1543 if not fileLocations:
1544 err_msg = f"Requested dataset to trash ({ref}) is not known to datastore {self.name}"
1545 if ignore_errors:
1546 log.warning(err_msg)
1547 return
1548 else:
1549 raise FileNotFoundError(err_msg)
1551 for location, storedFileInfo in fileLocations:
1552 if not self._artifact_exists(location): 1552 ↛ 1553line 1552 didn't jump to line 1553, because the condition on line 1552 was never true
1553 err_msg = f"Dataset is known to datastore {self.name} but " \
1554 f"associated artifact ({location.uri}) is missing"
1555 if ignore_errors:
1556 log.warning(err_msg)
1557 return
1558 else:
1559 raise FileNotFoundError(err_msg)
1561 # Mark dataset as trashed
1562 try:
1563 self._move_to_trash_in_registry(ref)
1564 except Exception as e:
1565 if ignore_errors:
1566 log.warning(f"Attempted to mark dataset ({ref}) to be trashed in datastore {self.name} "
1567 f"but encountered an error: {e}")
1568 pass
1569 else:
1570 raise
1572 @transactional
1573 def emptyTrash(self, ignore_errors: bool = True) -> None:
1574 """Remove all datasets from the trash.
1576 Parameters
1577 ----------
1578 ignore_errors : `bool`
1579 If `True` return without error even if something went wrong.
1580 Problems could occur if another process is simultaneously trying
1581 to delete.
1582 """
1583 log.debug("Emptying trash in datastore %s", self.name)
1584 # Context manager will empty trash iff we finish it without raising.
1585 with self.bridge.emptyTrash() as trashed:
1586 for ref in trashed:
1587 fileLocations = self._get_dataset_locations_info(ref)
1589 if not fileLocations: 1589 ↛ 1590line 1589 didn't jump to line 1590, because the condition on line 1589 was never true
1590 err_msg = f"Requested dataset ({ref}) does not exist in datastore {self.name}"
1591 if ignore_errors:
1592 log.warning(err_msg)
1593 continue
1594 else:
1595 raise FileNotFoundError(err_msg)
1597 for location, _ in fileLocations:
1599 if not self._artifact_exists(location): 1599 ↛ 1600line 1599 didn't jump to line 1600, because the condition on line 1599 was never true
1600 err_msg = f"Dataset {location.uri} no longer present in datastore {self.name}"
1601 if ignore_errors:
1602 log.warning(err_msg)
1603 continue
1604 else:
1605 raise FileNotFoundError(err_msg)
1607 # Can only delete the artifact if there are no references
1608 # to the file from untrashed dataset refs.
1609 if self._can_remove_dataset_artifact(ref, location):
1610 # Point of no return for this artifact
1611 log.debug("Removing artifact %s from datastore %s", location.uri, self.name)
1612 try:
1613 self._delete_artifact(location)
1614 except Exception as e:
1615 if ignore_errors:
1616 log.critical("Encountered error removing artifact %s from datastore %s: %s",
1617 location.uri, self.name, e)
1618 else:
1619 raise
1621 # Now must remove the entry from the internal registry even if
1622 # the artifact removal failed and was ignored,
1623 # otherwise the removal check above will never be true
1624 try:
1625 # There may be multiple rows associated with this ref
1626 # depending on disassembly
1627 self.removeStoredItemInfo(ref)
1628 except Exception as e:
1629 if ignore_errors:
1630 log.warning("Error removing dataset %s (%s) from internal registry of %s: %s",
1631 ref.id, location.uri, self.name, e)
1632 continue
1633 else:
1634 raise FileNotFoundError(
1635 f"Error removing dataset {ref.id} ({location.uri}) from internal registry "
1636 f"of {self.name}"
1637 ) from e
1639 @transactional
1640 def forget(self, refs: Iterable[DatasetRef]) -> None:
1641 # Docstring inherited.
1642 refs = list(refs)
1643 self.bridge.forget(refs)
1644 self._table.delete(["dataset_id"], *[{"dataset_id": ref.getCheckedId()} for ref in refs])
1646 def validateConfiguration(self, entities: Iterable[Union[DatasetRef, DatasetType, StorageClass]],
1647 logFailures: bool = False) -> None:
1648 """Validate some of the configuration for this datastore.
1650 Parameters
1651 ----------
1652 entities : iterable of `DatasetRef`, `DatasetType`, or `StorageClass`
1653 Entities to test against this configuration. Can be differing
1654 types.
1655 logFailures : `bool`, optional
1656 If `True`, output a log message for every validation error
1657 detected.
1659 Raises
1660 ------
1661 DatastoreValidationError
1662 Raised if there is a validation problem with a configuration.
1663 All the problems are reported in a single exception.
1665 Notes
1666 -----
1667 This method checks that all the supplied entities have valid file
1668 templates and also have formatters defined.
1669 """
1671 templateFailed = None
1672 try:
1673 self.templates.validateTemplates(entities, logFailures=logFailures)
1674 except FileTemplateValidationError as e:
1675 templateFailed = str(e)
1677 formatterFailed = []
1678 for entity in entities:
1679 try:
1680 self.formatterFactory.getFormatterClass(entity)
1681 except KeyError as e:
1682 formatterFailed.append(str(e))
1683 if logFailures: 1683 ↛ 1678line 1683 didn't jump to line 1678, because the condition on line 1683 was never false
1684 log.critical("Formatter failure: %s", e)
1686 if templateFailed or formatterFailed:
1687 messages = []
1688 if templateFailed: 1688 ↛ 1689line 1688 didn't jump to line 1689, because the condition on line 1688 was never true
1689 messages.append(templateFailed)
1690 if formatterFailed: 1690 ↛ 1692line 1690 didn't jump to line 1692, because the condition on line 1690 was never false
1691 messages.append(",".join(formatterFailed))
1692 msg = ";\n".join(messages)
1693 raise DatastoreValidationError(msg)
1695 def getLookupKeys(self) -> Set[LookupKey]:
1696 # Docstring is inherited from base class
1697 return self.templates.getLookupKeys() | self.formatterFactory.getLookupKeys() | \
1698 self.constraints.getLookupKeys()
1700 def validateKey(self, lookupKey: LookupKey,
1701 entity: Union[DatasetRef, DatasetType, StorageClass]) -> None:
1702 # Docstring is inherited from base class
1703 # The key can be valid in either formatters or templates so we can
1704 # only check the template if it exists
1705 if lookupKey in self.templates:
1706 try:
1707 self.templates[lookupKey].validateTemplate(entity)
1708 except FileTemplateValidationError as e:
1709 raise DatastoreValidationError(e) from e
1711 def export(self, refs: Iterable[DatasetRef], *,
1712 directory: Optional[Union[ButlerURI, str]] = None,
1713 transfer: Optional[str] = "auto") -> Iterable[FileDataset]:
1714 # Docstring inherited from Datastore.export.
1715 if transfer is not None and directory is None: 1715 ↛ 1716line 1715 didn't jump to line 1716, because the condition on line 1715 was never true
1716 raise RuntimeError(f"Cannot export using transfer mode {transfer} with no "
1717 "export directory given")
1719 # Force the directory to be a URI object
1720 directoryUri: Optional[ButlerURI] = None
1721 if directory is not None: 1721 ↛ 1724line 1721 didn't jump to line 1724, because the condition on line 1721 was never false
1722 directoryUri = ButlerURI(directory, forceDirectory=True)
1724 if transfer is not None and directoryUri is not None: 1724 ↛ 1729line 1724 didn't jump to line 1729, because the condition on line 1724 was never false
1725 # mypy needs the second test
1726 if not directoryUri.exists(): 1726 ↛ 1727line 1726 didn't jump to line 1727, because the condition on line 1726 was never true
1727 raise FileNotFoundError(f"Export location {directory} does not exist")
1729 progress = Progress("lsst.daf.butler.datastores.FileDatastore.export", level=logging.DEBUG)
1730 for ref in progress.wrap(refs, "Exporting dataset files"):
1731 fileLocations = self._get_dataset_locations_info(ref)
1732 if not fileLocations: 1732 ↛ 1733line 1732 didn't jump to line 1733, because the condition on line 1732 was never true
1733 raise FileNotFoundError(f"Could not retrieve dataset {ref}.")
1734 # For now we can not export disassembled datasets
1735 if len(fileLocations) > 1:
1736 raise NotImplementedError(f"Can not export disassembled datasets such as {ref}")
1737 location, storedFileInfo = fileLocations[0]
1739 pathInStore = location.pathInStore.path
1740 if transfer is None: 1740 ↛ 1743line 1740 didn't jump to line 1743, because the condition on line 1740 was never true
1741 # TODO: do we also need to return the readStorageClass somehow?
1742 # We will use the path in store directly
1743 pass
1744 elif transfer == "direct": 1744 ↛ 1746line 1744 didn't jump to line 1746, because the condition on line 1744 was never true
1745 # Use full URIs to the remote store in the export
1746 pathInStore = str(location.uri)
1747 else:
1748 # mypy needs help
1749 assert directoryUri is not None, "directoryUri must be defined to get here"
1750 storeUri = ButlerURI(location.uri)
1752 # if the datastore has an absolute URI to a resource, we
1753 # have two options:
1754 # 1. Keep the absolute URI in the exported YAML
1755 # 2. Allocate a new name in the local datastore and transfer
1756 # it.
1757 # For now go with option 2
1758 if location.pathInStore.isabs(): 1758 ↛ 1759line 1758 didn't jump to line 1759, because the condition on line 1758 was never true
1759 template = self.templates.getTemplate(ref)
1760 newURI = ButlerURI(template.format(ref), forceAbsolute=False)
1761 pathInStore = str(newURI.updatedExtension(location.pathInStore.getExtension()))
1763 exportUri = directoryUri.join(pathInStore)
1764 exportUri.transfer_from(storeUri, transfer=transfer)
1766 yield FileDataset(refs=[ref], path=pathInStore, formatter=storedFileInfo.formatter)
1768 @staticmethod
1769 def computeChecksum(uri: ButlerURI, algorithm: str = "blake2b", block_size: int = 8192) -> Optional[str]:
1770 """Compute the checksum of the supplied file.
1772 Parameters
1773 ----------
1774 uri : `ButlerURI`
1775 Name of resource to calculate checksum from.
1776 algorithm : `str`, optional
1777 Name of algorithm to use. Must be one of the algorithms supported
1778 by :py:class`hashlib`.
1779 block_size : `int`
1780 Number of bytes to read from file at one time.
1782 Returns
1783 -------
1784 hexdigest : `str`
1785 Hex digest of the file.
1787 Notes
1788 -----
1789 Currently returns None if the URI is for a remote resource.
1790 """
1791 if algorithm not in hashlib.algorithms_guaranteed: 1791 ↛ 1792line 1791 didn't jump to line 1792, because the condition on line 1791 was never true
1792 raise NameError("The specified algorithm '{}' is not supported by hashlib".format(algorithm))
1794 if not uri.isLocal: 1794 ↛ 1795line 1794 didn't jump to line 1795, because the condition on line 1794 was never true
1795 return None
1797 hasher = hashlib.new(algorithm)
1799 with uri.as_local() as local_uri:
1800 with open(local_uri.ospath, "rb") as f:
1801 for chunk in iter(lambda: f.read(block_size), b""):
1802 hasher.update(chunk)
1804 return hasher.hexdigest()
1806 def needs_expanded_data_ids(
1807 self,
1808 transfer: Optional[str],
1809 entity: Optional[Union[DatasetRef, DatasetType, StorageClass]] = None,
1810 ) -> bool:
1811 # Docstring inherited.
1812 # This _could_ also use entity to inspect whether the filename template
1813 # involves placeholders other than the required dimensions for its
1814 # dataset type, but that's not necessary for correctness; it just
1815 # enables more optimizations (perhaps only in theory).
1816 return transfer not in ("direct", None)