Coverage for python/lsst/daf/butler/datastores/fileDatastore.py : 83%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
21from __future__ import annotations
23"""Generic file-based datastore code."""
25__all__ = ("FileDatastore", )
27import hashlib
28import logging
29import os
30import tempfile
32from sqlalchemy import BigInteger, String
34from dataclasses import dataclass
35from typing import (
36 TYPE_CHECKING,
37 Any,
38 ClassVar,
39 Dict,
40 Iterable,
41 List,
42 Mapping,
43 Optional,
44 Set,
45 Tuple,
46 Type,
47 Union,
48)
50from lsst.daf.butler import (
51 ButlerURI,
52 CompositesMap,
53 Config,
54 FileDataset,
55 DatasetRef,
56 DatasetType,
57 DatasetTypeNotSupportedError,
58 Datastore,
59 DatastoreCacheManager,
60 DatastoreDisabledCacheManager,
61 DatastoreConfig,
62 DatastoreValidationError,
63 FileDescriptor,
64 FileTemplates,
65 FileTemplateValidationError,
66 Formatter,
67 FormatterFactory,
68 Location,
69 LocationFactory,
70 Progress,
71 StorageClass,
72 StoredFileInfo,
73)
75from lsst.daf.butler import ddl
76from lsst.daf.butler.registry.interfaces import (
77 ReadOnlyDatabaseError,
78 DatastoreRegistryBridge,
79)
81from lsst.daf.butler.core.repoRelocation import replaceRoot
82from lsst.daf.butler.core.utils import getInstanceOf, getClassOf, transactional
83from .genericDatastore import GenericBaseDatastore
85if TYPE_CHECKING: 85 ↛ 86line 85 didn't jump to line 86, because the condition on line 85 was never true
86 from lsst.daf.butler import LookupKey, AbstractDatastoreCacheManager
87 from lsst.daf.butler.registry.interfaces import DatasetIdRef, DatastoreRegistryBridgeManager
89log = logging.getLogger(__name__)
91# String to use when a Python None is encountered
92NULLSTR = "__NULL_STRING__"
95class _IngestPrepData(Datastore.IngestPrepData):
96 """Helper class for FileDatastore ingest implementation.
98 Parameters
99 ----------
100 datasets : `list` of `FileDataset`
101 Files to be ingested by this datastore.
102 """
103 def __init__(self, datasets: List[FileDataset]):
104 super().__init__(ref for dataset in datasets for ref in dataset.refs)
105 self.datasets = datasets
108@dataclass(frozen=True)
109class DatastoreFileGetInformation:
110 """Collection of useful parameters needed to retrieve a file from
111 a Datastore.
112 """
114 location: Location
115 """The location from which to read the dataset."""
117 formatter: Formatter
118 """The `Formatter` to use to deserialize the dataset."""
120 info: StoredFileInfo
121 """Stored information about this file and its formatter."""
123 assemblerParams: Dict[str, Any]
124 """Parameters to use for post-processing the retrieved dataset."""
126 formatterParams: Dict[str, Any]
127 """Parameters that were understood by the associated formatter."""
129 component: Optional[str]
130 """The component to be retrieved (can be `None`)."""
132 readStorageClass: StorageClass
133 """The `StorageClass` of the dataset being read."""
136class FileDatastore(GenericBaseDatastore):
137 """Generic Datastore for file-based implementations.
139 Should always be sub-classed since key abstract methods are missing.
141 Parameters
142 ----------
143 config : `DatastoreConfig` or `str`
144 Configuration as either a `Config` object or URI to file.
145 bridgeManager : `DatastoreRegistryBridgeManager`
146 Object that manages the interface between `Registry` and datastores.
147 butlerRoot : `str`, optional
148 New datastore root to use to override the configuration value.
150 Raises
151 ------
152 ValueError
153 If root location does not exist and ``create`` is `False` in the
154 configuration.
155 """
157 defaultConfigFile: ClassVar[Optional[str]] = None
158 """Path to configuration defaults. Accessed within the ``config`` resource
159 or relative to a search path. Can be None if no defaults specified.
160 """
162 root: ButlerURI
163 """Root directory URI of this `Datastore`."""
165 locationFactory: LocationFactory
166 """Factory for creating locations relative to the datastore root."""
168 formatterFactory: FormatterFactory
169 """Factory for creating instances of formatters."""
171 templates: FileTemplates
172 """File templates that can be used by this `Datastore`."""
174 composites: CompositesMap
175 """Determines whether a dataset should be disassembled on put."""
177 defaultConfigFile = "datastores/fileDatastore.yaml"
178 """Path to configuration defaults. Accessed within the ``config`` resource
179 or relative to a search path. Can be None if no defaults specified.
180 """
182 @classmethod
183 def setConfigRoot(cls, root: str, config: Config, full: Config, overwrite: bool = True) -> None:
184 """Set any filesystem-dependent config options for this Datastore to
185 be appropriate for a new empty repository with the given root.
187 Parameters
188 ----------
189 root : `str`
190 URI to the root of the data repository.
191 config : `Config`
192 A `Config` to update. Only the subset understood by
193 this component will be updated. Will not expand
194 defaults.
195 full : `Config`
196 A complete config with all defaults expanded that can be
197 converted to a `DatastoreConfig`. Read-only and will not be
198 modified by this method.
199 Repository-specific options that should not be obtained
200 from defaults when Butler instances are constructed
201 should be copied from ``full`` to ``config``.
202 overwrite : `bool`, optional
203 If `False`, do not modify a value in ``config`` if the value
204 already exists. Default is always to overwrite with the provided
205 ``root``.
207 Notes
208 -----
209 If a keyword is explicitly defined in the supplied ``config`` it
210 will not be overridden by this method if ``overwrite`` is `False`.
211 This allows explicit values set in external configs to be retained.
212 """
213 Config.updateParameters(DatastoreConfig, config, full,
214 toUpdate={"root": root},
215 toCopy=("cls", ("records", "table")), overwrite=overwrite)
217 @classmethod
218 def makeTableSpec(cls) -> ddl.TableSpec:
219 return ddl.TableSpec(
220 fields=[
221 ddl.FieldSpec(name="dataset_id", dtype=BigInteger, primaryKey=True),
222 ddl.FieldSpec(name="path", dtype=String, length=256, nullable=False),
223 ddl.FieldSpec(name="formatter", dtype=String, length=128, nullable=False),
224 ddl.FieldSpec(name="storage_class", dtype=String, length=64, nullable=False),
225 # Use empty string to indicate no component
226 ddl.FieldSpec(name="component", dtype=String, length=32, primaryKey=True),
227 # TODO: should checksum be Base64Bytes instead?
228 ddl.FieldSpec(name="checksum", dtype=String, length=128, nullable=True),
229 ddl.FieldSpec(name="file_size", dtype=BigInteger, nullable=True),
230 ],
231 unique=frozenset(),
232 )
234 def __init__(self, config: Union[DatastoreConfig, str],
235 bridgeManager: DatastoreRegistryBridgeManager, butlerRoot: str = None):
236 super().__init__(config, bridgeManager)
237 if "root" not in self.config: 237 ↛ 238line 237 didn't jump to line 238, because the condition on line 237 was never true
238 raise ValueError("No root directory specified in configuration")
240 # Name ourselves either using an explicit name or a name
241 # derived from the (unexpanded) root
242 if "name" in self.config:
243 self.name = self.config["name"]
244 else:
245 # We use the unexpanded root in the name to indicate that this
246 # datastore can be moved without having to update registry.
247 self.name = "{}@{}".format(type(self).__name__,
248 self.config["root"])
250 # Support repository relocation in config
251 # Existence of self.root is checked in subclass
252 self.root = ButlerURI(replaceRoot(self.config["root"], butlerRoot),
253 forceDirectory=True, forceAbsolute=True)
255 self.locationFactory = LocationFactory(self.root)
256 self.formatterFactory = FormatterFactory()
258 # Now associate formatters with storage classes
259 self.formatterFactory.registerFormatters(self.config["formatters"],
260 universe=bridgeManager.universe)
262 # Read the file naming templates
263 self.templates = FileTemplates(self.config["templates"],
264 universe=bridgeManager.universe)
266 # See if composites should be disassembled
267 self.composites = CompositesMap(self.config["composites"],
268 universe=bridgeManager.universe)
270 tableName = self.config["records", "table"]
271 try:
272 # Storage of paths and formatters, keyed by dataset_id
273 self._table = bridgeManager.opaque.register(tableName, self.makeTableSpec())
274 # Interface to Registry.
275 self._bridge = bridgeManager.register(self.name)
276 except ReadOnlyDatabaseError:
277 # If the database is read only and we just tried and failed to
278 # create a table, it means someone is trying to create a read-only
279 # butler client for an empty repo. That should be okay, as long
280 # as they then try to get any datasets before some other client
281 # creates the table. Chances are they'rejust validating
282 # configuration.
283 pass
285 # Determine whether checksums should be used - default to False
286 self.useChecksum = self.config.get("checksum", False)
288 # Determine whether we can fall back to configuration if a
289 # requested dataset is not known to registry
290 self.trustGetRequest = self.config.get("trust_get_request", False)
292 # Create a cache manager
293 self.cacheManager: AbstractDatastoreCacheManager
294 if "cached" in self.config: 294 ↛ 298line 294 didn't jump to line 298, because the condition on line 294 was never false
295 self.cacheManager = DatastoreCacheManager(self.config["cached"],
296 universe=bridgeManager.universe)
297 else:
298 self.cacheManager = DatastoreDisabledCacheManager("",
299 universe=bridgeManager.universe)
301 # Check existence and create directory structure if necessary
302 if not self.root.exists():
303 if "create" not in self.config or not self.config["create"]: 303 ↛ 304line 303 didn't jump to line 304, because the condition on line 303 was never true
304 raise ValueError(f"No valid root and not allowed to create one at: {self.root}")
305 try:
306 self.root.mkdir()
307 except Exception as e:
308 raise ValueError(f"Can not create datastore root '{self.root}', check permissions."
309 f" Got error: {e}") from e
311 def __str__(self) -> str:
312 return str(self.root)
314 @property
315 def bridge(self) -> DatastoreRegistryBridge:
316 return self._bridge
318 def _artifact_exists(self, location: Location) -> bool:
319 """Check that an artifact exists in this datastore at the specified
320 location.
322 Parameters
323 ----------
324 location : `Location`
325 Expected location of the artifact associated with this datastore.
327 Returns
328 -------
329 exists : `bool`
330 True if the location can be found, false otherwise.
331 """
332 log.debug("Checking if resource exists: %s", location.uri)
333 return location.uri.exists()
335 def _delete_artifact(self, location: Location) -> None:
336 """Delete the artifact from the datastore.
338 Parameters
339 ----------
340 location : `Location`
341 Location of the artifact associated with this datastore.
342 """
343 if location.pathInStore.isabs(): 343 ↛ 344line 343 didn't jump to line 344, because the condition on line 343 was never true
344 raise RuntimeError(f"Cannot delete artifact with absolute uri {location.uri}.")
345 log.debug("Deleting file: %s", location.uri)
346 location.uri.remove()
347 log.debug("Successfully deleted file: %s", location.uri)
349 def addStoredItemInfo(self, refs: Iterable[DatasetRef], infos: Iterable[StoredFileInfo]) -> None:
350 # Docstring inherited from GenericBaseDatastore
351 records = []
352 for ref, info in zip(refs, infos):
353 # Component should come from ref and fall back on info
354 component = ref.datasetType.component()
355 if component is None and info.component is not None: 355 ↛ 356line 355 didn't jump to line 356, because the condition on line 355 was never true
356 component = info.component
357 if component is None:
358 # Use empty string since we want this to be part of the
359 # primary key.
360 component = NULLSTR
361 records.append(
362 dict(dataset_id=ref.id, formatter=info.formatter, path=info.path,
363 storage_class=info.storageClass.name, component=component,
364 checksum=info.checksum, file_size=info.file_size)
365 )
366 self._table.insert(*records)
368 def getStoredItemsInfo(self, ref: DatasetIdRef) -> List[StoredFileInfo]:
369 # Docstring inherited from GenericBaseDatastore
371 # Look for the dataset_id -- there might be multiple matches
372 # if we have disassembled the dataset.
373 records = list(self._table.fetch(dataset_id=ref.id))
375 results = []
376 for record in records:
377 # Convert name of StorageClass to instance
378 storageClass = self.storageClassFactory.getStorageClass(record["storage_class"])
379 component = record["component"] if (record["component"]
380 and record["component"] != NULLSTR) else None
382 info = StoredFileInfo(formatter=record["formatter"],
383 path=record["path"],
384 storageClass=storageClass,
385 component=component,
386 checksum=record["checksum"],
387 file_size=record["file_size"])
388 results.append(info)
390 return results
392 def _registered_refs_per_artifact(self, pathInStore: ButlerURI) -> Set[int]:
393 """Return all dataset refs associated with the supplied path.
395 Parameters
396 ----------
397 pathInStore : `ButlerURI`
398 Path of interest in the data store.
400 Returns
401 -------
402 ids : `set` of `int`
403 All `DatasetRef` IDs associated with this path.
404 """
405 records = list(self._table.fetch(path=str(pathInStore)))
406 ids = {r["dataset_id"] for r in records}
407 return ids
409 def removeStoredItemInfo(self, ref: DatasetIdRef) -> None:
410 # Docstring inherited from GenericBaseDatastore
411 self._table.delete(["dataset_id"], {"dataset_id": ref.id})
413 def _get_dataset_locations_info(self, ref: DatasetIdRef) -> List[Tuple[Location, StoredFileInfo]]:
414 r"""Find all the `Location`\ s of the requested dataset in the
415 `Datastore` and the associated stored file information.
417 Parameters
418 ----------
419 ref : `DatasetRef`
420 Reference to the required `Dataset`.
422 Returns
423 -------
424 results : `list` [`tuple` [`Location`, `StoredFileInfo` ]]
425 Location of the dataset within the datastore and
426 stored information about each file and its formatter.
427 """
428 # Get the file information (this will fail if no file)
429 records = self.getStoredItemsInfo(ref)
431 # Use the path to determine the location -- we need to take
432 # into account absolute URIs in the datastore record
433 locations: List[Tuple[Location, StoredFileInfo]] = []
434 for r in records:
435 uriInStore = ButlerURI(r.path, forceAbsolute=False)
436 if uriInStore.isabs(): 436 ↛ 437line 436 didn't jump to line 437, because the condition on line 436 was never true
437 location = Location(None, uriInStore)
438 else:
439 location = self.locationFactory.fromPath(r.path)
440 locations.append((location, r))
441 return locations
443 def _can_remove_dataset_artifact(self, ref: DatasetIdRef, location: Location) -> bool:
444 """Check that there is only one dataset associated with the
445 specified artifact.
447 Parameters
448 ----------
449 ref : `DatasetRef` or `FakeDatasetRef`
450 Dataset to be removed.
451 location : `Location`
452 The location of the artifact to be removed.
454 Returns
455 -------
456 can_remove : `Bool`
457 True if the artifact can be safely removed.
458 """
459 # Can't ever delete absolute URIs.
460 if location.pathInStore.isabs(): 460 ↛ 461line 460 didn't jump to line 461, because the condition on line 460 was never true
461 return False
463 # Get all entries associated with this path
464 allRefs = self._registered_refs_per_artifact(location.pathInStore)
465 if not allRefs: 465 ↛ 466line 465 didn't jump to line 466, because the condition on line 465 was never true
466 raise RuntimeError(f"Datastore inconsistency error. {location.pathInStore} not in registry")
468 # Remove these refs from all the refs and if there is nothing left
469 # then we can delete
470 remainingRefs = allRefs - {ref.id}
472 if remainingRefs:
473 return False
474 return True
476 def _get_expected_dataset_locations_info(self, ref: DatasetRef) -> List[Tuple[Location,
477 StoredFileInfo]]:
478 """Predict the location and related file information of the requested
479 dataset in this datastore.
481 Parameters
482 ----------
483 ref : `DatasetRef`
484 Reference to the required `Dataset`.
486 Returns
487 -------
488 results : `list` [`tuple` [`Location`, `StoredFileInfo` ]]
489 Expected Location of the dataset within the datastore and
490 placeholder information about each file and its formatter.
492 Notes
493 -----
494 Uses the current configuration to determine how we would expect the
495 datastore files to have been written if we couldn't ask registry.
496 This is safe so long as there has been no change to datastore
497 configuration between writing the dataset and wanting to read it.
498 Will not work for files that have been ingested without using the
499 standard file template or default formatter.
500 """
502 # If we have a component ref we always need to ask the questions
503 # of the composite. If the composite is disassembled this routine
504 # should return all components. If the composite was not
505 # disassembled the composite is what is stored regardless of
506 # component request. Note that if the caller has disassembled
507 # a composite there is no way for this guess to know that
508 # without trying both the composite and component ref and seeing
509 # if there is something at the component Location even without
510 # disassembly being enabled.
511 if ref.datasetType.isComponent():
512 ref = ref.makeCompositeRef()
514 # See if the ref is a composite that should be disassembled
515 doDisassembly = self.composites.shouldBeDisassembled(ref)
517 all_info: List[Tuple[Location, Formatter, StorageClass, Optional[str]]] = []
519 if doDisassembly:
520 for component, componentStorage in ref.datasetType.storageClass.components.items():
521 compRef = ref.makeComponentRef(component)
522 location, formatter = self._determine_put_formatter_location(compRef)
523 all_info.append((location, formatter, componentStorage, component))
525 else:
526 # Always use the composite ref if no disassembly
527 location, formatter = self._determine_put_formatter_location(ref)
528 all_info.append((location, formatter, ref.datasetType.storageClass, None))
530 # Convert the list of tuples to have StoredFileInfo as second element
531 return [(location, StoredFileInfo(formatter=formatter,
532 path=location.pathInStore.path,
533 storageClass=storageClass,
534 component=component,
535 checksum=None,
536 file_size=-1))
537 for location, formatter, storageClass, component in all_info]
539 def _prepare_for_get(self, ref: DatasetRef,
540 parameters: Optional[Mapping[str, Any]] = None) -> List[DatastoreFileGetInformation]:
541 """Check parameters for ``get`` and obtain formatter and
542 location.
544 Parameters
545 ----------
546 ref : `DatasetRef`
547 Reference to the required Dataset.
548 parameters : `dict`
549 `StorageClass`-specific parameters that specify, for example,
550 a slice of the dataset to be loaded.
552 Returns
553 -------
554 getInfo : `list` [`DatastoreFileGetInformation`]
555 Parameters needed to retrieve each file.
556 """
557 log.debug("Retrieve %s from %s with parameters %s", ref, self.name, parameters)
559 # Get file metadata and internal metadata
560 fileLocations = self._get_dataset_locations_info(ref)
561 if not fileLocations:
562 if not self.trustGetRequest:
563 raise FileNotFoundError(f"Could not retrieve dataset {ref}.")
564 # Assume the dataset is where we think it should be
565 fileLocations = self._get_expected_dataset_locations_info(ref)
567 # The storage class we want to use eventually
568 refStorageClass = ref.datasetType.storageClass
570 if len(fileLocations) > 1:
571 disassembled = True
572 else:
573 disassembled = False
575 # Is this a component request?
576 refComponent = ref.datasetType.component()
578 fileGetInfo = []
579 for location, storedFileInfo in fileLocations:
581 # The storage class used to write the file
582 writeStorageClass = storedFileInfo.storageClass
584 # If this has been disassembled we need read to match the write
585 if disassembled:
586 readStorageClass = writeStorageClass
587 else:
588 readStorageClass = refStorageClass
590 formatter = getInstanceOf(storedFileInfo.formatter,
591 FileDescriptor(location, readStorageClass=readStorageClass,
592 storageClass=writeStorageClass, parameters=parameters),
593 ref.dataId)
595 formatterParams, notFormatterParams = formatter.segregateParameters()
597 # Of the remaining parameters, extract the ones supported by
598 # this StorageClass (for components not all will be handled)
599 assemblerParams = readStorageClass.filterParameters(notFormatterParams)
601 # The ref itself could be a component if the dataset was
602 # disassembled by butler, or we disassembled in datastore and
603 # components came from the datastore records
604 component = storedFileInfo.component if storedFileInfo.component else refComponent
606 fileGetInfo.append(DatastoreFileGetInformation(location, formatter, storedFileInfo,
607 assemblerParams, formatterParams,
608 component, readStorageClass))
610 return fileGetInfo
612 def _prepare_for_put(self, inMemoryDataset: Any, ref: DatasetRef) -> Tuple[Location, Formatter]:
613 """Check the arguments for ``put`` and obtain formatter and
614 location.
616 Parameters
617 ----------
618 inMemoryDataset : `object`
619 The dataset to store.
620 ref : `DatasetRef`
621 Reference to the associated Dataset.
623 Returns
624 -------
625 location : `Location`
626 The location to write the dataset.
627 formatter : `Formatter`
628 The `Formatter` to use to write the dataset.
630 Raises
631 ------
632 TypeError
633 Supplied object and storage class are inconsistent.
634 DatasetTypeNotSupportedError
635 The associated `DatasetType` is not handled by this datastore.
636 """
637 self._validate_put_parameters(inMemoryDataset, ref)
638 return self._determine_put_formatter_location(ref)
640 def _determine_put_formatter_location(self, ref: DatasetRef) -> Tuple[Location, Formatter]:
641 """Calculate the formatter and output location to use for put.
643 Parameters
644 ----------
645 ref : `DatasetRef`
646 Reference to the associated Dataset.
648 Returns
649 -------
650 location : `Location`
651 The location to write the dataset.
652 formatter : `Formatter`
653 The `Formatter` to use to write the dataset.
654 """
655 # Work out output file name
656 try:
657 template = self.templates.getTemplate(ref)
658 except KeyError as e:
659 raise DatasetTypeNotSupportedError(f"Unable to find template for {ref}") from e
661 # Validate the template to protect against filenames from different
662 # dataIds returning the same and causing overwrite confusion.
663 template.validateTemplate(ref)
665 location = self.locationFactory.fromPath(template.format(ref))
667 # Get the formatter based on the storage class
668 storageClass = ref.datasetType.storageClass
669 try:
670 formatter = self.formatterFactory.getFormatter(ref,
671 FileDescriptor(location,
672 storageClass=storageClass),
673 ref.dataId)
674 except KeyError as e:
675 raise DatasetTypeNotSupportedError(f"Unable to find formatter for {ref} in datastore "
676 f"{self.name}") from e
678 # Now that we know the formatter, update the location
679 location = formatter.makeUpdatedLocation(location)
681 return location, formatter
683 def _overrideTransferMode(self, *datasets: FileDataset, transfer: Optional[str] = None) -> Optional[str]:
684 # Docstring inherited from base class
685 if transfer != "auto":
686 return transfer
688 # See if the paths are within the datastore or not
689 inside = [self._pathInStore(d.path) is not None for d in datasets]
691 if all(inside):
692 transfer = None
693 elif not any(inside): 693 ↛ 697line 693 didn't jump to line 697, because the condition on line 693 was never false
694 # Allow ButlerURI to use its own knowledge
695 transfer = "auto"
696 else:
697 raise ValueError("Some datasets are inside the datastore and some are outside."
698 " Please use an explicit transfer mode and not 'auto'.")
700 return transfer
702 def _pathInStore(self, path: Union[str, ButlerURI]) -> Optional[str]:
703 """Return path relative to datastore root
705 Parameters
706 ----------
707 path : `str` or `ButlerURI`
708 Path to dataset. Can be absolute URI. If relative assumed to
709 be relative to the datastore. Returns path in datastore
710 or raises an exception if the path it outside.
712 Returns
713 -------
714 inStore : `str`
715 Path relative to datastore root. Returns `None` if the file is
716 outside the root.
717 """
718 # Relative path will always be relative to datastore
719 pathUri = ButlerURI(path, forceAbsolute=False)
720 return pathUri.relative_to(self.root)
722 def _standardizeIngestPath(self, path: Union[str, ButlerURI], *,
723 transfer: Optional[str] = None) -> Union[str, ButlerURI]:
724 """Standardize the path of a to-be-ingested file.
726 Parameters
727 ----------
728 path : `str` or `ButlerURI`
729 Path of a file to be ingested.
730 transfer : `str`, optional
731 How (and whether) the dataset should be added to the datastore.
732 See `ingest` for details of transfer modes.
733 This implementation is provided only so
734 `NotImplementedError` can be raised if the mode is not supported;
735 actual transfers are deferred to `_extractIngestInfo`.
737 Returns
738 -------
739 path : `str` or `ButlerURI`
740 New path in what the datastore considers standard form. If an
741 absolute URI was given that will be returned unchanged.
743 Notes
744 -----
745 Subclasses of `FileDatastore` can implement this method instead
746 of `_prepIngest`. It should not modify the data repository or given
747 file in any way.
749 Raises
750 ------
751 NotImplementedError
752 Raised if the datastore does not support the given transfer mode
753 (including the case where ingest is not supported at all).
754 FileNotFoundError
755 Raised if one of the given files does not exist.
756 """
757 if transfer not in (None, "direct") + self.root.transferModes: 757 ↛ 758line 757 didn't jump to line 758, because the condition on line 757 was never true
758 raise NotImplementedError(f"Transfer mode {transfer} not supported.")
760 # A relative URI indicates relative to datastore root
761 srcUri = ButlerURI(path, forceAbsolute=False)
762 if not srcUri.isabs():
763 srcUri = self.root.join(path)
765 if not srcUri.exists():
766 raise FileNotFoundError(f"Resource at {srcUri} does not exist; note that paths to ingest "
767 f"are assumed to be relative to {self.root} unless they are absolute.")
769 if transfer is None:
770 relpath = srcUri.relative_to(self.root)
771 if not relpath:
772 raise RuntimeError(f"Transfer is none but source file ({srcUri}) is not "
773 f"within datastore ({self.root})")
775 # Return the relative path within the datastore for internal
776 # transfer
777 path = relpath
779 return path
781 def _extractIngestInfo(self, path: Union[str, ButlerURI], ref: DatasetRef, *,
782 formatter: Union[Formatter, Type[Formatter]],
783 transfer: Optional[str] = None) -> StoredFileInfo:
784 """Relocate (if necessary) and extract `StoredFileInfo` from a
785 to-be-ingested file.
787 Parameters
788 ----------
789 path : `str` or `ButlerURI`
790 URI or path of a file to be ingested.
791 ref : `DatasetRef`
792 Reference for the dataset being ingested. Guaranteed to have
793 ``dataset_id not None`.
794 formatter : `type` or `Formatter`
795 `Formatter` subclass to use for this dataset or an instance.
796 transfer : `str`, optional
797 How (and whether) the dataset should be added to the datastore.
798 See `ingest` for details of transfer modes.
800 Returns
801 -------
802 info : `StoredFileInfo`
803 Internal datastore record for this file. This will be inserted by
804 the caller; the `_extractIngestInfo` is only resposible for
805 creating and populating the struct.
807 Raises
808 ------
809 FileNotFoundError
810 Raised if one of the given files does not exist.
811 FileExistsError
812 Raised if transfer is not `None` but the (internal) location the
813 file would be moved to is already occupied.
814 """
815 if self._transaction is None: 815 ↛ 816line 815 didn't jump to line 816, because the condition on line 815 was never true
816 raise RuntimeError("Ingest called without transaction enabled")
818 # Create URI of the source path, do not need to force a relative
819 # path to absolute.
820 srcUri = ButlerURI(path, forceAbsolute=False)
822 # Track whether we have read the size of the source yet
823 have_sized = False
825 tgtLocation: Optional[Location]
826 if transfer is None:
827 # A relative path is assumed to be relative to the datastore
828 # in this context
829 if not srcUri.isabs():
830 tgtLocation = self.locationFactory.fromPath(srcUri.ospath)
831 else:
832 # Work out the path in the datastore from an absolute URI
833 # This is required to be within the datastore.
834 pathInStore = srcUri.relative_to(self.root)
835 if pathInStore is None: 835 ↛ 836line 835 didn't jump to line 836, because the condition on line 835 was never true
836 raise RuntimeError(f"Unexpectedly learned that {srcUri} is "
837 f"not within datastore {self.root}")
838 tgtLocation = self.locationFactory.fromPath(pathInStore)
839 elif transfer == "direct": 839 ↛ 844line 839 didn't jump to line 844, because the condition on line 839 was never true
840 # Want to store the full URI to the resource directly in
841 # datastore. This is useful for referring to permanent archive
842 # storage for raw data.
843 # Trust that people know what they are doing.
844 tgtLocation = None
845 else:
846 # Work out the name we want this ingested file to have
847 # inside the datastore
848 tgtLocation = self._calculate_ingested_datastore_name(srcUri, ref, formatter)
849 if not tgtLocation.uri.dirname().exists():
850 log.debug("Folder %s does not exist yet.", tgtLocation.uri.dirname())
851 tgtLocation.uri.dirname().mkdir()
853 # if we are transferring from a local file to a remote location
854 # it may be more efficient to get the size and checksum of the
855 # local file rather than the transferred one
856 if not srcUri.scheme or srcUri.scheme == "file": 856 ↛ 862line 856 didn't jump to line 862, because the condition on line 856 was never false
857 size = srcUri.size()
858 checksum = self.computeChecksum(srcUri) if self.useChecksum else None
859 have_sized = True
861 # transfer the resource to the destination
862 tgtLocation.uri.transfer_from(srcUri, transfer=transfer, transaction=self._transaction)
864 if tgtLocation is None: 864 ↛ 866line 864 didn't jump to line 866, because the condition on line 864 was never true
865 # This means we are using direct mode
866 targetUri = srcUri
867 targetPath = str(srcUri)
868 else:
869 targetUri = tgtLocation.uri
870 targetPath = tgtLocation.pathInStore.path
872 # the file should exist in the datastore now
873 if not have_sized:
874 size = targetUri.size()
875 checksum = self.computeChecksum(targetUri) if self.useChecksum else None
877 return StoredFileInfo(formatter=formatter, path=targetPath,
878 storageClass=ref.datasetType.storageClass,
879 component=ref.datasetType.component(),
880 file_size=size, checksum=checksum)
882 def _prepIngest(self, *datasets: FileDataset, transfer: Optional[str] = None) -> _IngestPrepData:
883 # Docstring inherited from Datastore._prepIngest.
884 filtered = []
885 for dataset in datasets:
886 acceptable = [ref for ref in dataset.refs if self.constraints.isAcceptable(ref)]
887 if not acceptable:
888 continue
889 else:
890 dataset.refs = acceptable
891 if dataset.formatter is None:
892 dataset.formatter = self.formatterFactory.getFormatterClass(dataset.refs[0])
893 else:
894 assert isinstance(dataset.formatter, (type, str))
895 dataset.formatter = getClassOf(dataset.formatter)
896 dataset.path = self._standardizeIngestPath(dataset.path, transfer=transfer)
897 filtered.append(dataset)
898 return _IngestPrepData(filtered)
900 @transactional
901 def _finishIngest(self, prepData: Datastore.IngestPrepData, *, transfer: Optional[str] = None) -> None:
902 # Docstring inherited from Datastore._finishIngest.
903 refsAndInfos = []
904 progress = Progress("lsst.daf.butler.datastores.FileDatastore.ingest", level=logging.DEBUG)
905 for dataset in progress.wrap(prepData.datasets, desc="Ingesting dataset files"):
906 # Do ingest as if the first dataset ref is associated with the file
907 info = self._extractIngestInfo(dataset.path, dataset.refs[0], formatter=dataset.formatter,
908 transfer=transfer)
909 refsAndInfos.extend([(ref, info) for ref in dataset.refs])
910 self._register_datasets(refsAndInfos)
912 def _calculate_ingested_datastore_name(self, srcUri: ButlerURI, ref: DatasetRef,
913 formatter: Union[Formatter, Type[Formatter]]) -> Location:
914 """Given a source URI and a DatasetRef, determine the name the
915 dataset will have inside datastore.
917 Parameters
918 ----------
919 srcUri : `ButlerURI`
920 URI to the source dataset file.
921 ref : `DatasetRef`
922 Ref associated with the newly-ingested dataset artifact. This
923 is used to determine the name within the datastore.
924 formatter : `Formatter` or Formatter class.
925 Formatter to use for validation. Can be a class or an instance.
927 Returns
928 -------
929 location : `Location`
930 Target location for the newly-ingested dataset.
931 """
932 # Ingesting a file from outside the datastore.
933 # This involves a new name.
934 template = self.templates.getTemplate(ref)
935 location = self.locationFactory.fromPath(template.format(ref))
937 # Get the extension
938 ext = srcUri.getExtension()
940 # Update the destination to include that extension
941 location.updateExtension(ext)
943 # Ask the formatter to validate this extension
944 formatter.validateExtension(location)
946 return location
948 def _write_in_memory_to_artifact(self, inMemoryDataset: Any, ref: DatasetRef) -> StoredFileInfo:
949 """Write out in memory dataset to datastore.
951 Parameters
952 ----------
953 inMemoryDataset : `object`
954 Dataset to write to datastore.
955 ref : `DatasetRef`
956 Registry information associated with this dataset.
958 Returns
959 -------
960 info : `StoredFileInfo`
961 Information describin the artifact written to the datastore.
962 """
963 location, formatter = self._prepare_for_put(inMemoryDataset, ref)
964 uri = location.uri
966 if not uri.dirname().exists():
967 log.debug("Folder %s does not exist yet so creating it.", uri.dirname())
968 uri.dirname().mkdir()
970 if self._transaction is None: 970 ↛ 971line 970 didn't jump to line 971, because the condition on line 970 was never true
971 raise RuntimeError("Attempting to write artifact without transaction enabled")
973 def _removeFileExists(uri: ButlerURI) -> None:
974 """Remove a file and do not complain if it is not there.
976 This is important since a formatter might fail before the file
977 is written and we should not confuse people by writing spurious
978 error messages to the log.
979 """
980 try:
981 uri.remove()
982 except FileNotFoundError:
983 pass
985 # Register a callback to try to delete the uploaded data if
986 # something fails below
987 self._transaction.registerUndo("artifactWrite", _removeFileExists, uri)
989 # For a local file, simply use the formatter directly
990 if uri.isLocal:
991 formatter.write(inMemoryDataset)
992 log.debug("Successfully wrote python object to local file at %s", uri)
993 else:
994 # This is a remote URI, so first try bytes and write directly else
995 # fallback to a temporary file
996 try:
997 serializedDataset = formatter.toBytes(inMemoryDataset)
998 log.debug("Writing bytes directly to %s", uri)
999 uri.write(serializedDataset, overwrite=True)
1000 log.debug("Successfully wrote bytes directly to %s", uri)
1001 except NotImplementedError:
1002 with tempfile.NamedTemporaryFile(suffix=uri.getExtension()) as tmpFile:
1003 # Need to configure the formatter to write to a different
1004 # location and that needs us to overwrite internals
1005 tmpLocation = Location(*os.path.split(tmpFile.name))
1006 log.debug("Writing dataset to temporary location at %s", tmpLocation.uri)
1007 with formatter._updateLocation(tmpLocation):
1008 formatter.write(inMemoryDataset)
1009 uri.transfer_from(tmpLocation.uri, transfer="copy", overwrite=True)
1011 # Cache if required
1012 self.cacheManager.move_to_cache(tmpLocation.uri, ref)
1014 log.debug("Successfully wrote dataset to %s via a temporary file.", uri)
1016 # URI is needed to resolve what ingest case are we dealing with
1017 return self._extractIngestInfo(uri, ref, formatter=formatter)
1019 def _read_artifact_into_memory(self, getInfo: DatastoreFileGetInformation,
1020 ref: DatasetRef, isComponent: bool = False) -> Any:
1021 """Read the artifact from datastore into in memory object.
1023 Parameters
1024 ----------
1025 getInfo : `DatastoreFileGetInformation`
1026 Information about the artifact within the datastore.
1027 ref : `DatasetRef`
1028 The registry information associated with this artifact.
1029 isComponent : `bool`
1030 Flag to indicate if a component is being read from this artifact.
1032 Returns
1033 -------
1034 inMemoryDataset : `object`
1035 The artifact as a python object.
1036 """
1037 location = getInfo.location
1038 uri = location.uri
1039 log.debug("Accessing data from %s", uri)
1041 # Cannot recalculate checksum but can compare size as a quick check
1042 # Do not do this if the size is negative since that indicates
1043 # we do not know.
1044 recorded_size = getInfo.info.file_size
1045 resource_size = uri.size()
1046 if recorded_size >= 0 and resource_size != recorded_size: 1046 ↛ 1047line 1046 didn't jump to line 1047, because the condition on line 1046 was never true
1047 raise RuntimeError("Integrity failure in Datastore. "
1048 f"Size of file {uri} ({resource_size}) "
1049 f"does not match size recorded in registry of {recorded_size}")
1051 # For the general case we have choices for how to proceed.
1052 # 1. Always use a local file (downloading the remote resource to a
1053 # temporary file if needed).
1054 # 2. Use a threshold size and read into memory and use bytes.
1055 # Use both for now with an arbitrary hand off size.
1056 # This allows small datasets to be downloaded from remote object
1057 # stores without requiring a temporary file.
1059 formatter = getInfo.formatter
1060 nbytes_max = 10_000_000 # Arbitrary number that we can tune
1061 if resource_size <= nbytes_max and formatter.can_read_bytes():
1062 serializedDataset = uri.read()
1063 log.debug("Deserializing %s from %d bytes from location %s with formatter %s",
1064 f"component {getInfo.component}" if isComponent else "",
1065 len(serializedDataset), uri, formatter.name())
1066 try:
1067 result = formatter.fromBytes(serializedDataset,
1068 component=getInfo.component if isComponent else None)
1069 except Exception as e:
1070 raise ValueError(f"Failure from formatter '{formatter.name()}' for dataset {ref.id}"
1071 f" ({ref.datasetType.name} from {uri}): {e}") from e
1072 else:
1073 # Read from file.
1075 # Have to update the Location associated with the formatter
1076 # because formatter.read does not allow an override.
1077 # This could be improved.
1078 location_updated = False
1079 msg = ""
1081 # First check in cache for local version.
1082 # The cache will only be relevant for remote resources.
1083 if not uri.isLocal:
1084 cached_file = self.cacheManager.find_in_cache(ref, uri.getExtension())
1085 if cached_file is not None: 1085 ↛ 1086line 1085 didn't jump to line 1086, because the condition on line 1085 was never true
1086 msg = f"(via cache read of remote file {uri})"
1087 uri = cached_file
1088 location_updated = True
1090 with uri.as_local() as local_uri:
1092 # URI was remote and file was downloaded
1093 if uri != local_uri:
1094 cache_msg = ""
1095 location_updated = True
1097 # Cache the downloaded file if needed.
1098 cached_uri = self.cacheManager.move_to_cache(local_uri, ref)
1099 if cached_uri is not None: 1099 ↛ 1100line 1099 didn't jump to line 1100, because the condition on line 1099 was never true
1100 local_uri = cached_uri
1101 cache_msg = " and cached"
1103 msg = f"(via download to local file{cache_msg})"
1105 # Calculate the (possibly) new location for the formatter
1106 # to use.
1107 newLocation = Location(*local_uri.split()) if location_updated else None
1109 log.debug("Reading%s from location %s %s with formatter %s",
1110 f" component {getInfo.component}" if isComponent else "",
1111 uri, msg, formatter.name())
1112 try:
1113 with formatter._updateLocation(newLocation):
1114 result = formatter.read(component=getInfo.component if isComponent else None)
1115 except Exception as e:
1116 raise ValueError(f"Failure from formatter '{formatter.name()}' for dataset {ref.id}"
1117 f" ({ref.datasetType.name} from {uri}): {e}") from e
1119 return self._post_process_get(result, getInfo.readStorageClass, getInfo.assemblerParams,
1120 isComponent=isComponent)
1122 def exists(self, ref: DatasetRef) -> bool:
1123 """Check if the dataset exists in the datastore.
1125 Parameters
1126 ----------
1127 ref : `DatasetRef`
1128 Reference to the required dataset.
1130 Returns
1131 -------
1132 exists : `bool`
1133 `True` if the entity exists in the `Datastore`.
1134 """
1135 fileLocations = self._get_dataset_locations_info(ref)
1137 # if we are being asked to trust that registry might not be correct
1138 # we ask for the expected locations and check them explicitly
1139 if not fileLocations:
1140 if not self.trustGetRequest:
1141 return False
1142 fileLocations = self._get_expected_dataset_locations_info(ref)
1143 for location, _ in fileLocations:
1144 if not self._artifact_exists(location):
1145 return False
1147 return True
1149 def getURIs(self, ref: DatasetRef,
1150 predict: bool = False) -> Tuple[Optional[ButlerURI], Dict[str, ButlerURI]]:
1151 """Return URIs associated with dataset.
1153 Parameters
1154 ----------
1155 ref : `DatasetRef`
1156 Reference to the required dataset.
1157 predict : `bool`, optional
1158 If the datastore does not know about the dataset, should it
1159 return a predicted URI or not?
1161 Returns
1162 -------
1163 primary : `ButlerURI`
1164 The URI to the primary artifact associated with this dataset.
1165 If the dataset was disassembled within the datastore this
1166 may be `None`.
1167 components : `dict`
1168 URIs to any components associated with the dataset artifact.
1169 Can be empty if there are no components.
1170 """
1172 primary: Optional[ButlerURI] = None
1173 components: Dict[str, ButlerURI] = {}
1175 # if this has never been written then we have to guess
1176 if not self.exists(ref):
1177 if not predict:
1178 raise FileNotFoundError("Dataset {} not in this datastore".format(ref))
1180 doDisassembly = self.composites.shouldBeDisassembled(ref)
1182 if doDisassembly:
1184 for component, componentStorage in ref.datasetType.storageClass.components.items():
1185 compRef = ref.makeComponentRef(component)
1186 compLocation, _ = self._determine_put_formatter_location(compRef)
1188 # Add a URI fragment to indicate this is a guess
1189 components[component] = ButlerURI(compLocation.uri.geturl() + "#predicted")
1191 else:
1193 location, _ = self._determine_put_formatter_location(ref)
1195 # Add a URI fragment to indicate this is a guess
1196 primary = ButlerURI(location.uri.geturl() + "#predicted")
1198 return primary, components
1200 # If this is a ref that we have written we can get the path.
1201 # Get file metadata and internal metadata
1202 fileLocations = self._get_dataset_locations_info(ref)
1204 guessing = False
1205 if not fileLocations:
1206 if not self.trustGetRequest: 1206 ↛ 1207line 1206 didn't jump to line 1207, because the condition on line 1206 was never true
1207 raise RuntimeError(f"Unexpectedly got no artifacts for dataset {ref}")
1208 fileLocations = self._get_expected_dataset_locations_info(ref)
1209 guessing = True
1211 if len(fileLocations) == 1:
1212 # No disassembly so this is the primary URI
1213 uri = fileLocations[0][0].uri
1214 if guessing and not uri.exists(): 1214 ↛ 1215line 1214 didn't jump to line 1215, because the condition on line 1214 was never true
1215 raise FileNotFoundError(f"Expected URI ({uri}) does not exist")
1216 primary = uri
1218 else:
1219 for location, storedFileInfo in fileLocations:
1220 if storedFileInfo.component is None: 1220 ↛ 1221line 1220 didn't jump to line 1221, because the condition on line 1220 was never true
1221 raise RuntimeError(f"Unexpectedly got no component name for a component at {location}")
1222 uri = location.uri
1223 if guessing and not uri.exists(): 1223 ↛ 1224line 1223 didn't jump to line 1224, because the condition on line 1223 was never true
1224 raise FileNotFoundError(f"Expected URI ({uri}) does not exist")
1225 components[storedFileInfo.component] = uri
1227 return primary, components
1229 def getURI(self, ref: DatasetRef, predict: bool = False) -> ButlerURI:
1230 """URI to the Dataset.
1232 Parameters
1233 ----------
1234 ref : `DatasetRef`
1235 Reference to the required Dataset.
1236 predict : `bool`
1237 If `True`, allow URIs to be returned of datasets that have not
1238 been written.
1240 Returns
1241 -------
1242 uri : `str`
1243 URI pointing to the dataset within the datastore. If the
1244 dataset does not exist in the datastore, and if ``predict`` is
1245 `True`, the URI will be a prediction and will include a URI
1246 fragment "#predicted".
1247 If the datastore does not have entities that relate well
1248 to the concept of a URI the returned URI will be
1249 descriptive. The returned URI is not guaranteed to be obtainable.
1251 Raises
1252 ------
1253 FileNotFoundError
1254 Raised if a URI has been requested for a dataset that does not
1255 exist and guessing is not allowed.
1256 RuntimeError
1257 Raised if a request is made for a single URI but multiple URIs
1258 are associated with this dataset.
1260 Notes
1261 -----
1262 When a predicted URI is requested an attempt will be made to form
1263 a reasonable URI based on file templates and the expected formatter.
1264 """
1265 primary, components = self.getURIs(ref, predict)
1266 if primary is None or components: 1266 ↛ 1267line 1266 didn't jump to line 1267, because the condition on line 1266 was never true
1267 raise RuntimeError(f"Dataset ({ref}) includes distinct URIs for components. "
1268 "Use Dataastore.getURIs() instead.")
1269 return primary
1271 def get(self, ref: DatasetRef, parameters: Optional[Mapping[str, Any]] = None) -> Any:
1272 """Load an InMemoryDataset from the store.
1274 Parameters
1275 ----------
1276 ref : `DatasetRef`
1277 Reference to the required Dataset.
1278 parameters : `dict`
1279 `StorageClass`-specific parameters that specify, for example,
1280 a slice of the dataset to be loaded.
1282 Returns
1283 -------
1284 inMemoryDataset : `object`
1285 Requested dataset or slice thereof as an InMemoryDataset.
1287 Raises
1288 ------
1289 FileNotFoundError
1290 Requested dataset can not be retrieved.
1291 TypeError
1292 Return value from formatter has unexpected type.
1293 ValueError
1294 Formatter failed to process the dataset.
1295 """
1296 allGetInfo = self._prepare_for_get(ref, parameters)
1297 refComponent = ref.datasetType.component()
1299 # Supplied storage class for the component being read
1300 refStorageClass = ref.datasetType.storageClass
1302 # Create mapping from component name to related info
1303 allComponents = {i.component: i for i in allGetInfo}
1305 # By definition the dataset is disassembled if we have more
1306 # than one record for it.
1307 isDisassembled = len(allGetInfo) > 1
1309 # Look for the special case where we are disassembled but the
1310 # component is a derived component that was not written during
1311 # disassembly. For this scenario we need to check that the
1312 # component requested is listed as a derived component for the
1313 # composite storage class
1314 isDisassembledReadOnlyComponent = False
1315 if isDisassembled and refComponent:
1316 # The composite storage class should be accessible through
1317 # the component dataset type
1318 compositeStorageClass = ref.datasetType.parentStorageClass
1320 # In the unlikely scenario where the composite storage
1321 # class is not known, we can only assume that this is a
1322 # normal component. If that assumption is wrong then the
1323 # branch below that reads a persisted component will fail
1324 # so there is no need to complain here.
1325 if compositeStorageClass is not None: 1325 ↛ 1328line 1325 didn't jump to line 1328, because the condition on line 1325 was never false
1326 isDisassembledReadOnlyComponent = refComponent in compositeStorageClass.derivedComponents
1328 if isDisassembled and not refComponent:
1329 # This was a disassembled dataset spread over multiple files
1330 # and we need to put them all back together again.
1331 # Read into memory and then assemble
1333 # Check that the supplied parameters are suitable for the type read
1334 refStorageClass.validateParameters(parameters)
1336 # We want to keep track of all the parameters that were not used
1337 # by formatters. We assume that if any of the component formatters
1338 # use a parameter that we do not need to apply it again in the
1339 # assembler.
1340 usedParams = set()
1342 components: Dict[str, Any] = {}
1343 for getInfo in allGetInfo:
1344 # assemblerParams are parameters not understood by the
1345 # associated formatter.
1346 usedParams.update(set(getInfo.formatterParams))
1348 component = getInfo.component
1350 if component is None: 1350 ↛ 1351line 1350 didn't jump to line 1351, because the condition on line 1350 was never true
1351 raise RuntimeError(f"Internal error in datastore assembly of {ref}")
1353 # We do not want the formatter to think it's reading
1354 # a component though because it is really reading a
1355 # standalone dataset -- always tell reader it is not a
1356 # component.
1357 components[component] = self._read_artifact_into_memory(getInfo, ref, isComponent=False)
1359 inMemoryDataset = ref.datasetType.storageClass.delegate().assemble(components)
1361 # Any unused parameters will have to be passed to the assembler
1362 if parameters:
1363 unusedParams = {k: v for k, v in parameters.items() if k not in usedParams}
1364 else:
1365 unusedParams = {}
1367 # Process parameters
1368 return ref.datasetType.storageClass.delegate().handleParameters(inMemoryDataset,
1369 parameters=unusedParams)
1371 elif isDisassembledReadOnlyComponent:
1373 compositeStorageClass = ref.datasetType.parentStorageClass
1374 if compositeStorageClass is None: 1374 ↛ 1375line 1374 didn't jump to line 1375, because the condition on line 1374 was never true
1375 raise RuntimeError(f"Unable to retrieve derived component '{refComponent}' since"
1376 "no composite storage class is available.")
1378 if refComponent is None: 1378 ↛ 1380line 1378 didn't jump to line 1380, because the condition on line 1378 was never true
1379 # Mainly for mypy
1380 raise RuntimeError(f"Internal error in datastore {self.name}: component can not be None here")
1382 # Assume that every derived component can be calculated by
1383 # forwarding the request to a single read/write component.
1384 # Rather than guessing which rw component is the right one by
1385 # scanning each for a derived component of the same name,
1386 # we ask the storage class delegate directly which one is best to
1387 # use.
1388 compositeDelegate = compositeStorageClass.delegate()
1389 forwardedComponent = compositeDelegate.selectResponsibleComponent(refComponent,
1390 set(allComponents))
1392 # Select the relevant component
1393 rwInfo = allComponents[forwardedComponent]
1395 # For now assume that read parameters are validated against
1396 # the real component and not the requested component
1397 forwardedStorageClass = rwInfo.formatter.fileDescriptor.readStorageClass
1398 forwardedStorageClass.validateParameters(parameters)
1400 # Unfortunately the FileDescriptor inside the formatter will have
1401 # the wrong write storage class so we need to create a new one
1402 # given the immutability constraint.
1403 writeStorageClass = rwInfo.info.storageClass
1405 # We may need to put some thought into parameters for read
1406 # components but for now forward them on as is
1407 readFormatter = type(rwInfo.formatter)(FileDescriptor(rwInfo.location,
1408 readStorageClass=refStorageClass,
1409 storageClass=writeStorageClass,
1410 parameters=parameters),
1411 ref.dataId)
1413 # The assembler can not receive any parameter requests for a
1414 # derived component at this time since the assembler will
1415 # see the storage class of the derived component and those
1416 # parameters will have to be handled by the formatter on the
1417 # forwarded storage class.
1418 assemblerParams: Dict[str, Any] = {}
1420 # Need to created a new info that specifies the derived
1421 # component and associated storage class
1422 readInfo = DatastoreFileGetInformation(rwInfo.location, readFormatter,
1423 rwInfo.info, assemblerParams, {},
1424 refComponent, refStorageClass)
1426 return self._read_artifact_into_memory(readInfo, ref, isComponent=True)
1428 else:
1429 # Single file request or component from that composite file
1430 for lookup in (refComponent, None): 1430 ↛ 1435line 1430 didn't jump to line 1435, because the loop on line 1430 didn't complete
1431 if lookup in allComponents: 1431 ↛ 1430line 1431 didn't jump to line 1430, because the condition on line 1431 was never false
1432 getInfo = allComponents[lookup]
1433 break
1434 else:
1435 raise FileNotFoundError(f"Component {refComponent} not found "
1436 f"for ref {ref} in datastore {self.name}")
1438 # Do not need the component itself if already disassembled
1439 if isDisassembled:
1440 isComponent = False
1441 else:
1442 isComponent = getInfo.component is not None
1444 # For a disassembled component we can validate parametersagainst
1445 # the component storage class directly
1446 if isDisassembled:
1447 refStorageClass.validateParameters(parameters)
1448 else:
1449 # For an assembled composite this could be a derived
1450 # component derived from a real component. The validity
1451 # of the parameters is not clear. For now validate against
1452 # the composite storage class
1453 getInfo.formatter.fileDescriptor.storageClass.validateParameters(parameters)
1455 return self._read_artifact_into_memory(getInfo, ref, isComponent=isComponent)
1457 @transactional
1458 def put(self, inMemoryDataset: Any, ref: DatasetRef) -> None:
1459 """Write a InMemoryDataset with a given `DatasetRef` to the store.
1461 Parameters
1462 ----------
1463 inMemoryDataset : `object`
1464 The dataset to store.
1465 ref : `DatasetRef`
1466 Reference to the associated Dataset.
1468 Raises
1469 ------
1470 TypeError
1471 Supplied object and storage class are inconsistent.
1472 DatasetTypeNotSupportedError
1473 The associated `DatasetType` is not handled by this datastore.
1475 Notes
1476 -----
1477 If the datastore is configured to reject certain dataset types it
1478 is possible that the put will fail and raise a
1479 `DatasetTypeNotSupportedError`. The main use case for this is to
1480 allow `ChainedDatastore` to put to multiple datastores without
1481 requiring that every datastore accepts the dataset.
1482 """
1484 doDisassembly = self.composites.shouldBeDisassembled(ref)
1485 # doDisassembly = True
1487 artifacts = []
1488 if doDisassembly:
1489 components = ref.datasetType.storageClass.delegate().disassemble(inMemoryDataset)
1490 for component, componentInfo in components.items():
1491 # Don't recurse because we want to take advantage of
1492 # bulk insert -- need a new DatasetRef that refers to the
1493 # same dataset_id but has the component DatasetType
1494 # DatasetType does not refer to the types of components
1495 # So we construct one ourselves.
1496 compRef = ref.makeComponentRef(component)
1497 storedInfo = self._write_in_memory_to_artifact(componentInfo.component, compRef)
1498 artifacts.append((compRef, storedInfo))
1499 else:
1500 # Write the entire thing out
1501 storedInfo = self._write_in_memory_to_artifact(inMemoryDataset, ref)
1502 artifacts.append((ref, storedInfo))
1504 self._register_datasets(artifacts)
1506 @transactional
1507 def trash(self, ref: DatasetRef, ignore_errors: bool = True) -> None:
1508 """Indicate to the datastore that a dataset can be removed.
1510 Parameters
1511 ----------
1512 ref : `DatasetRef`
1513 Reference to the required Dataset.
1514 ignore_errors : `bool`
1515 If `True` return without error even if something went wrong.
1516 Problems could occur if another process is simultaneously trying
1517 to delete.
1519 Raises
1520 ------
1521 FileNotFoundError
1522 Attempt to remove a dataset that does not exist.
1523 """
1524 # Get file metadata and internal metadata
1525 log.debug("Trashing %s in datastore %s", ref, self.name)
1527 fileLocations = self._get_dataset_locations_info(ref)
1529 if not fileLocations:
1530 err_msg = f"Requested dataset to trash ({ref}) is not known to datastore {self.name}"
1531 if ignore_errors:
1532 log.warning(err_msg)
1533 return
1534 else:
1535 raise FileNotFoundError(err_msg)
1537 for location, storedFileInfo in fileLocations:
1538 if not self._artifact_exists(location): 1538 ↛ 1539line 1538 didn't jump to line 1539, because the condition on line 1538 was never true
1539 err_msg = f"Dataset is known to datastore {self.name} but " \
1540 f"associated artifact ({location.uri}) is missing"
1541 if ignore_errors:
1542 log.warning(err_msg)
1543 return
1544 else:
1545 raise FileNotFoundError(err_msg)
1547 # Mark dataset as trashed
1548 try:
1549 self._move_to_trash_in_registry(ref)
1550 except Exception as e:
1551 if ignore_errors:
1552 log.warning(f"Attempted to mark dataset ({ref}) to be trashed in datastore {self.name} "
1553 f"but encountered an error: {e}")
1554 pass
1555 else:
1556 raise
1558 @transactional
1559 def emptyTrash(self, ignore_errors: bool = True) -> None:
1560 """Remove all datasets from the trash.
1562 Parameters
1563 ----------
1564 ignore_errors : `bool`
1565 If `True` return without error even if something went wrong.
1566 Problems could occur if another process is simultaneously trying
1567 to delete.
1568 """
1569 log.debug("Emptying trash in datastore %s", self.name)
1570 # Context manager will empty trash iff we finish it without raising.
1571 with self.bridge.emptyTrash() as trashed:
1572 for ref in trashed:
1573 fileLocations = self._get_dataset_locations_info(ref)
1575 if not fileLocations: 1575 ↛ 1576line 1575 didn't jump to line 1576, because the condition on line 1575 was never true
1576 err_msg = f"Requested dataset ({ref}) does not exist in datastore {self.name}"
1577 if ignore_errors:
1578 log.warning(err_msg)
1579 continue
1580 else:
1581 raise FileNotFoundError(err_msg)
1583 for location, _ in fileLocations:
1585 if not self._artifact_exists(location): 1585 ↛ 1586line 1585 didn't jump to line 1586, because the condition on line 1585 was never true
1586 err_msg = f"Dataset {location.uri} no longer present in datastore {self.name}"
1587 if ignore_errors:
1588 log.warning(err_msg)
1589 continue
1590 else:
1591 raise FileNotFoundError(err_msg)
1593 # Can only delete the artifact if there are no references
1594 # to the file from untrashed dataset refs.
1595 if self._can_remove_dataset_artifact(ref, location):
1596 # Point of no return for this artifact
1597 log.debug("Removing artifact %s from datastore %s", location.uri, self.name)
1598 try:
1599 self._delete_artifact(location)
1600 except Exception as e:
1601 if ignore_errors:
1602 log.critical("Encountered error removing artifact %s from datastore %s: %s",
1603 location.uri, self.name, e)
1604 else:
1605 raise
1607 # Now must remove the entry from the internal registry even if
1608 # the artifact removal failed and was ignored,
1609 # otherwise the removal check above will never be true
1610 try:
1611 # There may be multiple rows associated with this ref
1612 # depending on disassembly
1613 self.removeStoredItemInfo(ref)
1614 except Exception as e:
1615 if ignore_errors:
1616 log.warning("Error removing dataset %s (%s) from internal registry of %s: %s",
1617 ref.id, location.uri, self.name, e)
1618 continue
1619 else:
1620 raise FileNotFoundError(
1621 f"Error removing dataset {ref.id} ({location.uri}) from internal registry "
1622 f"of {self.name}"
1623 ) from e
1625 @transactional
1626 def forget(self, refs: Iterable[DatasetRef]) -> None:
1627 # Docstring inherited.
1628 refs = list(refs)
1629 self.bridge.forget(refs)
1630 self._table.delete(["dataset_id"], *[{"dataset_id": ref.getCheckedId()} for ref in refs])
1632 def validateConfiguration(self, entities: Iterable[Union[DatasetRef, DatasetType, StorageClass]],
1633 logFailures: bool = False) -> None:
1634 """Validate some of the configuration for this datastore.
1636 Parameters
1637 ----------
1638 entities : iterable of `DatasetRef`, `DatasetType`, or `StorageClass`
1639 Entities to test against this configuration. Can be differing
1640 types.
1641 logFailures : `bool`, optional
1642 If `True`, output a log message for every validation error
1643 detected.
1645 Raises
1646 ------
1647 DatastoreValidationError
1648 Raised if there is a validation problem with a configuration.
1649 All the problems are reported in a single exception.
1651 Notes
1652 -----
1653 This method checks that all the supplied entities have valid file
1654 templates and also have formatters defined.
1655 """
1657 templateFailed = None
1658 try:
1659 self.templates.validateTemplates(entities, logFailures=logFailures)
1660 except FileTemplateValidationError as e:
1661 templateFailed = str(e)
1663 formatterFailed = []
1664 for entity in entities:
1665 try:
1666 self.formatterFactory.getFormatterClass(entity)
1667 except KeyError as e:
1668 formatterFailed.append(str(e))
1669 if logFailures: 1669 ↛ 1664line 1669 didn't jump to line 1664, because the condition on line 1669 was never false
1670 log.critical("Formatter failure: %s", e)
1672 if templateFailed or formatterFailed:
1673 messages = []
1674 if templateFailed: 1674 ↛ 1675line 1674 didn't jump to line 1675, because the condition on line 1674 was never true
1675 messages.append(templateFailed)
1676 if formatterFailed: 1676 ↛ 1678line 1676 didn't jump to line 1678, because the condition on line 1676 was never false
1677 messages.append(",".join(formatterFailed))
1678 msg = ";\n".join(messages)
1679 raise DatastoreValidationError(msg)
1681 def getLookupKeys(self) -> Set[LookupKey]:
1682 # Docstring is inherited from base class
1683 return self.templates.getLookupKeys() | self.formatterFactory.getLookupKeys() | \
1684 self.constraints.getLookupKeys()
1686 def validateKey(self, lookupKey: LookupKey,
1687 entity: Union[DatasetRef, DatasetType, StorageClass]) -> None:
1688 # Docstring is inherited from base class
1689 # The key can be valid in either formatters or templates so we can
1690 # only check the template if it exists
1691 if lookupKey in self.templates:
1692 try:
1693 self.templates[lookupKey].validateTemplate(entity)
1694 except FileTemplateValidationError as e:
1695 raise DatastoreValidationError(e) from e
1697 def export(self, refs: Iterable[DatasetRef], *,
1698 directory: Optional[Union[ButlerURI, str]] = None,
1699 transfer: Optional[str] = "auto") -> Iterable[FileDataset]:
1700 # Docstring inherited from Datastore.export.
1701 if transfer is not None and directory is None: 1701 ↛ 1702line 1701 didn't jump to line 1702, because the condition on line 1701 was never true
1702 raise RuntimeError(f"Cannot export using transfer mode {transfer} with no "
1703 "export directory given")
1705 # Force the directory to be a URI object
1706 directoryUri: Optional[ButlerURI] = None
1707 if directory is not None: 1707 ↛ 1710line 1707 didn't jump to line 1710, because the condition on line 1707 was never false
1708 directoryUri = ButlerURI(directory, forceDirectory=True)
1710 if transfer is not None and directoryUri is not None: 1710 ↛ 1715line 1710 didn't jump to line 1715, because the condition on line 1710 was never false
1711 # mypy needs the second test
1712 if not directoryUri.exists(): 1712 ↛ 1713line 1712 didn't jump to line 1713, because the condition on line 1712 was never true
1713 raise FileNotFoundError(f"Export location {directory} does not exist")
1715 progress = Progress("lsst.daf.butler.datastores.FileDatastore.export", level=logging.DEBUG)
1716 for ref in progress.wrap(refs, "Exporting dataset files"):
1717 fileLocations = self._get_dataset_locations_info(ref)
1718 if not fileLocations: 1718 ↛ 1719line 1718 didn't jump to line 1719, because the condition on line 1718 was never true
1719 raise FileNotFoundError(f"Could not retrieve dataset {ref}.")
1720 # For now we can not export disassembled datasets
1721 if len(fileLocations) > 1:
1722 raise NotImplementedError(f"Can not export disassembled datasets such as {ref}")
1723 location, storedFileInfo = fileLocations[0]
1725 pathInStore = location.pathInStore.path
1726 if transfer is None: 1726 ↛ 1729line 1726 didn't jump to line 1729, because the condition on line 1726 was never true
1727 # TODO: do we also need to return the readStorageClass somehow?
1728 # We will use the path in store directly
1729 pass
1730 elif transfer == "direct": 1730 ↛ 1732line 1730 didn't jump to line 1732, because the condition on line 1730 was never true
1731 # Use full URIs to the remote store in the export
1732 pathInStore = str(location.uri)
1733 else:
1734 # mypy needs help
1735 assert directoryUri is not None, "directoryUri must be defined to get here"
1736 storeUri = ButlerURI(location.uri)
1738 # if the datastore has an absolute URI to a resource, we
1739 # have two options:
1740 # 1. Keep the absolute URI in the exported YAML
1741 # 2. Allocate a new name in the local datastore and transfer
1742 # it.
1743 # For now go with option 2
1744 if location.pathInStore.isabs(): 1744 ↛ 1745line 1744 didn't jump to line 1745, because the condition on line 1744 was never true
1745 template = self.templates.getTemplate(ref)
1746 newURI = ButlerURI(template.format(ref), forceAbsolute=False)
1747 pathInStore = str(newURI.updatedExtension(location.pathInStore.getExtension()))
1749 exportUri = directoryUri.join(pathInStore)
1750 exportUri.transfer_from(storeUri, transfer=transfer)
1752 yield FileDataset(refs=[ref], path=pathInStore, formatter=storedFileInfo.formatter)
1754 @staticmethod
1755 def computeChecksum(uri: ButlerURI, algorithm: str = "blake2b", block_size: int = 8192) -> Optional[str]:
1756 """Compute the checksum of the supplied file.
1758 Parameters
1759 ----------
1760 uri : `ButlerURI`
1761 Name of resource to calculate checksum from.
1762 algorithm : `str`, optional
1763 Name of algorithm to use. Must be one of the algorithms supported
1764 by :py:class`hashlib`.
1765 block_size : `int`
1766 Number of bytes to read from file at one time.
1768 Returns
1769 -------
1770 hexdigest : `str`
1771 Hex digest of the file.
1773 Notes
1774 -----
1775 Currently returns None if the URI is for a remote resource.
1776 """
1777 if algorithm not in hashlib.algorithms_guaranteed: 1777 ↛ 1778line 1777 didn't jump to line 1778, because the condition on line 1777 was never true
1778 raise NameError("The specified algorithm '{}' is not supported by hashlib".format(algorithm))
1780 if not uri.isLocal: 1780 ↛ 1781line 1780 didn't jump to line 1781, because the condition on line 1780 was never true
1781 return None
1783 hasher = hashlib.new(algorithm)
1785 with uri.as_local() as local_uri:
1786 with open(local_uri.ospath, "rb") as f:
1787 for chunk in iter(lambda: f.read(block_size), b""):
1788 hasher.update(chunk)
1790 return hasher.hexdigest()