Coverage for python/lsst/daf/butler/datastores/fileDatastore.py : 79%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
21from __future__ import annotations
23"""Generic file-based datastore code."""
25__all__ = ("FileDatastore", )
27import hashlib
28import logging
29import os
30import tempfile
32from sqlalchemy import BigInteger, String
34from collections import defaultdict
35from dataclasses import dataclass
36from typing import (
37 TYPE_CHECKING,
38 Any,
39 ClassVar,
40 Dict,
41 Iterable,
42 List,
43 Mapping,
44 Optional,
45 Set,
46 Tuple,
47 Type,
48 Union,
49)
51from lsst.daf.butler import (
52 ButlerURI,
53 CompositesMap,
54 Config,
55 FileDataset,
56 DatasetId,
57 DatasetRef,
58 DatasetType,
59 DatasetTypeNotSupportedError,
60 Datastore,
61 DatastoreCacheManager,
62 DatastoreDisabledCacheManager,
63 DatastoreConfig,
64 DatastoreValidationError,
65 FileDescriptor,
66 FileTemplates,
67 FileTemplateValidationError,
68 Formatter,
69 FormatterFactory,
70 Location,
71 LocationFactory,
72 Progress,
73 StorageClass,
74 StoredFileInfo,
75)
77from lsst.daf.butler import ddl
78from lsst.daf.butler.registry.interfaces import (
79 ReadOnlyDatabaseError,
80 DatastoreRegistryBridge,
81)
83from lsst.daf.butler.core.repoRelocation import replaceRoot
84from lsst.daf.butler.core.utils import getInstanceOf, getClassOf, transactional
85from .genericDatastore import GenericBaseDatastore
87if TYPE_CHECKING: 87 ↛ 88line 87 didn't jump to line 88, because the condition on line 87 was never true
88 from lsst.daf.butler import LookupKey, AbstractDatastoreCacheManager
89 from lsst.daf.butler.registry.interfaces import DatasetIdRef, DatastoreRegistryBridgeManager
91log = logging.getLogger(__name__)
94class _IngestPrepData(Datastore.IngestPrepData):
95 """Helper class for FileDatastore ingest implementation.
97 Parameters
98 ----------
99 datasets : `list` of `FileDataset`
100 Files to be ingested by this datastore.
101 """
102 def __init__(self, datasets: List[FileDataset]):
103 super().__init__(ref for dataset in datasets for ref in dataset.refs)
104 self.datasets = datasets
107@dataclass(frozen=True)
108class DatastoreFileGetInformation:
109 """Collection of useful parameters needed to retrieve a file from
110 a Datastore.
111 """
113 location: Location
114 """The location from which to read the dataset."""
116 formatter: Formatter
117 """The `Formatter` to use to deserialize the dataset."""
119 info: StoredFileInfo
120 """Stored information about this file and its formatter."""
122 assemblerParams: Dict[str, Any]
123 """Parameters to use for post-processing the retrieved dataset."""
125 formatterParams: Dict[str, Any]
126 """Parameters that were understood by the associated formatter."""
128 component: Optional[str]
129 """The component to be retrieved (can be `None`)."""
131 readStorageClass: StorageClass
132 """The `StorageClass` of the dataset being read."""
135class FileDatastore(GenericBaseDatastore):
136 """Generic Datastore for file-based implementations.
138 Should always be sub-classed since key abstract methods are missing.
140 Parameters
141 ----------
142 config : `DatastoreConfig` or `str`
143 Configuration as either a `Config` object or URI to file.
144 bridgeManager : `DatastoreRegistryBridgeManager`
145 Object that manages the interface between `Registry` and datastores.
146 butlerRoot : `str`, optional
147 New datastore root to use to override the configuration value.
149 Raises
150 ------
151 ValueError
152 If root location does not exist and ``create`` is `False` in the
153 configuration.
154 """
156 defaultConfigFile: ClassVar[Optional[str]] = None
157 """Path to configuration defaults. Accessed within the ``config`` resource
158 or relative to a search path. Can be None if no defaults specified.
159 """
161 root: ButlerURI
162 """Root directory URI of this `Datastore`."""
164 locationFactory: LocationFactory
165 """Factory for creating locations relative to the datastore root."""
167 formatterFactory: FormatterFactory
168 """Factory for creating instances of formatters."""
170 templates: FileTemplates
171 """File templates that can be used by this `Datastore`."""
173 composites: CompositesMap
174 """Determines whether a dataset should be disassembled on put."""
176 defaultConfigFile = "datastores/fileDatastore.yaml"
177 """Path to configuration defaults. Accessed within the ``config`` resource
178 or relative to a search path. Can be None if no defaults specified.
179 """
181 @classmethod
182 def setConfigRoot(cls, root: str, config: Config, full: Config, overwrite: bool = True) -> None:
183 """Set any filesystem-dependent config options for this Datastore to
184 be appropriate for a new empty repository with the given root.
186 Parameters
187 ----------
188 root : `str`
189 URI to the root of the data repository.
190 config : `Config`
191 A `Config` to update. Only the subset understood by
192 this component will be updated. Will not expand
193 defaults.
194 full : `Config`
195 A complete config with all defaults expanded that can be
196 converted to a `DatastoreConfig`. Read-only and will not be
197 modified by this method.
198 Repository-specific options that should not be obtained
199 from defaults when Butler instances are constructed
200 should be copied from ``full`` to ``config``.
201 overwrite : `bool`, optional
202 If `False`, do not modify a value in ``config`` if the value
203 already exists. Default is always to overwrite with the provided
204 ``root``.
206 Notes
207 -----
208 If a keyword is explicitly defined in the supplied ``config`` it
209 will not be overridden by this method if ``overwrite`` is `False`.
210 This allows explicit values set in external configs to be retained.
211 """
212 Config.updateParameters(DatastoreConfig, config, full,
213 toUpdate={"root": root},
214 toCopy=("cls", ("records", "table")), overwrite=overwrite)
216 @classmethod
217 def makeTableSpec(cls, datasetIdColumnType: type) -> ddl.TableSpec:
218 return ddl.TableSpec(
219 fields=[
220 ddl.FieldSpec(name="dataset_id", dtype=datasetIdColumnType, primaryKey=True),
221 ddl.FieldSpec(name="path", dtype=String, length=256, nullable=False),
222 ddl.FieldSpec(name="formatter", dtype=String, length=128, nullable=False),
223 ddl.FieldSpec(name="storage_class", dtype=String, length=64, nullable=False),
224 # Use empty string to indicate no component
225 ddl.FieldSpec(name="component", dtype=String, length=32, primaryKey=True),
226 # TODO: should checksum be Base64Bytes instead?
227 ddl.FieldSpec(name="checksum", dtype=String, length=128, nullable=True),
228 ddl.FieldSpec(name="file_size", dtype=BigInteger, nullable=True),
229 ],
230 unique=frozenset(),
231 indexes=[tuple(["path"])],
232 )
234 def __init__(self, config: Union[DatastoreConfig, str],
235 bridgeManager: DatastoreRegistryBridgeManager, butlerRoot: str = None):
236 super().__init__(config, bridgeManager)
237 if "root" not in self.config: 237 ↛ 238line 237 didn't jump to line 238, because the condition on line 237 was never true
238 raise ValueError("No root directory specified in configuration")
240 # Name ourselves either using an explicit name or a name
241 # derived from the (unexpanded) root
242 if "name" in self.config:
243 self.name = self.config["name"]
244 else:
245 # We use the unexpanded root in the name to indicate that this
246 # datastore can be moved without having to update registry.
247 self.name = "{}@{}".format(type(self).__name__,
248 self.config["root"])
250 # Support repository relocation in config
251 # Existence of self.root is checked in subclass
252 self.root = ButlerURI(replaceRoot(self.config["root"], butlerRoot),
253 forceDirectory=True, forceAbsolute=True)
255 self.locationFactory = LocationFactory(self.root)
256 self.formatterFactory = FormatterFactory()
258 # Now associate formatters with storage classes
259 self.formatterFactory.registerFormatters(self.config["formatters"],
260 universe=bridgeManager.universe)
262 # Read the file naming templates
263 self.templates = FileTemplates(self.config["templates"],
264 universe=bridgeManager.universe)
266 # See if composites should be disassembled
267 self.composites = CompositesMap(self.config["composites"],
268 universe=bridgeManager.universe)
270 tableName = self.config["records", "table"]
271 try:
272 # Storage of paths and formatters, keyed by dataset_id
273 self._table = bridgeManager.opaque.register(
274 tableName, self.makeTableSpec(bridgeManager.datasetIdColumnType))
275 # Interface to Registry.
276 self._bridge = bridgeManager.register(self.name)
277 except ReadOnlyDatabaseError:
278 # If the database is read only and we just tried and failed to
279 # create a table, it means someone is trying to create a read-only
280 # butler client for an empty repo. That should be okay, as long
281 # as they then try to get any datasets before some other client
282 # creates the table. Chances are they'rejust validating
283 # configuration.
284 pass
286 # Determine whether checksums should be used - default to False
287 self.useChecksum = self.config.get("checksum", False)
289 # Determine whether we can fall back to configuration if a
290 # requested dataset is not known to registry
291 self.trustGetRequest = self.config.get("trust_get_request", False)
293 # Create a cache manager
294 self.cacheManager: AbstractDatastoreCacheManager
295 if "cached" in self.config: 295 ↛ 299line 295 didn't jump to line 299, because the condition on line 295 was never false
296 self.cacheManager = DatastoreCacheManager(self.config["cached"],
297 universe=bridgeManager.universe)
298 else:
299 self.cacheManager = DatastoreDisabledCacheManager("",
300 universe=bridgeManager.universe)
302 # Check existence and create directory structure if necessary
303 if not self.root.exists():
304 if "create" not in self.config or not self.config["create"]: 304 ↛ 305line 304 didn't jump to line 305, because the condition on line 304 was never true
305 raise ValueError(f"No valid root and not allowed to create one at: {self.root}")
306 try:
307 self.root.mkdir()
308 except Exception as e:
309 raise ValueError(f"Can not create datastore root '{self.root}', check permissions."
310 f" Got error: {e}") from e
312 def __str__(self) -> str:
313 return str(self.root)
315 @property
316 def bridge(self) -> DatastoreRegistryBridge:
317 return self._bridge
319 def _artifact_exists(self, location: Location) -> bool:
320 """Check that an artifact exists in this datastore at the specified
321 location.
323 Parameters
324 ----------
325 location : `Location`
326 Expected location of the artifact associated with this datastore.
328 Returns
329 -------
330 exists : `bool`
331 True if the location can be found, false otherwise.
332 """
333 log.debug("Checking if resource exists: %s", location.uri)
334 return location.uri.exists()
336 def _delete_artifact(self, location: Location) -> None:
337 """Delete the artifact from the datastore.
339 Parameters
340 ----------
341 location : `Location`
342 Location of the artifact associated with this datastore.
343 """
344 if location.pathInStore.isabs(): 344 ↛ 345line 344 didn't jump to line 345, because the condition on line 344 was never true
345 raise RuntimeError(f"Cannot delete artifact with absolute uri {location.uri}.")
347 try:
348 location.uri.remove()
349 except FileNotFoundError:
350 log.debug("File %s did not exist and so could not be deleted.", location.uri)
351 raise
352 except Exception as e:
353 log.critical("Failed to delete file: %s (%s)", location.uri, e)
354 raise
355 log.debug("Successfully deleted file: %s", location.uri)
357 def addStoredItemInfo(self, refs: Iterable[DatasetRef], infos: Iterable[StoredFileInfo]) -> None:
358 # Docstring inherited from GenericBaseDatastore
359 records = [info.to_record(ref) for ref, info in zip(refs, infos)]
360 self._table.insert(*records)
362 def getStoredItemsInfo(self, ref: DatasetIdRef) -> List[StoredFileInfo]:
363 # Docstring inherited from GenericBaseDatastore
365 # Look for the dataset_id -- there might be multiple matches
366 # if we have disassembled the dataset.
367 records = self._table.fetch(dataset_id=ref.id)
368 return [StoredFileInfo.from_record(record) for record in records]
370 def _get_stored_records_associated_with_refs(self,
371 refs: Iterable[DatasetIdRef]
372 ) -> Dict[DatasetId, List[StoredFileInfo]]:
373 """Retrieve all records associated with the provided refs.
375 Parameters
376 ----------
377 refs : iterable of `DatasetIdRef`
378 The refs for which records are to be retrieved.
380 Returns
381 -------
382 records : `dict` of [`DatasetId`, `list` of `StoredFileInfo`]
383 The matching records indexed by the ref ID. The number of entries
384 in the dict can be smaller than the number of requested refs.
385 """
386 records = self._table.fetch(dataset_id=[ref.id for ref in refs])
388 # Uniqueness is dataset_id + component so can have multiple records
389 # per ref.
390 records_by_ref = defaultdict(list)
391 for record in records:
392 records_by_ref[record["dataset_id"]].append(StoredFileInfo.from_record(record))
393 return records_by_ref
395 def _refs_associated_with_artifacts(self, paths: List[Union[str, ButlerURI]]) -> Dict[str,
396 Set[DatasetId]]:
397 """Return paths and associated dataset refs.
399 Parameters
400 ----------
401 paths : `list` of `str` or `ButlerURI`
402 All the paths to include in search.
404 Returns
405 -------
406 mapping : `dict` of [`str`, `set` [`DatasetId`]]
407 Mapping of each path to a set of associated database IDs.
408 """
409 records = self._table.fetch(path=[str(path) for path in paths])
410 result = defaultdict(set)
411 for row in records:
412 result[row["path"]].add(row["dataset_id"])
413 return result
415 def _registered_refs_per_artifact(self, pathInStore: ButlerURI) -> Set[DatasetId]:
416 """Return all dataset refs associated with the supplied path.
418 Parameters
419 ----------
420 pathInStore : `ButlerURI`
421 Path of interest in the data store.
423 Returns
424 -------
425 ids : `set` of `int`
426 All `DatasetRef` IDs associated with this path.
427 """
428 records = list(self._table.fetch(path=str(pathInStore)))
429 ids = {r["dataset_id"] for r in records}
430 return ids
432 def removeStoredItemInfo(self, ref: DatasetIdRef) -> None:
433 # Docstring inherited from GenericBaseDatastore
434 self._table.delete(["dataset_id"], {"dataset_id": ref.id})
436 def _get_dataset_locations_info(self, ref: DatasetIdRef) -> List[Tuple[Location, StoredFileInfo]]:
437 r"""Find all the `Location`\ s of the requested dataset in the
438 `Datastore` and the associated stored file information.
440 Parameters
441 ----------
442 ref : `DatasetRef`
443 Reference to the required `Dataset`.
445 Returns
446 -------
447 results : `list` [`tuple` [`Location`, `StoredFileInfo` ]]
448 Location of the dataset within the datastore and
449 stored information about each file and its formatter.
450 """
451 # Get the file information (this will fail if no file)
452 records = self.getStoredItemsInfo(ref)
454 # Use the path to determine the location -- we need to take
455 # into account absolute URIs in the datastore record
456 return [(r.file_location(self.locationFactory), r) for r in records]
458 def _can_remove_dataset_artifact(self, ref: DatasetIdRef, location: Location) -> bool:
459 """Check that there is only one dataset associated with the
460 specified artifact.
462 Parameters
463 ----------
464 ref : `DatasetRef` or `FakeDatasetRef`
465 Dataset to be removed.
466 location : `Location`
467 The location of the artifact to be removed.
469 Returns
470 -------
471 can_remove : `Bool`
472 True if the artifact can be safely removed.
473 """
474 # Can't ever delete absolute URIs.
475 if location.pathInStore.isabs():
476 return False
478 # Get all entries associated with this path
479 allRefs = self._registered_refs_per_artifact(location.pathInStore)
480 if not allRefs:
481 raise RuntimeError(f"Datastore inconsistency error. {location.pathInStore} not in registry")
483 # Remove these refs from all the refs and if there is nothing left
484 # then we can delete
485 remainingRefs = allRefs - {ref.id}
487 if remainingRefs:
488 return False
489 return True
491 def _get_expected_dataset_locations_info(self, ref: DatasetRef) -> List[Tuple[Location,
492 StoredFileInfo]]:
493 """Predict the location and related file information of the requested
494 dataset in this datastore.
496 Parameters
497 ----------
498 ref : `DatasetRef`
499 Reference to the required `Dataset`.
501 Returns
502 -------
503 results : `list` [`tuple` [`Location`, `StoredFileInfo` ]]
504 Expected Location of the dataset within the datastore and
505 placeholder information about each file and its formatter.
507 Notes
508 -----
509 Uses the current configuration to determine how we would expect the
510 datastore files to have been written if we couldn't ask registry.
511 This is safe so long as there has been no change to datastore
512 configuration between writing the dataset and wanting to read it.
513 Will not work for files that have been ingested without using the
514 standard file template or default formatter.
515 """
517 # If we have a component ref we always need to ask the questions
518 # of the composite. If the composite is disassembled this routine
519 # should return all components. If the composite was not
520 # disassembled the composite is what is stored regardless of
521 # component request. Note that if the caller has disassembled
522 # a composite there is no way for this guess to know that
523 # without trying both the composite and component ref and seeing
524 # if there is something at the component Location even without
525 # disassembly being enabled.
526 if ref.datasetType.isComponent():
527 ref = ref.makeCompositeRef()
529 # See if the ref is a composite that should be disassembled
530 doDisassembly = self.composites.shouldBeDisassembled(ref)
532 all_info: List[Tuple[Location, Formatter, StorageClass, Optional[str]]] = []
534 if doDisassembly:
535 for component, componentStorage in ref.datasetType.storageClass.components.items():
536 compRef = ref.makeComponentRef(component)
537 location, formatter = self._determine_put_formatter_location(compRef)
538 all_info.append((location, formatter, componentStorage, component))
540 else:
541 # Always use the composite ref if no disassembly
542 location, formatter = self._determine_put_formatter_location(ref)
543 all_info.append((location, formatter, ref.datasetType.storageClass, None))
545 # Convert the list of tuples to have StoredFileInfo as second element
546 return [(location, StoredFileInfo(formatter=formatter,
547 path=location.pathInStore.path,
548 storageClass=storageClass,
549 component=component,
550 checksum=None,
551 file_size=-1))
552 for location, formatter, storageClass, component in all_info]
554 def _prepare_for_get(self, ref: DatasetRef,
555 parameters: Optional[Mapping[str, Any]] = None) -> List[DatastoreFileGetInformation]:
556 """Check parameters for ``get`` and obtain formatter and
557 location.
559 Parameters
560 ----------
561 ref : `DatasetRef`
562 Reference to the required Dataset.
563 parameters : `dict`
564 `StorageClass`-specific parameters that specify, for example,
565 a slice of the dataset to be loaded.
567 Returns
568 -------
569 getInfo : `list` [`DatastoreFileGetInformation`]
570 Parameters needed to retrieve each file.
571 """
572 log.debug("Retrieve %s from %s with parameters %s", ref, self.name, parameters)
574 # Get file metadata and internal metadata
575 fileLocations = self._get_dataset_locations_info(ref)
576 if not fileLocations:
577 if not self.trustGetRequest:
578 raise FileNotFoundError(f"Could not retrieve dataset {ref}.")
579 # Assume the dataset is where we think it should be
580 fileLocations = self._get_expected_dataset_locations_info(ref)
582 # The storage class we want to use eventually
583 refStorageClass = ref.datasetType.storageClass
585 if len(fileLocations) > 1:
586 disassembled = True
588 # If trust is involved it is possible that there will be
589 # components listed here that do not exist in the datastore.
590 # Explicitly check for file artifact existence and filter out any
591 # that are missing.
592 if self.trustGetRequest:
593 fileLocations = [loc for loc in fileLocations if loc[0].uri.exists()]
595 # For now complain only if we have no components at all. One
596 # component is probably a problem but we can punt that to the
597 # assembler.
598 if not fileLocations: 598 ↛ 599line 598 didn't jump to line 599, because the condition on line 598 was never true
599 raise FileNotFoundError(f"None of the component files for dataset {ref} exist.")
601 else:
602 disassembled = False
604 # Is this a component request?
605 refComponent = ref.datasetType.component()
607 fileGetInfo = []
608 for location, storedFileInfo in fileLocations:
610 # The storage class used to write the file
611 writeStorageClass = storedFileInfo.storageClass
613 # If this has been disassembled we need read to match the write
614 if disassembled:
615 readStorageClass = writeStorageClass
616 else:
617 readStorageClass = refStorageClass
619 formatter = getInstanceOf(storedFileInfo.formatter,
620 FileDescriptor(location, readStorageClass=readStorageClass,
621 storageClass=writeStorageClass, parameters=parameters),
622 ref.dataId)
624 formatterParams, notFormatterParams = formatter.segregateParameters()
626 # Of the remaining parameters, extract the ones supported by
627 # this StorageClass (for components not all will be handled)
628 assemblerParams = readStorageClass.filterParameters(notFormatterParams)
630 # The ref itself could be a component if the dataset was
631 # disassembled by butler, or we disassembled in datastore and
632 # components came from the datastore records
633 component = storedFileInfo.component if storedFileInfo.component else refComponent
635 fileGetInfo.append(DatastoreFileGetInformation(location, formatter, storedFileInfo,
636 assemblerParams, formatterParams,
637 component, readStorageClass))
639 return fileGetInfo
641 def _prepare_for_put(self, inMemoryDataset: Any, ref: DatasetRef) -> Tuple[Location, Formatter]:
642 """Check the arguments for ``put`` and obtain formatter and
643 location.
645 Parameters
646 ----------
647 inMemoryDataset : `object`
648 The dataset to store.
649 ref : `DatasetRef`
650 Reference to the associated Dataset.
652 Returns
653 -------
654 location : `Location`
655 The location to write the dataset.
656 formatter : `Formatter`
657 The `Formatter` to use to write the dataset.
659 Raises
660 ------
661 TypeError
662 Supplied object and storage class are inconsistent.
663 DatasetTypeNotSupportedError
664 The associated `DatasetType` is not handled by this datastore.
665 """
666 self._validate_put_parameters(inMemoryDataset, ref)
667 return self._determine_put_formatter_location(ref)
669 def _determine_put_formatter_location(self, ref: DatasetRef) -> Tuple[Location, Formatter]:
670 """Calculate the formatter and output location to use for put.
672 Parameters
673 ----------
674 ref : `DatasetRef`
675 Reference to the associated Dataset.
677 Returns
678 -------
679 location : `Location`
680 The location to write the dataset.
681 formatter : `Formatter`
682 The `Formatter` to use to write the dataset.
683 """
684 # Work out output file name
685 try:
686 template = self.templates.getTemplate(ref)
687 except KeyError as e:
688 raise DatasetTypeNotSupportedError(f"Unable to find template for {ref}") from e
690 # Validate the template to protect against filenames from different
691 # dataIds returning the same and causing overwrite confusion.
692 template.validateTemplate(ref)
694 location = self.locationFactory.fromPath(template.format(ref))
696 # Get the formatter based on the storage class
697 storageClass = ref.datasetType.storageClass
698 try:
699 formatter = self.formatterFactory.getFormatter(ref,
700 FileDescriptor(location,
701 storageClass=storageClass),
702 ref.dataId)
703 except KeyError as e:
704 raise DatasetTypeNotSupportedError(f"Unable to find formatter for {ref} in datastore "
705 f"{self.name}") from e
707 # Now that we know the formatter, update the location
708 location = formatter.makeUpdatedLocation(location)
710 return location, formatter
712 def _overrideTransferMode(self, *datasets: FileDataset, transfer: Optional[str] = None) -> Optional[str]:
713 # Docstring inherited from base class
714 if transfer != "auto":
715 return transfer
717 # See if the paths are within the datastore or not
718 inside = [self._pathInStore(d.path) is not None for d in datasets]
720 if all(inside):
721 transfer = None
722 elif not any(inside): 722 ↛ 731line 722 didn't jump to line 731, because the condition on line 722 was never false
723 # Allow ButlerURI to use its own knowledge
724 transfer = "auto"
725 else:
726 # This can happen when importing from a datastore that
727 # has had some datasets ingested using "direct" mode.
728 # Also allow ButlerURI to sort it out but warn about it.
729 # This can happen if you are importing from a datastore
730 # that had some direct transfer datasets.
731 log.warning("Some datasets are inside the datastore and some are outside. Using 'split' "
732 "transfer mode. This assumes that the files outside the datastore are "
733 "still accessible to the new butler since they will not be copied into "
734 "the target datastore.")
735 transfer = "split"
737 return transfer
739 def _pathInStore(self, path: Union[str, ButlerURI]) -> Optional[str]:
740 """Return path relative to datastore root
742 Parameters
743 ----------
744 path : `str` or `ButlerURI`
745 Path to dataset. Can be absolute URI. If relative assumed to
746 be relative to the datastore. Returns path in datastore
747 or raises an exception if the path it outside.
749 Returns
750 -------
751 inStore : `str`
752 Path relative to datastore root. Returns `None` if the file is
753 outside the root.
754 """
755 # Relative path will always be relative to datastore
756 pathUri = ButlerURI(path, forceAbsolute=False)
757 return pathUri.relative_to(self.root)
759 def _standardizeIngestPath(self, path: Union[str, ButlerURI], *,
760 transfer: Optional[str] = None) -> Union[str, ButlerURI]:
761 """Standardize the path of a to-be-ingested file.
763 Parameters
764 ----------
765 path : `str` or `ButlerURI`
766 Path of a file to be ingested.
767 transfer : `str`, optional
768 How (and whether) the dataset should be added to the datastore.
769 See `ingest` for details of transfer modes.
770 This implementation is provided only so
771 `NotImplementedError` can be raised if the mode is not supported;
772 actual transfers are deferred to `_extractIngestInfo`.
774 Returns
775 -------
776 path : `str` or `ButlerURI`
777 New path in what the datastore considers standard form. If an
778 absolute URI was given that will be returned unchanged.
780 Notes
781 -----
782 Subclasses of `FileDatastore` can implement this method instead
783 of `_prepIngest`. It should not modify the data repository or given
784 file in any way.
786 Raises
787 ------
788 NotImplementedError
789 Raised if the datastore does not support the given transfer mode
790 (including the case where ingest is not supported at all).
791 FileNotFoundError
792 Raised if one of the given files does not exist.
793 """
794 if transfer not in (None, "direct", "split") + self.root.transferModes: 794 ↛ 795line 794 didn't jump to line 795, because the condition on line 794 was never true
795 raise NotImplementedError(f"Transfer mode {transfer} not supported.")
797 # A relative URI indicates relative to datastore root
798 srcUri = ButlerURI(path, forceAbsolute=False)
799 if not srcUri.isabs():
800 srcUri = self.root.join(path)
802 if not srcUri.exists():
803 raise FileNotFoundError(f"Resource at {srcUri} does not exist; note that paths to ingest "
804 f"are assumed to be relative to {self.root} unless they are absolute.")
806 if transfer is None:
807 relpath = srcUri.relative_to(self.root)
808 if not relpath:
809 raise RuntimeError(f"Transfer is none but source file ({srcUri}) is not "
810 f"within datastore ({self.root})")
812 # Return the relative path within the datastore for internal
813 # transfer
814 path = relpath
816 return path
818 def _extractIngestInfo(self, path: Union[str, ButlerURI], ref: DatasetRef, *,
819 formatter: Union[Formatter, Type[Formatter]],
820 transfer: Optional[str] = None) -> StoredFileInfo:
821 """Relocate (if necessary) and extract `StoredFileInfo` from a
822 to-be-ingested file.
824 Parameters
825 ----------
826 path : `str` or `ButlerURI`
827 URI or path of a file to be ingested.
828 ref : `DatasetRef`
829 Reference for the dataset being ingested. Guaranteed to have
830 ``dataset_id not None`.
831 formatter : `type` or `Formatter`
832 `Formatter` subclass to use for this dataset or an instance.
833 transfer : `str`, optional
834 How (and whether) the dataset should be added to the datastore.
835 See `ingest` for details of transfer modes.
837 Returns
838 -------
839 info : `StoredFileInfo`
840 Internal datastore record for this file. This will be inserted by
841 the caller; the `_extractIngestInfo` is only resposible for
842 creating and populating the struct.
844 Raises
845 ------
846 FileNotFoundError
847 Raised if one of the given files does not exist.
848 FileExistsError
849 Raised if transfer is not `None` but the (internal) location the
850 file would be moved to is already occupied.
851 """
852 if self._transaction is None: 852 ↛ 853line 852 didn't jump to line 853, because the condition on line 852 was never true
853 raise RuntimeError("Ingest called without transaction enabled")
855 # Create URI of the source path, do not need to force a relative
856 # path to absolute.
857 srcUri = ButlerURI(path, forceAbsolute=False)
859 # Track whether we have read the size of the source yet
860 have_sized = False
862 tgtLocation: Optional[Location]
863 if transfer is None or transfer == "split":
864 # A relative path is assumed to be relative to the datastore
865 # in this context
866 if not srcUri.isabs():
867 tgtLocation = self.locationFactory.fromPath(srcUri.ospath)
868 else:
869 # Work out the path in the datastore from an absolute URI
870 # This is required to be within the datastore.
871 pathInStore = srcUri.relative_to(self.root)
872 if pathInStore is None and transfer is None: 872 ↛ 873line 872 didn't jump to line 873, because the condition on line 872 was never true
873 raise RuntimeError(f"Unexpectedly learned that {srcUri} is "
874 f"not within datastore {self.root}")
875 if pathInStore: 875 ↛ 877line 875 didn't jump to line 877, because the condition on line 875 was never false
876 tgtLocation = self.locationFactory.fromPath(pathInStore)
877 elif transfer == "split":
878 # Outside the datastore but treat that as a direct ingest
879 # instead.
880 tgtLocation = None
881 else:
882 raise RuntimeError(f"Unexpected transfer mode encountered: {transfer} for"
883 f" URI {srcUri}")
884 elif transfer == "direct": 884 ↛ 889line 884 didn't jump to line 889, because the condition on line 884 was never true
885 # Want to store the full URI to the resource directly in
886 # datastore. This is useful for referring to permanent archive
887 # storage for raw data.
888 # Trust that people know what they are doing.
889 tgtLocation = None
890 else:
891 # Work out the name we want this ingested file to have
892 # inside the datastore
893 tgtLocation = self._calculate_ingested_datastore_name(srcUri, ref, formatter)
894 if not tgtLocation.uri.dirname().exists():
895 log.debug("Folder %s does not exist yet.", tgtLocation.uri.dirname())
896 tgtLocation.uri.dirname().mkdir()
898 # if we are transferring from a local file to a remote location
899 # it may be more efficient to get the size and checksum of the
900 # local file rather than the transferred one
901 if not srcUri.scheme or srcUri.scheme == "file": 901 ↛ 907line 901 didn't jump to line 907, because the condition on line 901 was never false
902 size = srcUri.size()
903 checksum = self.computeChecksum(srcUri) if self.useChecksum else None
904 have_sized = True
906 # transfer the resource to the destination
907 tgtLocation.uri.transfer_from(srcUri, transfer=transfer, transaction=self._transaction)
909 if tgtLocation is None: 909 ↛ 911line 909 didn't jump to line 911, because the condition on line 909 was never true
910 # This means we are using direct mode
911 targetUri = srcUri
912 targetPath = str(srcUri)
913 else:
914 targetUri = tgtLocation.uri
915 targetPath = tgtLocation.pathInStore.path
917 # the file should exist in the datastore now
918 if not have_sized:
919 size = targetUri.size()
920 checksum = self.computeChecksum(targetUri) if self.useChecksum else None
922 return StoredFileInfo(formatter=formatter, path=targetPath,
923 storageClass=ref.datasetType.storageClass,
924 component=ref.datasetType.component(),
925 file_size=size, checksum=checksum)
927 def _prepIngest(self, *datasets: FileDataset, transfer: Optional[str] = None) -> _IngestPrepData:
928 # Docstring inherited from Datastore._prepIngest.
929 filtered = []
930 for dataset in datasets:
931 acceptable = [ref for ref in dataset.refs if self.constraints.isAcceptable(ref)]
932 if not acceptable:
933 continue
934 else:
935 dataset.refs = acceptable
936 if dataset.formatter is None:
937 dataset.formatter = self.formatterFactory.getFormatterClass(dataset.refs[0])
938 else:
939 assert isinstance(dataset.formatter, (type, str))
940 dataset.formatter = getClassOf(dataset.formatter)
941 dataset.path = self._standardizeIngestPath(dataset.path, transfer=transfer)
942 filtered.append(dataset)
943 return _IngestPrepData(filtered)
945 @transactional
946 def _finishIngest(self, prepData: Datastore.IngestPrepData, *, transfer: Optional[str] = None) -> None:
947 # Docstring inherited from Datastore._finishIngest.
948 refsAndInfos = []
949 progress = Progress("lsst.daf.butler.datastores.FileDatastore.ingest", level=logging.DEBUG)
950 for dataset in progress.wrap(prepData.datasets, desc="Ingesting dataset files"):
951 # Do ingest as if the first dataset ref is associated with the file
952 info = self._extractIngestInfo(dataset.path, dataset.refs[0], formatter=dataset.formatter,
953 transfer=transfer)
954 refsAndInfos.extend([(ref, info) for ref in dataset.refs])
955 self._register_datasets(refsAndInfos)
957 def _calculate_ingested_datastore_name(self, srcUri: ButlerURI, ref: DatasetRef,
958 formatter: Union[Formatter, Type[Formatter]]) -> Location:
959 """Given a source URI and a DatasetRef, determine the name the
960 dataset will have inside datastore.
962 Parameters
963 ----------
964 srcUri : `ButlerURI`
965 URI to the source dataset file.
966 ref : `DatasetRef`
967 Ref associated with the newly-ingested dataset artifact. This
968 is used to determine the name within the datastore.
969 formatter : `Formatter` or Formatter class.
970 Formatter to use for validation. Can be a class or an instance.
972 Returns
973 -------
974 location : `Location`
975 Target location for the newly-ingested dataset.
976 """
977 # Ingesting a file from outside the datastore.
978 # This involves a new name.
979 template = self.templates.getTemplate(ref)
980 location = self.locationFactory.fromPath(template.format(ref))
982 # Get the extension
983 ext = srcUri.getExtension()
985 # Update the destination to include that extension
986 location.updateExtension(ext)
988 # Ask the formatter to validate this extension
989 formatter.validateExtension(location)
991 return location
993 def _write_in_memory_to_artifact(self, inMemoryDataset: Any, ref: DatasetRef) -> StoredFileInfo:
994 """Write out in memory dataset to datastore.
996 Parameters
997 ----------
998 inMemoryDataset : `object`
999 Dataset to write to datastore.
1000 ref : `DatasetRef`
1001 Registry information associated with this dataset.
1003 Returns
1004 -------
1005 info : `StoredFileInfo`
1006 Information describin the artifact written to the datastore.
1007 """
1008 location, formatter = self._prepare_for_put(inMemoryDataset, ref)
1009 uri = location.uri
1011 if not uri.dirname().exists():
1012 log.debug("Folder %s does not exist yet so creating it.", uri.dirname())
1013 uri.dirname().mkdir()
1015 if self._transaction is None: 1015 ↛ 1016line 1015 didn't jump to line 1016, because the condition on line 1015 was never true
1016 raise RuntimeError("Attempting to write artifact without transaction enabled")
1018 def _removeFileExists(uri: ButlerURI) -> None:
1019 """Remove a file and do not complain if it is not there.
1021 This is important since a formatter might fail before the file
1022 is written and we should not confuse people by writing spurious
1023 error messages to the log.
1024 """
1025 try:
1026 uri.remove()
1027 except FileNotFoundError:
1028 pass
1030 # Register a callback to try to delete the uploaded data if
1031 # something fails below
1032 self._transaction.registerUndo("artifactWrite", _removeFileExists, uri)
1034 # For a local file, simply use the formatter directly
1035 if uri.isLocal:
1036 try:
1037 formatter.write(inMemoryDataset)
1038 except Exception as e:
1039 raise RuntimeError(f"Failed to serialize dataset {ref} of type {type(inMemoryDataset)} "
1040 f"to location {uri}") from e
1041 log.debug("Successfully wrote python object to local file at %s", uri)
1042 else:
1043 # This is a remote URI, so first try bytes and write directly else
1044 # fallback to a temporary file
1045 try:
1046 serializedDataset = formatter.toBytes(inMemoryDataset)
1047 except NotImplementedError: 1047 ↛ 1066line 1047 didn't jump to line 1066
1048 with tempfile.NamedTemporaryFile(suffix=uri.getExtension()) as tmpFile:
1049 # Need to configure the formatter to write to a different
1050 # location and that needs us to overwrite internals
1051 tmpLocation = Location(*os.path.split(tmpFile.name))
1052 log.debug("Writing dataset to temporary location at %s", tmpLocation.uri)
1053 with formatter._updateLocation(tmpLocation):
1054 try:
1055 formatter.write(inMemoryDataset)
1056 except Exception as e:
1057 raise RuntimeError(f"Failed to serialize dataset {ref} of type"
1058 f" {type(inMemoryDataset)} to "
1059 f"temporary location {tmpLocation.uri}") from e
1060 uri.transfer_from(tmpLocation.uri, transfer="copy", overwrite=True)
1062 # Cache if required
1063 self.cacheManager.move_to_cache(tmpLocation.uri, ref)
1065 log.debug("Successfully wrote dataset to %s via a temporary file.", uri)
1066 except Exception as e:
1067 raise RuntimeError(f"Failed to serialize dataset {ref} to bytes.") from e
1068 else:
1069 log.debug("Writing bytes directly to %s", uri)
1070 uri.write(serializedDataset, overwrite=True)
1071 log.debug("Successfully wrote bytes directly to %s", uri)
1073 # URI is needed to resolve what ingest case are we dealing with
1074 return self._extractIngestInfo(uri, ref, formatter=formatter)
1076 def _read_artifact_into_memory(self, getInfo: DatastoreFileGetInformation,
1077 ref: DatasetRef, isComponent: bool = False) -> Any:
1078 """Read the artifact from datastore into in memory object.
1080 Parameters
1081 ----------
1082 getInfo : `DatastoreFileGetInformation`
1083 Information about the artifact within the datastore.
1084 ref : `DatasetRef`
1085 The registry information associated with this artifact.
1086 isComponent : `bool`
1087 Flag to indicate if a component is being read from this artifact.
1089 Returns
1090 -------
1091 inMemoryDataset : `object`
1092 The artifact as a python object.
1093 """
1094 location = getInfo.location
1095 uri = location.uri
1096 log.debug("Accessing data from %s", uri)
1098 # Cannot recalculate checksum but can compare size as a quick check
1099 # Do not do this if the size is negative since that indicates
1100 # we do not know.
1101 recorded_size = getInfo.info.file_size
1102 resource_size = uri.size()
1103 if recorded_size >= 0 and resource_size != recorded_size: 1103 ↛ 1104line 1103 didn't jump to line 1104, because the condition on line 1103 was never true
1104 raise RuntimeError("Integrity failure in Datastore. "
1105 f"Size of file {uri} ({resource_size}) "
1106 f"does not match size recorded in registry of {recorded_size}")
1108 # For the general case we have choices for how to proceed.
1109 # 1. Always use a local file (downloading the remote resource to a
1110 # temporary file if needed).
1111 # 2. Use a threshold size and read into memory and use bytes.
1112 # Use both for now with an arbitrary hand off size.
1113 # This allows small datasets to be downloaded from remote object
1114 # stores without requiring a temporary file.
1116 formatter = getInfo.formatter
1117 nbytes_max = 10_000_000 # Arbitrary number that we can tune
1118 if resource_size <= nbytes_max and formatter.can_read_bytes():
1119 serializedDataset = uri.read()
1120 log.debug("Deserializing %s from %d bytes from location %s with formatter %s",
1121 f"component {getInfo.component}" if isComponent else "",
1122 len(serializedDataset), uri, formatter.name())
1123 try:
1124 result = formatter.fromBytes(serializedDataset,
1125 component=getInfo.component if isComponent else None)
1126 except Exception as e:
1127 raise ValueError(f"Failure from formatter '{formatter.name()}' for dataset {ref.id}"
1128 f" ({ref.datasetType.name} from {uri}): {e}") from e
1129 else:
1130 # Read from file.
1132 # Have to update the Location associated with the formatter
1133 # because formatter.read does not allow an override.
1134 # This could be improved.
1135 location_updated = False
1136 msg = ""
1138 # First check in cache for local version.
1139 # The cache will only be relevant for remote resources.
1140 if not uri.isLocal:
1141 cached_file = self.cacheManager.find_in_cache(ref, uri.getExtension())
1142 if cached_file is not None: 1142 ↛ 1143line 1142 didn't jump to line 1143, because the condition on line 1142 was never true
1143 msg = f"(via cache read of remote file {uri})"
1144 uri = cached_file
1145 location_updated = True
1147 with uri.as_local() as local_uri:
1149 # URI was remote and file was downloaded
1150 if uri != local_uri:
1151 cache_msg = ""
1152 location_updated = True
1154 # Cache the downloaded file if needed.
1155 cached_uri = self.cacheManager.move_to_cache(local_uri, ref)
1156 if cached_uri is not None: 1156 ↛ 1157line 1156 didn't jump to line 1157, because the condition on line 1156 was never true
1157 local_uri = cached_uri
1158 cache_msg = " and cached"
1160 msg = f"(via download to local file{cache_msg})"
1162 # Calculate the (possibly) new location for the formatter
1163 # to use.
1164 newLocation = Location(*local_uri.split()) if location_updated else None
1166 log.debug("Reading%s from location %s %s with formatter %s",
1167 f" component {getInfo.component}" if isComponent else "",
1168 uri, msg, formatter.name())
1169 try:
1170 with formatter._updateLocation(newLocation):
1171 result = formatter.read(component=getInfo.component if isComponent else None)
1172 except Exception as e:
1173 raise ValueError(f"Failure from formatter '{formatter.name()}' for dataset {ref.id}"
1174 f" ({ref.datasetType.name} from {uri}): {e}") from e
1176 return self._post_process_get(result, getInfo.readStorageClass, getInfo.assemblerParams,
1177 isComponent=isComponent)
1179 def knows(self, ref: DatasetRef) -> bool:
1180 """Check if the dataset is known to the datastore.
1182 Does not check for existence of any artifact.
1184 Parameters
1185 ----------
1186 ref : `DatasetRef`
1187 Reference to the required dataset.
1189 Returns
1190 -------
1191 exists : `bool`
1192 `True` if the dataset is known to the datastore.
1193 """
1194 fileLocations = self._get_dataset_locations_info(ref)
1195 if fileLocations:
1196 return True
1197 return False
1199 def exists(self, ref: DatasetRef) -> bool:
1200 """Check if the dataset exists in the datastore.
1202 Parameters
1203 ----------
1204 ref : `DatasetRef`
1205 Reference to the required dataset.
1207 Returns
1208 -------
1209 exists : `bool`
1210 `True` if the entity exists in the `Datastore`.
1211 """
1212 fileLocations = self._get_dataset_locations_info(ref)
1214 # if we are being asked to trust that registry might not be correct
1215 # we ask for the expected locations and check them explicitly
1216 if not fileLocations:
1217 if not self.trustGetRequest:
1218 return False
1220 # When we are guessing a dataset location we can not check
1221 # for the existence of every component since we can not
1222 # know if every component was written. Instead we check
1223 # for the existence of any of the expected locations.
1224 for location, _ in self._get_expected_dataset_locations_info(ref): 1224 ↛ 1227line 1224 didn't jump to line 1227, because the loop on line 1224 didn't complete
1225 if self._artifact_exists(location): 1225 ↛ 1224line 1225 didn't jump to line 1224, because the condition on line 1225 was never false
1226 return True
1227 return False
1229 # All listed artifacts must exist.
1230 for location, _ in fileLocations:
1231 if not self._artifact_exists(location):
1232 return False
1234 return True
1236 def getURIs(self, ref: DatasetRef,
1237 predict: bool = False) -> Tuple[Optional[ButlerURI], Dict[str, ButlerURI]]:
1238 """Return URIs associated with dataset.
1240 Parameters
1241 ----------
1242 ref : `DatasetRef`
1243 Reference to the required dataset.
1244 predict : `bool`, optional
1245 If the datastore does not know about the dataset, should it
1246 return a predicted URI or not?
1248 Returns
1249 -------
1250 primary : `ButlerURI`
1251 The URI to the primary artifact associated with this dataset.
1252 If the dataset was disassembled within the datastore this
1253 may be `None`.
1254 components : `dict`
1255 URIs to any components associated with the dataset artifact.
1256 Can be empty if there are no components.
1257 """
1259 primary: Optional[ButlerURI] = None
1260 components: Dict[str, ButlerURI] = {}
1262 # if this has never been written then we have to guess
1263 if not self.exists(ref):
1264 if not predict:
1265 raise FileNotFoundError("Dataset {} not in this datastore".format(ref))
1267 doDisassembly = self.composites.shouldBeDisassembled(ref)
1269 if doDisassembly:
1271 for component, componentStorage in ref.datasetType.storageClass.components.items():
1272 compRef = ref.makeComponentRef(component)
1273 compLocation, _ = self._determine_put_formatter_location(compRef)
1275 # Add a URI fragment to indicate this is a guess
1276 components[component] = ButlerURI(compLocation.uri.geturl() + "#predicted")
1278 else:
1280 location, _ = self._determine_put_formatter_location(ref)
1282 # Add a URI fragment to indicate this is a guess
1283 primary = ButlerURI(location.uri.geturl() + "#predicted")
1285 return primary, components
1287 # If this is a ref that we have written we can get the path.
1288 # Get file metadata and internal metadata
1289 fileLocations = self._get_dataset_locations_info(ref)
1291 guessing = False
1292 if not fileLocations:
1293 if not self.trustGetRequest: 1293 ↛ 1294line 1293 didn't jump to line 1294, because the condition on line 1293 was never true
1294 raise RuntimeError(f"Unexpectedly got no artifacts for dataset {ref}")
1295 fileLocations = self._get_expected_dataset_locations_info(ref)
1296 guessing = True
1298 if len(fileLocations) == 1:
1299 # No disassembly so this is the primary URI
1300 uri = fileLocations[0][0].uri
1301 if guessing and not uri.exists(): 1301 ↛ 1302line 1301 didn't jump to line 1302, because the condition on line 1301 was never true
1302 raise FileNotFoundError(f"Expected URI ({uri}) does not exist")
1303 primary = uri
1305 else:
1306 for location, storedFileInfo in fileLocations:
1307 if storedFileInfo.component is None: 1307 ↛ 1308line 1307 didn't jump to line 1308, because the condition on line 1307 was never true
1308 raise RuntimeError(f"Unexpectedly got no component name for a component at {location}")
1309 uri = location.uri
1310 if guessing and not uri.exists(): 1310 ↛ 1314line 1310 didn't jump to line 1314, because the condition on line 1310 was never true
1311 # If we are trusting then it is entirely possible for
1312 # some components to be missing. In that case we skip
1313 # to the next component.
1314 if self.trustGetRequest:
1315 continue
1316 raise FileNotFoundError(f"Expected URI ({uri}) does not exist")
1317 components[storedFileInfo.component] = uri
1319 return primary, components
1321 def getURI(self, ref: DatasetRef, predict: bool = False) -> ButlerURI:
1322 """URI to the Dataset.
1324 Parameters
1325 ----------
1326 ref : `DatasetRef`
1327 Reference to the required Dataset.
1328 predict : `bool`
1329 If `True`, allow URIs to be returned of datasets that have not
1330 been written.
1332 Returns
1333 -------
1334 uri : `str`
1335 URI pointing to the dataset within the datastore. If the
1336 dataset does not exist in the datastore, and if ``predict`` is
1337 `True`, the URI will be a prediction and will include a URI
1338 fragment "#predicted".
1339 If the datastore does not have entities that relate well
1340 to the concept of a URI the returned URI will be
1341 descriptive. The returned URI is not guaranteed to be obtainable.
1343 Raises
1344 ------
1345 FileNotFoundError
1346 Raised if a URI has been requested for a dataset that does not
1347 exist and guessing is not allowed.
1348 RuntimeError
1349 Raised if a request is made for a single URI but multiple URIs
1350 are associated with this dataset.
1352 Notes
1353 -----
1354 When a predicted URI is requested an attempt will be made to form
1355 a reasonable URI based on file templates and the expected formatter.
1356 """
1357 primary, components = self.getURIs(ref, predict)
1358 if primary is None or components: 1358 ↛ 1359line 1358 didn't jump to line 1359, because the condition on line 1358 was never true
1359 raise RuntimeError(f"Dataset ({ref}) includes distinct URIs for components. "
1360 "Use Dataastore.getURIs() instead.")
1361 return primary
1363 def retrieveArtifacts(self, refs: Iterable[DatasetRef],
1364 destination: ButlerURI, transfer: str = "auto",
1365 preserve_path: bool = True,
1366 overwrite: bool = False) -> List[ButlerURI]:
1367 """Retrieve the file artifacts associated with the supplied refs.
1369 Parameters
1370 ----------
1371 refs : iterable of `DatasetRef`
1372 The datasets for which file artifacts are to be retrieved.
1373 A single ref can result in multiple files. The refs must
1374 be resolved.
1375 destination : `ButlerURI`
1376 Location to write the file artifacts.
1377 transfer : `str`, optional
1378 Method to use to transfer the artifacts. Must be one of the options
1379 supported by `ButlerURI.transfer_from()`. "move" is not allowed.
1380 preserve_path : `bool`, optional
1381 If `True` the full path of the file artifact within the datastore
1382 is preserved. If `False` the final file component of the path
1383 is used.
1384 overwrite : `bool`, optional
1385 If `True` allow transfers to overwrite existing files at the
1386 destination.
1388 Returns
1389 -------
1390 targets : `list` of `ButlerURI`
1391 URIs of file artifacts in destination location. Order is not
1392 preserved.
1393 """
1394 if not destination.isdir(): 1394 ↛ 1395line 1394 didn't jump to line 1395, because the condition on line 1394 was never true
1395 raise ValueError(f"Destination location must refer to a directory. Given {destination}")
1397 if transfer == "move":
1398 raise ValueError("Can not move artifacts out of datastore. Use copy instead.")
1400 # Source -> Destination
1401 # This also helps filter out duplicate DatasetRef in the request
1402 # that will map to the same underlying file transfer.
1403 to_transfer: Dict[ButlerURI, ButlerURI] = {}
1405 for ref in refs:
1406 locations = self._get_dataset_locations_info(ref)
1407 for location, _ in locations:
1408 source_uri = location.uri
1409 target_path: Union[str, ButlerURI]
1410 if preserve_path:
1411 target_path = location.pathInStore
1412 if target_path.isabs(): 1412 ↛ 1415line 1412 didn't jump to line 1415, because the condition on line 1412 was never true
1413 # This is an absolute path to an external file.
1414 # Use the full path.
1415 target_path = target_path.relativeToPathRoot
1416 else:
1417 target_path = source_uri.basename()
1418 target_uri = destination.join(target_path)
1419 to_transfer[source_uri] = target_uri
1421 # In theory can now parallelize the transfer
1422 log.debug("Number of artifacts to transfer to %s: %d",
1423 str(destination), len(to_transfer))
1424 for source_uri, target_uri in to_transfer.items():
1425 target_uri.transfer_from(source_uri, transfer=transfer, overwrite=overwrite)
1427 return list(to_transfer.values())
1429 def get(self, ref: DatasetRef, parameters: Optional[Mapping[str, Any]] = None) -> Any:
1430 """Load an InMemoryDataset from the store.
1432 Parameters
1433 ----------
1434 ref : `DatasetRef`
1435 Reference to the required Dataset.
1436 parameters : `dict`
1437 `StorageClass`-specific parameters that specify, for example,
1438 a slice of the dataset to be loaded.
1440 Returns
1441 -------
1442 inMemoryDataset : `object`
1443 Requested dataset or slice thereof as an InMemoryDataset.
1445 Raises
1446 ------
1447 FileNotFoundError
1448 Requested dataset can not be retrieved.
1449 TypeError
1450 Return value from formatter has unexpected type.
1451 ValueError
1452 Formatter failed to process the dataset.
1453 """
1454 allGetInfo = self._prepare_for_get(ref, parameters)
1455 refComponent = ref.datasetType.component()
1457 # Supplied storage class for the component being read
1458 refStorageClass = ref.datasetType.storageClass
1460 # Create mapping from component name to related info
1461 allComponents = {i.component: i for i in allGetInfo}
1463 # By definition the dataset is disassembled if we have more
1464 # than one record for it.
1465 isDisassembled = len(allGetInfo) > 1
1467 # Look for the special case where we are disassembled but the
1468 # component is a derived component that was not written during
1469 # disassembly. For this scenario we need to check that the
1470 # component requested is listed as a derived component for the
1471 # composite storage class
1472 isDisassembledReadOnlyComponent = False
1473 if isDisassembled and refComponent:
1474 # The composite storage class should be accessible through
1475 # the component dataset type
1476 compositeStorageClass = ref.datasetType.parentStorageClass
1478 # In the unlikely scenario where the composite storage
1479 # class is not known, we can only assume that this is a
1480 # normal component. If that assumption is wrong then the
1481 # branch below that reads a persisted component will fail
1482 # so there is no need to complain here.
1483 if compositeStorageClass is not None: 1483 ↛ 1486line 1483 didn't jump to line 1486, because the condition on line 1483 was never false
1484 isDisassembledReadOnlyComponent = refComponent in compositeStorageClass.derivedComponents
1486 if isDisassembled and not refComponent:
1487 # This was a disassembled dataset spread over multiple files
1488 # and we need to put them all back together again.
1489 # Read into memory and then assemble
1491 # Check that the supplied parameters are suitable for the type read
1492 refStorageClass.validateParameters(parameters)
1494 # We want to keep track of all the parameters that were not used
1495 # by formatters. We assume that if any of the component formatters
1496 # use a parameter that we do not need to apply it again in the
1497 # assembler.
1498 usedParams = set()
1500 components: Dict[str, Any] = {}
1501 for getInfo in allGetInfo:
1502 # assemblerParams are parameters not understood by the
1503 # associated formatter.
1504 usedParams.update(set(getInfo.formatterParams))
1506 component = getInfo.component
1508 if component is None: 1508 ↛ 1509line 1508 didn't jump to line 1509, because the condition on line 1508 was never true
1509 raise RuntimeError(f"Internal error in datastore assembly of {ref}")
1511 # We do not want the formatter to think it's reading
1512 # a component though because it is really reading a
1513 # standalone dataset -- always tell reader it is not a
1514 # component.
1515 components[component] = self._read_artifact_into_memory(getInfo, ref, isComponent=False)
1517 inMemoryDataset = ref.datasetType.storageClass.delegate().assemble(components)
1519 # Any unused parameters will have to be passed to the assembler
1520 if parameters:
1521 unusedParams = {k: v for k, v in parameters.items() if k not in usedParams}
1522 else:
1523 unusedParams = {}
1525 # Process parameters
1526 return ref.datasetType.storageClass.delegate().handleParameters(inMemoryDataset,
1527 parameters=unusedParams)
1529 elif isDisassembledReadOnlyComponent:
1531 compositeStorageClass = ref.datasetType.parentStorageClass
1532 if compositeStorageClass is None: 1532 ↛ 1533line 1532 didn't jump to line 1533, because the condition on line 1532 was never true
1533 raise RuntimeError(f"Unable to retrieve derived component '{refComponent}' since"
1534 "no composite storage class is available.")
1536 if refComponent is None: 1536 ↛ 1538line 1536 didn't jump to line 1538, because the condition on line 1536 was never true
1537 # Mainly for mypy
1538 raise RuntimeError(f"Internal error in datastore {self.name}: component can not be None here")
1540 # Assume that every derived component can be calculated by
1541 # forwarding the request to a single read/write component.
1542 # Rather than guessing which rw component is the right one by
1543 # scanning each for a derived component of the same name,
1544 # we ask the storage class delegate directly which one is best to
1545 # use.
1546 compositeDelegate = compositeStorageClass.delegate()
1547 forwardedComponent = compositeDelegate.selectResponsibleComponent(refComponent,
1548 set(allComponents))
1550 # Select the relevant component
1551 rwInfo = allComponents[forwardedComponent]
1553 # For now assume that read parameters are validated against
1554 # the real component and not the requested component
1555 forwardedStorageClass = rwInfo.formatter.fileDescriptor.readStorageClass
1556 forwardedStorageClass.validateParameters(parameters)
1558 # Unfortunately the FileDescriptor inside the formatter will have
1559 # the wrong write storage class so we need to create a new one
1560 # given the immutability constraint.
1561 writeStorageClass = rwInfo.info.storageClass
1563 # We may need to put some thought into parameters for read
1564 # components but for now forward them on as is
1565 readFormatter = type(rwInfo.formatter)(FileDescriptor(rwInfo.location,
1566 readStorageClass=refStorageClass,
1567 storageClass=writeStorageClass,
1568 parameters=parameters),
1569 ref.dataId)
1571 # The assembler can not receive any parameter requests for a
1572 # derived component at this time since the assembler will
1573 # see the storage class of the derived component and those
1574 # parameters will have to be handled by the formatter on the
1575 # forwarded storage class.
1576 assemblerParams: Dict[str, Any] = {}
1578 # Need to created a new info that specifies the derived
1579 # component and associated storage class
1580 readInfo = DatastoreFileGetInformation(rwInfo.location, readFormatter,
1581 rwInfo.info, assemblerParams, {},
1582 refComponent, refStorageClass)
1584 return self._read_artifact_into_memory(readInfo, ref, isComponent=True)
1586 else:
1587 # Single file request or component from that composite file
1588 for lookup in (refComponent, None): 1588 ↛ 1593line 1588 didn't jump to line 1593, because the loop on line 1588 didn't complete
1589 if lookup in allComponents: 1589 ↛ 1588line 1589 didn't jump to line 1588, because the condition on line 1589 was never false
1590 getInfo = allComponents[lookup]
1591 break
1592 else:
1593 raise FileNotFoundError(f"Component {refComponent} not found "
1594 f"for ref {ref} in datastore {self.name}")
1596 # Do not need the component itself if already disassembled
1597 if isDisassembled:
1598 isComponent = False
1599 else:
1600 isComponent = getInfo.component is not None
1602 # For a disassembled component we can validate parametersagainst
1603 # the component storage class directly
1604 if isDisassembled:
1605 refStorageClass.validateParameters(parameters)
1606 else:
1607 # For an assembled composite this could be a derived
1608 # component derived from a real component. The validity
1609 # of the parameters is not clear. For now validate against
1610 # the composite storage class
1611 getInfo.formatter.fileDescriptor.storageClass.validateParameters(parameters)
1613 return self._read_artifact_into_memory(getInfo, ref, isComponent=isComponent)
1615 @transactional
1616 def put(self, inMemoryDataset: Any, ref: DatasetRef) -> None:
1617 """Write a InMemoryDataset with a given `DatasetRef` to the store.
1619 Parameters
1620 ----------
1621 inMemoryDataset : `object`
1622 The dataset to store.
1623 ref : `DatasetRef`
1624 Reference to the associated Dataset.
1626 Raises
1627 ------
1628 TypeError
1629 Supplied object and storage class are inconsistent.
1630 DatasetTypeNotSupportedError
1631 The associated `DatasetType` is not handled by this datastore.
1633 Notes
1634 -----
1635 If the datastore is configured to reject certain dataset types it
1636 is possible that the put will fail and raise a
1637 `DatasetTypeNotSupportedError`. The main use case for this is to
1638 allow `ChainedDatastore` to put to multiple datastores without
1639 requiring that every datastore accepts the dataset.
1640 """
1642 doDisassembly = self.composites.shouldBeDisassembled(ref)
1643 # doDisassembly = True
1645 artifacts = []
1646 if doDisassembly:
1647 components = ref.datasetType.storageClass.delegate().disassemble(inMemoryDataset)
1648 for component, componentInfo in components.items():
1649 # Don't recurse because we want to take advantage of
1650 # bulk insert -- need a new DatasetRef that refers to the
1651 # same dataset_id but has the component DatasetType
1652 # DatasetType does not refer to the types of components
1653 # So we construct one ourselves.
1654 compRef = ref.makeComponentRef(component)
1655 storedInfo = self._write_in_memory_to_artifact(componentInfo.component, compRef)
1656 artifacts.append((compRef, storedInfo))
1657 else:
1658 # Write the entire thing out
1659 storedInfo = self._write_in_memory_to_artifact(inMemoryDataset, ref)
1660 artifacts.append((ref, storedInfo))
1662 self._register_datasets(artifacts)
1664 @transactional
1665 def trash(self, ref: Union[DatasetRef, Iterable[DatasetRef]], ignore_errors: bool = True) -> None:
1666 # Get file metadata and internal metadata
1667 if not isinstance(ref, DatasetRef):
1668 log.debug("Doing multi-dataset trash in datastore %s", self.name)
1669 # Assumed to be an iterable of refs so bulk mode enabled.
1670 try:
1671 self.bridge.moveToTrash(ref)
1672 except Exception as e:
1673 if ignore_errors:
1674 log.warning("Unexpected issue moving multiple datasets to trash: %s", e)
1675 else:
1676 raise
1677 return
1679 log.debug("Trashing dataset %s in datastore %s", ref, self.name)
1681 fileLocations = self._get_dataset_locations_info(ref)
1683 if not fileLocations:
1684 err_msg = f"Requested dataset to trash ({ref}) is not known to datastore {self.name}"
1685 if ignore_errors: 1685 ↛ 1686line 1685 didn't jump to line 1686, because the condition on line 1685 was never true
1686 log.warning(err_msg)
1687 return
1688 else:
1689 raise FileNotFoundError(err_msg)
1691 for location, storedFileInfo in fileLocations:
1692 if not self._artifact_exists(location): 1692 ↛ 1693line 1692 didn't jump to line 1693, because the condition on line 1692 was never true
1693 err_msg = f"Dataset is known to datastore {self.name} but " \
1694 f"associated artifact ({location.uri}) is missing"
1695 if ignore_errors:
1696 log.warning(err_msg)
1697 return
1698 else:
1699 raise FileNotFoundError(err_msg)
1701 # Mark dataset as trashed
1702 try:
1703 self.bridge.moveToTrash([ref])
1704 except Exception as e:
1705 if ignore_errors:
1706 log.warning(f"Attempted to mark dataset ({ref}) to be trashed in datastore {self.name} "
1707 f"but encountered an error: {e}")
1708 pass
1709 else:
1710 raise
1712 @transactional
1713 def emptyTrash(self, ignore_errors: bool = True) -> None:
1714 """Remove all datasets from the trash.
1716 Parameters
1717 ----------
1718 ignore_errors : `bool`
1719 If `True` return without error even if something went wrong.
1720 Problems could occur if another process is simultaneously trying
1721 to delete.
1722 """
1723 log.debug("Emptying trash in datastore %s", self.name)
1725 # Context manager will empty trash iff we finish it without raising.
1726 # It will also automatically delete the relevant rows from the
1727 # trash table and the records table.
1728 with self.bridge.emptyTrash(self._table, record_class=StoredFileInfo,
1729 record_column="path") as trash_data:
1730 # Removing the artifacts themselves requires that the files are
1731 # not also associated with refs that are not to be trashed.
1732 # Therefore need to do a query with the file paths themselves
1733 # and return all the refs associated with them. Can only delete
1734 # a file if the refs to be trashed are the only refs associated
1735 # with the file.
1736 # This requires multiple copies of the trashed items
1737 trashed, artifacts_to_keep = trash_data
1739 if artifacts_to_keep is None:
1740 # The bridge is not helping us so have to work it out
1741 # ourselves. This is not going to be as efficient.
1742 trashed = list(trashed)
1744 # The instance check is for mypy since up to this point it
1745 # does not know the type of info.
1746 path_map = self._refs_associated_with_artifacts([info.path for _, info in trashed
1747 if isinstance(info, StoredFileInfo)])
1749 for ref, info in trashed:
1751 # Mypy needs to know this is not the base class
1752 assert isinstance(info, StoredFileInfo), f"Unexpectedly got info of class {type(info)}"
1754 # Check for mypy
1755 assert ref.id is not None, f"Internal logic error in emptyTrash with ref {ref}/{info}"
1757 path_map[info.path].remove(ref.id)
1758 if not path_map[info.path]: 1758 ↛ 1749line 1758 didn't jump to line 1749, because the condition on line 1758 was never false
1759 del path_map[info.path]
1761 artifacts_to_keep = set(path_map)
1763 for ref, info in trashed:
1765 # Should not happen for this implementation but need
1766 # to keep mypy happy.
1767 assert info is not None, f"Internal logic error in emptyTrash with ref {ref}."
1769 # Mypy needs to know this is not the base class
1770 assert isinstance(info, StoredFileInfo), f"Unexpectedly got info of class {type(info)}"
1772 # Check for mypy
1773 assert ref.id is not None, f"Internal logic error in emptyTrash with ref {ref}/{info}"
1775 if info.path in artifacts_to_keep:
1776 # This is a multi-dataset artifact and we are not
1777 # removing all associated refs.
1778 continue
1780 # Only trashed refs still known to datastore will be returned.
1781 location = info.file_location(self.locationFactory)
1783 # Point of no return for this artifact
1784 log.debug("Removing artifact %s from datastore %s", location.uri, self.name)
1785 try:
1786 self._delete_artifact(location)
1787 except FileNotFoundError:
1788 # If the file itself has been deleted there is nothing
1789 # we can do about it. It is possible that trash has
1790 # been run in parallel in another process or someone
1791 # decided to delete the file. It is unlikely to come
1792 # back and so we should still continue with the removal
1793 # of the entry from the trash table. It is also possible
1794 # we removed it in a previous iteration if it was
1795 # a multi-dataset artifact. The delete artifact method
1796 # will log a debug message in this scenario.
1797 # Distinguishing file missing before trash started and
1798 # file already removed previously as part of this trash
1799 # is not worth the distinction with regards to potential
1800 # memory cost.
1801 pass
1802 except Exception as e:
1803 if ignore_errors:
1804 # Use a debug message here even though it's not
1805 # a good situation. In some cases this can be
1806 # caused by a race between user A and user B
1807 # and neither of them has permissions for the
1808 # other's files. Butler does not know about users
1809 # and trash has no idea what collections these
1810 # files were in (without guessing from a path).
1811 log.debug("Encountered error removing artifact %s from datastore %s: %s",
1812 location.uri, self.name, e)
1813 else:
1814 raise
1816 @transactional
1817 def transfer_from(self, source_datastore: Datastore, refs: Iterable[DatasetRef],
1818 local_refs: Optional[Iterable[DatasetRef]] = None,
1819 transfer: str = "auto") -> None:
1820 # Docstring inherited
1821 if type(self) is not type(source_datastore): 1821 ↛ 1822line 1821 didn't jump to line 1822, because the condition on line 1821 was never true
1822 raise TypeError(f"Datastore mismatch between this datastore ({type(self)}) and the "
1823 f"source datastore ({type(source_datastore)}).")
1825 # Be explicit for mypy
1826 if not isinstance(source_datastore, FileDatastore): 1826 ↛ 1827line 1826 didn't jump to line 1827, because the condition on line 1826 was never true
1827 raise TypeError("Can only transfer to a FileDatastore from another FileDatastore, not"
1828 f" {type(source_datastore)}")
1830 # Stop early if "direct" transfer mode is requested. That would
1831 # require that the URI inside the source datastore should be stored
1832 # directly in the target datastore, which seems unlikely to be useful
1833 # since at any moment the source datastore could delete the file.
1834 if transfer in ("direct", "split"): 1834 ↛ 1835line 1834 didn't jump to line 1835, because the condition on line 1834 was never true
1835 raise ValueError("Can not transfer from a source datastore using direct mode since"
1836 " those files are controlled by the other datastore.")
1838 # We will go through the list multiple times so must convert
1839 # generators to lists.
1840 refs = list(refs)
1842 if local_refs is None: 1842 ↛ 1843line 1842 didn't jump to line 1843, because the condition on line 1842 was never true
1843 local_refs = refs
1844 else:
1845 local_refs = list(local_refs)
1847 # In order to handle disassembled composites the code works
1848 # at the records level since it can assume that internal APIs
1849 # can be used.
1850 # - If the record already exists in the destination this is assumed
1851 # to be okay.
1852 # - If there is no record but the source and destination URIs are
1853 # identical no transfer is done but the record is added.
1854 # - If the source record refers to an absolute URI currently assume
1855 # that that URI should remain absolute and will be visible to the
1856 # destination butler. May need to have a flag to indicate whether
1857 # the dataset should be transferred. This will only happen if
1858 # the detached Butler has had a local ingest.
1860 # What we really want is all the records in the source datastore
1861 # associated with these refs. Or derived ones if they don't exist
1862 # in the source.
1863 source_records = source_datastore._get_stored_records_associated_with_refs(refs)
1865 # The source dataset_ids are the keys in these records
1866 source_ids = set(source_records)
1867 log.debug("Number of datastore records found in source: %d", len(source_ids))
1869 # The not None check is to appease mypy
1870 requested_ids = set(ref.id for ref in refs if ref.id is not None)
1871 missing_ids = requested_ids - source_ids
1873 # Missing IDs can be okay if that datastore has allowed
1874 # gets based on file existence. Should we transfer what we can
1875 # or complain about it and warn?
1876 if missing_ids and not source_datastore.trustGetRequest: 1876 ↛ 1877line 1876 didn't jump to line 1877, because the condition on line 1876 was never true
1877 raise ValueError(f"Some datasets are missing from source datastore {source_datastore}:"
1878 f" {missing_ids}")
1880 # Need to map these missing IDs to a DatasetRef so we can guess
1881 # the details.
1882 if missing_ids: 1882 ↛ 1883line 1882 didn't jump to line 1883, because the condition on line 1882 was never true
1883 log.info("Number of expected datasets missing from source datastore records: %d out of %d",
1884 len(missing_ids), len(requested_ids))
1885 id_to_ref = {ref.id: ref for ref in refs if ref.id in missing_ids}
1887 for missing in missing_ids:
1888 # Ask the source datastore where the missing artifacts
1889 # should be. An execution butler might not know about the
1890 # artifacts even if they are there.
1891 expected = source_datastore._get_expected_dataset_locations_info(id_to_ref[missing])
1893 # Not all components can be guaranteed to exist so this
1894 # list has to filter those by checking to see if the
1895 # artifact is really there.
1896 records = [info for location, info in expected if location.uri.exists()]
1897 if records:
1898 source_records[missing].extend(records)
1899 else:
1900 log.warning("Asked to transfer dataset %s but no file artifacts exist for it.",
1901 id_to_ref[missing])
1903 # See if we already have these records
1904 target_records = self._get_stored_records_associated_with_refs(local_refs)
1906 # The artifacts to register
1907 artifacts = []
1909 # Refs that already exist
1910 already_present = []
1912 # Now can transfer the artifacts
1913 for source_ref, target_ref in zip(refs, local_refs):
1914 if target_ref.id in target_records: 1914 ↛ 1916line 1914 didn't jump to line 1916, because the condition on line 1914 was never true
1915 # Already have an artifact for this.
1916 already_present.append(target_ref)
1917 continue
1919 # mypy needs to know these are always resolved refs
1920 for info in source_records[source_ref.getCheckedId()]:
1921 source_location = info.file_location(source_datastore.locationFactory)
1922 target_location = info.file_location(self.locationFactory)
1923 if source_location == target_location: 1923 ↛ 1927line 1923 didn't jump to line 1927, because the condition on line 1923 was never true
1924 # Either the dataset is already in the target datastore
1925 # (which is how execution butler currently runs) or
1926 # it is an absolute URI.
1927 if source_location.pathInStore.isabs():
1928 # Just because we can see the artifact when running
1929 # the transfer doesn't mean it will be generally
1930 # accessible to a user of this butler. For now warn
1931 # but assume it will be accessible.
1932 log.warning("Transfer request for an outside-datastore artifact has been found at %s",
1933 source_location)
1934 else:
1935 # Need to transfer it to the new location.
1936 # Assume we should always overwrite. If the artifact
1937 # is there this might indicate that a previous transfer
1938 # was interrupted but was not able to be rolled back
1939 # completely (eg pre-emption) so follow Datastore default
1940 # and overwrite.
1941 target_location.uri.transfer_from(source_location.uri, transfer=transfer,
1942 overwrite=True, transaction=self._transaction)
1944 artifacts.append((target_ref, info))
1946 self._register_datasets(artifacts)
1948 if already_present: 1948 ↛ 1949line 1948 didn't jump to line 1949, because the condition on line 1948 was never true
1949 n_skipped = len(already_present)
1950 log.info("Skipped transfer of %d dataset%s already present in datastore", n_skipped,
1951 "" if n_skipped == 1 else "s")
1953 @transactional
1954 def forget(self, refs: Iterable[DatasetRef]) -> None:
1955 # Docstring inherited.
1956 refs = list(refs)
1957 self.bridge.forget(refs)
1958 self._table.delete(["dataset_id"], *[{"dataset_id": ref.getCheckedId()} for ref in refs])
1960 def validateConfiguration(self, entities: Iterable[Union[DatasetRef, DatasetType, StorageClass]],
1961 logFailures: bool = False) -> None:
1962 """Validate some of the configuration for this datastore.
1964 Parameters
1965 ----------
1966 entities : iterable of `DatasetRef`, `DatasetType`, or `StorageClass`
1967 Entities to test against this configuration. Can be differing
1968 types.
1969 logFailures : `bool`, optional
1970 If `True`, output a log message for every validation error
1971 detected.
1973 Raises
1974 ------
1975 DatastoreValidationError
1976 Raised if there is a validation problem with a configuration.
1977 All the problems are reported in a single exception.
1979 Notes
1980 -----
1981 This method checks that all the supplied entities have valid file
1982 templates and also have formatters defined.
1983 """
1985 templateFailed = None
1986 try:
1987 self.templates.validateTemplates(entities, logFailures=logFailures)
1988 except FileTemplateValidationError as e:
1989 templateFailed = str(e)
1991 formatterFailed = []
1992 for entity in entities:
1993 try:
1994 self.formatterFactory.getFormatterClass(entity)
1995 except KeyError as e:
1996 formatterFailed.append(str(e))
1997 if logFailures: 1997 ↛ 1992line 1997 didn't jump to line 1992, because the condition on line 1997 was never false
1998 log.critical("Formatter failure: %s", e)
2000 if templateFailed or formatterFailed:
2001 messages = []
2002 if templateFailed: 2002 ↛ 2003line 2002 didn't jump to line 2003, because the condition on line 2002 was never true
2003 messages.append(templateFailed)
2004 if formatterFailed: 2004 ↛ 2006line 2004 didn't jump to line 2006, because the condition on line 2004 was never false
2005 messages.append(",".join(formatterFailed))
2006 msg = ";\n".join(messages)
2007 raise DatastoreValidationError(msg)
2009 def getLookupKeys(self) -> Set[LookupKey]:
2010 # Docstring is inherited from base class
2011 return self.templates.getLookupKeys() | self.formatterFactory.getLookupKeys() | \
2012 self.constraints.getLookupKeys()
2014 def validateKey(self, lookupKey: LookupKey,
2015 entity: Union[DatasetRef, DatasetType, StorageClass]) -> None:
2016 # Docstring is inherited from base class
2017 # The key can be valid in either formatters or templates so we can
2018 # only check the template if it exists
2019 if lookupKey in self.templates:
2020 try:
2021 self.templates[lookupKey].validateTemplate(entity)
2022 except FileTemplateValidationError as e:
2023 raise DatastoreValidationError(e) from e
2025 def export(self, refs: Iterable[DatasetRef], *,
2026 directory: Optional[Union[ButlerURI, str]] = None,
2027 transfer: Optional[str] = "auto") -> Iterable[FileDataset]:
2028 # Docstring inherited from Datastore.export.
2029 if transfer is not None and directory is None: 2029 ↛ 2030line 2029 didn't jump to line 2030, because the condition on line 2029 was never true
2030 raise RuntimeError(f"Cannot export using transfer mode {transfer} with no "
2031 "export directory given")
2033 # Force the directory to be a URI object
2034 directoryUri: Optional[ButlerURI] = None
2035 if directory is not None: 2035 ↛ 2038line 2035 didn't jump to line 2038, because the condition on line 2035 was never false
2036 directoryUri = ButlerURI(directory, forceDirectory=True)
2038 if transfer is not None and directoryUri is not None: 2038 ↛ 2043line 2038 didn't jump to line 2043, because the condition on line 2038 was never false
2039 # mypy needs the second test
2040 if not directoryUri.exists(): 2040 ↛ 2041line 2040 didn't jump to line 2041, because the condition on line 2040 was never true
2041 raise FileNotFoundError(f"Export location {directory} does not exist")
2043 progress = Progress("lsst.daf.butler.datastores.FileDatastore.export", level=logging.DEBUG)
2044 for ref in progress.wrap(refs, "Exporting dataset files"):
2045 fileLocations = self._get_dataset_locations_info(ref)
2046 if not fileLocations: 2046 ↛ 2047line 2046 didn't jump to line 2047, because the condition on line 2046 was never true
2047 raise FileNotFoundError(f"Could not retrieve dataset {ref}.")
2048 # For now we can not export disassembled datasets
2049 if len(fileLocations) > 1: 2049 ↛ 2050line 2049 didn't jump to line 2050, because the condition on line 2049 was never true
2050 raise NotImplementedError(f"Can not export disassembled datasets such as {ref}")
2051 location, storedFileInfo = fileLocations[0]
2053 pathInStore = location.pathInStore.path
2054 if transfer is None: 2054 ↛ 2057line 2054 didn't jump to line 2057, because the condition on line 2054 was never true
2055 # TODO: do we also need to return the readStorageClass somehow?
2056 # We will use the path in store directly
2057 pass
2058 elif transfer == "direct": 2058 ↛ 2060line 2058 didn't jump to line 2060, because the condition on line 2058 was never true
2059 # Use full URIs to the remote store in the export
2060 pathInStore = str(location.uri)
2061 else:
2062 # mypy needs help
2063 assert directoryUri is not None, "directoryUri must be defined to get here"
2064 storeUri = ButlerURI(location.uri)
2066 # if the datastore has an absolute URI to a resource, we
2067 # have two options:
2068 # 1. Keep the absolute URI in the exported YAML
2069 # 2. Allocate a new name in the local datastore and transfer
2070 # it.
2071 # For now go with option 2
2072 if location.pathInStore.isabs(): 2072 ↛ 2073line 2072 didn't jump to line 2073, because the condition on line 2072 was never true
2073 template = self.templates.getTemplate(ref)
2074 newURI = ButlerURI(template.format(ref), forceAbsolute=False)
2075 pathInStore = str(newURI.updatedExtension(location.pathInStore.getExtension()))
2077 exportUri = directoryUri.join(pathInStore)
2078 exportUri.transfer_from(storeUri, transfer=transfer)
2080 yield FileDataset(refs=[ref], path=pathInStore, formatter=storedFileInfo.formatter)
2082 @staticmethod
2083 def computeChecksum(uri: ButlerURI, algorithm: str = "blake2b", block_size: int = 8192) -> Optional[str]:
2084 """Compute the checksum of the supplied file.
2086 Parameters
2087 ----------
2088 uri : `ButlerURI`
2089 Name of resource to calculate checksum from.
2090 algorithm : `str`, optional
2091 Name of algorithm to use. Must be one of the algorithms supported
2092 by :py:class`hashlib`.
2093 block_size : `int`
2094 Number of bytes to read from file at one time.
2096 Returns
2097 -------
2098 hexdigest : `str`
2099 Hex digest of the file.
2101 Notes
2102 -----
2103 Currently returns None if the URI is for a remote resource.
2104 """
2105 if algorithm not in hashlib.algorithms_guaranteed: 2105 ↛ 2106line 2105 didn't jump to line 2106, because the condition on line 2105 was never true
2106 raise NameError("The specified algorithm '{}' is not supported by hashlib".format(algorithm))
2108 if not uri.isLocal: 2108 ↛ 2109line 2108 didn't jump to line 2109, because the condition on line 2108 was never true
2109 return None
2111 hasher = hashlib.new(algorithm)
2113 with uri.as_local() as local_uri:
2114 with open(local_uri.ospath, "rb") as f:
2115 for chunk in iter(lambda: f.read(block_size), b""):
2116 hasher.update(chunk)
2118 return hasher.hexdigest()
2120 def needs_expanded_data_ids(
2121 self,
2122 transfer: Optional[str],
2123 entity: Optional[Union[DatasetRef, DatasetType, StorageClass]] = None,
2124 ) -> bool:
2125 # Docstring inherited.
2126 # This _could_ also use entity to inspect whether the filename template
2127 # involves placeholders other than the required dimensions for its
2128 # dataset type, but that's not necessary for correctness; it just
2129 # enables more optimizations (perhaps only in theory).
2130 return transfer not in ("direct", None)