Coverage for python/lsst/daf/butler/datastores/fileDatastore.py : 78%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
21from __future__ import annotations
23"""Generic file-based datastore code."""
25__all__ = ("FileDatastore", )
27import hashlib
28import logging
30from sqlalchemy import BigInteger, String
32from collections import defaultdict
33from dataclasses import dataclass
34from typing import (
35 TYPE_CHECKING,
36 Any,
37 ClassVar,
38 Dict,
39 Iterable,
40 List,
41 Mapping,
42 Optional,
43 Set,
44 Tuple,
45 Type,
46 Union,
47)
49from lsst.daf.butler import (
50 ButlerURI,
51 CompositesMap,
52 Config,
53 FileDataset,
54 DatasetId,
55 DatasetRef,
56 DatasetType,
57 DatasetTypeNotSupportedError,
58 Datastore,
59 DatastoreCacheManager,
60 DatastoreDisabledCacheManager,
61 DatastoreConfig,
62 DatastoreValidationError,
63 FileDescriptor,
64 FileTemplates,
65 FileTemplateValidationError,
66 Formatter,
67 FormatterFactory,
68 Location,
69 LocationFactory,
70 Progress,
71 StorageClass,
72 StoredFileInfo,
73)
75from lsst.daf.butler import ddl
76from lsst.daf.butler.registry.interfaces import (
77 ReadOnlyDatabaseError,
78 DatastoreRegistryBridge,
79)
81from lsst.daf.butler.core.repoRelocation import replaceRoot
82from lsst.daf.butler.core.utils import getInstanceOf, getClassOf, transactional, time_this
83from .genericDatastore import GenericBaseDatastore
85if TYPE_CHECKING: 85 ↛ 86line 85 didn't jump to line 86, because the condition on line 85 was never true
86 from lsst.daf.butler import LookupKey, AbstractDatastoreCacheManager
87 from lsst.daf.butler.registry.interfaces import DatasetIdRef, DatastoreRegistryBridgeManager
89log = logging.getLogger(__name__)
92class _IngestPrepData(Datastore.IngestPrepData):
93 """Helper class for FileDatastore ingest implementation.
95 Parameters
96 ----------
97 datasets : `list` of `FileDataset`
98 Files to be ingested by this datastore.
99 """
100 def __init__(self, datasets: List[FileDataset]):
101 super().__init__(ref for dataset in datasets for ref in dataset.refs)
102 self.datasets = datasets
105@dataclass(frozen=True)
106class DatastoreFileGetInformation:
107 """Collection of useful parameters needed to retrieve a file from
108 a Datastore.
109 """
111 location: Location
112 """The location from which to read the dataset."""
114 formatter: Formatter
115 """The `Formatter` to use to deserialize the dataset."""
117 info: StoredFileInfo
118 """Stored information about this file and its formatter."""
120 assemblerParams: Dict[str, Any]
121 """Parameters to use for post-processing the retrieved dataset."""
123 formatterParams: Dict[str, Any]
124 """Parameters that were understood by the associated formatter."""
126 component: Optional[str]
127 """The component to be retrieved (can be `None`)."""
129 readStorageClass: StorageClass
130 """The `StorageClass` of the dataset being read."""
133class FileDatastore(GenericBaseDatastore):
134 """Generic Datastore for file-based implementations.
136 Should always be sub-classed since key abstract methods are missing.
138 Parameters
139 ----------
140 config : `DatastoreConfig` or `str`
141 Configuration as either a `Config` object or URI to file.
142 bridgeManager : `DatastoreRegistryBridgeManager`
143 Object that manages the interface between `Registry` and datastores.
144 butlerRoot : `str`, optional
145 New datastore root to use to override the configuration value.
147 Raises
148 ------
149 ValueError
150 If root location does not exist and ``create`` is `False` in the
151 configuration.
152 """
154 defaultConfigFile: ClassVar[Optional[str]] = None
155 """Path to configuration defaults. Accessed within the ``config`` resource
156 or relative to a search path. Can be None if no defaults specified.
157 """
159 root: ButlerURI
160 """Root directory URI of this `Datastore`."""
162 locationFactory: LocationFactory
163 """Factory for creating locations relative to the datastore root."""
165 formatterFactory: FormatterFactory
166 """Factory for creating instances of formatters."""
168 templates: FileTemplates
169 """File templates that can be used by this `Datastore`."""
171 composites: CompositesMap
172 """Determines whether a dataset should be disassembled on put."""
174 defaultConfigFile = "datastores/fileDatastore.yaml"
175 """Path to configuration defaults. Accessed within the ``config`` resource
176 or relative to a search path. Can be None if no defaults specified.
177 """
179 @classmethod
180 def setConfigRoot(cls, root: str, config: Config, full: Config, overwrite: bool = True) -> None:
181 """Set any filesystem-dependent config options for this Datastore to
182 be appropriate for a new empty repository with the given root.
184 Parameters
185 ----------
186 root : `str`
187 URI to the root of the data repository.
188 config : `Config`
189 A `Config` to update. Only the subset understood by
190 this component will be updated. Will not expand
191 defaults.
192 full : `Config`
193 A complete config with all defaults expanded that can be
194 converted to a `DatastoreConfig`. Read-only and will not be
195 modified by this method.
196 Repository-specific options that should not be obtained
197 from defaults when Butler instances are constructed
198 should be copied from ``full`` to ``config``.
199 overwrite : `bool`, optional
200 If `False`, do not modify a value in ``config`` if the value
201 already exists. Default is always to overwrite with the provided
202 ``root``.
204 Notes
205 -----
206 If a keyword is explicitly defined in the supplied ``config`` it
207 will not be overridden by this method if ``overwrite`` is `False`.
208 This allows explicit values set in external configs to be retained.
209 """
210 Config.updateParameters(DatastoreConfig, config, full,
211 toUpdate={"root": root},
212 toCopy=("cls", ("records", "table")), overwrite=overwrite)
214 @classmethod
215 def makeTableSpec(cls, datasetIdColumnType: type) -> ddl.TableSpec:
216 return ddl.TableSpec(
217 fields=[
218 ddl.FieldSpec(name="dataset_id", dtype=datasetIdColumnType, primaryKey=True),
219 ddl.FieldSpec(name="path", dtype=String, length=256, nullable=False),
220 ddl.FieldSpec(name="formatter", dtype=String, length=128, nullable=False),
221 ddl.FieldSpec(name="storage_class", dtype=String, length=64, nullable=False),
222 # Use empty string to indicate no component
223 ddl.FieldSpec(name="component", dtype=String, length=32, primaryKey=True),
224 # TODO: should checksum be Base64Bytes instead?
225 ddl.FieldSpec(name="checksum", dtype=String, length=128, nullable=True),
226 ddl.FieldSpec(name="file_size", dtype=BigInteger, nullable=True),
227 ],
228 unique=frozenset(),
229 indexes=[tuple(["path"])],
230 )
232 def __init__(self, config: Union[DatastoreConfig, str],
233 bridgeManager: DatastoreRegistryBridgeManager, butlerRoot: str = None):
234 super().__init__(config, bridgeManager)
235 if "root" not in self.config: 235 ↛ 236line 235 didn't jump to line 236, because the condition on line 235 was never true
236 raise ValueError("No root directory specified in configuration")
238 # Name ourselves either using an explicit name or a name
239 # derived from the (unexpanded) root
240 if "name" in self.config:
241 self.name = self.config["name"]
242 else:
243 # We use the unexpanded root in the name to indicate that this
244 # datastore can be moved without having to update registry.
245 self.name = "{}@{}".format(type(self).__name__,
246 self.config["root"])
248 # Support repository relocation in config
249 # Existence of self.root is checked in subclass
250 self.root = ButlerURI(replaceRoot(self.config["root"], butlerRoot),
251 forceDirectory=True, forceAbsolute=True)
253 self.locationFactory = LocationFactory(self.root)
254 self.formatterFactory = FormatterFactory()
256 # Now associate formatters with storage classes
257 self.formatterFactory.registerFormatters(self.config["formatters"],
258 universe=bridgeManager.universe)
260 # Read the file naming templates
261 self.templates = FileTemplates(self.config["templates"],
262 universe=bridgeManager.universe)
264 # See if composites should be disassembled
265 self.composites = CompositesMap(self.config["composites"],
266 universe=bridgeManager.universe)
268 tableName = self.config["records", "table"]
269 try:
270 # Storage of paths and formatters, keyed by dataset_id
271 self._table = bridgeManager.opaque.register(
272 tableName, self.makeTableSpec(bridgeManager.datasetIdColumnType))
273 # Interface to Registry.
274 self._bridge = bridgeManager.register(self.name)
275 except ReadOnlyDatabaseError:
276 # If the database is read only and we just tried and failed to
277 # create a table, it means someone is trying to create a read-only
278 # butler client for an empty repo. That should be okay, as long
279 # as they then try to get any datasets before some other client
280 # creates the table. Chances are they'rejust validating
281 # configuration.
282 pass
284 # Determine whether checksums should be used - default to False
285 self.useChecksum = self.config.get("checksum", False)
287 # Determine whether we can fall back to configuration if a
288 # requested dataset is not known to registry
289 self.trustGetRequest = self.config.get("trust_get_request", False)
291 # Create a cache manager
292 self.cacheManager: AbstractDatastoreCacheManager
293 if "cached" in self.config: 293 ↛ 297line 293 didn't jump to line 297, because the condition on line 293 was never false
294 self.cacheManager = DatastoreCacheManager(self.config["cached"],
295 universe=bridgeManager.universe)
296 else:
297 self.cacheManager = DatastoreDisabledCacheManager("",
298 universe=bridgeManager.universe)
300 # Check existence and create directory structure if necessary
301 if not self.root.exists():
302 if "create" not in self.config or not self.config["create"]: 302 ↛ 303line 302 didn't jump to line 303, because the condition on line 302 was never true
303 raise ValueError(f"No valid root and not allowed to create one at: {self.root}")
304 try:
305 self.root.mkdir()
306 except Exception as e:
307 raise ValueError(f"Can not create datastore root '{self.root}', check permissions."
308 f" Got error: {e}") from e
310 def __str__(self) -> str:
311 return str(self.root)
313 @property
314 def bridge(self) -> DatastoreRegistryBridge:
315 return self._bridge
317 def _artifact_exists(self, location: Location) -> bool:
318 """Check that an artifact exists in this datastore at the specified
319 location.
321 Parameters
322 ----------
323 location : `Location`
324 Expected location of the artifact associated with this datastore.
326 Returns
327 -------
328 exists : `bool`
329 True if the location can be found, false otherwise.
330 """
331 log.debug("Checking if resource exists: %s", location.uri)
332 return location.uri.exists()
334 def _delete_artifact(self, location: Location) -> None:
335 """Delete the artifact from the datastore.
337 Parameters
338 ----------
339 location : `Location`
340 Location of the artifact associated with this datastore.
341 """
342 if location.pathInStore.isabs(): 342 ↛ 343line 342 didn't jump to line 343, because the condition on line 342 was never true
343 raise RuntimeError(f"Cannot delete artifact with absolute uri {location.uri}.")
345 try:
346 location.uri.remove()
347 except FileNotFoundError:
348 log.debug("File %s did not exist and so could not be deleted.", location.uri)
349 raise
350 except Exception as e:
351 log.critical("Failed to delete file: %s (%s)", location.uri, e)
352 raise
353 log.debug("Successfully deleted file: %s", location.uri)
355 def addStoredItemInfo(self, refs: Iterable[DatasetRef], infos: Iterable[StoredFileInfo]) -> None:
356 # Docstring inherited from GenericBaseDatastore
357 records = [info.to_record(ref) for ref, info in zip(refs, infos)]
358 self._table.insert(*records)
360 def getStoredItemsInfo(self, ref: DatasetIdRef) -> List[StoredFileInfo]:
361 # Docstring inherited from GenericBaseDatastore
363 # Look for the dataset_id -- there might be multiple matches
364 # if we have disassembled the dataset.
365 records = self._table.fetch(dataset_id=ref.id)
366 return [StoredFileInfo.from_record(record) for record in records]
368 def _get_stored_records_associated_with_refs(self,
369 refs: Iterable[DatasetIdRef]
370 ) -> Dict[DatasetId, List[StoredFileInfo]]:
371 """Retrieve all records associated with the provided refs.
373 Parameters
374 ----------
375 refs : iterable of `DatasetIdRef`
376 The refs for which records are to be retrieved.
378 Returns
379 -------
380 records : `dict` of [`DatasetId`, `list` of `StoredFileInfo`]
381 The matching records indexed by the ref ID. The number of entries
382 in the dict can be smaller than the number of requested refs.
383 """
384 records = self._table.fetch(dataset_id=[ref.id for ref in refs])
386 # Uniqueness is dataset_id + component so can have multiple records
387 # per ref.
388 records_by_ref = defaultdict(list)
389 for record in records:
390 records_by_ref[record["dataset_id"]].append(StoredFileInfo.from_record(record))
391 return records_by_ref
393 def _refs_associated_with_artifacts(self, paths: List[Union[str, ButlerURI]]) -> Dict[str,
394 Set[DatasetId]]:
395 """Return paths and associated dataset refs.
397 Parameters
398 ----------
399 paths : `list` of `str` or `ButlerURI`
400 All the paths to include in search.
402 Returns
403 -------
404 mapping : `dict` of [`str`, `set` [`DatasetId`]]
405 Mapping of each path to a set of associated database IDs.
406 """
407 records = self._table.fetch(path=[str(path) for path in paths])
408 result = defaultdict(set)
409 for row in records:
410 result[row["path"]].add(row["dataset_id"])
411 return result
413 def _registered_refs_per_artifact(self, pathInStore: ButlerURI) -> Set[DatasetId]:
414 """Return all dataset refs associated with the supplied path.
416 Parameters
417 ----------
418 pathInStore : `ButlerURI`
419 Path of interest in the data store.
421 Returns
422 -------
423 ids : `set` of `int`
424 All `DatasetRef` IDs associated with this path.
425 """
426 records = list(self._table.fetch(path=str(pathInStore)))
427 ids = {r["dataset_id"] for r in records}
428 return ids
430 def removeStoredItemInfo(self, ref: DatasetIdRef) -> None:
431 # Docstring inherited from GenericBaseDatastore
432 self._table.delete(["dataset_id"], {"dataset_id": ref.id})
434 def _get_dataset_locations_info(self, ref: DatasetIdRef) -> List[Tuple[Location, StoredFileInfo]]:
435 r"""Find all the `Location`\ s of the requested dataset in the
436 `Datastore` and the associated stored file information.
438 Parameters
439 ----------
440 ref : `DatasetRef`
441 Reference to the required `Dataset`.
443 Returns
444 -------
445 results : `list` [`tuple` [`Location`, `StoredFileInfo` ]]
446 Location of the dataset within the datastore and
447 stored information about each file and its formatter.
448 """
449 # Get the file information (this will fail if no file)
450 records = self.getStoredItemsInfo(ref)
452 # Use the path to determine the location -- we need to take
453 # into account absolute URIs in the datastore record
454 return [(r.file_location(self.locationFactory), r) for r in records]
456 def _can_remove_dataset_artifact(self, ref: DatasetIdRef, location: Location) -> bool:
457 """Check that there is only one dataset associated with the
458 specified artifact.
460 Parameters
461 ----------
462 ref : `DatasetRef` or `FakeDatasetRef`
463 Dataset to be removed.
464 location : `Location`
465 The location of the artifact to be removed.
467 Returns
468 -------
469 can_remove : `Bool`
470 True if the artifact can be safely removed.
471 """
472 # Can't ever delete absolute URIs.
473 if location.pathInStore.isabs():
474 return False
476 # Get all entries associated with this path
477 allRefs = self._registered_refs_per_artifact(location.pathInStore)
478 if not allRefs:
479 raise RuntimeError(f"Datastore inconsistency error. {location.pathInStore} not in registry")
481 # Remove these refs from all the refs and if there is nothing left
482 # then we can delete
483 remainingRefs = allRefs - {ref.id}
485 if remainingRefs:
486 return False
487 return True
489 def _get_expected_dataset_locations_info(self, ref: DatasetRef) -> List[Tuple[Location,
490 StoredFileInfo]]:
491 """Predict the location and related file information of the requested
492 dataset in this datastore.
494 Parameters
495 ----------
496 ref : `DatasetRef`
497 Reference to the required `Dataset`.
499 Returns
500 -------
501 results : `list` [`tuple` [`Location`, `StoredFileInfo` ]]
502 Expected Location of the dataset within the datastore and
503 placeholder information about each file and its formatter.
505 Notes
506 -----
507 Uses the current configuration to determine how we would expect the
508 datastore files to have been written if we couldn't ask registry.
509 This is safe so long as there has been no change to datastore
510 configuration between writing the dataset and wanting to read it.
511 Will not work for files that have been ingested without using the
512 standard file template or default formatter.
513 """
515 # If we have a component ref we always need to ask the questions
516 # of the composite. If the composite is disassembled this routine
517 # should return all components. If the composite was not
518 # disassembled the composite is what is stored regardless of
519 # component request. Note that if the caller has disassembled
520 # a composite there is no way for this guess to know that
521 # without trying both the composite and component ref and seeing
522 # if there is something at the component Location even without
523 # disassembly being enabled.
524 if ref.datasetType.isComponent():
525 ref = ref.makeCompositeRef()
527 # See if the ref is a composite that should be disassembled
528 doDisassembly = self.composites.shouldBeDisassembled(ref)
530 all_info: List[Tuple[Location, Formatter, StorageClass, Optional[str]]] = []
532 if doDisassembly:
533 for component, componentStorage in ref.datasetType.storageClass.components.items():
534 compRef = ref.makeComponentRef(component)
535 location, formatter = self._determine_put_formatter_location(compRef)
536 all_info.append((location, formatter, componentStorage, component))
538 else:
539 # Always use the composite ref if no disassembly
540 location, formatter = self._determine_put_formatter_location(ref)
541 all_info.append((location, formatter, ref.datasetType.storageClass, None))
543 # Convert the list of tuples to have StoredFileInfo as second element
544 return [(location, StoredFileInfo(formatter=formatter,
545 path=location.pathInStore.path,
546 storageClass=storageClass,
547 component=component,
548 checksum=None,
549 file_size=-1))
550 for location, formatter, storageClass, component in all_info]
552 def _prepare_for_get(self, ref: DatasetRef,
553 parameters: Optional[Mapping[str, Any]] = None) -> List[DatastoreFileGetInformation]:
554 """Check parameters for ``get`` and obtain formatter and
555 location.
557 Parameters
558 ----------
559 ref : `DatasetRef`
560 Reference to the required Dataset.
561 parameters : `dict`
562 `StorageClass`-specific parameters that specify, for example,
563 a slice of the dataset to be loaded.
565 Returns
566 -------
567 getInfo : `list` [`DatastoreFileGetInformation`]
568 Parameters needed to retrieve each file.
569 """
570 log.debug("Retrieve %s from %s with parameters %s", ref, self.name, parameters)
572 # Get file metadata and internal metadata
573 fileLocations = self._get_dataset_locations_info(ref)
574 if not fileLocations:
575 if not self.trustGetRequest:
576 raise FileNotFoundError(f"Could not retrieve dataset {ref}.")
577 # Assume the dataset is where we think it should be
578 fileLocations = self._get_expected_dataset_locations_info(ref)
580 # The storage class we want to use eventually
581 refStorageClass = ref.datasetType.storageClass
583 if len(fileLocations) > 1:
584 disassembled = True
586 # If trust is involved it is possible that there will be
587 # components listed here that do not exist in the datastore.
588 # Explicitly check for file artifact existence and filter out any
589 # that are missing.
590 if self.trustGetRequest:
591 fileLocations = [loc for loc in fileLocations if loc[0].uri.exists()]
593 # For now complain only if we have no components at all. One
594 # component is probably a problem but we can punt that to the
595 # assembler.
596 if not fileLocations: 596 ↛ 597line 596 didn't jump to line 597, because the condition on line 596 was never true
597 raise FileNotFoundError(f"None of the component files for dataset {ref} exist.")
599 else:
600 disassembled = False
602 # Is this a component request?
603 refComponent = ref.datasetType.component()
605 fileGetInfo = []
606 for location, storedFileInfo in fileLocations:
608 # The storage class used to write the file
609 writeStorageClass = storedFileInfo.storageClass
611 # If this has been disassembled we need read to match the write
612 if disassembled:
613 readStorageClass = writeStorageClass
614 else:
615 readStorageClass = refStorageClass
617 formatter = getInstanceOf(storedFileInfo.formatter,
618 FileDescriptor(location, readStorageClass=readStorageClass,
619 storageClass=writeStorageClass, parameters=parameters),
620 ref.dataId)
622 formatterParams, notFormatterParams = formatter.segregateParameters()
624 # Of the remaining parameters, extract the ones supported by
625 # this StorageClass (for components not all will be handled)
626 assemblerParams = readStorageClass.filterParameters(notFormatterParams)
628 # The ref itself could be a component if the dataset was
629 # disassembled by butler, or we disassembled in datastore and
630 # components came from the datastore records
631 component = storedFileInfo.component if storedFileInfo.component else refComponent
633 fileGetInfo.append(DatastoreFileGetInformation(location, formatter, storedFileInfo,
634 assemblerParams, formatterParams,
635 component, readStorageClass))
637 return fileGetInfo
639 def _prepare_for_put(self, inMemoryDataset: Any, ref: DatasetRef) -> Tuple[Location, Formatter]:
640 """Check the arguments for ``put`` and obtain formatter and
641 location.
643 Parameters
644 ----------
645 inMemoryDataset : `object`
646 The dataset to store.
647 ref : `DatasetRef`
648 Reference to the associated Dataset.
650 Returns
651 -------
652 location : `Location`
653 The location to write the dataset.
654 formatter : `Formatter`
655 The `Formatter` to use to write the dataset.
657 Raises
658 ------
659 TypeError
660 Supplied object and storage class are inconsistent.
661 DatasetTypeNotSupportedError
662 The associated `DatasetType` is not handled by this datastore.
663 """
664 self._validate_put_parameters(inMemoryDataset, ref)
665 return self._determine_put_formatter_location(ref)
667 def _determine_put_formatter_location(self, ref: DatasetRef) -> Tuple[Location, Formatter]:
668 """Calculate the formatter and output location to use for put.
670 Parameters
671 ----------
672 ref : `DatasetRef`
673 Reference to the associated Dataset.
675 Returns
676 -------
677 location : `Location`
678 The location to write the dataset.
679 formatter : `Formatter`
680 The `Formatter` to use to write the dataset.
681 """
682 # Work out output file name
683 try:
684 template = self.templates.getTemplate(ref)
685 except KeyError as e:
686 raise DatasetTypeNotSupportedError(f"Unable to find template for {ref}") from e
688 # Validate the template to protect against filenames from different
689 # dataIds returning the same and causing overwrite confusion.
690 template.validateTemplate(ref)
692 location = self.locationFactory.fromPath(template.format(ref))
694 # Get the formatter based on the storage class
695 storageClass = ref.datasetType.storageClass
696 try:
697 formatter = self.formatterFactory.getFormatter(ref,
698 FileDescriptor(location,
699 storageClass=storageClass),
700 ref.dataId)
701 except KeyError as e:
702 raise DatasetTypeNotSupportedError(f"Unable to find formatter for {ref} in datastore "
703 f"{self.name}") from e
705 # Now that we know the formatter, update the location
706 location = formatter.makeUpdatedLocation(location)
708 return location, formatter
710 def _overrideTransferMode(self, *datasets: FileDataset, transfer: Optional[str] = None) -> Optional[str]:
711 # Docstring inherited from base class
712 if transfer != "auto":
713 return transfer
715 # See if the paths are within the datastore or not
716 inside = [self._pathInStore(d.path) is not None for d in datasets]
718 if all(inside):
719 transfer = None
720 elif not any(inside): 720 ↛ 729line 720 didn't jump to line 729, because the condition on line 720 was never false
721 # Allow ButlerURI to use its own knowledge
722 transfer = "auto"
723 else:
724 # This can happen when importing from a datastore that
725 # has had some datasets ingested using "direct" mode.
726 # Also allow ButlerURI to sort it out but warn about it.
727 # This can happen if you are importing from a datastore
728 # that had some direct transfer datasets.
729 log.warning("Some datasets are inside the datastore and some are outside. Using 'split' "
730 "transfer mode. This assumes that the files outside the datastore are "
731 "still accessible to the new butler since they will not be copied into "
732 "the target datastore.")
733 transfer = "split"
735 return transfer
737 def _pathInStore(self, path: Union[str, ButlerURI]) -> Optional[str]:
738 """Return path relative to datastore root
740 Parameters
741 ----------
742 path : `str` or `ButlerURI`
743 Path to dataset. Can be absolute URI. If relative assumed to
744 be relative to the datastore. Returns path in datastore
745 or raises an exception if the path it outside.
747 Returns
748 -------
749 inStore : `str`
750 Path relative to datastore root. Returns `None` if the file is
751 outside the root.
752 """
753 # Relative path will always be relative to datastore
754 pathUri = ButlerURI(path, forceAbsolute=False)
755 return pathUri.relative_to(self.root)
757 def _standardizeIngestPath(self, path: Union[str, ButlerURI], *,
758 transfer: Optional[str] = None) -> Union[str, ButlerURI]:
759 """Standardize the path of a to-be-ingested file.
761 Parameters
762 ----------
763 path : `str` or `ButlerURI`
764 Path of a file to be ingested.
765 transfer : `str`, optional
766 How (and whether) the dataset should be added to the datastore.
767 See `ingest` for details of transfer modes.
768 This implementation is provided only so
769 `NotImplementedError` can be raised if the mode is not supported;
770 actual transfers are deferred to `_extractIngestInfo`.
772 Returns
773 -------
774 path : `str` or `ButlerURI`
775 New path in what the datastore considers standard form. If an
776 absolute URI was given that will be returned unchanged.
778 Notes
779 -----
780 Subclasses of `FileDatastore` can implement this method instead
781 of `_prepIngest`. It should not modify the data repository or given
782 file in any way.
784 Raises
785 ------
786 NotImplementedError
787 Raised if the datastore does not support the given transfer mode
788 (including the case where ingest is not supported at all).
789 FileNotFoundError
790 Raised if one of the given files does not exist.
791 """
792 if transfer not in (None, "direct", "split") + self.root.transferModes: 792 ↛ 793line 792 didn't jump to line 793, because the condition on line 792 was never true
793 raise NotImplementedError(f"Transfer mode {transfer} not supported.")
795 # A relative URI indicates relative to datastore root
796 srcUri = ButlerURI(path, forceAbsolute=False)
797 if not srcUri.isabs():
798 srcUri = self.root.join(path)
800 if not srcUri.exists():
801 raise FileNotFoundError(f"Resource at {srcUri} does not exist; note that paths to ingest "
802 f"are assumed to be relative to {self.root} unless they are absolute.")
804 if transfer is None:
805 relpath = srcUri.relative_to(self.root)
806 if not relpath:
807 raise RuntimeError(f"Transfer is none but source file ({srcUri}) is not "
808 f"within datastore ({self.root})")
810 # Return the relative path within the datastore for internal
811 # transfer
812 path = relpath
814 return path
816 def _extractIngestInfo(self, path: Union[str, ButlerURI], ref: DatasetRef, *,
817 formatter: Union[Formatter, Type[Formatter]],
818 transfer: Optional[str] = None) -> StoredFileInfo:
819 """Relocate (if necessary) and extract `StoredFileInfo` from a
820 to-be-ingested file.
822 Parameters
823 ----------
824 path : `str` or `ButlerURI`
825 URI or path of a file to be ingested.
826 ref : `DatasetRef`
827 Reference for the dataset being ingested. Guaranteed to have
828 ``dataset_id not None`.
829 formatter : `type` or `Formatter`
830 `Formatter` subclass to use for this dataset or an instance.
831 transfer : `str`, optional
832 How (and whether) the dataset should be added to the datastore.
833 See `ingest` for details of transfer modes.
835 Returns
836 -------
837 info : `StoredFileInfo`
838 Internal datastore record for this file. This will be inserted by
839 the caller; the `_extractIngestInfo` is only resposible for
840 creating and populating the struct.
842 Raises
843 ------
844 FileNotFoundError
845 Raised if one of the given files does not exist.
846 FileExistsError
847 Raised if transfer is not `None` but the (internal) location the
848 file would be moved to is already occupied.
849 """
850 if self._transaction is None: 850 ↛ 851line 850 didn't jump to line 851, because the condition on line 850 was never true
851 raise RuntimeError("Ingest called without transaction enabled")
853 # Create URI of the source path, do not need to force a relative
854 # path to absolute.
855 srcUri = ButlerURI(path, forceAbsolute=False)
857 # Track whether we have read the size of the source yet
858 have_sized = False
860 tgtLocation: Optional[Location]
861 if transfer is None or transfer == "split":
862 # A relative path is assumed to be relative to the datastore
863 # in this context
864 if not srcUri.isabs():
865 tgtLocation = self.locationFactory.fromPath(srcUri.ospath)
866 else:
867 # Work out the path in the datastore from an absolute URI
868 # This is required to be within the datastore.
869 pathInStore = srcUri.relative_to(self.root)
870 if pathInStore is None and transfer is None: 870 ↛ 871line 870 didn't jump to line 871, because the condition on line 870 was never true
871 raise RuntimeError(f"Unexpectedly learned that {srcUri} is "
872 f"not within datastore {self.root}")
873 if pathInStore: 873 ↛ 875line 873 didn't jump to line 875, because the condition on line 873 was never false
874 tgtLocation = self.locationFactory.fromPath(pathInStore)
875 elif transfer == "split":
876 # Outside the datastore but treat that as a direct ingest
877 # instead.
878 tgtLocation = None
879 else:
880 raise RuntimeError(f"Unexpected transfer mode encountered: {transfer} for"
881 f" URI {srcUri}")
882 elif transfer == "direct": 882 ↛ 887line 882 didn't jump to line 887, because the condition on line 882 was never true
883 # Want to store the full URI to the resource directly in
884 # datastore. This is useful for referring to permanent archive
885 # storage for raw data.
886 # Trust that people know what they are doing.
887 tgtLocation = None
888 else:
889 # Work out the name we want this ingested file to have
890 # inside the datastore
891 tgtLocation = self._calculate_ingested_datastore_name(srcUri, ref, formatter)
892 if not tgtLocation.uri.dirname().exists():
893 log.debug("Folder %s does not exist yet.", tgtLocation.uri.dirname())
894 tgtLocation.uri.dirname().mkdir()
896 # if we are transferring from a local file to a remote location
897 # it may be more efficient to get the size and checksum of the
898 # local file rather than the transferred one
899 if not srcUri.scheme or srcUri.scheme == "file": 899 ↛ 905line 899 didn't jump to line 905, because the condition on line 899 was never false
900 size = srcUri.size()
901 checksum = self.computeChecksum(srcUri) if self.useChecksum else None
902 have_sized = True
904 # transfer the resource to the destination
905 tgtLocation.uri.transfer_from(srcUri, transfer=transfer, transaction=self._transaction)
907 if tgtLocation is None: 907 ↛ 909line 907 didn't jump to line 909, because the condition on line 907 was never true
908 # This means we are using direct mode
909 targetUri = srcUri
910 targetPath = str(srcUri)
911 else:
912 targetUri = tgtLocation.uri
913 targetPath = tgtLocation.pathInStore.path
915 # the file should exist in the datastore now
916 if not have_sized:
917 size = targetUri.size()
918 checksum = self.computeChecksum(targetUri) if self.useChecksum else None
920 return StoredFileInfo(formatter=formatter, path=targetPath,
921 storageClass=ref.datasetType.storageClass,
922 component=ref.datasetType.component(),
923 file_size=size, checksum=checksum)
925 def _prepIngest(self, *datasets: FileDataset, transfer: Optional[str] = None) -> _IngestPrepData:
926 # Docstring inherited from Datastore._prepIngest.
927 filtered = []
928 for dataset in datasets:
929 acceptable = [ref for ref in dataset.refs if self.constraints.isAcceptable(ref)]
930 if not acceptable:
931 continue
932 else:
933 dataset.refs = acceptable
934 if dataset.formatter is None:
935 dataset.formatter = self.formatterFactory.getFormatterClass(dataset.refs[0])
936 else:
937 assert isinstance(dataset.formatter, (type, str))
938 dataset.formatter = getClassOf(dataset.formatter)
939 dataset.path = self._standardizeIngestPath(dataset.path, transfer=transfer)
940 filtered.append(dataset)
941 return _IngestPrepData(filtered)
943 @transactional
944 def _finishIngest(self, prepData: Datastore.IngestPrepData, *, transfer: Optional[str] = None) -> None:
945 # Docstring inherited from Datastore._finishIngest.
946 refsAndInfos = []
947 progress = Progress("lsst.daf.butler.datastores.FileDatastore.ingest", level=logging.DEBUG)
948 for dataset in progress.wrap(prepData.datasets, desc="Ingesting dataset files"):
949 # Do ingest as if the first dataset ref is associated with the file
950 info = self._extractIngestInfo(dataset.path, dataset.refs[0], formatter=dataset.formatter,
951 transfer=transfer)
952 refsAndInfos.extend([(ref, info) for ref in dataset.refs])
953 self._register_datasets(refsAndInfos)
955 def _calculate_ingested_datastore_name(self, srcUri: ButlerURI, ref: DatasetRef,
956 formatter: Union[Formatter, Type[Formatter]]) -> Location:
957 """Given a source URI and a DatasetRef, determine the name the
958 dataset will have inside datastore.
960 Parameters
961 ----------
962 srcUri : `ButlerURI`
963 URI to the source dataset file.
964 ref : `DatasetRef`
965 Ref associated with the newly-ingested dataset artifact. This
966 is used to determine the name within the datastore.
967 formatter : `Formatter` or Formatter class.
968 Formatter to use for validation. Can be a class or an instance.
970 Returns
971 -------
972 location : `Location`
973 Target location for the newly-ingested dataset.
974 """
975 # Ingesting a file from outside the datastore.
976 # This involves a new name.
977 template = self.templates.getTemplate(ref)
978 location = self.locationFactory.fromPath(template.format(ref))
980 # Get the extension
981 ext = srcUri.getExtension()
983 # Update the destination to include that extension
984 location.updateExtension(ext)
986 # Ask the formatter to validate this extension
987 formatter.validateExtension(location)
989 return location
991 def _write_in_memory_to_artifact(self, inMemoryDataset: Any, ref: DatasetRef) -> StoredFileInfo:
992 """Write out in memory dataset to datastore.
994 Parameters
995 ----------
996 inMemoryDataset : `object`
997 Dataset to write to datastore.
998 ref : `DatasetRef`
999 Registry information associated with this dataset.
1001 Returns
1002 -------
1003 info : `StoredFileInfo`
1004 Information describin the artifact written to the datastore.
1005 """
1006 location, formatter = self._prepare_for_put(inMemoryDataset, ref)
1007 uri = location.uri
1009 if not uri.dirname().exists():
1010 log.debug("Folder %s does not exist yet so creating it.", uri.dirname())
1011 uri.dirname().mkdir()
1013 if self._transaction is None: 1013 ↛ 1014line 1013 didn't jump to line 1014, because the condition on line 1013 was never true
1014 raise RuntimeError("Attempting to write artifact without transaction enabled")
1016 def _removeFileExists(uri: ButlerURI) -> None:
1017 """Remove a file and do not complain if it is not there.
1019 This is important since a formatter might fail before the file
1020 is written and we should not confuse people by writing spurious
1021 error messages to the log.
1022 """
1023 try:
1024 uri.remove()
1025 except FileNotFoundError:
1026 pass
1028 # Register a callback to try to delete the uploaded data if
1029 # something fails below
1030 self._transaction.registerUndo("artifactWrite", _removeFileExists, uri)
1032 # For a local file, simply use the formatter directly
1033 if uri.isLocal:
1034 try:
1035 formatter.write(inMemoryDataset)
1036 except Exception as e:
1037 raise RuntimeError(f"Failed to serialize dataset {ref} of type {type(inMemoryDataset)} "
1038 f"to location {uri}") from e
1039 log.debug("Successfully wrote python object to local file at %s", uri)
1040 else:
1041 # This is a remote URI. Some datasets can be serialized directly
1042 # to bytes and sent to the remote datastore without writing a
1043 # file. If the dataset is intended to be saved to the cache
1044 # a file is always written and direct write to the remote
1045 # datastore is bypassed.
1046 data_written = False
1047 if not self.cacheManager.should_be_cached(ref):
1048 try:
1049 serializedDataset = formatter.toBytes(inMemoryDataset)
1050 except NotImplementedError:
1051 # Fallback to the file writing option.
1052 pass
1053 except Exception as e:
1054 raise RuntimeError(f"Failed to serialize dataset {ref} "
1055 f"of type {type(inMemoryDataset)} to bytes.") from e
1056 else:
1057 log.debug("Writing bytes directly to %s", uri)
1058 uri.write(serializedDataset, overwrite=True)
1059 log.debug("Successfully wrote bytes directly to %s", uri)
1060 data_written = True
1062 if not data_written:
1063 # Did not write the bytes directly to object store so instead
1064 # write to temporary file.
1065 with ButlerURI.temporary_uri(suffix=uri.getExtension()) as temporary_uri:
1066 # Need to configure the formatter to write to a different
1067 # location and that needs us to overwrite internals
1068 log.debug("Writing dataset to temporary location at %s", temporary_uri)
1069 with formatter._updateLocation(Location(None, temporary_uri)):
1070 try:
1071 formatter.write(inMemoryDataset)
1072 except Exception as e:
1073 raise RuntimeError(f"Failed to serialize dataset {ref} of type"
1074 f" {type(inMemoryDataset)} to "
1075 f"temporary location {temporary_uri}") from e
1076 uri.transfer_from(temporary_uri, transfer="copy", overwrite=True)
1078 # Cache if required
1079 self.cacheManager.move_to_cache(temporary_uri, ref)
1081 log.debug("Successfully wrote dataset to %s via a temporary file.", uri)
1083 # URI is needed to resolve what ingest case are we dealing with
1084 return self._extractIngestInfo(uri, ref, formatter=formatter)
1086 def _read_artifact_into_memory(self, getInfo: DatastoreFileGetInformation,
1087 ref: DatasetRef, isComponent: bool = False,
1088 cache_ref: Optional[DatasetRef] = None) -> Any:
1089 """Read the artifact from datastore into in memory object.
1091 Parameters
1092 ----------
1093 getInfo : `DatastoreFileGetInformation`
1094 Information about the artifact within the datastore.
1095 ref : `DatasetRef`
1096 The registry information associated with this artifact.
1097 isComponent : `bool`
1098 Flag to indicate if a component is being read from this artifact.
1099 cache_ref : `DatasetRef`, optional
1100 The DatasetRef to use when looking up the file in the cache.
1101 This ref must have the same ID as the supplied ref but can
1102 be a parent ref or component ref to indicate to the cache whether
1103 a composite file is being requested from the cache or a component
1104 file. Without this the cache will default to the supplied ref but
1105 it can get confused with read-only derived components for
1106 disassembled composites.
1108 Returns
1109 -------
1110 inMemoryDataset : `object`
1111 The artifact as a python object.
1112 """
1113 location = getInfo.location
1114 uri = location.uri
1115 log.debug("Accessing data from %s", uri)
1117 if cache_ref is None:
1118 cache_ref = ref
1119 if cache_ref.id != ref.id: 1119 ↛ 1120line 1119 didn't jump to line 1120, because the condition on line 1119 was never true
1120 raise ValueError("The supplied cache dataset ref refers to a different dataset than expected:"
1121 f" {ref.id} != {cache_ref.id}")
1123 # Cannot recalculate checksum but can compare size as a quick check
1124 # Do not do this if the size is negative since that indicates
1125 # we do not know.
1126 recorded_size = getInfo.info.file_size
1127 resource_size = uri.size()
1128 if recorded_size >= 0 and resource_size != recorded_size: 1128 ↛ 1129line 1128 didn't jump to line 1129, because the condition on line 1128 was never true
1129 raise RuntimeError("Integrity failure in Datastore. "
1130 f"Size of file {uri} ({resource_size}) "
1131 f"does not match size recorded in registry of {recorded_size}")
1133 # For the general case we have choices for how to proceed.
1134 # 1. Always use a local file (downloading the remote resource to a
1135 # temporary file if needed).
1136 # 2. Use a threshold size and read into memory and use bytes.
1137 # Use both for now with an arbitrary hand off size.
1138 # This allows small datasets to be downloaded from remote object
1139 # stores without requiring a temporary file.
1141 formatter = getInfo.formatter
1142 nbytes_max = 10_000_000 # Arbitrary number that we can tune
1143 if resource_size <= nbytes_max and formatter.can_read_bytes():
1144 with self.cacheManager.find_in_cache(cache_ref, uri.getExtension()) as cached_file:
1145 if cached_file is not None:
1146 desired_uri = cached_file
1147 msg = f" (cached version of {uri})"
1148 else:
1149 desired_uri = uri
1150 msg = ""
1151 with time_this(log, msg="Reading bytes from %s%s", args=(desired_uri, msg)):
1152 serializedDataset = desired_uri.read()
1153 log.debug("Deserializing %s from %d bytes from location %s with formatter %s",
1154 f"component {getInfo.component}" if isComponent else "",
1155 len(serializedDataset), uri, formatter.name())
1156 try:
1157 result = formatter.fromBytes(serializedDataset,
1158 component=getInfo.component if isComponent else None)
1159 except Exception as e:
1160 raise ValueError(f"Failure from formatter '{formatter.name()}' for dataset {ref.id}"
1161 f" ({ref.datasetType.name} from {uri}): {e}") from e
1162 else:
1163 # Read from file.
1165 # Have to update the Location associated with the formatter
1166 # because formatter.read does not allow an override.
1167 # This could be improved.
1168 location_updated = False
1169 msg = ""
1171 # First check in cache for local version.
1172 # The cache will only be relevant for remote resources but
1173 # no harm in always asking. Context manager ensures that cache
1174 # file is not deleted during cache expiration.
1175 with self.cacheManager.find_in_cache(cache_ref, uri.getExtension()) as cached_file:
1176 if cached_file is not None:
1177 msg = f"(via cache read of remote file {uri})"
1178 uri = cached_file
1179 location_updated = True
1181 with uri.as_local() as local_uri:
1183 can_be_cached = False
1184 if uri != local_uri: 1184 ↛ 1186line 1184 didn't jump to line 1186, because the condition on line 1184 was never true
1185 # URI was remote and file was downloaded
1186 cache_msg = ""
1187 location_updated = True
1189 if self.cacheManager.should_be_cached(cache_ref):
1190 # In this scenario we want to ask if the downloaded
1191 # file should be cached but we should not cache
1192 # it until after we've used it (to ensure it can't
1193 # be expired whilst we are using it).
1194 can_be_cached = True
1196 # Say that it is "likely" to be cached because
1197 # if the formatter read fails we will not be
1198 # caching this file.
1199 cache_msg = " and likely cached"
1201 msg = f"(via download to local file{cache_msg})"
1203 # Calculate the (possibly) new location for the formatter
1204 # to use.
1205 newLocation = Location(*local_uri.split()) if location_updated else None
1207 log.debug("Reading%s from location %s %s with formatter %s",
1208 f" component {getInfo.component}" if isComponent else "",
1209 uri, msg, formatter.name())
1210 try:
1211 with formatter._updateLocation(newLocation):
1212 with time_this(log, msg="Reading%s from location %s %s with formatter %s",
1213 args=(f" component {getInfo.component}" if isComponent else "",
1214 uri, msg, formatter.name())):
1215 result = formatter.read(component=getInfo.component if isComponent else None)
1216 except Exception as e:
1217 raise ValueError(f"Failure from formatter '{formatter.name()}' for dataset {ref.id}"
1218 f" ({ref.datasetType.name} from {uri}): {e}") from e
1220 # File was read successfully so can move to cache
1221 if can_be_cached: 1221 ↛ 1222line 1221 didn't jump to line 1222, because the condition on line 1221 was never true
1222 self.cacheManager.move_to_cache(local_uri, cache_ref)
1224 return self._post_process_get(result, getInfo.readStorageClass, getInfo.assemblerParams,
1225 isComponent=isComponent)
1227 def knows(self, ref: DatasetRef) -> bool:
1228 """Check if the dataset is known to the datastore.
1230 Does not check for existence of any artifact.
1232 Parameters
1233 ----------
1234 ref : `DatasetRef`
1235 Reference to the required dataset.
1237 Returns
1238 -------
1239 exists : `bool`
1240 `True` if the dataset is known to the datastore.
1241 """
1242 fileLocations = self._get_dataset_locations_info(ref)
1243 if fileLocations:
1244 return True
1245 return False
1247 def exists(self, ref: DatasetRef) -> bool:
1248 """Check if the dataset exists in the datastore.
1250 Parameters
1251 ----------
1252 ref : `DatasetRef`
1253 Reference to the required dataset.
1255 Returns
1256 -------
1257 exists : `bool`
1258 `True` if the entity exists in the `Datastore`.
1259 """
1260 fileLocations = self._get_dataset_locations_info(ref)
1262 # if we are being asked to trust that registry might not be correct
1263 # we ask for the expected locations and check them explicitly
1264 if not fileLocations:
1265 if not self.trustGetRequest:
1266 return False
1268 # When we are guessing a dataset location we can not check
1269 # for the existence of every component since we can not
1270 # know if every component was written. Instead we check
1271 # for the existence of any of the expected locations.
1272 for location, _ in self._get_expected_dataset_locations_info(ref): 1272 ↛ 1275line 1272 didn't jump to line 1275, because the loop on line 1272 didn't complete
1273 if self._artifact_exists(location): 1273 ↛ 1272line 1273 didn't jump to line 1272, because the condition on line 1273 was never false
1274 return True
1275 return False
1277 # All listed artifacts must exist.
1278 for location, _ in fileLocations:
1279 if not self._artifact_exists(location):
1280 return False
1282 return True
1284 def getURIs(self, ref: DatasetRef,
1285 predict: bool = False) -> Tuple[Optional[ButlerURI], Dict[str, ButlerURI]]:
1286 """Return URIs associated with dataset.
1288 Parameters
1289 ----------
1290 ref : `DatasetRef`
1291 Reference to the required dataset.
1292 predict : `bool`, optional
1293 If the datastore does not know about the dataset, should it
1294 return a predicted URI or not?
1296 Returns
1297 -------
1298 primary : `ButlerURI`
1299 The URI to the primary artifact associated with this dataset.
1300 If the dataset was disassembled within the datastore this
1301 may be `None`.
1302 components : `dict`
1303 URIs to any components associated with the dataset artifact.
1304 Can be empty if there are no components.
1305 """
1307 primary: Optional[ButlerURI] = None
1308 components: Dict[str, ButlerURI] = {}
1310 # if this has never been written then we have to guess
1311 if not self.exists(ref):
1312 if not predict:
1313 raise FileNotFoundError("Dataset {} not in this datastore".format(ref))
1315 doDisassembly = self.composites.shouldBeDisassembled(ref)
1317 if doDisassembly:
1319 for component, componentStorage in ref.datasetType.storageClass.components.items():
1320 compRef = ref.makeComponentRef(component)
1321 compLocation, _ = self._determine_put_formatter_location(compRef)
1323 # Add a URI fragment to indicate this is a guess
1324 components[component] = ButlerURI(compLocation.uri.geturl() + "#predicted")
1326 else:
1328 location, _ = self._determine_put_formatter_location(ref)
1330 # Add a URI fragment to indicate this is a guess
1331 primary = ButlerURI(location.uri.geturl() + "#predicted")
1333 return primary, components
1335 # If this is a ref that we have written we can get the path.
1336 # Get file metadata and internal metadata
1337 fileLocations = self._get_dataset_locations_info(ref)
1339 guessing = False
1340 if not fileLocations:
1341 if not self.trustGetRequest: 1341 ↛ 1342line 1341 didn't jump to line 1342, because the condition on line 1341 was never true
1342 raise RuntimeError(f"Unexpectedly got no artifacts for dataset {ref}")
1343 fileLocations = self._get_expected_dataset_locations_info(ref)
1344 guessing = True
1346 if len(fileLocations) == 1:
1347 # No disassembly so this is the primary URI
1348 uri = fileLocations[0][0].uri
1349 if guessing and not uri.exists(): 1349 ↛ 1350line 1349 didn't jump to line 1350, because the condition on line 1349 was never true
1350 raise FileNotFoundError(f"Expected URI ({uri}) does not exist")
1351 primary = uri
1353 else:
1354 for location, storedFileInfo in fileLocations:
1355 if storedFileInfo.component is None: 1355 ↛ 1356line 1355 didn't jump to line 1356, because the condition on line 1355 was never true
1356 raise RuntimeError(f"Unexpectedly got no component name for a component at {location}")
1357 uri = location.uri
1358 if guessing and not uri.exists(): 1358 ↛ 1362line 1358 didn't jump to line 1362, because the condition on line 1358 was never true
1359 # If we are trusting then it is entirely possible for
1360 # some components to be missing. In that case we skip
1361 # to the next component.
1362 if self.trustGetRequest:
1363 continue
1364 raise FileNotFoundError(f"Expected URI ({uri}) does not exist")
1365 components[storedFileInfo.component] = uri
1367 return primary, components
1369 def getURI(self, ref: DatasetRef, predict: bool = False) -> ButlerURI:
1370 """URI to the Dataset.
1372 Parameters
1373 ----------
1374 ref : `DatasetRef`
1375 Reference to the required Dataset.
1376 predict : `bool`
1377 If `True`, allow URIs to be returned of datasets that have not
1378 been written.
1380 Returns
1381 -------
1382 uri : `str`
1383 URI pointing to the dataset within the datastore. If the
1384 dataset does not exist in the datastore, and if ``predict`` is
1385 `True`, the URI will be a prediction and will include a URI
1386 fragment "#predicted".
1387 If the datastore does not have entities that relate well
1388 to the concept of a URI the returned URI will be
1389 descriptive. The returned URI is not guaranteed to be obtainable.
1391 Raises
1392 ------
1393 FileNotFoundError
1394 Raised if a URI has been requested for a dataset that does not
1395 exist and guessing is not allowed.
1396 RuntimeError
1397 Raised if a request is made for a single URI but multiple URIs
1398 are associated with this dataset.
1400 Notes
1401 -----
1402 When a predicted URI is requested an attempt will be made to form
1403 a reasonable URI based on file templates and the expected formatter.
1404 """
1405 primary, components = self.getURIs(ref, predict)
1406 if primary is None or components: 1406 ↛ 1407line 1406 didn't jump to line 1407, because the condition on line 1406 was never true
1407 raise RuntimeError(f"Dataset ({ref}) includes distinct URIs for components. "
1408 "Use Dataastore.getURIs() instead.")
1409 return primary
1411 def retrieveArtifacts(self, refs: Iterable[DatasetRef],
1412 destination: ButlerURI, transfer: str = "auto",
1413 preserve_path: bool = True,
1414 overwrite: bool = False) -> List[ButlerURI]:
1415 """Retrieve the file artifacts associated with the supplied refs.
1417 Parameters
1418 ----------
1419 refs : iterable of `DatasetRef`
1420 The datasets for which file artifacts are to be retrieved.
1421 A single ref can result in multiple files. The refs must
1422 be resolved.
1423 destination : `ButlerURI`
1424 Location to write the file artifacts.
1425 transfer : `str`, optional
1426 Method to use to transfer the artifacts. Must be one of the options
1427 supported by `ButlerURI.transfer_from()`. "move" is not allowed.
1428 preserve_path : `bool`, optional
1429 If `True` the full path of the file artifact within the datastore
1430 is preserved. If `False` the final file component of the path
1431 is used.
1432 overwrite : `bool`, optional
1433 If `True` allow transfers to overwrite existing files at the
1434 destination.
1436 Returns
1437 -------
1438 targets : `list` of `ButlerURI`
1439 URIs of file artifacts in destination location. Order is not
1440 preserved.
1441 """
1442 if not destination.isdir(): 1442 ↛ 1443line 1442 didn't jump to line 1443, because the condition on line 1442 was never true
1443 raise ValueError(f"Destination location must refer to a directory. Given {destination}")
1445 if transfer == "move":
1446 raise ValueError("Can not move artifacts out of datastore. Use copy instead.")
1448 # Source -> Destination
1449 # This also helps filter out duplicate DatasetRef in the request
1450 # that will map to the same underlying file transfer.
1451 to_transfer: Dict[ButlerURI, ButlerURI] = {}
1453 for ref in refs:
1454 locations = self._get_dataset_locations_info(ref)
1455 for location, _ in locations:
1456 source_uri = location.uri
1457 target_path: Union[str, ButlerURI]
1458 if preserve_path:
1459 target_path = location.pathInStore
1460 if target_path.isabs(): 1460 ↛ 1463line 1460 didn't jump to line 1463, because the condition on line 1460 was never true
1461 # This is an absolute path to an external file.
1462 # Use the full path.
1463 target_path = target_path.relativeToPathRoot
1464 else:
1465 target_path = source_uri.basename()
1466 target_uri = destination.join(target_path)
1467 to_transfer[source_uri] = target_uri
1469 # In theory can now parallelize the transfer
1470 log.debug("Number of artifacts to transfer to %s: %d",
1471 str(destination), len(to_transfer))
1472 for source_uri, target_uri in to_transfer.items():
1473 target_uri.transfer_from(source_uri, transfer=transfer, overwrite=overwrite)
1475 return list(to_transfer.values())
1477 def get(self, ref: DatasetRef, parameters: Optional[Mapping[str, Any]] = None) -> Any:
1478 """Load an InMemoryDataset from the store.
1480 Parameters
1481 ----------
1482 ref : `DatasetRef`
1483 Reference to the required Dataset.
1484 parameters : `dict`
1485 `StorageClass`-specific parameters that specify, for example,
1486 a slice of the dataset to be loaded.
1488 Returns
1489 -------
1490 inMemoryDataset : `object`
1491 Requested dataset or slice thereof as an InMemoryDataset.
1493 Raises
1494 ------
1495 FileNotFoundError
1496 Requested dataset can not be retrieved.
1497 TypeError
1498 Return value from formatter has unexpected type.
1499 ValueError
1500 Formatter failed to process the dataset.
1501 """
1502 allGetInfo = self._prepare_for_get(ref, parameters)
1503 refComponent = ref.datasetType.component()
1505 # Supplied storage class for the component being read
1506 refStorageClass = ref.datasetType.storageClass
1508 # Create mapping from component name to related info
1509 allComponents = {i.component: i for i in allGetInfo}
1511 # By definition the dataset is disassembled if we have more
1512 # than one record for it.
1513 isDisassembled = len(allGetInfo) > 1
1515 # Look for the special case where we are disassembled but the
1516 # component is a derived component that was not written during
1517 # disassembly. For this scenario we need to check that the
1518 # component requested is listed as a derived component for the
1519 # composite storage class
1520 isDisassembledReadOnlyComponent = False
1521 if isDisassembled and refComponent:
1522 # The composite storage class should be accessible through
1523 # the component dataset type
1524 compositeStorageClass = ref.datasetType.parentStorageClass
1526 # In the unlikely scenario where the composite storage
1527 # class is not known, we can only assume that this is a
1528 # normal component. If that assumption is wrong then the
1529 # branch below that reads a persisted component will fail
1530 # so there is no need to complain here.
1531 if compositeStorageClass is not None: 1531 ↛ 1534line 1531 didn't jump to line 1534, because the condition on line 1531 was never false
1532 isDisassembledReadOnlyComponent = refComponent in compositeStorageClass.derivedComponents
1534 if isDisassembled and not refComponent:
1535 # This was a disassembled dataset spread over multiple files
1536 # and we need to put them all back together again.
1537 # Read into memory and then assemble
1539 # Check that the supplied parameters are suitable for the type read
1540 refStorageClass.validateParameters(parameters)
1542 # We want to keep track of all the parameters that were not used
1543 # by formatters. We assume that if any of the component formatters
1544 # use a parameter that we do not need to apply it again in the
1545 # assembler.
1546 usedParams = set()
1548 components: Dict[str, Any] = {}
1549 for getInfo in allGetInfo:
1550 # assemblerParams are parameters not understood by the
1551 # associated formatter.
1552 usedParams.update(set(getInfo.formatterParams))
1554 component = getInfo.component
1556 if component is None: 1556 ↛ 1557line 1556 didn't jump to line 1557, because the condition on line 1556 was never true
1557 raise RuntimeError(f"Internal error in datastore assembly of {ref}")
1559 # We do not want the formatter to think it's reading
1560 # a component though because it is really reading a
1561 # standalone dataset -- always tell reader it is not a
1562 # component.
1563 components[component] = self._read_artifact_into_memory(getInfo,
1564 ref.makeComponentRef(component),
1565 isComponent=False)
1567 inMemoryDataset = ref.datasetType.storageClass.delegate().assemble(components)
1569 # Any unused parameters will have to be passed to the assembler
1570 if parameters:
1571 unusedParams = {k: v for k, v in parameters.items() if k not in usedParams}
1572 else:
1573 unusedParams = {}
1575 # Process parameters
1576 return ref.datasetType.storageClass.delegate().handleParameters(inMemoryDataset,
1577 parameters=unusedParams)
1579 elif isDisassembledReadOnlyComponent:
1581 compositeStorageClass = ref.datasetType.parentStorageClass
1582 if compositeStorageClass is None: 1582 ↛ 1583line 1582 didn't jump to line 1583, because the condition on line 1582 was never true
1583 raise RuntimeError(f"Unable to retrieve derived component '{refComponent}' since"
1584 "no composite storage class is available.")
1586 if refComponent is None: 1586 ↛ 1588line 1586 didn't jump to line 1588, because the condition on line 1586 was never true
1587 # Mainly for mypy
1588 raise RuntimeError(f"Internal error in datastore {self.name}: component can not be None here")
1590 # Assume that every derived component can be calculated by
1591 # forwarding the request to a single read/write component.
1592 # Rather than guessing which rw component is the right one by
1593 # scanning each for a derived component of the same name,
1594 # we ask the storage class delegate directly which one is best to
1595 # use.
1596 compositeDelegate = compositeStorageClass.delegate()
1597 forwardedComponent = compositeDelegate.selectResponsibleComponent(refComponent,
1598 set(allComponents))
1600 # Select the relevant component
1601 rwInfo = allComponents[forwardedComponent]
1603 # For now assume that read parameters are validated against
1604 # the real component and not the requested component
1605 forwardedStorageClass = rwInfo.formatter.fileDescriptor.readStorageClass
1606 forwardedStorageClass.validateParameters(parameters)
1608 # The reference to use for the caching must refer to the forwarded
1609 # component and not the derived component.
1610 cache_ref = ref.makeCompositeRef().makeComponentRef(forwardedComponent)
1612 # Unfortunately the FileDescriptor inside the formatter will have
1613 # the wrong write storage class so we need to create a new one
1614 # given the immutability constraint.
1615 writeStorageClass = rwInfo.info.storageClass
1617 # We may need to put some thought into parameters for read
1618 # components but for now forward them on as is
1619 readFormatter = type(rwInfo.formatter)(FileDescriptor(rwInfo.location,
1620 readStorageClass=refStorageClass,
1621 storageClass=writeStorageClass,
1622 parameters=parameters),
1623 ref.dataId)
1625 # The assembler can not receive any parameter requests for a
1626 # derived component at this time since the assembler will
1627 # see the storage class of the derived component and those
1628 # parameters will have to be handled by the formatter on the
1629 # forwarded storage class.
1630 assemblerParams: Dict[str, Any] = {}
1632 # Need to created a new info that specifies the derived
1633 # component and associated storage class
1634 readInfo = DatastoreFileGetInformation(rwInfo.location, readFormatter,
1635 rwInfo.info, assemblerParams, {},
1636 refComponent, refStorageClass)
1638 return self._read_artifact_into_memory(readInfo, ref, isComponent=True,
1639 cache_ref=cache_ref)
1641 else:
1642 # Single file request or component from that composite file
1643 for lookup in (refComponent, None): 1643 ↛ 1648line 1643 didn't jump to line 1648, because the loop on line 1643 didn't complete
1644 if lookup in allComponents: 1644 ↛ 1643line 1644 didn't jump to line 1643, because the condition on line 1644 was never false
1645 getInfo = allComponents[lookup]
1646 break
1647 else:
1648 raise FileNotFoundError(f"Component {refComponent} not found "
1649 f"for ref {ref} in datastore {self.name}")
1651 # Do not need the component itself if already disassembled
1652 if isDisassembled:
1653 isComponent = False
1654 else:
1655 isComponent = getInfo.component is not None
1657 # For a component read of a composite we want the cache to
1658 # be looking at the composite ref itself.
1659 cache_ref = ref.makeCompositeRef() if isComponent else ref
1661 # For a disassembled component we can validate parametersagainst
1662 # the component storage class directly
1663 if isDisassembled:
1664 refStorageClass.validateParameters(parameters)
1665 else:
1666 # For an assembled composite this could be a derived
1667 # component derived from a real component. The validity
1668 # of the parameters is not clear. For now validate against
1669 # the composite storage class
1670 getInfo.formatter.fileDescriptor.storageClass.validateParameters(parameters)
1672 return self._read_artifact_into_memory(getInfo, ref, isComponent=isComponent,
1673 cache_ref=cache_ref)
1675 @transactional
1676 def put(self, inMemoryDataset: Any, ref: DatasetRef) -> None:
1677 """Write a InMemoryDataset with a given `DatasetRef` to the store.
1679 Parameters
1680 ----------
1681 inMemoryDataset : `object`
1682 The dataset to store.
1683 ref : `DatasetRef`
1684 Reference to the associated Dataset.
1686 Raises
1687 ------
1688 TypeError
1689 Supplied object and storage class are inconsistent.
1690 DatasetTypeNotSupportedError
1691 The associated `DatasetType` is not handled by this datastore.
1693 Notes
1694 -----
1695 If the datastore is configured to reject certain dataset types it
1696 is possible that the put will fail and raise a
1697 `DatasetTypeNotSupportedError`. The main use case for this is to
1698 allow `ChainedDatastore` to put to multiple datastores without
1699 requiring that every datastore accepts the dataset.
1700 """
1702 doDisassembly = self.composites.shouldBeDisassembled(ref)
1703 # doDisassembly = True
1705 artifacts = []
1706 if doDisassembly:
1707 components = ref.datasetType.storageClass.delegate().disassemble(inMemoryDataset)
1708 for component, componentInfo in components.items():
1709 # Don't recurse because we want to take advantage of
1710 # bulk insert -- need a new DatasetRef that refers to the
1711 # same dataset_id but has the component DatasetType
1712 # DatasetType does not refer to the types of components
1713 # So we construct one ourselves.
1714 compRef = ref.makeComponentRef(component)
1715 storedInfo = self._write_in_memory_to_artifact(componentInfo.component, compRef)
1716 artifacts.append((compRef, storedInfo))
1717 else:
1718 # Write the entire thing out
1719 storedInfo = self._write_in_memory_to_artifact(inMemoryDataset, ref)
1720 artifacts.append((ref, storedInfo))
1722 self._register_datasets(artifacts)
1724 @transactional
1725 def trash(self, ref: Union[DatasetRef, Iterable[DatasetRef]], ignore_errors: bool = True) -> None:
1726 # At this point can safely remove these datasets from the cache
1727 # to avoid confusion later on. If they are not trashed later
1728 # the cache will simply be refilled.
1729 self.cacheManager.remove_from_cache(ref)
1731 # Get file metadata and internal metadata
1732 if not isinstance(ref, DatasetRef):
1733 log.debug("Doing multi-dataset trash in datastore %s", self.name)
1734 # Assumed to be an iterable of refs so bulk mode enabled.
1735 try:
1736 self.bridge.moveToTrash(ref)
1737 except Exception as e:
1738 if ignore_errors:
1739 log.warning("Unexpected issue moving multiple datasets to trash: %s", e)
1740 else:
1741 raise
1742 return
1744 log.debug("Trashing dataset %s in datastore %s", ref, self.name)
1746 fileLocations = self._get_dataset_locations_info(ref)
1748 if not fileLocations:
1749 err_msg = f"Requested dataset to trash ({ref}) is not known to datastore {self.name}"
1750 if ignore_errors: 1750 ↛ 1751line 1750 didn't jump to line 1751, because the condition on line 1750 was never true
1751 log.warning(err_msg)
1752 return
1753 else:
1754 raise FileNotFoundError(err_msg)
1756 for location, storedFileInfo in fileLocations:
1757 if not self._artifact_exists(location): 1757 ↛ 1758line 1757 didn't jump to line 1758, because the condition on line 1757 was never true
1758 err_msg = f"Dataset is known to datastore {self.name} but " \
1759 f"associated artifact ({location.uri}) is missing"
1760 if ignore_errors:
1761 log.warning(err_msg)
1762 return
1763 else:
1764 raise FileNotFoundError(err_msg)
1766 # Mark dataset as trashed
1767 try:
1768 self.bridge.moveToTrash([ref])
1769 except Exception as e:
1770 if ignore_errors:
1771 log.warning("Attempted to mark dataset (%s) to be trashed in datastore %s "
1772 "but encountered an error: %s", ref, self.name, e)
1773 pass
1774 else:
1775 raise
1777 @transactional
1778 def emptyTrash(self, ignore_errors: bool = True) -> None:
1779 """Remove all datasets from the trash.
1781 Parameters
1782 ----------
1783 ignore_errors : `bool`
1784 If `True` return without error even if something went wrong.
1785 Problems could occur if another process is simultaneously trying
1786 to delete.
1787 """
1788 log.debug("Emptying trash in datastore %s", self.name)
1790 # Context manager will empty trash iff we finish it without raising.
1791 # It will also automatically delete the relevant rows from the
1792 # trash table and the records table.
1793 with self.bridge.emptyTrash(self._table, record_class=StoredFileInfo,
1794 record_column="path") as trash_data:
1795 # Removing the artifacts themselves requires that the files are
1796 # not also associated with refs that are not to be trashed.
1797 # Therefore need to do a query with the file paths themselves
1798 # and return all the refs associated with them. Can only delete
1799 # a file if the refs to be trashed are the only refs associated
1800 # with the file.
1801 # This requires multiple copies of the trashed items
1802 trashed, artifacts_to_keep = trash_data
1804 if artifacts_to_keep is None:
1805 # The bridge is not helping us so have to work it out
1806 # ourselves. This is not going to be as efficient.
1807 trashed = list(trashed)
1809 # The instance check is for mypy since up to this point it
1810 # does not know the type of info.
1811 path_map = self._refs_associated_with_artifacts([info.path for _, info in trashed
1812 if isinstance(info, StoredFileInfo)])
1814 for ref, info in trashed:
1816 # Mypy needs to know this is not the base class
1817 assert isinstance(info, StoredFileInfo), f"Unexpectedly got info of class {type(info)}"
1819 # Check for mypy
1820 assert ref.id is not None, f"Internal logic error in emptyTrash with ref {ref}/{info}"
1822 path_map[info.path].remove(ref.id)
1823 if not path_map[info.path]: 1823 ↛ 1814line 1823 didn't jump to line 1814, because the condition on line 1823 was never false
1824 del path_map[info.path]
1826 artifacts_to_keep = set(path_map)
1828 for ref, info in trashed:
1830 # Should not happen for this implementation but need
1831 # to keep mypy happy.
1832 assert info is not None, f"Internal logic error in emptyTrash with ref {ref}."
1834 # Mypy needs to know this is not the base class
1835 assert isinstance(info, StoredFileInfo), f"Unexpectedly got info of class {type(info)}"
1837 # Check for mypy
1838 assert ref.id is not None, f"Internal logic error in emptyTrash with ref {ref}/{info}"
1840 if info.path in artifacts_to_keep:
1841 # This is a multi-dataset artifact and we are not
1842 # removing all associated refs.
1843 continue
1845 # Only trashed refs still known to datastore will be returned.
1846 location = info.file_location(self.locationFactory)
1848 # Point of no return for this artifact
1849 log.debug("Removing artifact %s from datastore %s", location.uri, self.name)
1850 try:
1851 self._delete_artifact(location)
1852 except FileNotFoundError:
1853 # If the file itself has been deleted there is nothing
1854 # we can do about it. It is possible that trash has
1855 # been run in parallel in another process or someone
1856 # decided to delete the file. It is unlikely to come
1857 # back and so we should still continue with the removal
1858 # of the entry from the trash table. It is also possible
1859 # we removed it in a previous iteration if it was
1860 # a multi-dataset artifact. The delete artifact method
1861 # will log a debug message in this scenario.
1862 # Distinguishing file missing before trash started and
1863 # file already removed previously as part of this trash
1864 # is not worth the distinction with regards to potential
1865 # memory cost.
1866 pass
1867 except Exception as e:
1868 if ignore_errors:
1869 # Use a debug message here even though it's not
1870 # a good situation. In some cases this can be
1871 # caused by a race between user A and user B
1872 # and neither of them has permissions for the
1873 # other's files. Butler does not know about users
1874 # and trash has no idea what collections these
1875 # files were in (without guessing from a path).
1876 log.debug("Encountered error removing artifact %s from datastore %s: %s",
1877 location.uri, self.name, e)
1878 else:
1879 raise
1881 @transactional
1882 def transfer_from(self, source_datastore: Datastore, refs: Iterable[DatasetRef],
1883 local_refs: Optional[Iterable[DatasetRef]] = None,
1884 transfer: str = "auto") -> None:
1885 # Docstring inherited
1886 if type(self) is not type(source_datastore): 1886 ↛ 1887line 1886 didn't jump to line 1887, because the condition on line 1886 was never true
1887 raise TypeError(f"Datastore mismatch between this datastore ({type(self)}) and the "
1888 f"source datastore ({type(source_datastore)}).")
1890 # Be explicit for mypy
1891 if not isinstance(source_datastore, FileDatastore): 1891 ↛ 1892line 1891 didn't jump to line 1892, because the condition on line 1891 was never true
1892 raise TypeError("Can only transfer to a FileDatastore from another FileDatastore, not"
1893 f" {type(source_datastore)}")
1895 # Stop early if "direct" transfer mode is requested. That would
1896 # require that the URI inside the source datastore should be stored
1897 # directly in the target datastore, which seems unlikely to be useful
1898 # since at any moment the source datastore could delete the file.
1899 if transfer in ("direct", "split"): 1899 ↛ 1900line 1899 didn't jump to line 1900, because the condition on line 1899 was never true
1900 raise ValueError("Can not transfer from a source datastore using direct mode since"
1901 " those files are controlled by the other datastore.")
1903 # We will go through the list multiple times so must convert
1904 # generators to lists.
1905 refs = list(refs)
1907 if local_refs is None: 1907 ↛ 1908line 1907 didn't jump to line 1908, because the condition on line 1907 was never true
1908 local_refs = refs
1909 else:
1910 local_refs = list(local_refs)
1912 # In order to handle disassembled composites the code works
1913 # at the records level since it can assume that internal APIs
1914 # can be used.
1915 # - If the record already exists in the destination this is assumed
1916 # to be okay.
1917 # - If there is no record but the source and destination URIs are
1918 # identical no transfer is done but the record is added.
1919 # - If the source record refers to an absolute URI currently assume
1920 # that that URI should remain absolute and will be visible to the
1921 # destination butler. May need to have a flag to indicate whether
1922 # the dataset should be transferred. This will only happen if
1923 # the detached Butler has had a local ingest.
1925 # What we really want is all the records in the source datastore
1926 # associated with these refs. Or derived ones if they don't exist
1927 # in the source.
1928 source_records = source_datastore._get_stored_records_associated_with_refs(refs)
1930 # The source dataset_ids are the keys in these records
1931 source_ids = set(source_records)
1932 log.debug("Number of datastore records found in source: %d", len(source_ids))
1934 # The not None check is to appease mypy
1935 requested_ids = set(ref.id for ref in refs if ref.id is not None)
1936 missing_ids = requested_ids - source_ids
1938 # Missing IDs can be okay if that datastore has allowed
1939 # gets based on file existence. Should we transfer what we can
1940 # or complain about it and warn?
1941 if missing_ids and not source_datastore.trustGetRequest: 1941 ↛ 1942line 1941 didn't jump to line 1942, because the condition on line 1941 was never true
1942 raise ValueError(f"Some datasets are missing from source datastore {source_datastore}:"
1943 f" {missing_ids}")
1945 # Need to map these missing IDs to a DatasetRef so we can guess
1946 # the details.
1947 if missing_ids: 1947 ↛ 1948line 1947 didn't jump to line 1948, because the condition on line 1947 was never true
1948 log.info("Number of expected datasets missing from source datastore records: %d out of %d",
1949 len(missing_ids), len(requested_ids))
1950 id_to_ref = {ref.id: ref for ref in refs if ref.id in missing_ids}
1952 for missing in missing_ids:
1953 # Ask the source datastore where the missing artifacts
1954 # should be. An execution butler might not know about the
1955 # artifacts even if they are there.
1956 expected = source_datastore._get_expected_dataset_locations_info(id_to_ref[missing])
1958 # Not all components can be guaranteed to exist so this
1959 # list has to filter those by checking to see if the
1960 # artifact is really there.
1961 records = [info for location, info in expected if location.uri.exists()]
1962 if records:
1963 source_records[missing].extend(records)
1964 else:
1965 log.warning("Asked to transfer dataset %s but no file artifacts exist for it.",
1966 id_to_ref[missing])
1968 # See if we already have these records
1969 target_records = self._get_stored_records_associated_with_refs(local_refs)
1971 # The artifacts to register
1972 artifacts = []
1974 # Refs that already exist
1975 already_present = []
1977 # Now can transfer the artifacts
1978 for source_ref, target_ref in zip(refs, local_refs):
1979 if target_ref.id in target_records: 1979 ↛ 1981line 1979 didn't jump to line 1981, because the condition on line 1979 was never true
1980 # Already have an artifact for this.
1981 already_present.append(target_ref)
1982 continue
1984 # mypy needs to know these are always resolved refs
1985 for info in source_records[source_ref.getCheckedId()]:
1986 source_location = info.file_location(source_datastore.locationFactory)
1987 target_location = info.file_location(self.locationFactory)
1988 if source_location == target_location: 1988 ↛ 1992line 1988 didn't jump to line 1992, because the condition on line 1988 was never true
1989 # Either the dataset is already in the target datastore
1990 # (which is how execution butler currently runs) or
1991 # it is an absolute URI.
1992 if source_location.pathInStore.isabs():
1993 # Just because we can see the artifact when running
1994 # the transfer doesn't mean it will be generally
1995 # accessible to a user of this butler. For now warn
1996 # but assume it will be accessible.
1997 log.warning("Transfer request for an outside-datastore artifact has been found at %s",
1998 source_location)
1999 else:
2000 # Need to transfer it to the new location.
2001 # Assume we should always overwrite. If the artifact
2002 # is there this might indicate that a previous transfer
2003 # was interrupted but was not able to be rolled back
2004 # completely (eg pre-emption) so follow Datastore default
2005 # and overwrite.
2006 target_location.uri.transfer_from(source_location.uri, transfer=transfer,
2007 overwrite=True, transaction=self._transaction)
2009 artifacts.append((target_ref, info))
2011 self._register_datasets(artifacts)
2013 if already_present: 2013 ↛ 2014line 2013 didn't jump to line 2014, because the condition on line 2013 was never true
2014 n_skipped = len(already_present)
2015 log.info("Skipped transfer of %d dataset%s already present in datastore", n_skipped,
2016 "" if n_skipped == 1 else "s")
2018 @transactional
2019 def forget(self, refs: Iterable[DatasetRef]) -> None:
2020 # Docstring inherited.
2021 refs = list(refs)
2022 self.bridge.forget(refs)
2023 self._table.delete(["dataset_id"], *[{"dataset_id": ref.getCheckedId()} for ref in refs])
2025 def validateConfiguration(self, entities: Iterable[Union[DatasetRef, DatasetType, StorageClass]],
2026 logFailures: bool = False) -> None:
2027 """Validate some of the configuration for this datastore.
2029 Parameters
2030 ----------
2031 entities : iterable of `DatasetRef`, `DatasetType`, or `StorageClass`
2032 Entities to test against this configuration. Can be differing
2033 types.
2034 logFailures : `bool`, optional
2035 If `True`, output a log message for every validation error
2036 detected.
2038 Raises
2039 ------
2040 DatastoreValidationError
2041 Raised if there is a validation problem with a configuration.
2042 All the problems are reported in a single exception.
2044 Notes
2045 -----
2046 This method checks that all the supplied entities have valid file
2047 templates and also have formatters defined.
2048 """
2050 templateFailed = None
2051 try:
2052 self.templates.validateTemplates(entities, logFailures=logFailures)
2053 except FileTemplateValidationError as e:
2054 templateFailed = str(e)
2056 formatterFailed = []
2057 for entity in entities:
2058 try:
2059 self.formatterFactory.getFormatterClass(entity)
2060 except KeyError as e:
2061 formatterFailed.append(str(e))
2062 if logFailures: 2062 ↛ 2057line 2062 didn't jump to line 2057, because the condition on line 2062 was never false
2063 log.critical("Formatter failure: %s", e)
2065 if templateFailed or formatterFailed:
2066 messages = []
2067 if templateFailed: 2067 ↛ 2068line 2067 didn't jump to line 2068, because the condition on line 2067 was never true
2068 messages.append(templateFailed)
2069 if formatterFailed: 2069 ↛ 2071line 2069 didn't jump to line 2071, because the condition on line 2069 was never false
2070 messages.append(",".join(formatterFailed))
2071 msg = ";\n".join(messages)
2072 raise DatastoreValidationError(msg)
2074 def getLookupKeys(self) -> Set[LookupKey]:
2075 # Docstring is inherited from base class
2076 return self.templates.getLookupKeys() | self.formatterFactory.getLookupKeys() | \
2077 self.constraints.getLookupKeys()
2079 def validateKey(self, lookupKey: LookupKey,
2080 entity: Union[DatasetRef, DatasetType, StorageClass]) -> None:
2081 # Docstring is inherited from base class
2082 # The key can be valid in either formatters or templates so we can
2083 # only check the template if it exists
2084 if lookupKey in self.templates:
2085 try:
2086 self.templates[lookupKey].validateTemplate(entity)
2087 except FileTemplateValidationError as e:
2088 raise DatastoreValidationError(e) from e
2090 def export(self, refs: Iterable[DatasetRef], *,
2091 directory: Optional[Union[ButlerURI, str]] = None,
2092 transfer: Optional[str] = "auto") -> Iterable[FileDataset]:
2093 # Docstring inherited from Datastore.export.
2094 if transfer is not None and directory is None: 2094 ↛ 2095line 2094 didn't jump to line 2095, because the condition on line 2094 was never true
2095 raise RuntimeError(f"Cannot export using transfer mode {transfer} with no "
2096 "export directory given")
2098 # Force the directory to be a URI object
2099 directoryUri: Optional[ButlerURI] = None
2100 if directory is not None: 2100 ↛ 2103line 2100 didn't jump to line 2103, because the condition on line 2100 was never false
2101 directoryUri = ButlerURI(directory, forceDirectory=True)
2103 if transfer is not None and directoryUri is not None: 2103 ↛ 2108line 2103 didn't jump to line 2108, because the condition on line 2103 was never false
2104 # mypy needs the second test
2105 if not directoryUri.exists(): 2105 ↛ 2106line 2105 didn't jump to line 2106, because the condition on line 2105 was never true
2106 raise FileNotFoundError(f"Export location {directory} does not exist")
2108 progress = Progress("lsst.daf.butler.datastores.FileDatastore.export", level=logging.DEBUG)
2109 for ref in progress.wrap(refs, "Exporting dataset files"):
2110 fileLocations = self._get_dataset_locations_info(ref)
2111 if not fileLocations: 2111 ↛ 2112line 2111 didn't jump to line 2112, because the condition on line 2111 was never true
2112 raise FileNotFoundError(f"Could not retrieve dataset {ref}.")
2113 # For now we can not export disassembled datasets
2114 if len(fileLocations) > 1:
2115 raise NotImplementedError(f"Can not export disassembled datasets such as {ref}")
2116 location, storedFileInfo = fileLocations[0]
2118 pathInStore = location.pathInStore.path
2119 if transfer is None: 2119 ↛ 2123line 2119 didn't jump to line 2123, because the condition on line 2119 was never true
2120 # TODO: do we also need to return the readStorageClass somehow?
2121 # We will use the path in store directly. If this is an
2122 # absolute URI, preserve it.
2123 if location.pathInStore.isabs():
2124 pathInStore = str(location.uri)
2125 elif transfer == "direct": 2125 ↛ 2127line 2125 didn't jump to line 2127, because the condition on line 2125 was never true
2126 # Use full URIs to the remote store in the export
2127 pathInStore = str(location.uri)
2128 else:
2129 # mypy needs help
2130 assert directoryUri is not None, "directoryUri must be defined to get here"
2131 storeUri = ButlerURI(location.uri)
2133 # if the datastore has an absolute URI to a resource, we
2134 # have two options:
2135 # 1. Keep the absolute URI in the exported YAML
2136 # 2. Allocate a new name in the local datastore and transfer
2137 # it.
2138 # For now go with option 2
2139 if location.pathInStore.isabs(): 2139 ↛ 2140line 2139 didn't jump to line 2140, because the condition on line 2139 was never true
2140 template = self.templates.getTemplate(ref)
2141 newURI = ButlerURI(template.format(ref), forceAbsolute=False)
2142 pathInStore = str(newURI.updatedExtension(location.pathInStore.getExtension()))
2144 exportUri = directoryUri.join(pathInStore)
2145 exportUri.transfer_from(storeUri, transfer=transfer)
2147 yield FileDataset(refs=[ref], path=pathInStore, formatter=storedFileInfo.formatter)
2149 @staticmethod
2150 def computeChecksum(uri: ButlerURI, algorithm: str = "blake2b", block_size: int = 8192) -> Optional[str]:
2151 """Compute the checksum of the supplied file.
2153 Parameters
2154 ----------
2155 uri : `ButlerURI`
2156 Name of resource to calculate checksum from.
2157 algorithm : `str`, optional
2158 Name of algorithm to use. Must be one of the algorithms supported
2159 by :py:class`hashlib`.
2160 block_size : `int`
2161 Number of bytes to read from file at one time.
2163 Returns
2164 -------
2165 hexdigest : `str`
2166 Hex digest of the file.
2168 Notes
2169 -----
2170 Currently returns None if the URI is for a remote resource.
2171 """
2172 if algorithm not in hashlib.algorithms_guaranteed: 2172 ↛ 2173line 2172 didn't jump to line 2173, because the condition on line 2172 was never true
2173 raise NameError("The specified algorithm '{}' is not supported by hashlib".format(algorithm))
2175 if not uri.isLocal: 2175 ↛ 2176line 2175 didn't jump to line 2176, because the condition on line 2175 was never true
2176 return None
2178 hasher = hashlib.new(algorithm)
2180 with uri.as_local() as local_uri:
2181 with open(local_uri.ospath, "rb") as f:
2182 for chunk in iter(lambda: f.read(block_size), b""):
2183 hasher.update(chunk)
2185 return hasher.hexdigest()
2187 def needs_expanded_data_ids(
2188 self,
2189 transfer: Optional[str],
2190 entity: Optional[Union[DatasetRef, DatasetType, StorageClass]] = None,
2191 ) -> bool:
2192 # Docstring inherited.
2193 # This _could_ also use entity to inspect whether the filename template
2194 # involves placeholders other than the required dimensions for its
2195 # dataset type, but that's not necessary for correctness; it just
2196 # enables more optimizations (perhaps only in theory).
2197 return transfer not in ("direct", None)