Coverage for python/lsst/daf/butler/datastores/fileDatastore.py: 83%
833 statements
« prev ^ index » next coverage.py v6.5.0, created at 2022-12-01 19:54 +0000
« prev ^ index » next coverage.py v6.5.0, created at 2022-12-01 19:54 +0000
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
21from __future__ import annotations
23"""Generic file-based datastore code."""
25__all__ = ("FileDatastore", )
27import hashlib
28import logging
30from sqlalchemy import BigInteger, String
32from collections import defaultdict
33from dataclasses import dataclass
34from typing import (
35 TYPE_CHECKING,
36 Any,
37 ClassVar,
38 Dict,
39 Iterable,
40 List,
41 Mapping,
42 Optional,
43 Set,
44 Tuple,
45 Type,
46 Union,
47)
49from lsst.daf.butler import (
50 ButlerURI,
51 CompositesMap,
52 Config,
53 FileDataset,
54 DatasetId,
55 DatasetRef,
56 DatasetType,
57 DatasetTypeNotSupportedError,
58 Datastore,
59 DatastoreCacheManager,
60 DatastoreDisabledCacheManager,
61 DatastoreConfig,
62 DatastoreValidationError,
63 FileDescriptor,
64 FileTemplates,
65 FileTemplateValidationError,
66 Formatter,
67 FormatterFactory,
68 Location,
69 LocationFactory,
70 Progress,
71 StorageClass,
72 StoredFileInfo,
73 VERBOSE,
74)
76from lsst.daf.butler import ddl
77from lsst.daf.butler.registry.interfaces import (
78 ReadOnlyDatabaseError,
79 DatastoreRegistryBridge,
80)
82from lsst.daf.butler.core.repoRelocation import replaceRoot
83from lsst.daf.butler.core.utils import getInstanceOf, getClassOf, transactional, time_this, chunk_iterable
84from .genericDatastore import GenericBaseDatastore
86if TYPE_CHECKING: 86 ↛ 87line 86 didn't jump to line 87, because the condition on line 86 was never true
87 from lsst.daf.butler import LookupKey, AbstractDatastoreCacheManager
88 from lsst.daf.butler.registry.interfaces import DatasetIdRef, DatastoreRegistryBridgeManager
90log = logging.getLogger(__name__)
93class _IngestPrepData(Datastore.IngestPrepData):
94 """Helper class for FileDatastore ingest implementation.
96 Parameters
97 ----------
98 datasets : `list` of `FileDataset`
99 Files to be ingested by this datastore.
100 """
101 def __init__(self, datasets: List[FileDataset]):
102 super().__init__(ref for dataset in datasets for ref in dataset.refs)
103 self.datasets = datasets
106@dataclass(frozen=True)
107class DatastoreFileGetInformation:
108 """Collection of useful parameters needed to retrieve a file from
109 a Datastore.
110 """
112 location: Location
113 """The location from which to read the dataset."""
115 formatter: Formatter
116 """The `Formatter` to use to deserialize the dataset."""
118 info: StoredFileInfo
119 """Stored information about this file and its formatter."""
121 assemblerParams: Dict[str, Any]
122 """Parameters to use for post-processing the retrieved dataset."""
124 formatterParams: Dict[str, Any]
125 """Parameters that were understood by the associated formatter."""
127 component: Optional[str]
128 """The component to be retrieved (can be `None`)."""
130 readStorageClass: StorageClass
131 """The `StorageClass` of the dataset being read."""
134class FileDatastore(GenericBaseDatastore):
135 """Generic Datastore for file-based implementations.
137 Should always be sub-classed since key abstract methods are missing.
139 Parameters
140 ----------
141 config : `DatastoreConfig` or `str`
142 Configuration as either a `Config` object or URI to file.
143 bridgeManager : `DatastoreRegistryBridgeManager`
144 Object that manages the interface between `Registry` and datastores.
145 butlerRoot : `str`, optional
146 New datastore root to use to override the configuration value.
148 Raises
149 ------
150 ValueError
151 If root location does not exist and ``create`` is `False` in the
152 configuration.
153 """
155 defaultConfigFile: ClassVar[Optional[str]] = None
156 """Path to configuration defaults. Accessed within the ``config`` resource
157 or relative to a search path. Can be None if no defaults specified.
158 """
160 root: ButlerURI
161 """Root directory URI of this `Datastore`."""
163 locationFactory: LocationFactory
164 """Factory for creating locations relative to the datastore root."""
166 formatterFactory: FormatterFactory
167 """Factory for creating instances of formatters."""
169 templates: FileTemplates
170 """File templates that can be used by this `Datastore`."""
172 composites: CompositesMap
173 """Determines whether a dataset should be disassembled on put."""
175 defaultConfigFile = "datastores/fileDatastore.yaml"
176 """Path to configuration defaults. Accessed within the ``config`` resource
177 or relative to a search path. Can be None if no defaults specified.
178 """
180 @classmethod
181 def setConfigRoot(cls, root: str, config: Config, full: Config, overwrite: bool = True) -> None:
182 """Set any filesystem-dependent config options for this Datastore to
183 be appropriate for a new empty repository with the given root.
185 Parameters
186 ----------
187 root : `str`
188 URI to the root of the data repository.
189 config : `Config`
190 A `Config` to update. Only the subset understood by
191 this component will be updated. Will not expand
192 defaults.
193 full : `Config`
194 A complete config with all defaults expanded that can be
195 converted to a `DatastoreConfig`. Read-only and will not be
196 modified by this method.
197 Repository-specific options that should not be obtained
198 from defaults when Butler instances are constructed
199 should be copied from ``full`` to ``config``.
200 overwrite : `bool`, optional
201 If `False`, do not modify a value in ``config`` if the value
202 already exists. Default is always to overwrite with the provided
203 ``root``.
205 Notes
206 -----
207 If a keyword is explicitly defined in the supplied ``config`` it
208 will not be overridden by this method if ``overwrite`` is `False`.
209 This allows explicit values set in external configs to be retained.
210 """
211 Config.updateParameters(DatastoreConfig, config, full,
212 toUpdate={"root": root},
213 toCopy=("cls", ("records", "table")), overwrite=overwrite)
215 @classmethod
216 def makeTableSpec(cls, datasetIdColumnType: type) -> ddl.TableSpec:
217 return ddl.TableSpec(
218 fields=[
219 ddl.FieldSpec(name="dataset_id", dtype=datasetIdColumnType, primaryKey=True),
220 ddl.FieldSpec(name="path", dtype=String, length=256, nullable=False),
221 ddl.FieldSpec(name="formatter", dtype=String, length=128, nullable=False),
222 ddl.FieldSpec(name="storage_class", dtype=String, length=64, nullable=False),
223 # Use empty string to indicate no component
224 ddl.FieldSpec(name="component", dtype=String, length=32, primaryKey=True),
225 # TODO: should checksum be Base64Bytes instead?
226 ddl.FieldSpec(name="checksum", dtype=String, length=128, nullable=True),
227 ddl.FieldSpec(name="file_size", dtype=BigInteger, nullable=True),
228 ],
229 unique=frozenset(),
230 indexes=[tuple(["path"])],
231 )
233 def __init__(self, config: Union[DatastoreConfig, str],
234 bridgeManager: DatastoreRegistryBridgeManager, butlerRoot: str = None):
235 super().__init__(config, bridgeManager)
236 if "root" not in self.config: 236 ↛ 237line 236 didn't jump to line 237, because the condition on line 236 was never true
237 raise ValueError("No root directory specified in configuration")
239 # Name ourselves either using an explicit name or a name
240 # derived from the (unexpanded) root
241 if "name" in self.config:
242 self.name = self.config["name"]
243 else:
244 # We use the unexpanded root in the name to indicate that this
245 # datastore can be moved without having to update registry.
246 self.name = "{}@{}".format(type(self).__name__,
247 self.config["root"])
249 # Support repository relocation in config
250 # Existence of self.root is checked in subclass
251 self.root = ButlerURI(replaceRoot(self.config["root"], butlerRoot),
252 forceDirectory=True, forceAbsolute=True)
254 self.locationFactory = LocationFactory(self.root)
255 self.formatterFactory = FormatterFactory()
257 # Now associate formatters with storage classes
258 self.formatterFactory.registerFormatters(self.config["formatters"],
259 universe=bridgeManager.universe)
261 # Read the file naming templates
262 self.templates = FileTemplates(self.config["templates"],
263 universe=bridgeManager.universe)
265 # See if composites should be disassembled
266 self.composites = CompositesMap(self.config["composites"],
267 universe=bridgeManager.universe)
269 tableName = self.config["records", "table"]
270 try:
271 # Storage of paths and formatters, keyed by dataset_id
272 self._table = bridgeManager.opaque.register(
273 tableName, self.makeTableSpec(bridgeManager.datasetIdColumnType))
274 # Interface to Registry.
275 self._bridge = bridgeManager.register(self.name)
276 except ReadOnlyDatabaseError:
277 # If the database is read only and we just tried and failed to
278 # create a table, it means someone is trying to create a read-only
279 # butler client for an empty repo. That should be okay, as long
280 # as they then try to get any datasets before some other client
281 # creates the table. Chances are they'rejust validating
282 # configuration.
283 pass
285 # Determine whether checksums should be used - default to False
286 self.useChecksum = self.config.get("checksum", False)
288 # Determine whether we can fall back to configuration if a
289 # requested dataset is not known to registry
290 self.trustGetRequest = self.config.get("trust_get_request", False)
292 # Create a cache manager
293 self.cacheManager: AbstractDatastoreCacheManager
294 if "cached" in self.config: 294 ↛ 298line 294 didn't jump to line 298, because the condition on line 294 was never false
295 self.cacheManager = DatastoreCacheManager(self.config["cached"],
296 universe=bridgeManager.universe)
297 else:
298 self.cacheManager = DatastoreDisabledCacheManager("",
299 universe=bridgeManager.universe)
301 # Check existence and create directory structure if necessary
302 if not self.root.exists():
303 if "create" not in self.config or not self.config["create"]: 303 ↛ 304line 303 didn't jump to line 304, because the condition on line 303 was never true
304 raise ValueError(f"No valid root and not allowed to create one at: {self.root}")
305 try:
306 self.root.mkdir()
307 except Exception as e:
308 raise ValueError(f"Can not create datastore root '{self.root}', check permissions."
309 f" Got error: {e}") from e
311 def __str__(self) -> str:
312 return str(self.root)
314 @property
315 def bridge(self) -> DatastoreRegistryBridge:
316 return self._bridge
318 def _artifact_exists(self, location: Location) -> bool:
319 """Check that an artifact exists in this datastore at the specified
320 location.
322 Parameters
323 ----------
324 location : `Location`
325 Expected location of the artifact associated with this datastore.
327 Returns
328 -------
329 exists : `bool`
330 True if the location can be found, false otherwise.
331 """
332 log.debug("Checking if resource exists: %s", location.uri)
333 return location.uri.exists()
335 def _delete_artifact(self, location: Location) -> None:
336 """Delete the artifact from the datastore.
338 Parameters
339 ----------
340 location : `Location`
341 Location of the artifact associated with this datastore.
342 """
343 if location.pathInStore.isabs(): 343 ↛ 344line 343 didn't jump to line 344, because the condition on line 343 was never true
344 raise RuntimeError(f"Cannot delete artifact with absolute uri {location.uri}.")
346 try:
347 location.uri.remove()
348 except FileNotFoundError:
349 log.debug("File %s did not exist and so could not be deleted.", location.uri)
350 raise
351 except Exception as e:
352 log.critical("Failed to delete file: %s (%s)", location.uri, e)
353 raise
354 log.debug("Successfully deleted file: %s", location.uri)
356 def addStoredItemInfo(self, refs: Iterable[DatasetRef], infos: Iterable[StoredFileInfo]) -> None:
357 # Docstring inherited from GenericBaseDatastore
358 records = [info.to_record(ref) for ref, info in zip(refs, infos)]
359 self._table.insert(*records)
361 def getStoredItemsInfo(self, ref: DatasetIdRef) -> List[StoredFileInfo]:
362 # Docstring inherited from GenericBaseDatastore
364 # Look for the dataset_id -- there might be multiple matches
365 # if we have disassembled the dataset.
366 records = self._table.fetch(dataset_id=ref.id)
367 return [StoredFileInfo.from_record(record) for record in records]
369 def _get_stored_records_associated_with_refs(self,
370 refs: Iterable[DatasetIdRef]
371 ) -> Dict[DatasetId, List[StoredFileInfo]]:
372 """Retrieve all records associated with the provided refs.
374 Parameters
375 ----------
376 refs : iterable of `DatasetIdRef`
377 The refs for which records are to be retrieved.
379 Returns
380 -------
381 records : `dict` of [`DatasetId`, `list` of `StoredFileInfo`]
382 The matching records indexed by the ref ID. The number of entries
383 in the dict can be smaller than the number of requested refs.
384 """
385 records = self._table.fetch(dataset_id=[ref.id for ref in refs])
387 # Uniqueness is dataset_id + component so can have multiple records
388 # per ref.
389 records_by_ref = defaultdict(list)
390 for record in records:
391 records_by_ref[record["dataset_id"]].append(StoredFileInfo.from_record(record))
392 return records_by_ref
394 def _refs_associated_with_artifacts(self, paths: List[Union[str, ButlerURI]]) -> Dict[str,
395 Set[DatasetId]]:
396 """Return paths and associated dataset refs.
398 Parameters
399 ----------
400 paths : `list` of `str` or `ButlerURI`
401 All the paths to include in search.
403 Returns
404 -------
405 mapping : `dict` of [`str`, `set` [`DatasetId`]]
406 Mapping of each path to a set of associated database IDs.
407 """
408 records = self._table.fetch(path=[str(path) for path in paths])
409 result = defaultdict(set)
410 for row in records:
411 result[row["path"]].add(row["dataset_id"])
412 return result
414 def _registered_refs_per_artifact(self, pathInStore: ButlerURI) -> Set[DatasetId]:
415 """Return all dataset refs associated with the supplied path.
417 Parameters
418 ----------
419 pathInStore : `ButlerURI`
420 Path of interest in the data store.
422 Returns
423 -------
424 ids : `set` of `int`
425 All `DatasetRef` IDs associated with this path.
426 """
427 records = list(self._table.fetch(path=str(pathInStore)))
428 ids = {r["dataset_id"] for r in records}
429 return ids
431 def removeStoredItemInfo(self, ref: DatasetIdRef) -> None:
432 # Docstring inherited from GenericBaseDatastore
433 self._table.delete(["dataset_id"], {"dataset_id": ref.id})
435 def _get_dataset_locations_info(self, ref: DatasetIdRef) -> List[Tuple[Location, StoredFileInfo]]:
436 r"""Find all the `Location`\ s of the requested dataset in the
437 `Datastore` and the associated stored file information.
439 Parameters
440 ----------
441 ref : `DatasetRef`
442 Reference to the required `Dataset`.
444 Returns
445 -------
446 results : `list` [`tuple` [`Location`, `StoredFileInfo` ]]
447 Location of the dataset within the datastore and
448 stored information about each file and its formatter.
449 """
450 # Get the file information (this will fail if no file)
451 records = self.getStoredItemsInfo(ref)
453 # Use the path to determine the location -- we need to take
454 # into account absolute URIs in the datastore record
455 return [(r.file_location(self.locationFactory), r) for r in records]
457 def _can_remove_dataset_artifact(self, ref: DatasetIdRef, location: Location) -> bool:
458 """Check that there is only one dataset associated with the
459 specified artifact.
461 Parameters
462 ----------
463 ref : `DatasetRef` or `FakeDatasetRef`
464 Dataset to be removed.
465 location : `Location`
466 The location of the artifact to be removed.
468 Returns
469 -------
470 can_remove : `Bool`
471 True if the artifact can be safely removed.
472 """
473 # Can't ever delete absolute URIs.
474 if location.pathInStore.isabs():
475 return False
477 # Get all entries associated with this path
478 allRefs = self._registered_refs_per_artifact(location.pathInStore)
479 if not allRefs:
480 raise RuntimeError(f"Datastore inconsistency error. {location.pathInStore} not in registry")
482 # Remove these refs from all the refs and if there is nothing left
483 # then we can delete
484 remainingRefs = allRefs - {ref.id}
486 if remainingRefs:
487 return False
488 return True
490 def _get_expected_dataset_locations_info(self, ref: DatasetRef) -> List[Tuple[Location,
491 StoredFileInfo]]:
492 """Predict the location and related file information of the requested
493 dataset in this datastore.
495 Parameters
496 ----------
497 ref : `DatasetRef`
498 Reference to the required `Dataset`.
500 Returns
501 -------
502 results : `list` [`tuple` [`Location`, `StoredFileInfo` ]]
503 Expected Location of the dataset within the datastore and
504 placeholder information about each file and its formatter.
506 Notes
507 -----
508 Uses the current configuration to determine how we would expect the
509 datastore files to have been written if we couldn't ask registry.
510 This is safe so long as there has been no change to datastore
511 configuration between writing the dataset and wanting to read it.
512 Will not work for files that have been ingested without using the
513 standard file template or default formatter.
514 """
516 # If we have a component ref we always need to ask the questions
517 # of the composite. If the composite is disassembled this routine
518 # should return all components. If the composite was not
519 # disassembled the composite is what is stored regardless of
520 # component request. Note that if the caller has disassembled
521 # a composite there is no way for this guess to know that
522 # without trying both the composite and component ref and seeing
523 # if there is something at the component Location even without
524 # disassembly being enabled.
525 if ref.datasetType.isComponent():
526 ref = ref.makeCompositeRef()
528 # See if the ref is a composite that should be disassembled
529 doDisassembly = self.composites.shouldBeDisassembled(ref)
531 all_info: List[Tuple[Location, Formatter, StorageClass, Optional[str]]] = []
533 if doDisassembly:
534 for component, componentStorage in ref.datasetType.storageClass.components.items():
535 compRef = ref.makeComponentRef(component)
536 location, formatter = self._determine_put_formatter_location(compRef)
537 all_info.append((location, formatter, componentStorage, component))
539 else:
540 # Always use the composite ref if no disassembly
541 location, formatter = self._determine_put_formatter_location(ref)
542 all_info.append((location, formatter, ref.datasetType.storageClass, None))
544 # Convert the list of tuples to have StoredFileInfo as second element
545 return [(location, StoredFileInfo(formatter=formatter,
546 path=location.pathInStore.path,
547 storageClass=storageClass,
548 component=component,
549 checksum=None,
550 file_size=-1))
551 for location, formatter, storageClass, component in all_info]
553 def _prepare_for_get(self, ref: DatasetRef,
554 parameters: Optional[Mapping[str, Any]] = None) -> List[DatastoreFileGetInformation]:
555 """Check parameters for ``get`` and obtain formatter and
556 location.
558 Parameters
559 ----------
560 ref : `DatasetRef`
561 Reference to the required Dataset.
562 parameters : `dict`
563 `StorageClass`-specific parameters that specify, for example,
564 a slice of the dataset to be loaded.
566 Returns
567 -------
568 getInfo : `list` [`DatastoreFileGetInformation`]
569 Parameters needed to retrieve each file.
570 """
571 log.debug("Retrieve %s from %s with parameters %s", ref, self.name, parameters)
573 # Get file metadata and internal metadata
574 fileLocations = self._get_dataset_locations_info(ref)
575 if not fileLocations:
576 if not self.trustGetRequest:
577 raise FileNotFoundError(f"Could not retrieve dataset {ref}.")
578 # Assume the dataset is where we think it should be
579 fileLocations = self._get_expected_dataset_locations_info(ref)
581 # The storage class we want to use eventually
582 refStorageClass = ref.datasetType.storageClass
584 if len(fileLocations) > 1:
585 disassembled = True
587 # If trust is involved it is possible that there will be
588 # components listed here that do not exist in the datastore.
589 # Explicitly check for file artifact existence and filter out any
590 # that are missing.
591 if self.trustGetRequest:
592 fileLocations = [loc for loc in fileLocations if loc[0].uri.exists()]
594 # For now complain only if we have no components at all. One
595 # component is probably a problem but we can punt that to the
596 # assembler.
597 if not fileLocations: 597 ↛ 598line 597 didn't jump to line 598, because the condition on line 597 was never true
598 raise FileNotFoundError(f"None of the component files for dataset {ref} exist.")
600 else:
601 disassembled = False
603 # Is this a component request?
604 refComponent = ref.datasetType.component()
606 fileGetInfo = []
607 for location, storedFileInfo in fileLocations:
609 # The storage class used to write the file
610 writeStorageClass = storedFileInfo.storageClass
612 # If this has been disassembled we need read to match the write
613 if disassembled:
614 readStorageClass = writeStorageClass
615 else:
616 readStorageClass = refStorageClass
618 formatter = getInstanceOf(storedFileInfo.formatter,
619 FileDescriptor(location, readStorageClass=readStorageClass,
620 storageClass=writeStorageClass, parameters=parameters),
621 ref.dataId)
623 formatterParams, notFormatterParams = formatter.segregateParameters()
625 # Of the remaining parameters, extract the ones supported by
626 # this StorageClass (for components not all will be handled)
627 assemblerParams = readStorageClass.filterParameters(notFormatterParams)
629 # The ref itself could be a component if the dataset was
630 # disassembled by butler, or we disassembled in datastore and
631 # components came from the datastore records
632 component = storedFileInfo.component if storedFileInfo.component else refComponent
634 fileGetInfo.append(DatastoreFileGetInformation(location, formatter, storedFileInfo,
635 assemblerParams, formatterParams,
636 component, readStorageClass))
638 return fileGetInfo
640 def _prepare_for_put(self, inMemoryDataset: Any, ref: DatasetRef) -> Tuple[Location, Formatter]:
641 """Check the arguments for ``put`` and obtain formatter and
642 location.
644 Parameters
645 ----------
646 inMemoryDataset : `object`
647 The dataset to store.
648 ref : `DatasetRef`
649 Reference to the associated Dataset.
651 Returns
652 -------
653 location : `Location`
654 The location to write the dataset.
655 formatter : `Formatter`
656 The `Formatter` to use to write the dataset.
658 Raises
659 ------
660 TypeError
661 Supplied object and storage class are inconsistent.
662 DatasetTypeNotSupportedError
663 The associated `DatasetType` is not handled by this datastore.
664 """
665 self._validate_put_parameters(inMemoryDataset, ref)
666 return self._determine_put_formatter_location(ref)
668 def _determine_put_formatter_location(self, ref: DatasetRef) -> Tuple[Location, Formatter]:
669 """Calculate the formatter and output location to use for put.
671 Parameters
672 ----------
673 ref : `DatasetRef`
674 Reference to the associated Dataset.
676 Returns
677 -------
678 location : `Location`
679 The location to write the dataset.
680 formatter : `Formatter`
681 The `Formatter` to use to write the dataset.
682 """
683 # Work out output file name
684 try:
685 template = self.templates.getTemplate(ref)
686 except KeyError as e:
687 raise DatasetTypeNotSupportedError(f"Unable to find template for {ref}") from e
689 # Validate the template to protect against filenames from different
690 # dataIds returning the same and causing overwrite confusion.
691 template.validateTemplate(ref)
693 location = self.locationFactory.fromPath(template.format(ref))
695 # Get the formatter based on the storage class
696 storageClass = ref.datasetType.storageClass
697 try:
698 formatter = self.formatterFactory.getFormatter(ref,
699 FileDescriptor(location,
700 storageClass=storageClass),
701 ref.dataId)
702 except KeyError as e:
703 raise DatasetTypeNotSupportedError(f"Unable to find formatter for {ref} in datastore "
704 f"{self.name}") from e
706 # Now that we know the formatter, update the location
707 location = formatter.makeUpdatedLocation(location)
709 return location, formatter
711 def _overrideTransferMode(self, *datasets: FileDataset, transfer: Optional[str] = None) -> Optional[str]:
712 # Docstring inherited from base class
713 if transfer != "auto":
714 return transfer
716 # See if the paths are within the datastore or not
717 inside = [self._pathInStore(d.path) is not None for d in datasets]
719 if all(inside):
720 transfer = None
721 elif not any(inside): 721 ↛ 730line 721 didn't jump to line 730, because the condition on line 721 was never false
722 # Allow ButlerURI to use its own knowledge
723 transfer = "auto"
724 else:
725 # This can happen when importing from a datastore that
726 # has had some datasets ingested using "direct" mode.
727 # Also allow ButlerURI to sort it out but warn about it.
728 # This can happen if you are importing from a datastore
729 # that had some direct transfer datasets.
730 log.warning("Some datasets are inside the datastore and some are outside. Using 'split' "
731 "transfer mode. This assumes that the files outside the datastore are "
732 "still accessible to the new butler since they will not be copied into "
733 "the target datastore.")
734 transfer = "split"
736 return transfer
738 def _pathInStore(self, path: Union[str, ButlerURI]) -> Optional[str]:
739 """Return path relative to datastore root
741 Parameters
742 ----------
743 path : `str` or `ButlerURI`
744 Path to dataset. Can be absolute URI. If relative assumed to
745 be relative to the datastore. Returns path in datastore
746 or raises an exception if the path it outside.
748 Returns
749 -------
750 inStore : `str`
751 Path relative to datastore root. Returns `None` if the file is
752 outside the root.
753 """
754 # Relative path will always be relative to datastore
755 pathUri = ButlerURI(path, forceAbsolute=False)
756 return pathUri.relative_to(self.root)
758 def _standardizeIngestPath(self, path: Union[str, ButlerURI], *,
759 transfer: Optional[str] = None) -> Union[str, ButlerURI]:
760 """Standardize the path of a to-be-ingested file.
762 Parameters
763 ----------
764 path : `str` or `ButlerURI`
765 Path of a file to be ingested.
766 transfer : `str`, optional
767 How (and whether) the dataset should be added to the datastore.
768 See `ingest` for details of transfer modes.
769 This implementation is provided only so
770 `NotImplementedError` can be raised if the mode is not supported;
771 actual transfers are deferred to `_extractIngestInfo`.
773 Returns
774 -------
775 path : `str` or `ButlerURI`
776 New path in what the datastore considers standard form. If an
777 absolute URI was given that will be returned unchanged.
779 Notes
780 -----
781 Subclasses of `FileDatastore` can implement this method instead
782 of `_prepIngest`. It should not modify the data repository or given
783 file in any way.
785 Raises
786 ------
787 NotImplementedError
788 Raised if the datastore does not support the given transfer mode
789 (including the case where ingest is not supported at all).
790 FileNotFoundError
791 Raised if one of the given files does not exist.
792 """
793 if transfer not in (None, "direct", "split") + self.root.transferModes: 793 ↛ 794line 793 didn't jump to line 794, because the condition on line 793 was never true
794 raise NotImplementedError(f"Transfer mode {transfer} not supported.")
796 # A relative URI indicates relative to datastore root
797 srcUri = ButlerURI(path, forceAbsolute=False)
798 if not srcUri.isabs():
799 srcUri = self.root.join(path)
801 if not srcUri.exists():
802 raise FileNotFoundError(f"Resource at {srcUri} does not exist; note that paths to ingest "
803 f"are assumed to be relative to {self.root} unless they are absolute.")
805 if transfer is None:
806 relpath = srcUri.relative_to(self.root)
807 if not relpath:
808 raise RuntimeError(f"Transfer is none but source file ({srcUri}) is not "
809 f"within datastore ({self.root})")
811 # Return the relative path within the datastore for internal
812 # transfer
813 path = relpath
815 return path
817 def _extractIngestInfo(self, path: Union[str, ButlerURI], ref: DatasetRef, *,
818 formatter: Union[Formatter, Type[Formatter]],
819 transfer: Optional[str] = None) -> StoredFileInfo:
820 """Relocate (if necessary) and extract `StoredFileInfo` from a
821 to-be-ingested file.
823 Parameters
824 ----------
825 path : `str` or `ButlerURI`
826 URI or path of a file to be ingested.
827 ref : `DatasetRef`
828 Reference for the dataset being ingested. Guaranteed to have
829 ``dataset_id not None`.
830 formatter : `type` or `Formatter`
831 `Formatter` subclass to use for this dataset or an instance.
832 transfer : `str`, optional
833 How (and whether) the dataset should be added to the datastore.
834 See `ingest` for details of transfer modes.
836 Returns
837 -------
838 info : `StoredFileInfo`
839 Internal datastore record for this file. This will be inserted by
840 the caller; the `_extractIngestInfo` is only resposible for
841 creating and populating the struct.
843 Raises
844 ------
845 FileNotFoundError
846 Raised if one of the given files does not exist.
847 FileExistsError
848 Raised if transfer is not `None` but the (internal) location the
849 file would be moved to is already occupied.
850 """
851 if self._transaction is None: 851 ↛ 852line 851 didn't jump to line 852, because the condition on line 851 was never true
852 raise RuntimeError("Ingest called without transaction enabled")
854 # Create URI of the source path, do not need to force a relative
855 # path to absolute.
856 srcUri = ButlerURI(path, forceAbsolute=False)
858 # Track whether we have read the size of the source yet
859 have_sized = False
861 tgtLocation: Optional[Location]
862 if transfer is None or transfer == "split":
863 # A relative path is assumed to be relative to the datastore
864 # in this context
865 if not srcUri.isabs():
866 tgtLocation = self.locationFactory.fromPath(srcUri.ospath)
867 else:
868 # Work out the path in the datastore from an absolute URI
869 # This is required to be within the datastore.
870 pathInStore = srcUri.relative_to(self.root)
871 if pathInStore is None and transfer is None: 871 ↛ 872line 871 didn't jump to line 872, because the condition on line 871 was never true
872 raise RuntimeError(f"Unexpectedly learned that {srcUri} is "
873 f"not within datastore {self.root}")
874 if pathInStore: 874 ↛ 876line 874 didn't jump to line 876, because the condition on line 874 was never false
875 tgtLocation = self.locationFactory.fromPath(pathInStore)
876 elif transfer == "split":
877 # Outside the datastore but treat that as a direct ingest
878 # instead.
879 tgtLocation = None
880 else:
881 raise RuntimeError(f"Unexpected transfer mode encountered: {transfer} for"
882 f" URI {srcUri}")
883 elif transfer == "direct": 883 ↛ 888line 883 didn't jump to line 888, because the condition on line 883 was never true
884 # Want to store the full URI to the resource directly in
885 # datastore. This is useful for referring to permanent archive
886 # storage for raw data.
887 # Trust that people know what they are doing.
888 tgtLocation = None
889 else:
890 # Work out the name we want this ingested file to have
891 # inside the datastore
892 tgtLocation = self._calculate_ingested_datastore_name(srcUri, ref, formatter)
893 if not tgtLocation.uri.dirname().exists():
894 log.debug("Folder %s does not exist yet.", tgtLocation.uri.dirname())
895 tgtLocation.uri.dirname().mkdir()
897 # if we are transferring from a local file to a remote location
898 # it may be more efficient to get the size and checksum of the
899 # local file rather than the transferred one
900 if not srcUri.scheme or srcUri.scheme == "file": 900 ↛ 910line 900 didn't jump to line 910, because the condition on line 900 was never false
901 size = srcUri.size()
902 checksum = self.computeChecksum(srcUri) if self.useChecksum else None
903 have_sized = True
905 # Transfer the resource to the destination.
906 # Allow overwrite of an existing file. This matches the behavior
907 # of datastore.put() in that it trusts that registry would not
908 # be asking to overwrite unless registry thought that the
909 # overwrite was allowed.
910 tgtLocation.uri.transfer_from(srcUri, transfer=transfer, transaction=self._transaction,
911 overwrite=True)
913 if tgtLocation is None: 913 ↛ 915line 913 didn't jump to line 915, because the condition on line 913 was never true
914 # This means we are using direct mode
915 targetUri = srcUri
916 targetPath = str(srcUri)
917 else:
918 targetUri = tgtLocation.uri
919 targetPath = tgtLocation.pathInStore.path
921 # the file should exist in the datastore now
922 if not have_sized:
923 size = targetUri.size()
924 checksum = self.computeChecksum(targetUri) if self.useChecksum else None
926 return StoredFileInfo(formatter=formatter, path=targetPath,
927 storageClass=ref.datasetType.storageClass,
928 component=ref.datasetType.component(),
929 file_size=size, checksum=checksum)
931 def _prepIngest(self, *datasets: FileDataset, transfer: Optional[str] = None) -> _IngestPrepData:
932 # Docstring inherited from Datastore._prepIngest.
933 filtered = []
934 for dataset in datasets:
935 acceptable = [ref for ref in dataset.refs if self.constraints.isAcceptable(ref)]
936 if not acceptable:
937 continue
938 else:
939 dataset.refs = acceptable
940 if dataset.formatter is None:
941 dataset.formatter = self.formatterFactory.getFormatterClass(dataset.refs[0])
942 else:
943 assert isinstance(dataset.formatter, (type, str))
944 dataset.formatter = getClassOf(dataset.formatter)
945 dataset.path = self._standardizeIngestPath(dataset.path, transfer=transfer)
946 filtered.append(dataset)
947 return _IngestPrepData(filtered)
949 @transactional
950 def _finishIngest(self, prepData: Datastore.IngestPrepData, *, transfer: Optional[str] = None) -> None:
951 # Docstring inherited from Datastore._finishIngest.
952 refsAndInfos = []
953 progress = Progress("lsst.daf.butler.datastores.FileDatastore.ingest", level=logging.DEBUG)
954 for dataset in progress.wrap(prepData.datasets, desc="Ingesting dataset files"):
955 # Do ingest as if the first dataset ref is associated with the file
956 info = self._extractIngestInfo(dataset.path, dataset.refs[0], formatter=dataset.formatter,
957 transfer=transfer)
958 refsAndInfos.extend([(ref, info) for ref in dataset.refs])
959 self._register_datasets(refsAndInfos)
961 def _calculate_ingested_datastore_name(self, srcUri: ButlerURI, ref: DatasetRef,
962 formatter: Union[Formatter, Type[Formatter]]) -> Location:
963 """Given a source URI and a DatasetRef, determine the name the
964 dataset will have inside datastore.
966 Parameters
967 ----------
968 srcUri : `ButlerURI`
969 URI to the source dataset file.
970 ref : `DatasetRef`
971 Ref associated with the newly-ingested dataset artifact. This
972 is used to determine the name within the datastore.
973 formatter : `Formatter` or Formatter class.
974 Formatter to use for validation. Can be a class or an instance.
976 Returns
977 -------
978 location : `Location`
979 Target location for the newly-ingested dataset.
980 """
981 # Ingesting a file from outside the datastore.
982 # This involves a new name.
983 template = self.templates.getTemplate(ref)
984 location = self.locationFactory.fromPath(template.format(ref))
986 # Get the extension
987 ext = srcUri.getExtension()
989 # Update the destination to include that extension
990 location.updateExtension(ext)
992 # Ask the formatter to validate this extension
993 formatter.validateExtension(location)
995 return location
997 def _write_in_memory_to_artifact(self, inMemoryDataset: Any, ref: DatasetRef) -> StoredFileInfo:
998 """Write out in memory dataset to datastore.
1000 Parameters
1001 ----------
1002 inMemoryDataset : `object`
1003 Dataset to write to datastore.
1004 ref : `DatasetRef`
1005 Registry information associated with this dataset.
1007 Returns
1008 -------
1009 info : `StoredFileInfo`
1010 Information describin the artifact written to the datastore.
1011 """
1012 location, formatter = self._prepare_for_put(inMemoryDataset, ref)
1013 uri = location.uri
1015 if not uri.dirname().exists():
1016 log.debug("Folder %s does not exist yet so creating it.", uri.dirname())
1017 uri.dirname().mkdir()
1019 if self._transaction is None: 1019 ↛ 1020line 1019 didn't jump to line 1020, because the condition on line 1019 was never true
1020 raise RuntimeError("Attempting to write artifact without transaction enabled")
1022 def _removeFileExists(uri: ButlerURI) -> None:
1023 """Remove a file and do not complain if it is not there.
1025 This is important since a formatter might fail before the file
1026 is written and we should not confuse people by writing spurious
1027 error messages to the log.
1028 """
1029 try:
1030 uri.remove()
1031 except FileNotFoundError:
1032 pass
1034 # Register a callback to try to delete the uploaded data if
1035 # something fails below
1036 self._transaction.registerUndo("artifactWrite", _removeFileExists, uri)
1038 # For a local file, simply use the formatter directly
1039 if uri.isLocal:
1040 try:
1041 formatter.write(inMemoryDataset)
1042 except Exception as e:
1043 raise RuntimeError(f"Failed to serialize dataset {ref} of type {type(inMemoryDataset)} "
1044 f"to location {uri}") from e
1045 log.debug("Successfully wrote python object to local file at %s", uri)
1046 else:
1047 # This is a remote URI. Some datasets can be serialized directly
1048 # to bytes and sent to the remote datastore without writing a
1049 # file. If the dataset is intended to be saved to the cache
1050 # a file is always written and direct write to the remote
1051 # datastore is bypassed.
1052 data_written = False
1053 if not self.cacheManager.should_be_cached(ref):
1054 try:
1055 serializedDataset = formatter.toBytes(inMemoryDataset)
1056 except NotImplementedError:
1057 # Fallback to the file writing option.
1058 pass
1059 except Exception as e:
1060 raise RuntimeError(f"Failed to serialize dataset {ref} "
1061 f"of type {type(inMemoryDataset)} to bytes.") from e
1062 else:
1063 log.debug("Writing bytes directly to %s", uri)
1064 uri.write(serializedDataset, overwrite=True)
1065 log.debug("Successfully wrote bytes directly to %s", uri)
1066 data_written = True
1068 if not data_written:
1069 # Did not write the bytes directly to object store so instead
1070 # write to temporary file.
1071 with ButlerURI.temporary_uri(suffix=uri.getExtension()) as temporary_uri:
1072 # Need to configure the formatter to write to a different
1073 # location and that needs us to overwrite internals
1074 log.debug("Writing dataset to temporary location at %s", temporary_uri)
1075 with formatter._updateLocation(Location(None, temporary_uri)):
1076 try:
1077 formatter.write(inMemoryDataset)
1078 except Exception as e:
1079 raise RuntimeError(f"Failed to serialize dataset {ref} of type"
1080 f" {type(inMemoryDataset)} to "
1081 f"temporary location {temporary_uri}") from e
1082 uri.transfer_from(temporary_uri, transfer="copy", overwrite=True)
1084 # Cache if required
1085 self.cacheManager.move_to_cache(temporary_uri, ref)
1087 log.debug("Successfully wrote dataset to %s via a temporary file.", uri)
1089 # URI is needed to resolve what ingest case are we dealing with
1090 return self._extractIngestInfo(uri, ref, formatter=formatter)
1092 def _read_artifact_into_memory(self, getInfo: DatastoreFileGetInformation,
1093 ref: DatasetRef, isComponent: bool = False,
1094 cache_ref: Optional[DatasetRef] = None) -> Any:
1095 """Read the artifact from datastore into in memory object.
1097 Parameters
1098 ----------
1099 getInfo : `DatastoreFileGetInformation`
1100 Information about the artifact within the datastore.
1101 ref : `DatasetRef`
1102 The registry information associated with this artifact.
1103 isComponent : `bool`
1104 Flag to indicate if a component is being read from this artifact.
1105 cache_ref : `DatasetRef`, optional
1106 The DatasetRef to use when looking up the file in the cache.
1107 This ref must have the same ID as the supplied ref but can
1108 be a parent ref or component ref to indicate to the cache whether
1109 a composite file is being requested from the cache or a component
1110 file. Without this the cache will default to the supplied ref but
1111 it can get confused with read-only derived components for
1112 disassembled composites.
1114 Returns
1115 -------
1116 inMemoryDataset : `object`
1117 The artifact as a python object.
1118 """
1119 location = getInfo.location
1120 uri = location.uri
1121 log.debug("Accessing data from %s", uri)
1123 if cache_ref is None:
1124 cache_ref = ref
1125 if cache_ref.id != ref.id: 1125 ↛ 1126line 1125 didn't jump to line 1126, because the condition on line 1125 was never true
1126 raise ValueError("The supplied cache dataset ref refers to a different dataset than expected:"
1127 f" {ref.id} != {cache_ref.id}")
1129 # Cannot recalculate checksum but can compare size as a quick check
1130 # Do not do this if the size is negative since that indicates
1131 # we do not know.
1132 recorded_size = getInfo.info.file_size
1133 resource_size = uri.size()
1134 if recorded_size >= 0 and resource_size != recorded_size: 1134 ↛ 1135line 1134 didn't jump to line 1135, because the condition on line 1134 was never true
1135 raise RuntimeError("Integrity failure in Datastore. "
1136 f"Size of file {uri} ({resource_size}) "
1137 f"does not match size recorded in registry of {recorded_size}")
1139 # For the general case we have choices for how to proceed.
1140 # 1. Always use a local file (downloading the remote resource to a
1141 # temporary file if needed).
1142 # 2. Use a threshold size and read into memory and use bytes.
1143 # Use both for now with an arbitrary hand off size.
1144 # This allows small datasets to be downloaded from remote object
1145 # stores without requiring a temporary file.
1147 formatter = getInfo.formatter
1148 nbytes_max = 10_000_000 # Arbitrary number that we can tune
1149 if resource_size <= nbytes_max and formatter.can_read_bytes():
1150 with self.cacheManager.find_in_cache(cache_ref, uri.getExtension()) as cached_file:
1151 if cached_file is not None:
1152 desired_uri = cached_file
1153 msg = f" (cached version of {uri})"
1154 else:
1155 desired_uri = uri
1156 msg = ""
1157 with time_this(log, msg="Reading bytes from %s%s", args=(desired_uri, msg)):
1158 serializedDataset = desired_uri.read()
1159 log.debug("Deserializing %s from %d bytes from location %s with formatter %s",
1160 f"component {getInfo.component}" if isComponent else "",
1161 len(serializedDataset), uri, formatter.name())
1162 try:
1163 result = formatter.fromBytes(serializedDataset,
1164 component=getInfo.component if isComponent else None)
1165 except Exception as e:
1166 raise ValueError(f"Failure from formatter '{formatter.name()}' for dataset {ref.id}"
1167 f" ({ref.datasetType.name} from {uri}): {e}") from e
1168 else:
1169 # Read from file.
1171 # Have to update the Location associated with the formatter
1172 # because formatter.read does not allow an override.
1173 # This could be improved.
1174 location_updated = False
1175 msg = ""
1177 # First check in cache for local version.
1178 # The cache will only be relevant for remote resources but
1179 # no harm in always asking. Context manager ensures that cache
1180 # file is not deleted during cache expiration.
1181 with self.cacheManager.find_in_cache(cache_ref, uri.getExtension()) as cached_file:
1182 if cached_file is not None:
1183 msg = f"(via cache read of remote file {uri})"
1184 uri = cached_file
1185 location_updated = True
1187 with uri.as_local() as local_uri:
1189 can_be_cached = False
1190 if uri != local_uri: 1190 ↛ 1192line 1190 didn't jump to line 1192, because the condition on line 1190 was never true
1191 # URI was remote and file was downloaded
1192 cache_msg = ""
1193 location_updated = True
1195 if self.cacheManager.should_be_cached(cache_ref):
1196 # In this scenario we want to ask if the downloaded
1197 # file should be cached but we should not cache
1198 # it until after we've used it (to ensure it can't
1199 # be expired whilst we are using it).
1200 can_be_cached = True
1202 # Say that it is "likely" to be cached because
1203 # if the formatter read fails we will not be
1204 # caching this file.
1205 cache_msg = " and likely cached"
1207 msg = f"(via download to local file{cache_msg})"
1209 # Calculate the (possibly) new location for the formatter
1210 # to use.
1211 newLocation = Location(*local_uri.split()) if location_updated else None
1213 log.debug("Reading%s from location %s %s with formatter %s",
1214 f" component {getInfo.component}" if isComponent else "",
1215 uri, msg, formatter.name())
1216 try:
1217 with formatter._updateLocation(newLocation):
1218 with time_this(log, msg="Reading%s from location %s %s with formatter %s",
1219 args=(f" component {getInfo.component}" if isComponent else "",
1220 uri, msg, formatter.name())):
1221 result = formatter.read(component=getInfo.component if isComponent else None)
1222 except Exception as e:
1223 raise ValueError(f"Failure from formatter '{formatter.name()}' for dataset {ref.id}"
1224 f" ({ref.datasetType.name} from {uri}): {e}") from e
1226 # File was read successfully so can move to cache
1227 if can_be_cached: 1227 ↛ 1228line 1227 didn't jump to line 1228, because the condition on line 1227 was never true
1228 self.cacheManager.move_to_cache(local_uri, cache_ref)
1230 return self._post_process_get(result, getInfo.readStorageClass, getInfo.assemblerParams,
1231 isComponent=isComponent)
1233 def knows(self, ref: DatasetRef) -> bool:
1234 """Check if the dataset is known to the datastore.
1236 Does not check for existence of any artifact.
1238 Parameters
1239 ----------
1240 ref : `DatasetRef`
1241 Reference to the required dataset.
1243 Returns
1244 -------
1245 exists : `bool`
1246 `True` if the dataset is known to the datastore.
1247 """
1248 fileLocations = self._get_dataset_locations_info(ref)
1249 if fileLocations:
1250 return True
1251 return False
1253 def _process_mexists_records(self, id_to_ref: Dict[DatasetId, DatasetRef],
1254 records: Dict[DatasetId, List[StoredFileInfo]],
1255 all_required: bool,
1256 artifact_existence: Optional[Dict[ButlerURI,
1257 bool]] = None) -> Dict[DatasetRef, bool]:
1258 """Helper function for mexists that checks the given records.
1260 Parameters
1261 ----------
1262 id_to_ref : `dict` of [`DatasetId`, `DatasetRef`]
1263 Mapping of the dataset ID to the dataset ref itself.
1264 records : `dict` of [`DatasetId`, `list` of `StoredFileInfo`]
1265 Records as generally returned by
1266 ``_get_stored_records_associated_with_refs``.
1267 all_required : `bool`
1268 Flag to indicate whether existence requires all artifacts
1269 associated with a dataset ID to exist or not for existence.
1270 artifact_existence : `dict` of [`ButlerURI`, `bool`], optional
1271 Mapping of datastore artifact to existence. Updated by this
1272 method with details of all artifacts tested. Can be `None`
1273 if the caller is not interested.
1275 Returns
1276 -------
1277 existence : `dict` of [`DatasetRef`, `bool`]
1278 Mapping from dataset to boolean indicating existence.
1279 """
1280 # The URIs to be checked and a mapping of those URIs to
1281 # the dataset ID.
1282 uris_to_check: List[ButlerURI] = []
1283 location_map: Dict[ButlerURI, DatasetId] = {}
1285 location_factory = self.locationFactory
1287 for ref_id, info in records.items():
1288 # Key is the dataId, value is list of StoredItemInfo
1289 uris = [info.file_location(location_factory).uri for info in info]
1290 uris_to_check.extend(uris)
1291 location_map.update({uri: ref_id for uri in uris})
1293 uri_existence: Dict[ButlerURI, bool] = {}
1294 if artifact_existence is not None:
1295 # If a URI has already been checked remove it from the list
1296 # and immediately add the status to the output dict.
1297 filtered_uris_to_check = []
1298 for uri in uris_to_check:
1299 if uri in artifact_existence:
1300 uri_existence[uri] = artifact_existence[uri]
1301 else:
1302 filtered_uris_to_check.append(uri)
1303 uris_to_check = filtered_uris_to_check
1305 # Results.
1306 dataset_existence: Dict[DatasetRef, bool] = {}
1308 uri_existence.update(ButlerURI.mexists(uris_to_check))
1309 for uri, exists in uri_existence.items():
1310 dataset_id = location_map[uri]
1311 ref = id_to_ref[dataset_id]
1313 # Disassembled composite needs to check all locations.
1314 # all_required indicates whether all need to exist or not.
1315 if ref in dataset_existence:
1316 if all_required:
1317 exists = dataset_existence[ref] and exists
1318 else:
1319 exists = dataset_existence[ref] or exists
1320 dataset_existence[ref] = exists
1322 if artifact_existence is not None:
1323 artifact_existence.update(uri_existence)
1325 return dataset_existence
1327 def mexists(self, refs: Iterable[DatasetRef],
1328 artifact_existence: Optional[Dict[ButlerURI, bool]] = None) -> Dict[DatasetRef, bool]:
1329 """Check the existence of multiple datasets at once.
1331 Parameters
1332 ----------
1333 refs : iterable of `DatasetRef`
1334 The datasets to be checked.
1335 artifact_existence : `dict` of [`ButlerURI`, `bool`], optional
1336 Mapping of datastore artifact to existence. Updated by this
1337 method with details of all artifacts tested. Can be `None`
1338 if the caller is not interested.
1340 Returns
1341 -------
1342 existence : `dict` of [`DatasetRef`, `bool`]
1343 Mapping from dataset to boolean indicating existence.
1344 """
1345 chunk_size = 10_000
1346 dataset_existence: Dict[DatasetRef, bool] = {}
1347 log.debug("Checking for the existence of multiple artifacts in datastore in chunks of %d",
1348 chunk_size)
1349 n_found_total = 0
1350 n_checked = 0
1351 n_chunks = 0
1352 for chunk in chunk_iterable(refs, chunk_size=chunk_size):
1353 chunk_result = self._mexists(chunk, artifact_existence)
1354 if log.isEnabledFor(VERBOSE):
1355 n_results = len(chunk_result)
1356 n_checked += n_results
1357 # Can treat the booleans as 0, 1 integers and sum them.
1358 n_found = sum(chunk_result.values())
1359 n_found_total += n_found
1360 log.log(VERBOSE, "Number of datasets found in datastore for chunk %d = %d/%d"
1361 " (running total: %d/%d)",
1362 n_chunks, n_found, n_results, n_found_total, n_checked)
1363 dataset_existence.update(chunk_result)
1364 n_chunks += 1
1366 return dataset_existence
1368 def _mexists(self, refs: Iterable[DatasetRef],
1369 artifact_existence: Optional[Dict[ButlerURI, bool]] = None) -> Dict[DatasetRef, bool]:
1370 """Check the existence of multiple datasets at once.
1372 Parameters
1373 ----------
1374 refs : iterable of `DatasetRef`
1375 The datasets to be checked.
1377 Returns
1378 -------
1379 existence : `dict` of [`DatasetRef`, `bool`]
1380 Mapping from dataset to boolean indicating existence.
1381 """
1382 # Need a mapping of dataset_id to dataset ref since the API
1383 # works with dataset_id
1384 id_to_ref = {ref.getCheckedId(): ref for ref in refs}
1386 # Set of all IDs we are checking for.
1387 requested_ids = set(id_to_ref.keys())
1389 # The records themselves. Could be missing some entries.
1390 records = self._get_stored_records_associated_with_refs(refs)
1392 dataset_existence = self._process_mexists_records(id_to_ref, records, True,
1393 artifact_existence=artifact_existence)
1395 # Set of IDs that have been handled.
1396 handled_ids = {ref.id for ref in dataset_existence.keys()}
1398 missing_ids = requested_ids - handled_ids
1399 if missing_ids:
1400 if not self.trustGetRequest:
1401 # Must assume these do not exist
1402 for missing in missing_ids:
1403 dataset_existence[id_to_ref[missing]] = False
1404 else:
1405 log.debug("%d out of %d datasets were not known to datastore during initial existence check.",
1406 len(missing_ids), len(requested_ids))
1408 # Construct data structure identical to that returned
1409 # by _get_stored_records_associated_with_refs() but using
1410 # guessed names.
1411 records = {}
1412 for missing in missing_ids:
1413 expected = self._get_expected_dataset_locations_info(id_to_ref[missing])
1414 records[missing] = [info for _, info in expected]
1416 dataset_existence.update(self._process_mexists_records(id_to_ref, records, False,
1417 artifact_existence=artifact_existence))
1419 return dataset_existence
1421 def exists(self, ref: DatasetRef) -> bool:
1422 """Check if the dataset exists in the datastore.
1424 Parameters
1425 ----------
1426 ref : `DatasetRef`
1427 Reference to the required dataset.
1429 Returns
1430 -------
1431 exists : `bool`
1432 `True` if the entity exists in the `Datastore`.
1433 """
1434 fileLocations = self._get_dataset_locations_info(ref)
1436 # if we are being asked to trust that registry might not be correct
1437 # we ask for the expected locations and check them explicitly
1438 if not fileLocations:
1439 if not self.trustGetRequest:
1440 return False
1442 # When we are guessing a dataset location we can not check
1443 # for the existence of every component since we can not
1444 # know if every component was written. Instead we check
1445 # for the existence of any of the expected locations.
1446 for location, _ in self._get_expected_dataset_locations_info(ref): 1446 ↛ 1449line 1446 didn't jump to line 1449, because the loop on line 1446 didn't complete
1447 if self._artifact_exists(location): 1447 ↛ 1446line 1447 didn't jump to line 1446, because the condition on line 1447 was never false
1448 return True
1449 return False
1451 # All listed artifacts must exist.
1452 for location, _ in fileLocations:
1453 if not self._artifact_exists(location):
1454 return False
1456 return True
1458 def getURIs(self, ref: DatasetRef,
1459 predict: bool = False) -> Tuple[Optional[ButlerURI], Dict[str, ButlerURI]]:
1460 """Return URIs associated with dataset.
1462 Parameters
1463 ----------
1464 ref : `DatasetRef`
1465 Reference to the required dataset.
1466 predict : `bool`, optional
1467 If the datastore does not know about the dataset, should it
1468 return a predicted URI or not?
1470 Returns
1471 -------
1472 primary : `ButlerURI`
1473 The URI to the primary artifact associated with this dataset.
1474 If the dataset was disassembled within the datastore this
1475 may be `None`.
1476 components : `dict`
1477 URIs to any components associated with the dataset artifact.
1478 Can be empty if there are no components.
1479 """
1481 primary: Optional[ButlerURI] = None
1482 components: Dict[str, ButlerURI] = {}
1484 # if this has never been written then we have to guess
1485 if not self.exists(ref):
1486 if not predict:
1487 raise FileNotFoundError("Dataset {} not in this datastore".format(ref))
1489 doDisassembly = self.composites.shouldBeDisassembled(ref)
1491 if doDisassembly:
1493 for component, componentStorage in ref.datasetType.storageClass.components.items():
1494 compRef = ref.makeComponentRef(component)
1495 compLocation, _ = self._determine_put_formatter_location(compRef)
1497 # Add a URI fragment to indicate this is a guess
1498 components[component] = ButlerURI(compLocation.uri.geturl() + "#predicted")
1500 else:
1502 location, _ = self._determine_put_formatter_location(ref)
1504 # Add a URI fragment to indicate this is a guess
1505 primary = ButlerURI(location.uri.geturl() + "#predicted")
1507 return primary, components
1509 # If this is a ref that we have written we can get the path.
1510 # Get file metadata and internal metadata
1511 fileLocations = self._get_dataset_locations_info(ref)
1513 guessing = False
1514 if not fileLocations:
1515 if not self.trustGetRequest: 1515 ↛ 1516line 1515 didn't jump to line 1516, because the condition on line 1515 was never true
1516 raise RuntimeError(f"Unexpectedly got no artifacts for dataset {ref}")
1517 fileLocations = self._get_expected_dataset_locations_info(ref)
1518 guessing = True
1520 if len(fileLocations) == 1:
1521 # No disassembly so this is the primary URI
1522 uri = fileLocations[0][0].uri
1523 if guessing and not uri.exists(): 1523 ↛ 1524line 1523 didn't jump to line 1524, because the condition on line 1523 was never true
1524 raise FileNotFoundError(f"Expected URI ({uri}) does not exist")
1525 primary = uri
1527 else:
1528 for location, storedFileInfo in fileLocations:
1529 if storedFileInfo.component is None: 1529 ↛ 1530line 1529 didn't jump to line 1530, because the condition on line 1529 was never true
1530 raise RuntimeError(f"Unexpectedly got no component name for a component at {location}")
1531 uri = location.uri
1532 if guessing and not uri.exists(): 1532 ↛ 1536line 1532 didn't jump to line 1536, because the condition on line 1532 was never true
1533 # If we are trusting then it is entirely possible for
1534 # some components to be missing. In that case we skip
1535 # to the next component.
1536 if self.trustGetRequest:
1537 continue
1538 raise FileNotFoundError(f"Expected URI ({uri}) does not exist")
1539 components[storedFileInfo.component] = uri
1541 return primary, components
1543 def getURI(self, ref: DatasetRef, predict: bool = False) -> ButlerURI:
1544 """URI to the Dataset.
1546 Parameters
1547 ----------
1548 ref : `DatasetRef`
1549 Reference to the required Dataset.
1550 predict : `bool`
1551 If `True`, allow URIs to be returned of datasets that have not
1552 been written.
1554 Returns
1555 -------
1556 uri : `str`
1557 URI pointing to the dataset within the datastore. If the
1558 dataset does not exist in the datastore, and if ``predict`` is
1559 `True`, the URI will be a prediction and will include a URI
1560 fragment "#predicted".
1561 If the datastore does not have entities that relate well
1562 to the concept of a URI the returned URI will be
1563 descriptive. The returned URI is not guaranteed to be obtainable.
1565 Raises
1566 ------
1567 FileNotFoundError
1568 Raised if a URI has been requested for a dataset that does not
1569 exist and guessing is not allowed.
1570 RuntimeError
1571 Raised if a request is made for a single URI but multiple URIs
1572 are associated with this dataset.
1574 Notes
1575 -----
1576 When a predicted URI is requested an attempt will be made to form
1577 a reasonable URI based on file templates and the expected formatter.
1578 """
1579 primary, components = self.getURIs(ref, predict)
1580 if primary is None or components: 1580 ↛ 1581line 1580 didn't jump to line 1581, because the condition on line 1580 was never true
1581 raise RuntimeError(f"Dataset ({ref}) includes distinct URIs for components. "
1582 "Use Dataastore.getURIs() instead.")
1583 return primary
1585 def retrieveArtifacts(self, refs: Iterable[DatasetRef],
1586 destination: ButlerURI, transfer: str = "auto",
1587 preserve_path: bool = True,
1588 overwrite: bool = False) -> List[ButlerURI]:
1589 """Retrieve the file artifacts associated with the supplied refs.
1591 Parameters
1592 ----------
1593 refs : iterable of `DatasetRef`
1594 The datasets for which file artifacts are to be retrieved.
1595 A single ref can result in multiple files. The refs must
1596 be resolved.
1597 destination : `ButlerURI`
1598 Location to write the file artifacts.
1599 transfer : `str`, optional
1600 Method to use to transfer the artifacts. Must be one of the options
1601 supported by `ButlerURI.transfer_from()`. "move" is not allowed.
1602 preserve_path : `bool`, optional
1603 If `True` the full path of the file artifact within the datastore
1604 is preserved. If `False` the final file component of the path
1605 is used.
1606 overwrite : `bool`, optional
1607 If `True` allow transfers to overwrite existing files at the
1608 destination.
1610 Returns
1611 -------
1612 targets : `list` of `ButlerURI`
1613 URIs of file artifacts in destination location. Order is not
1614 preserved.
1615 """
1616 if not destination.isdir(): 1616 ↛ 1617line 1616 didn't jump to line 1617, because the condition on line 1616 was never true
1617 raise ValueError(f"Destination location must refer to a directory. Given {destination}")
1619 if transfer == "move":
1620 raise ValueError("Can not move artifacts out of datastore. Use copy instead.")
1622 # Source -> Destination
1623 # This also helps filter out duplicate DatasetRef in the request
1624 # that will map to the same underlying file transfer.
1625 to_transfer: Dict[ButlerURI, ButlerURI] = {}
1627 for ref in refs:
1628 locations = self._get_dataset_locations_info(ref)
1629 for location, _ in locations:
1630 source_uri = location.uri
1631 target_path: Union[str, ButlerURI]
1632 if preserve_path:
1633 target_path = location.pathInStore
1634 if target_path.isabs(): 1634 ↛ 1637line 1634 didn't jump to line 1637, because the condition on line 1634 was never true
1635 # This is an absolute path to an external file.
1636 # Use the full path.
1637 target_path = target_path.relativeToPathRoot
1638 else:
1639 target_path = source_uri.basename()
1640 target_uri = destination.join(target_path)
1641 to_transfer[source_uri] = target_uri
1643 # In theory can now parallelize the transfer
1644 log.debug("Number of artifacts to transfer to %s: %d",
1645 str(destination), len(to_transfer))
1646 for source_uri, target_uri in to_transfer.items():
1647 target_uri.transfer_from(source_uri, transfer=transfer, overwrite=overwrite)
1649 return list(to_transfer.values())
1651 def get(self, ref: DatasetRef, parameters: Optional[Mapping[str, Any]] = None) -> Any:
1652 """Load an InMemoryDataset from the store.
1654 Parameters
1655 ----------
1656 ref : `DatasetRef`
1657 Reference to the required Dataset.
1658 parameters : `dict`
1659 `StorageClass`-specific parameters that specify, for example,
1660 a slice of the dataset to be loaded.
1662 Returns
1663 -------
1664 inMemoryDataset : `object`
1665 Requested dataset or slice thereof as an InMemoryDataset.
1667 Raises
1668 ------
1669 FileNotFoundError
1670 Requested dataset can not be retrieved.
1671 TypeError
1672 Return value from formatter has unexpected type.
1673 ValueError
1674 Formatter failed to process the dataset.
1675 """
1676 allGetInfo = self._prepare_for_get(ref, parameters)
1677 refComponent = ref.datasetType.component()
1679 # Supplied storage class for the component being read
1680 refStorageClass = ref.datasetType.storageClass
1682 # Create mapping from component name to related info
1683 allComponents = {i.component: i for i in allGetInfo}
1685 # By definition the dataset is disassembled if we have more
1686 # than one record for it.
1687 isDisassembled = len(allGetInfo) > 1
1689 # Look for the special case where we are disassembled but the
1690 # component is a derived component that was not written during
1691 # disassembly. For this scenario we need to check that the
1692 # component requested is listed as a derived component for the
1693 # composite storage class
1694 isDisassembledReadOnlyComponent = False
1695 if isDisassembled and refComponent:
1696 # The composite storage class should be accessible through
1697 # the component dataset type
1698 compositeStorageClass = ref.datasetType.parentStorageClass
1700 # In the unlikely scenario where the composite storage
1701 # class is not known, we can only assume that this is a
1702 # normal component. If that assumption is wrong then the
1703 # branch below that reads a persisted component will fail
1704 # so there is no need to complain here.
1705 if compositeStorageClass is not None: 1705 ↛ 1708line 1705 didn't jump to line 1708, because the condition on line 1705 was never false
1706 isDisassembledReadOnlyComponent = refComponent in compositeStorageClass.derivedComponents
1708 if isDisassembled and not refComponent:
1709 # This was a disassembled dataset spread over multiple files
1710 # and we need to put them all back together again.
1711 # Read into memory and then assemble
1713 # Check that the supplied parameters are suitable for the type read
1714 refStorageClass.validateParameters(parameters)
1716 # We want to keep track of all the parameters that were not used
1717 # by formatters. We assume that if any of the component formatters
1718 # use a parameter that we do not need to apply it again in the
1719 # assembler.
1720 usedParams = set()
1722 components: Dict[str, Any] = {}
1723 for getInfo in allGetInfo:
1724 # assemblerParams are parameters not understood by the
1725 # associated formatter.
1726 usedParams.update(set(getInfo.formatterParams))
1728 component = getInfo.component
1730 if component is None: 1730 ↛ 1731line 1730 didn't jump to line 1731, because the condition on line 1730 was never true
1731 raise RuntimeError(f"Internal error in datastore assembly of {ref}")
1733 # We do not want the formatter to think it's reading
1734 # a component though because it is really reading a
1735 # standalone dataset -- always tell reader it is not a
1736 # component.
1737 components[component] = self._read_artifact_into_memory(getInfo,
1738 ref.makeComponentRef(component),
1739 isComponent=False)
1741 inMemoryDataset = ref.datasetType.storageClass.delegate().assemble(components)
1743 # Any unused parameters will have to be passed to the assembler
1744 if parameters:
1745 unusedParams = {k: v for k, v in parameters.items() if k not in usedParams}
1746 else:
1747 unusedParams = {}
1749 # Process parameters
1750 return ref.datasetType.storageClass.delegate().handleParameters(inMemoryDataset,
1751 parameters=unusedParams)
1753 elif isDisassembledReadOnlyComponent:
1755 compositeStorageClass = ref.datasetType.parentStorageClass
1756 if compositeStorageClass is None: 1756 ↛ 1757line 1756 didn't jump to line 1757, because the condition on line 1756 was never true
1757 raise RuntimeError(f"Unable to retrieve derived component '{refComponent}' since"
1758 "no composite storage class is available.")
1760 if refComponent is None: 1760 ↛ 1762line 1760 didn't jump to line 1762, because the condition on line 1760 was never true
1761 # Mainly for mypy
1762 raise RuntimeError(f"Internal error in datastore {self.name}: component can not be None here")
1764 # Assume that every derived component can be calculated by
1765 # forwarding the request to a single read/write component.
1766 # Rather than guessing which rw component is the right one by
1767 # scanning each for a derived component of the same name,
1768 # we ask the storage class delegate directly which one is best to
1769 # use.
1770 compositeDelegate = compositeStorageClass.delegate()
1771 forwardedComponent = compositeDelegate.selectResponsibleComponent(refComponent,
1772 set(allComponents))
1774 # Select the relevant component
1775 rwInfo = allComponents[forwardedComponent]
1777 # For now assume that read parameters are validated against
1778 # the real component and not the requested component
1779 forwardedStorageClass = rwInfo.formatter.fileDescriptor.readStorageClass
1780 forwardedStorageClass.validateParameters(parameters)
1782 # The reference to use for the caching must refer to the forwarded
1783 # component and not the derived component.
1784 cache_ref = ref.makeCompositeRef().makeComponentRef(forwardedComponent)
1786 # Unfortunately the FileDescriptor inside the formatter will have
1787 # the wrong write storage class so we need to create a new one
1788 # given the immutability constraint.
1789 writeStorageClass = rwInfo.info.storageClass
1791 # We may need to put some thought into parameters for read
1792 # components but for now forward them on as is
1793 readFormatter = type(rwInfo.formatter)(FileDescriptor(rwInfo.location,
1794 readStorageClass=refStorageClass,
1795 storageClass=writeStorageClass,
1796 parameters=parameters),
1797 ref.dataId)
1799 # The assembler can not receive any parameter requests for a
1800 # derived component at this time since the assembler will
1801 # see the storage class of the derived component and those
1802 # parameters will have to be handled by the formatter on the
1803 # forwarded storage class.
1804 assemblerParams: Dict[str, Any] = {}
1806 # Need to created a new info that specifies the derived
1807 # component and associated storage class
1808 readInfo = DatastoreFileGetInformation(rwInfo.location, readFormatter,
1809 rwInfo.info, assemblerParams, {},
1810 refComponent, refStorageClass)
1812 return self._read_artifact_into_memory(readInfo, ref, isComponent=True,
1813 cache_ref=cache_ref)
1815 else:
1816 # Single file request or component from that composite file
1817 for lookup in (refComponent, None): 1817 ↛ 1822line 1817 didn't jump to line 1822, because the loop on line 1817 didn't complete
1818 if lookup in allComponents: 1818 ↛ 1817line 1818 didn't jump to line 1817, because the condition on line 1818 was never false
1819 getInfo = allComponents[lookup]
1820 break
1821 else:
1822 raise FileNotFoundError(f"Component {refComponent} not found "
1823 f"for ref {ref} in datastore {self.name}")
1825 # Do not need the component itself if already disassembled
1826 if isDisassembled:
1827 isComponent = False
1828 else:
1829 isComponent = getInfo.component is not None
1831 # For a component read of a composite we want the cache to
1832 # be looking at the composite ref itself.
1833 cache_ref = ref.makeCompositeRef() if isComponent else ref
1835 # For a disassembled component we can validate parametersagainst
1836 # the component storage class directly
1837 if isDisassembled:
1838 refStorageClass.validateParameters(parameters)
1839 else:
1840 # For an assembled composite this could be a derived
1841 # component derived from a real component. The validity
1842 # of the parameters is not clear. For now validate against
1843 # the composite storage class
1844 getInfo.formatter.fileDescriptor.storageClass.validateParameters(parameters)
1846 return self._read_artifact_into_memory(getInfo, ref, isComponent=isComponent,
1847 cache_ref=cache_ref)
1849 @transactional
1850 def put(self, inMemoryDataset: Any, ref: DatasetRef) -> None:
1851 """Write a InMemoryDataset with a given `DatasetRef` to the store.
1853 Parameters
1854 ----------
1855 inMemoryDataset : `object`
1856 The dataset to store.
1857 ref : `DatasetRef`
1858 Reference to the associated Dataset.
1860 Raises
1861 ------
1862 TypeError
1863 Supplied object and storage class are inconsistent.
1864 DatasetTypeNotSupportedError
1865 The associated `DatasetType` is not handled by this datastore.
1867 Notes
1868 -----
1869 If the datastore is configured to reject certain dataset types it
1870 is possible that the put will fail and raise a
1871 `DatasetTypeNotSupportedError`. The main use case for this is to
1872 allow `ChainedDatastore` to put to multiple datastores without
1873 requiring that every datastore accepts the dataset.
1874 """
1876 doDisassembly = self.composites.shouldBeDisassembled(ref)
1877 # doDisassembly = True
1879 artifacts = []
1880 if doDisassembly:
1881 components = ref.datasetType.storageClass.delegate().disassemble(inMemoryDataset)
1882 for component, componentInfo in components.items():
1883 # Don't recurse because we want to take advantage of
1884 # bulk insert -- need a new DatasetRef that refers to the
1885 # same dataset_id but has the component DatasetType
1886 # DatasetType does not refer to the types of components
1887 # So we construct one ourselves.
1888 compRef = ref.makeComponentRef(component)
1889 storedInfo = self._write_in_memory_to_artifact(componentInfo.component, compRef)
1890 artifacts.append((compRef, storedInfo))
1891 else:
1892 # Write the entire thing out
1893 storedInfo = self._write_in_memory_to_artifact(inMemoryDataset, ref)
1894 artifacts.append((ref, storedInfo))
1896 self._register_datasets(artifacts)
1898 @transactional
1899 def trash(self, ref: Union[DatasetRef, Iterable[DatasetRef]], ignore_errors: bool = True) -> None:
1900 # At this point can safely remove these datasets from the cache
1901 # to avoid confusion later on. If they are not trashed later
1902 # the cache will simply be refilled.
1903 self.cacheManager.remove_from_cache(ref)
1905 # If we are in trust mode there will be nothing to move to
1906 # the trash table and we will have to try to delete the file
1907 # immediately.
1908 if self.trustGetRequest:
1909 # Try to keep the logic below for a single file trash.
1910 if isinstance(ref, DatasetRef):
1911 refs = {ref}
1912 else:
1913 # Will recreate ref at the end of this branch.
1914 refs = set(ref)
1916 # Determine which datasets are known to datastore directly.
1917 id_to_ref = {ref.getCheckedId(): ref for ref in refs}
1918 existing_ids = self._get_stored_records_associated_with_refs(refs)
1919 existing_refs = {id_to_ref[ref_id] for ref_id in existing_ids}
1921 missing = refs - existing_refs
1922 if missing:
1923 # Do an explicit existence check on these refs.
1924 # We only care about the artifacts at this point and not
1925 # the dataset existence.
1926 artifact_existence: Dict[ButlerURI, bool] = {}
1927 _ = self.mexists(missing, artifact_existence)
1928 uris = [uri for uri, exists in artifact_existence.items() if exists]
1930 # FUTURE UPGRADE: Implement a parallelized bulk remove.
1931 log.debug("Removing %d artifacts from datastore that are unknown to datastore", len(uris))
1932 for uri in uris:
1933 try:
1934 uri.remove()
1935 except Exception as e:
1936 if ignore_errors:
1937 log.debug("Artifact %s could not be removed: %s", uri, e)
1938 continue
1939 raise
1941 # There is no point asking the code below to remove refs we
1942 # know are missing so update it with the list of existing
1943 # records. Try to retain one vs many logic.
1944 if not existing_refs:
1945 # Nothing more to do since none of the datasets were
1946 # known to the datastore record table.
1947 return
1948 ref = list(existing_refs)
1949 if len(ref) == 1:
1950 ref = ref[0]
1952 # Get file metadata and internal metadata
1953 if not isinstance(ref, DatasetRef):
1954 log.debug("Doing multi-dataset trash in datastore %s", self.name)
1955 # Assumed to be an iterable of refs so bulk mode enabled.
1956 try:
1957 self.bridge.moveToTrash(ref)
1958 except Exception as e:
1959 if ignore_errors:
1960 log.warning("Unexpected issue moving multiple datasets to trash: %s", e)
1961 else:
1962 raise
1963 return
1965 log.debug("Trashing dataset %s in datastore %s", ref, self.name)
1967 fileLocations = self._get_dataset_locations_info(ref)
1969 if not fileLocations:
1970 err_msg = f"Requested dataset to trash ({ref}) is not known to datastore {self.name}"
1971 if ignore_errors:
1972 log.warning(err_msg)
1973 return
1974 else:
1975 raise FileNotFoundError(err_msg)
1977 for location, storedFileInfo in fileLocations:
1978 if not self._artifact_exists(location): 1978 ↛ 1979line 1978 didn't jump to line 1979, because the condition on line 1978 was never true
1979 err_msg = f"Dataset is known to datastore {self.name} but " \
1980 f"associated artifact ({location.uri}) is missing"
1981 if ignore_errors:
1982 log.warning(err_msg)
1983 return
1984 else:
1985 raise FileNotFoundError(err_msg)
1987 # Mark dataset as trashed
1988 try:
1989 self.bridge.moveToTrash([ref])
1990 except Exception as e:
1991 if ignore_errors:
1992 log.warning("Attempted to mark dataset (%s) to be trashed in datastore %s "
1993 "but encountered an error: %s", ref, self.name, e)
1994 pass
1995 else:
1996 raise
1998 @transactional
1999 def emptyTrash(self, ignore_errors: bool = True) -> None:
2000 """Remove all datasets from the trash.
2002 Parameters
2003 ----------
2004 ignore_errors : `bool`
2005 If `True` return without error even if something went wrong.
2006 Problems could occur if another process is simultaneously trying
2007 to delete.
2008 """
2009 log.debug("Emptying trash in datastore %s", self.name)
2011 # Context manager will empty trash iff we finish it without raising.
2012 # It will also automatically delete the relevant rows from the
2013 # trash table and the records table.
2014 with self.bridge.emptyTrash(self._table, record_class=StoredFileInfo,
2015 record_column="path") as trash_data:
2016 # Removing the artifacts themselves requires that the files are
2017 # not also associated with refs that are not to be trashed.
2018 # Therefore need to do a query with the file paths themselves
2019 # and return all the refs associated with them. Can only delete
2020 # a file if the refs to be trashed are the only refs associated
2021 # with the file.
2022 # This requires multiple copies of the trashed items
2023 trashed, artifacts_to_keep = trash_data
2025 if artifacts_to_keep is None:
2026 # The bridge is not helping us so have to work it out
2027 # ourselves. This is not going to be as efficient.
2028 trashed = list(trashed)
2030 # The instance check is for mypy since up to this point it
2031 # does not know the type of info.
2032 path_map = self._refs_associated_with_artifacts([info.path for _, info in trashed
2033 if isinstance(info, StoredFileInfo)])
2035 for ref, info in trashed:
2037 # Mypy needs to know this is not the base class
2038 assert isinstance(info, StoredFileInfo), f"Unexpectedly got info of class {type(info)}"
2040 # Check for mypy
2041 assert ref.id is not None, f"Internal logic error in emptyTrash with ref {ref}/{info}"
2043 path_map[info.path].remove(ref.id)
2044 if not path_map[info.path]: 2044 ↛ 2035line 2044 didn't jump to line 2035, because the condition on line 2044 was never false
2045 del path_map[info.path]
2047 artifacts_to_keep = set(path_map)
2049 for ref, info in trashed:
2051 # Should not happen for this implementation but need
2052 # to keep mypy happy.
2053 assert info is not None, f"Internal logic error in emptyTrash with ref {ref}."
2055 # Mypy needs to know this is not the base class
2056 assert isinstance(info, StoredFileInfo), f"Unexpectedly got info of class {type(info)}"
2058 # Check for mypy
2059 assert ref.id is not None, f"Internal logic error in emptyTrash with ref {ref}/{info}"
2061 if info.path in artifacts_to_keep:
2062 # This is a multi-dataset artifact and we are not
2063 # removing all associated refs.
2064 continue
2066 # Only trashed refs still known to datastore will be returned.
2067 location = info.file_location(self.locationFactory)
2069 # Point of no return for this artifact
2070 log.debug("Removing artifact %s from datastore %s", location.uri, self.name)
2071 try:
2072 self._delete_artifact(location)
2073 except FileNotFoundError:
2074 # If the file itself has been deleted there is nothing
2075 # we can do about it. It is possible that trash has
2076 # been run in parallel in another process or someone
2077 # decided to delete the file. It is unlikely to come
2078 # back and so we should still continue with the removal
2079 # of the entry from the trash table. It is also possible
2080 # we removed it in a previous iteration if it was
2081 # a multi-dataset artifact. The delete artifact method
2082 # will log a debug message in this scenario.
2083 # Distinguishing file missing before trash started and
2084 # file already removed previously as part of this trash
2085 # is not worth the distinction with regards to potential
2086 # memory cost.
2087 pass
2088 except Exception as e:
2089 if ignore_errors:
2090 # Use a debug message here even though it's not
2091 # a good situation. In some cases this can be
2092 # caused by a race between user A and user B
2093 # and neither of them has permissions for the
2094 # other's files. Butler does not know about users
2095 # and trash has no idea what collections these
2096 # files were in (without guessing from a path).
2097 log.debug("Encountered error removing artifact %s from datastore %s: %s",
2098 location.uri, self.name, e)
2099 else:
2100 raise
2102 @transactional
2103 def transfer_from(self, source_datastore: Datastore, refs: Iterable[DatasetRef],
2104 local_refs: Optional[Iterable[DatasetRef]] = None,
2105 transfer: str = "auto",
2106 artifact_existence: Optional[Dict[ButlerURI, bool]] = None) -> None:
2107 # Docstring inherited
2108 if type(self) is not type(source_datastore):
2109 raise TypeError(f"Datastore mismatch between this datastore ({type(self)}) and the "
2110 f"source datastore ({type(source_datastore)}).")
2112 # Be explicit for mypy
2113 if not isinstance(source_datastore, FileDatastore): 2113 ↛ 2114line 2113 didn't jump to line 2114, because the condition on line 2113 was never true
2114 raise TypeError("Can only transfer to a FileDatastore from another FileDatastore, not"
2115 f" {type(source_datastore)}")
2117 # Stop early if "direct" transfer mode is requested. That would
2118 # require that the URI inside the source datastore should be stored
2119 # directly in the target datastore, which seems unlikely to be useful
2120 # since at any moment the source datastore could delete the file.
2121 if transfer in ("direct", "split"):
2122 raise ValueError(f"Can not transfer from a source datastore using {transfer} mode since"
2123 " those files are controlled by the other datastore.")
2125 # Empty existence lookup if none given.
2126 if artifact_existence is None:
2127 artifact_existence = {}
2129 # We will go through the list multiple times so must convert
2130 # generators to lists.
2131 refs = list(refs)
2133 if local_refs is None:
2134 local_refs = refs
2135 else:
2136 local_refs = list(local_refs)
2138 # In order to handle disassembled composites the code works
2139 # at the records level since it can assume that internal APIs
2140 # can be used.
2141 # - If the record already exists in the destination this is assumed
2142 # to be okay.
2143 # - If there is no record but the source and destination URIs are
2144 # identical no transfer is done but the record is added.
2145 # - If the source record refers to an absolute URI currently assume
2146 # that that URI should remain absolute and will be visible to the
2147 # destination butler. May need to have a flag to indicate whether
2148 # the dataset should be transferred. This will only happen if
2149 # the detached Butler has had a local ingest.
2151 # What we really want is all the records in the source datastore
2152 # associated with these refs. Or derived ones if they don't exist
2153 # in the source.
2154 source_records = source_datastore._get_stored_records_associated_with_refs(refs)
2156 # The source dataset_ids are the keys in these records
2157 source_ids = set(source_records)
2158 log.debug("Number of datastore records found in source: %d", len(source_ids))
2160 # The not None check is to appease mypy
2161 requested_ids = set(ref.id for ref in refs if ref.id is not None)
2162 missing_ids = requested_ids - source_ids
2164 # Missing IDs can be okay if that datastore has allowed
2165 # gets based on file existence. Should we transfer what we can
2166 # or complain about it and warn?
2167 if missing_ids and not source_datastore.trustGetRequest: 2167 ↛ 2168line 2167 didn't jump to line 2168, because the condition on line 2167 was never true
2168 raise ValueError(f"Some datasets are missing from source datastore {source_datastore}:"
2169 f" {missing_ids}")
2171 # Need to map these missing IDs to a DatasetRef so we can guess
2172 # the details.
2173 if missing_ids:
2174 log.info("Number of expected datasets missing from source datastore records: %d out of %d",
2175 len(missing_ids), len(requested_ids))
2176 id_to_ref = {ref.id: ref for ref in refs if ref.id in missing_ids}
2178 # This should be chunked in case we end up having to check
2179 # the file store since we need some log output to show
2180 # progress.
2181 for missing_ids_chunk in chunk_iterable(missing_ids, chunk_size=10_000):
2182 records = {}
2183 for missing in missing_ids_chunk:
2184 # Ask the source datastore where the missing artifacts
2185 # should be. An execution butler might not know about the
2186 # artifacts even if they are there.
2187 expected = source_datastore._get_expected_dataset_locations_info(id_to_ref[missing])
2188 records[missing] = [info for _, info in expected]
2190 # Call the mexist helper method in case we have not already
2191 # checked these artifacts such that artifact_existence is
2192 # empty. This allows us to benefit from parallelism.
2193 # datastore.mexists() itself does not give us access to the
2194 # derived datastore record.
2195 log.log(VERBOSE, "Checking existence of %d datasets unknown to datastore",
2196 len(records))
2197 ref_exists = source_datastore._process_mexists_records(id_to_ref, records, False,
2198 artifact_existence=artifact_existence)
2200 # Now go through the records and propagate the ones that exist.
2201 location_factory = source_datastore.locationFactory
2202 for missing, record_list in records.items():
2203 # Skip completely if the ref does not exist.
2204 ref = id_to_ref[missing]
2205 if not ref_exists[ref]:
2206 log.warning("Asked to transfer dataset %s but no file artifacts exist for it.",
2207 ref)
2208 continue
2209 # Check for file artifact to decide which parts of a
2210 # disassembled composite do exist. If there is only a
2211 # single record we don't even need to look because it can't
2212 # be a composite and must exist.
2213 if len(record_list) == 1:
2214 dataset_records = record_list
2215 else:
2216 dataset_records = [record for record in record_list
2217 if artifact_existence[record.file_location(location_factory).uri]]
2218 assert len(dataset_records) > 0, "Disassembled composite should have had some files."
2220 # Rely on source_records being a defaultdict.
2221 source_records[missing].extend(dataset_records)
2223 # See if we already have these records
2224 target_records = self._get_stored_records_associated_with_refs(local_refs)
2226 # The artifacts to register
2227 artifacts = []
2229 # Refs that already exist
2230 already_present = []
2232 # Now can transfer the artifacts
2233 for source_ref, target_ref in zip(refs, local_refs):
2234 if target_ref.id in target_records:
2235 # Already have an artifact for this.
2236 already_present.append(target_ref)
2237 continue
2239 # mypy needs to know these are always resolved refs
2240 for info in source_records[source_ref.getCheckedId()]:
2241 source_location = info.file_location(source_datastore.locationFactory)
2242 target_location = info.file_location(self.locationFactory)
2243 if source_location == target_location: 2243 ↛ 2247line 2243 didn't jump to line 2247, because the condition on line 2243 was never true
2244 # Either the dataset is already in the target datastore
2245 # (which is how execution butler currently runs) or
2246 # it is an absolute URI.
2247 if source_location.pathInStore.isabs():
2248 # Just because we can see the artifact when running
2249 # the transfer doesn't mean it will be generally
2250 # accessible to a user of this butler. For now warn
2251 # but assume it will be accessible.
2252 log.warning("Transfer request for an outside-datastore artifact has been found at %s",
2253 source_location)
2254 else:
2255 # Need to transfer it to the new location.
2256 # Assume we should always overwrite. If the artifact
2257 # is there this might indicate that a previous transfer
2258 # was interrupted but was not able to be rolled back
2259 # completely (eg pre-emption) so follow Datastore default
2260 # and overwrite.
2261 target_location.uri.transfer_from(source_location.uri, transfer=transfer,
2262 overwrite=True, transaction=self._transaction)
2264 artifacts.append((target_ref, info))
2266 self._register_datasets(artifacts)
2268 if already_present:
2269 n_skipped = len(already_present)
2270 log.info("Skipped transfer of %d dataset%s already present in datastore", n_skipped,
2271 "" if n_skipped == 1 else "s")
2273 @transactional
2274 def forget(self, refs: Iterable[DatasetRef]) -> None:
2275 # Docstring inherited.
2276 refs = list(refs)
2277 self.bridge.forget(refs)
2278 self._table.delete(["dataset_id"], *[{"dataset_id": ref.getCheckedId()} for ref in refs])
2280 def validateConfiguration(self, entities: Iterable[Union[DatasetRef, DatasetType, StorageClass]],
2281 logFailures: bool = False) -> None:
2282 """Validate some of the configuration for this datastore.
2284 Parameters
2285 ----------
2286 entities : iterable of `DatasetRef`, `DatasetType`, or `StorageClass`
2287 Entities to test against this configuration. Can be differing
2288 types.
2289 logFailures : `bool`, optional
2290 If `True`, output a log message for every validation error
2291 detected.
2293 Raises
2294 ------
2295 DatastoreValidationError
2296 Raised if there is a validation problem with a configuration.
2297 All the problems are reported in a single exception.
2299 Notes
2300 -----
2301 This method checks that all the supplied entities have valid file
2302 templates and also have formatters defined.
2303 """
2305 templateFailed = None
2306 try:
2307 self.templates.validateTemplates(entities, logFailures=logFailures)
2308 except FileTemplateValidationError as e:
2309 templateFailed = str(e)
2311 formatterFailed = []
2312 for entity in entities:
2313 try:
2314 self.formatterFactory.getFormatterClass(entity)
2315 except KeyError as e:
2316 formatterFailed.append(str(e))
2317 if logFailures: 2317 ↛ 2312line 2317 didn't jump to line 2312, because the condition on line 2317 was never false
2318 log.critical("Formatter failure: %s", e)
2320 if templateFailed or formatterFailed:
2321 messages = []
2322 if templateFailed: 2322 ↛ 2323line 2322 didn't jump to line 2323, because the condition on line 2322 was never true
2323 messages.append(templateFailed)
2324 if formatterFailed: 2324 ↛ 2326line 2324 didn't jump to line 2326, because the condition on line 2324 was never false
2325 messages.append(",".join(formatterFailed))
2326 msg = ";\n".join(messages)
2327 raise DatastoreValidationError(msg)
2329 def getLookupKeys(self) -> Set[LookupKey]:
2330 # Docstring is inherited from base class
2331 return self.templates.getLookupKeys() | self.formatterFactory.getLookupKeys() | \
2332 self.constraints.getLookupKeys()
2334 def validateKey(self, lookupKey: LookupKey,
2335 entity: Union[DatasetRef, DatasetType, StorageClass]) -> None:
2336 # Docstring is inherited from base class
2337 # The key can be valid in either formatters or templates so we can
2338 # only check the template if it exists
2339 if lookupKey in self.templates:
2340 try:
2341 self.templates[lookupKey].validateTemplate(entity)
2342 except FileTemplateValidationError as e:
2343 raise DatastoreValidationError(e) from e
2345 def export(self, refs: Iterable[DatasetRef], *,
2346 directory: Optional[Union[ButlerURI, str]] = None,
2347 transfer: Optional[str] = "auto") -> Iterable[FileDataset]:
2348 # Docstring inherited from Datastore.export.
2349 if transfer is not None and directory is None: 2349 ↛ 2350line 2349 didn't jump to line 2350, because the condition on line 2349 was never true
2350 raise RuntimeError(f"Cannot export using transfer mode {transfer} with no "
2351 "export directory given")
2353 # Force the directory to be a URI object
2354 directoryUri: Optional[ButlerURI] = None
2355 if directory is not None: 2355 ↛ 2358line 2355 didn't jump to line 2358, because the condition on line 2355 was never false
2356 directoryUri = ButlerURI(directory, forceDirectory=True)
2358 if transfer is not None and directoryUri is not None: 2358 ↛ 2363line 2358 didn't jump to line 2363, because the condition on line 2358 was never false
2359 # mypy needs the second test
2360 if not directoryUri.exists(): 2360 ↛ 2361line 2360 didn't jump to line 2361, because the condition on line 2360 was never true
2361 raise FileNotFoundError(f"Export location {directory} does not exist")
2363 progress = Progress("lsst.daf.butler.datastores.FileDatastore.export", level=logging.DEBUG)
2364 for ref in progress.wrap(refs, "Exporting dataset files"):
2365 fileLocations = self._get_dataset_locations_info(ref)
2366 if not fileLocations: 2366 ↛ 2367line 2366 didn't jump to line 2367, because the condition on line 2366 was never true
2367 raise FileNotFoundError(f"Could not retrieve dataset {ref}.")
2368 # For now we can not export disassembled datasets
2369 if len(fileLocations) > 1:
2370 raise NotImplementedError(f"Can not export disassembled datasets such as {ref}")
2371 location, storedFileInfo = fileLocations[0]
2373 pathInStore = location.pathInStore.path
2374 if transfer is None: 2374 ↛ 2378line 2374 didn't jump to line 2378, because the condition on line 2374 was never true
2375 # TODO: do we also need to return the readStorageClass somehow?
2376 # We will use the path in store directly. If this is an
2377 # absolute URI, preserve it.
2378 if location.pathInStore.isabs():
2379 pathInStore = str(location.uri)
2380 elif transfer == "direct": 2380 ↛ 2382line 2380 didn't jump to line 2382, because the condition on line 2380 was never true
2381 # Use full URIs to the remote store in the export
2382 pathInStore = str(location.uri)
2383 else:
2384 # mypy needs help
2385 assert directoryUri is not None, "directoryUri must be defined to get here"
2386 storeUri = ButlerURI(location.uri)
2388 # if the datastore has an absolute URI to a resource, we
2389 # have two options:
2390 # 1. Keep the absolute URI in the exported YAML
2391 # 2. Allocate a new name in the local datastore and transfer
2392 # it.
2393 # For now go with option 2
2394 if location.pathInStore.isabs(): 2394 ↛ 2395line 2394 didn't jump to line 2395, because the condition on line 2394 was never true
2395 template = self.templates.getTemplate(ref)
2396 newURI = ButlerURI(template.format(ref), forceAbsolute=False)
2397 pathInStore = str(newURI.updatedExtension(location.pathInStore.getExtension()))
2399 exportUri = directoryUri.join(pathInStore)
2400 exportUri.transfer_from(storeUri, transfer=transfer)
2402 yield FileDataset(refs=[ref], path=pathInStore, formatter=storedFileInfo.formatter)
2404 @staticmethod
2405 def computeChecksum(uri: ButlerURI, algorithm: str = "blake2b", block_size: int = 8192) -> Optional[str]:
2406 """Compute the checksum of the supplied file.
2408 Parameters
2409 ----------
2410 uri : `ButlerURI`
2411 Name of resource to calculate checksum from.
2412 algorithm : `str`, optional
2413 Name of algorithm to use. Must be one of the algorithms supported
2414 by :py:class`hashlib`.
2415 block_size : `int`
2416 Number of bytes to read from file at one time.
2418 Returns
2419 -------
2420 hexdigest : `str`
2421 Hex digest of the file.
2423 Notes
2424 -----
2425 Currently returns None if the URI is for a remote resource.
2426 """
2427 if algorithm not in hashlib.algorithms_guaranteed: 2427 ↛ 2428line 2427 didn't jump to line 2428, because the condition on line 2427 was never true
2428 raise NameError("The specified algorithm '{}' is not supported by hashlib".format(algorithm))
2430 if not uri.isLocal: 2430 ↛ 2431line 2430 didn't jump to line 2431, because the condition on line 2430 was never true
2431 return None
2433 hasher = hashlib.new(algorithm)
2435 with uri.as_local() as local_uri:
2436 with open(local_uri.ospath, "rb") as f:
2437 for chunk in iter(lambda: f.read(block_size), b""):
2438 hasher.update(chunk)
2440 return hasher.hexdigest()
2442 def needs_expanded_data_ids(
2443 self,
2444 transfer: Optional[str],
2445 entity: Optional[Union[DatasetRef, DatasetType, StorageClass]] = None,
2446 ) -> bool:
2447 # Docstring inherited.
2448 # This _could_ also use entity to inspect whether the filename template
2449 # involves placeholders other than the required dimensions for its
2450 # dataset type, but that's not necessary for correctness; it just
2451 # enables more optimizations (perhaps only in theory).
2452 return transfer not in ("direct", None)