Coverage for python/lsst/daf/butler/datastores/fileDatastore.py: 83%
Shortcuts on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
21from __future__ import annotations
23"""Generic file-based datastore code."""
25__all__ = ("FileDatastore", )
27import hashlib
28import logging
30from sqlalchemy import BigInteger, String
32from collections import defaultdict
33from dataclasses import dataclass
34from typing import (
35 TYPE_CHECKING,
36 Any,
37 ClassVar,
38 Dict,
39 Iterable,
40 List,
41 Mapping,
42 Optional,
43 Set,
44 Tuple,
45 Type,
46 Union,
47)
49from lsst.utils.iteration import chunk_iterable
50from lsst.utils.introspection import get_class_of, get_instance_of
51from lsst.utils.timer import time_this
53# For VERBOSE logging usage.
54from lsst.utils.logging import getLogger, VERBOSE
56from lsst.daf.butler import (
57 ButlerURI,
58 CompositesMap,
59 Config,
60 FileDataset,
61 DatasetId,
62 DatasetRef,
63 DatasetType,
64 DatasetTypeNotSupportedError,
65 Datastore,
66 DatastoreCacheManager,
67 DatastoreDisabledCacheManager,
68 DatastoreConfig,
69 DatastoreValidationError,
70 FileDescriptor,
71 FileTemplates,
72 FileTemplateValidationError,
73 Formatter,
74 FormatterFactory,
75 Location,
76 LocationFactory,
77 Progress,
78 StorageClass,
79 StoredFileInfo,
80)
82from lsst.daf.butler import ddl
83from lsst.daf.butler.registry.interfaces import (
84 ReadOnlyDatabaseError,
85 DatastoreRegistryBridge,
86)
88from lsst.daf.butler.core.repoRelocation import replaceRoot
89from lsst.daf.butler.core.utils import transactional
90from .genericDatastore import GenericBaseDatastore
92if TYPE_CHECKING: 92 ↛ 93line 92 didn't jump to line 93, because the condition on line 92 was never true
93 from lsst.daf.butler import LookupKey, AbstractDatastoreCacheManager
94 from lsst.daf.butler.registry.interfaces import DatasetIdRef, DatastoreRegistryBridgeManager
96log = getLogger(__name__)
99class _IngestPrepData(Datastore.IngestPrepData):
100 """Helper class for FileDatastore ingest implementation.
102 Parameters
103 ----------
104 datasets : `list` of `FileDataset`
105 Files to be ingested by this datastore.
106 """
107 def __init__(self, datasets: List[FileDataset]):
108 super().__init__(ref for dataset in datasets for ref in dataset.refs)
109 self.datasets = datasets
112@dataclass(frozen=True)
113class DatastoreFileGetInformation:
114 """Collection of useful parameters needed to retrieve a file from
115 a Datastore.
116 """
118 location: Location
119 """The location from which to read the dataset."""
121 formatter: Formatter
122 """The `Formatter` to use to deserialize the dataset."""
124 info: StoredFileInfo
125 """Stored information about this file and its formatter."""
127 assemblerParams: Dict[str, Any]
128 """Parameters to use for post-processing the retrieved dataset."""
130 formatterParams: Dict[str, Any]
131 """Parameters that were understood by the associated formatter."""
133 component: Optional[str]
134 """The component to be retrieved (can be `None`)."""
136 readStorageClass: StorageClass
137 """The `StorageClass` of the dataset being read."""
140class FileDatastore(GenericBaseDatastore):
141 """Generic Datastore for file-based implementations.
143 Should always be sub-classed since key abstract methods are missing.
145 Parameters
146 ----------
147 config : `DatastoreConfig` or `str`
148 Configuration as either a `Config` object or URI to file.
149 bridgeManager : `DatastoreRegistryBridgeManager`
150 Object that manages the interface between `Registry` and datastores.
151 butlerRoot : `str`, optional
152 New datastore root to use to override the configuration value.
154 Raises
155 ------
156 ValueError
157 If root location does not exist and ``create`` is `False` in the
158 configuration.
159 """
161 defaultConfigFile: ClassVar[Optional[str]] = None
162 """Path to configuration defaults. Accessed within the ``config`` resource
163 or relative to a search path. Can be None if no defaults specified.
164 """
166 root: ButlerURI
167 """Root directory URI of this `Datastore`."""
169 locationFactory: LocationFactory
170 """Factory for creating locations relative to the datastore root."""
172 formatterFactory: FormatterFactory
173 """Factory for creating instances of formatters."""
175 templates: FileTemplates
176 """File templates that can be used by this `Datastore`."""
178 composites: CompositesMap
179 """Determines whether a dataset should be disassembled on put."""
181 defaultConfigFile = "datastores/fileDatastore.yaml"
182 """Path to configuration defaults. Accessed within the ``config`` resource
183 or relative to a search path. Can be None if no defaults specified.
184 """
186 @classmethod
187 def setConfigRoot(cls, root: str, config: Config, full: Config, overwrite: bool = True) -> None:
188 """Set any filesystem-dependent config options for this Datastore to
189 be appropriate for a new empty repository with the given root.
191 Parameters
192 ----------
193 root : `str`
194 URI to the root of the data repository.
195 config : `Config`
196 A `Config` to update. Only the subset understood by
197 this component will be updated. Will not expand
198 defaults.
199 full : `Config`
200 A complete config with all defaults expanded that can be
201 converted to a `DatastoreConfig`. Read-only and will not be
202 modified by this method.
203 Repository-specific options that should not be obtained
204 from defaults when Butler instances are constructed
205 should be copied from ``full`` to ``config``.
206 overwrite : `bool`, optional
207 If `False`, do not modify a value in ``config`` if the value
208 already exists. Default is always to overwrite with the provided
209 ``root``.
211 Notes
212 -----
213 If a keyword is explicitly defined in the supplied ``config`` it
214 will not be overridden by this method if ``overwrite`` is `False`.
215 This allows explicit values set in external configs to be retained.
216 """
217 Config.updateParameters(DatastoreConfig, config, full,
218 toUpdate={"root": root},
219 toCopy=("cls", ("records", "table")), overwrite=overwrite)
221 @classmethod
222 def makeTableSpec(cls, datasetIdColumnType: type) -> ddl.TableSpec:
223 return ddl.TableSpec(
224 fields=[
225 ddl.FieldSpec(name="dataset_id", dtype=datasetIdColumnType, primaryKey=True),
226 ddl.FieldSpec(name="path", dtype=String, length=256, nullable=False),
227 ddl.FieldSpec(name="formatter", dtype=String, length=128, nullable=False),
228 ddl.FieldSpec(name="storage_class", dtype=String, length=64, nullable=False),
229 # Use empty string to indicate no component
230 ddl.FieldSpec(name="component", dtype=String, length=32, primaryKey=True),
231 # TODO: should checksum be Base64Bytes instead?
232 ddl.FieldSpec(name="checksum", dtype=String, length=128, nullable=True),
233 ddl.FieldSpec(name="file_size", dtype=BigInteger, nullable=True),
234 ],
235 unique=frozenset(),
236 indexes=[tuple(["path"])],
237 )
239 def __init__(self, config: Union[DatastoreConfig, str],
240 bridgeManager: DatastoreRegistryBridgeManager, butlerRoot: str = None):
241 super().__init__(config, bridgeManager)
242 if "root" not in self.config: 242 ↛ 243line 242 didn't jump to line 243, because the condition on line 242 was never true
243 raise ValueError("No root directory specified in configuration")
245 # Name ourselves either using an explicit name or a name
246 # derived from the (unexpanded) root
247 if "name" in self.config:
248 self.name = self.config["name"]
249 else:
250 # We use the unexpanded root in the name to indicate that this
251 # datastore can be moved without having to update registry.
252 self.name = "{}@{}".format(type(self).__name__,
253 self.config["root"])
255 # Support repository relocation in config
256 # Existence of self.root is checked in subclass
257 self.root = ButlerURI(replaceRoot(self.config["root"], butlerRoot),
258 forceDirectory=True, forceAbsolute=True)
260 self.locationFactory = LocationFactory(self.root)
261 self.formatterFactory = FormatterFactory()
263 # Now associate formatters with storage classes
264 self.formatterFactory.registerFormatters(self.config["formatters"],
265 universe=bridgeManager.universe)
267 # Read the file naming templates
268 self.templates = FileTemplates(self.config["templates"],
269 universe=bridgeManager.universe)
271 # See if composites should be disassembled
272 self.composites = CompositesMap(self.config["composites"],
273 universe=bridgeManager.universe)
275 tableName = self.config["records", "table"]
276 try:
277 # Storage of paths and formatters, keyed by dataset_id
278 self._table = bridgeManager.opaque.register(
279 tableName, self.makeTableSpec(bridgeManager.datasetIdColumnType))
280 # Interface to Registry.
281 self._bridge = bridgeManager.register(self.name)
282 except ReadOnlyDatabaseError:
283 # If the database is read only and we just tried and failed to
284 # create a table, it means someone is trying to create a read-only
285 # butler client for an empty repo. That should be okay, as long
286 # as they then try to get any datasets before some other client
287 # creates the table. Chances are they'rejust validating
288 # configuration.
289 pass
291 # Determine whether checksums should be used - default to False
292 self.useChecksum = self.config.get("checksum", False)
294 # Determine whether we can fall back to configuration if a
295 # requested dataset is not known to registry
296 self.trustGetRequest = self.config.get("trust_get_request", False)
298 # Create a cache manager
299 self.cacheManager: AbstractDatastoreCacheManager
300 if "cached" in self.config: 300 ↛ 304line 300 didn't jump to line 304, because the condition on line 300 was never false
301 self.cacheManager = DatastoreCacheManager(self.config["cached"],
302 universe=bridgeManager.universe)
303 else:
304 self.cacheManager = DatastoreDisabledCacheManager("",
305 universe=bridgeManager.universe)
307 # Check existence and create directory structure if necessary
308 if not self.root.exists():
309 if "create" not in self.config or not self.config["create"]: 309 ↛ 310line 309 didn't jump to line 310, because the condition on line 309 was never true
310 raise ValueError(f"No valid root and not allowed to create one at: {self.root}")
311 try:
312 self.root.mkdir()
313 except Exception as e:
314 raise ValueError(f"Can not create datastore root '{self.root}', check permissions."
315 f" Got error: {e}") from e
317 def __str__(self) -> str:
318 return str(self.root)
320 @property
321 def bridge(self) -> DatastoreRegistryBridge:
322 return self._bridge
324 def _artifact_exists(self, location: Location) -> bool:
325 """Check that an artifact exists in this datastore at the specified
326 location.
328 Parameters
329 ----------
330 location : `Location`
331 Expected location of the artifact associated with this datastore.
333 Returns
334 -------
335 exists : `bool`
336 True if the location can be found, false otherwise.
337 """
338 log.debug("Checking if resource exists: %s", location.uri)
339 return location.uri.exists()
341 def _delete_artifact(self, location: Location) -> None:
342 """Delete the artifact from the datastore.
344 Parameters
345 ----------
346 location : `Location`
347 Location of the artifact associated with this datastore.
348 """
349 if location.pathInStore.isabs(): 349 ↛ 350line 349 didn't jump to line 350, because the condition on line 349 was never true
350 raise RuntimeError(f"Cannot delete artifact with absolute uri {location.uri}.")
352 try:
353 location.uri.remove()
354 except FileNotFoundError:
355 log.debug("File %s did not exist and so could not be deleted.", location.uri)
356 raise
357 except Exception as e:
358 log.critical("Failed to delete file: %s (%s)", location.uri, e)
359 raise
360 log.debug("Successfully deleted file: %s", location.uri)
362 def addStoredItemInfo(self, refs: Iterable[DatasetRef], infos: Iterable[StoredFileInfo]) -> None:
363 # Docstring inherited from GenericBaseDatastore
364 records = [info.to_record(ref) for ref, info in zip(refs, infos)]
365 self._table.insert(*records)
367 def getStoredItemsInfo(self, ref: DatasetIdRef) -> List[StoredFileInfo]:
368 # Docstring inherited from GenericBaseDatastore
370 # Look for the dataset_id -- there might be multiple matches
371 # if we have disassembled the dataset.
372 records = self._table.fetch(dataset_id=ref.id)
373 return [StoredFileInfo.from_record(record) for record in records]
375 def _get_stored_records_associated_with_refs(self,
376 refs: Iterable[DatasetIdRef]
377 ) -> Dict[DatasetId, List[StoredFileInfo]]:
378 """Retrieve all records associated with the provided refs.
380 Parameters
381 ----------
382 refs : iterable of `DatasetIdRef`
383 The refs for which records are to be retrieved.
385 Returns
386 -------
387 records : `dict` of [`DatasetId`, `list` of `StoredFileInfo`]
388 The matching records indexed by the ref ID. The number of entries
389 in the dict can be smaller than the number of requested refs.
390 """
391 records = self._table.fetch(dataset_id=[ref.id for ref in refs])
393 # Uniqueness is dataset_id + component so can have multiple records
394 # per ref.
395 records_by_ref = defaultdict(list)
396 for record in records:
397 records_by_ref[record["dataset_id"]].append(StoredFileInfo.from_record(record))
398 return records_by_ref
400 def _refs_associated_with_artifacts(self, paths: List[Union[str, ButlerURI]]) -> Dict[str,
401 Set[DatasetId]]:
402 """Return paths and associated dataset refs.
404 Parameters
405 ----------
406 paths : `list` of `str` or `ButlerURI`
407 All the paths to include in search.
409 Returns
410 -------
411 mapping : `dict` of [`str`, `set` [`DatasetId`]]
412 Mapping of each path to a set of associated database IDs.
413 """
414 records = self._table.fetch(path=[str(path) for path in paths])
415 result = defaultdict(set)
416 for row in records:
417 result[row["path"]].add(row["dataset_id"])
418 return result
420 def _registered_refs_per_artifact(self, pathInStore: ButlerURI) -> Set[DatasetId]:
421 """Return all dataset refs associated with the supplied path.
423 Parameters
424 ----------
425 pathInStore : `ButlerURI`
426 Path of interest in the data store.
428 Returns
429 -------
430 ids : `set` of `int`
431 All `DatasetRef` IDs associated with this path.
432 """
433 records = list(self._table.fetch(path=str(pathInStore)))
434 ids = {r["dataset_id"] for r in records}
435 return ids
437 def removeStoredItemInfo(self, ref: DatasetIdRef) -> None:
438 # Docstring inherited from GenericBaseDatastore
439 self._table.delete(["dataset_id"], {"dataset_id": ref.id})
441 def _get_dataset_locations_info(self, ref: DatasetIdRef) -> List[Tuple[Location, StoredFileInfo]]:
442 r"""Find all the `Location`\ s of the requested dataset in the
443 `Datastore` and the associated stored file information.
445 Parameters
446 ----------
447 ref : `DatasetRef`
448 Reference to the required `Dataset`.
450 Returns
451 -------
452 results : `list` [`tuple` [`Location`, `StoredFileInfo` ]]
453 Location of the dataset within the datastore and
454 stored information about each file and its formatter.
455 """
456 # Get the file information (this will fail if no file)
457 records = self.getStoredItemsInfo(ref)
459 # Use the path to determine the location -- we need to take
460 # into account absolute URIs in the datastore record
461 return [(r.file_location(self.locationFactory), r) for r in records]
463 def _can_remove_dataset_artifact(self, ref: DatasetIdRef, location: Location) -> bool:
464 """Check that there is only one dataset associated with the
465 specified artifact.
467 Parameters
468 ----------
469 ref : `DatasetRef` or `FakeDatasetRef`
470 Dataset to be removed.
471 location : `Location`
472 The location of the artifact to be removed.
474 Returns
475 -------
476 can_remove : `Bool`
477 True if the artifact can be safely removed.
478 """
479 # Can't ever delete absolute URIs.
480 if location.pathInStore.isabs():
481 return False
483 # Get all entries associated with this path
484 allRefs = self._registered_refs_per_artifact(location.pathInStore)
485 if not allRefs:
486 raise RuntimeError(f"Datastore inconsistency error. {location.pathInStore} not in registry")
488 # Remove these refs from all the refs and if there is nothing left
489 # then we can delete
490 remainingRefs = allRefs - {ref.id}
492 if remainingRefs:
493 return False
494 return True
496 def _get_expected_dataset_locations_info(self, ref: DatasetRef) -> List[Tuple[Location,
497 StoredFileInfo]]:
498 """Predict the location and related file information of the requested
499 dataset in this datastore.
501 Parameters
502 ----------
503 ref : `DatasetRef`
504 Reference to the required `Dataset`.
506 Returns
507 -------
508 results : `list` [`tuple` [`Location`, `StoredFileInfo` ]]
509 Expected Location of the dataset within the datastore and
510 placeholder information about each file and its formatter.
512 Notes
513 -----
514 Uses the current configuration to determine how we would expect the
515 datastore files to have been written if we couldn't ask registry.
516 This is safe so long as there has been no change to datastore
517 configuration between writing the dataset and wanting to read it.
518 Will not work for files that have been ingested without using the
519 standard file template or default formatter.
520 """
522 # If we have a component ref we always need to ask the questions
523 # of the composite. If the composite is disassembled this routine
524 # should return all components. If the composite was not
525 # disassembled the composite is what is stored regardless of
526 # component request. Note that if the caller has disassembled
527 # a composite there is no way for this guess to know that
528 # without trying both the composite and component ref and seeing
529 # if there is something at the component Location even without
530 # disassembly being enabled.
531 if ref.datasetType.isComponent():
532 ref = ref.makeCompositeRef()
534 # See if the ref is a composite that should be disassembled
535 doDisassembly = self.composites.shouldBeDisassembled(ref)
537 all_info: List[Tuple[Location, Formatter, StorageClass, Optional[str]]] = []
539 if doDisassembly:
540 for component, componentStorage in ref.datasetType.storageClass.components.items():
541 compRef = ref.makeComponentRef(component)
542 location, formatter = self._determine_put_formatter_location(compRef)
543 all_info.append((location, formatter, componentStorage, component))
545 else:
546 # Always use the composite ref if no disassembly
547 location, formatter = self._determine_put_formatter_location(ref)
548 all_info.append((location, formatter, ref.datasetType.storageClass, None))
550 # Convert the list of tuples to have StoredFileInfo as second element
551 return [(location, StoredFileInfo(formatter=formatter,
552 path=location.pathInStore.path,
553 storageClass=storageClass,
554 component=component,
555 checksum=None,
556 file_size=-1))
557 for location, formatter, storageClass, component in all_info]
559 def _prepare_for_get(self, ref: DatasetRef,
560 parameters: Optional[Mapping[str, Any]] = None) -> List[DatastoreFileGetInformation]:
561 """Check parameters for ``get`` and obtain formatter and
562 location.
564 Parameters
565 ----------
566 ref : `DatasetRef`
567 Reference to the required Dataset.
568 parameters : `dict`
569 `StorageClass`-specific parameters that specify, for example,
570 a slice of the dataset to be loaded.
572 Returns
573 -------
574 getInfo : `list` [`DatastoreFileGetInformation`]
575 Parameters needed to retrieve each file.
576 """
577 log.debug("Retrieve %s from %s with parameters %s", ref, self.name, parameters)
579 # Get file metadata and internal metadata
580 fileLocations = self._get_dataset_locations_info(ref)
581 if not fileLocations:
582 if not self.trustGetRequest:
583 raise FileNotFoundError(f"Could not retrieve dataset {ref}.")
584 # Assume the dataset is where we think it should be
585 fileLocations = self._get_expected_dataset_locations_info(ref)
587 # The storage class we want to use eventually
588 refStorageClass = ref.datasetType.storageClass
590 if len(fileLocations) > 1:
591 disassembled = True
593 # If trust is involved it is possible that there will be
594 # components listed here that do not exist in the datastore.
595 # Explicitly check for file artifact existence and filter out any
596 # that are missing.
597 if self.trustGetRequest:
598 fileLocations = [loc for loc in fileLocations if loc[0].uri.exists()]
600 # For now complain only if we have no components at all. One
601 # component is probably a problem but we can punt that to the
602 # assembler.
603 if not fileLocations: 603 ↛ 604line 603 didn't jump to line 604, because the condition on line 603 was never true
604 raise FileNotFoundError(f"None of the component files for dataset {ref} exist.")
606 else:
607 disassembled = False
609 # Is this a component request?
610 refComponent = ref.datasetType.component()
612 fileGetInfo = []
613 for location, storedFileInfo in fileLocations:
615 # The storage class used to write the file
616 writeStorageClass = storedFileInfo.storageClass
618 # If this has been disassembled we need read to match the write
619 if disassembled:
620 readStorageClass = writeStorageClass
621 else:
622 readStorageClass = refStorageClass
624 formatter = get_instance_of(storedFileInfo.formatter,
625 FileDescriptor(location, readStorageClass=readStorageClass,
626 storageClass=writeStorageClass, parameters=parameters),
627 ref.dataId)
629 formatterParams, notFormatterParams = formatter.segregateParameters()
631 # Of the remaining parameters, extract the ones supported by
632 # this StorageClass (for components not all will be handled)
633 assemblerParams = readStorageClass.filterParameters(notFormatterParams)
635 # The ref itself could be a component if the dataset was
636 # disassembled by butler, or we disassembled in datastore and
637 # components came from the datastore records
638 component = storedFileInfo.component if storedFileInfo.component else refComponent
640 fileGetInfo.append(DatastoreFileGetInformation(location, formatter, storedFileInfo,
641 assemblerParams, formatterParams,
642 component, readStorageClass))
644 return fileGetInfo
646 def _prepare_for_put(self, inMemoryDataset: Any, ref: DatasetRef) -> Tuple[Location, Formatter]:
647 """Check the arguments for ``put`` and obtain formatter and
648 location.
650 Parameters
651 ----------
652 inMemoryDataset : `object`
653 The dataset to store.
654 ref : `DatasetRef`
655 Reference to the associated Dataset.
657 Returns
658 -------
659 location : `Location`
660 The location to write the dataset.
661 formatter : `Formatter`
662 The `Formatter` to use to write the dataset.
664 Raises
665 ------
666 TypeError
667 Supplied object and storage class are inconsistent.
668 DatasetTypeNotSupportedError
669 The associated `DatasetType` is not handled by this datastore.
670 """
671 self._validate_put_parameters(inMemoryDataset, ref)
672 return self._determine_put_formatter_location(ref)
674 def _determine_put_formatter_location(self, ref: DatasetRef) -> Tuple[Location, Formatter]:
675 """Calculate the formatter and output location to use for put.
677 Parameters
678 ----------
679 ref : `DatasetRef`
680 Reference to the associated Dataset.
682 Returns
683 -------
684 location : `Location`
685 The location to write the dataset.
686 formatter : `Formatter`
687 The `Formatter` to use to write the dataset.
688 """
689 # Work out output file name
690 try:
691 template = self.templates.getTemplate(ref)
692 except KeyError as e:
693 raise DatasetTypeNotSupportedError(f"Unable to find template for {ref}") from e
695 # Validate the template to protect against filenames from different
696 # dataIds returning the same and causing overwrite confusion.
697 template.validateTemplate(ref)
699 location = self.locationFactory.fromPath(template.format(ref))
701 # Get the formatter based on the storage class
702 storageClass = ref.datasetType.storageClass
703 try:
704 formatter = self.formatterFactory.getFormatter(ref,
705 FileDescriptor(location,
706 storageClass=storageClass),
707 ref.dataId)
708 except KeyError as e:
709 raise DatasetTypeNotSupportedError(f"Unable to find formatter for {ref} in datastore "
710 f"{self.name}") from e
712 # Now that we know the formatter, update the location
713 location = formatter.makeUpdatedLocation(location)
715 return location, formatter
717 def _overrideTransferMode(self, *datasets: FileDataset, transfer: Optional[str] = None) -> Optional[str]:
718 # Docstring inherited from base class
719 if transfer != "auto":
720 return transfer
722 # See if the paths are within the datastore or not
723 inside = [self._pathInStore(d.path) is not None for d in datasets]
725 if all(inside):
726 transfer = None
727 elif not any(inside): 727 ↛ 736line 727 didn't jump to line 736, because the condition on line 727 was never false
728 # Allow ButlerURI to use its own knowledge
729 transfer = "auto"
730 else:
731 # This can happen when importing from a datastore that
732 # has had some datasets ingested using "direct" mode.
733 # Also allow ButlerURI to sort it out but warn about it.
734 # This can happen if you are importing from a datastore
735 # that had some direct transfer datasets.
736 log.warning("Some datasets are inside the datastore and some are outside. Using 'split' "
737 "transfer mode. This assumes that the files outside the datastore are "
738 "still accessible to the new butler since they will not be copied into "
739 "the target datastore.")
740 transfer = "split"
742 return transfer
744 def _pathInStore(self, path: Union[str, ButlerURI]) -> Optional[str]:
745 """Return path relative to datastore root
747 Parameters
748 ----------
749 path : `str` or `ButlerURI`
750 Path to dataset. Can be absolute URI. If relative assumed to
751 be relative to the datastore. Returns path in datastore
752 or raises an exception if the path it outside.
754 Returns
755 -------
756 inStore : `str`
757 Path relative to datastore root. Returns `None` if the file is
758 outside the root.
759 """
760 # Relative path will always be relative to datastore
761 pathUri = ButlerURI(path, forceAbsolute=False)
762 return pathUri.relative_to(self.root)
764 def _standardizeIngestPath(self, path: Union[str, ButlerURI], *,
765 transfer: Optional[str] = None) -> Union[str, ButlerURI]:
766 """Standardize the path of a to-be-ingested file.
768 Parameters
769 ----------
770 path : `str` or `ButlerURI`
771 Path of a file to be ingested.
772 transfer : `str`, optional
773 How (and whether) the dataset should be added to the datastore.
774 See `ingest` for details of transfer modes.
775 This implementation is provided only so
776 `NotImplementedError` can be raised if the mode is not supported;
777 actual transfers are deferred to `_extractIngestInfo`.
779 Returns
780 -------
781 path : `str` or `ButlerURI`
782 New path in what the datastore considers standard form. If an
783 absolute URI was given that will be returned unchanged.
785 Notes
786 -----
787 Subclasses of `FileDatastore` can implement this method instead
788 of `_prepIngest`. It should not modify the data repository or given
789 file in any way.
791 Raises
792 ------
793 NotImplementedError
794 Raised if the datastore does not support the given transfer mode
795 (including the case where ingest is not supported at all).
796 FileNotFoundError
797 Raised if one of the given files does not exist.
798 """
799 if transfer not in (None, "direct", "split") + self.root.transferModes: 799 ↛ 800line 799 didn't jump to line 800, because the condition on line 799 was never true
800 raise NotImplementedError(f"Transfer mode {transfer} not supported.")
802 # A relative URI indicates relative to datastore root
803 srcUri = ButlerURI(path, forceAbsolute=False)
804 if not srcUri.isabs():
805 srcUri = self.root.join(path)
807 if not srcUri.exists():
808 raise FileNotFoundError(f"Resource at {srcUri} does not exist; note that paths to ingest "
809 f"are assumed to be relative to {self.root} unless they are absolute.")
811 if transfer is None:
812 relpath = srcUri.relative_to(self.root)
813 if not relpath:
814 raise RuntimeError(f"Transfer is none but source file ({srcUri}) is not "
815 f"within datastore ({self.root})")
817 # Return the relative path within the datastore for internal
818 # transfer
819 path = relpath
821 return path
823 def _extractIngestInfo(self, path: Union[str, ButlerURI], ref: DatasetRef, *,
824 formatter: Union[Formatter, Type[Formatter]],
825 transfer: Optional[str] = None) -> StoredFileInfo:
826 """Relocate (if necessary) and extract `StoredFileInfo` from a
827 to-be-ingested file.
829 Parameters
830 ----------
831 path : `str` or `ButlerURI`
832 URI or path of a file to be ingested.
833 ref : `DatasetRef`
834 Reference for the dataset being ingested. Guaranteed to have
835 ``dataset_id not None`.
836 formatter : `type` or `Formatter`
837 `Formatter` subclass to use for this dataset or an instance.
838 transfer : `str`, optional
839 How (and whether) the dataset should be added to the datastore.
840 See `ingest` for details of transfer modes.
842 Returns
843 -------
844 info : `StoredFileInfo`
845 Internal datastore record for this file. This will be inserted by
846 the caller; the `_extractIngestInfo` is only responsible for
847 creating and populating the struct.
849 Raises
850 ------
851 FileNotFoundError
852 Raised if one of the given files does not exist.
853 FileExistsError
854 Raised if transfer is not `None` but the (internal) location the
855 file would be moved to is already occupied.
856 """
857 if self._transaction is None: 857 ↛ 858line 857 didn't jump to line 858, because the condition on line 857 was never true
858 raise RuntimeError("Ingest called without transaction enabled")
860 # Create URI of the source path, do not need to force a relative
861 # path to absolute.
862 srcUri = ButlerURI(path, forceAbsolute=False)
864 # Track whether we have read the size of the source yet
865 have_sized = False
867 tgtLocation: Optional[Location]
868 if transfer is None or transfer == "split":
869 # A relative path is assumed to be relative to the datastore
870 # in this context
871 if not srcUri.isabs():
872 tgtLocation = self.locationFactory.fromPath(srcUri.ospath)
873 else:
874 # Work out the path in the datastore from an absolute URI
875 # This is required to be within the datastore.
876 pathInStore = srcUri.relative_to(self.root)
877 if pathInStore is None and transfer is None: 877 ↛ 878line 877 didn't jump to line 878, because the condition on line 877 was never true
878 raise RuntimeError(f"Unexpectedly learned that {srcUri} is "
879 f"not within datastore {self.root}")
880 if pathInStore: 880 ↛ 882line 880 didn't jump to line 882, because the condition on line 880 was never false
881 tgtLocation = self.locationFactory.fromPath(pathInStore)
882 elif transfer == "split":
883 # Outside the datastore but treat that as a direct ingest
884 # instead.
885 tgtLocation = None
886 else:
887 raise RuntimeError(f"Unexpected transfer mode encountered: {transfer} for"
888 f" URI {srcUri}")
889 elif transfer == "direct": 889 ↛ 894line 889 didn't jump to line 894, because the condition on line 889 was never true
890 # Want to store the full URI to the resource directly in
891 # datastore. This is useful for referring to permanent archive
892 # storage for raw data.
893 # Trust that people know what they are doing.
894 tgtLocation = None
895 else:
896 # Work out the name we want this ingested file to have
897 # inside the datastore
898 tgtLocation = self._calculate_ingested_datastore_name(srcUri, ref, formatter)
899 if not tgtLocation.uri.dirname().exists():
900 log.debug("Folder %s does not exist yet.", tgtLocation.uri.dirname())
901 tgtLocation.uri.dirname().mkdir()
903 # if we are transferring from a local file to a remote location
904 # it may be more efficient to get the size and checksum of the
905 # local file rather than the transferred one
906 if not srcUri.scheme or srcUri.scheme == "file": 906 ↛ 916line 906 didn't jump to line 916, because the condition on line 906 was never false
907 size = srcUri.size()
908 checksum = self.computeChecksum(srcUri) if self.useChecksum else None
909 have_sized = True
911 # Transfer the resource to the destination.
912 # Allow overwrite of an existing file. This matches the behavior
913 # of datastore.put() in that it trusts that registry would not
914 # be asking to overwrite unless registry thought that the
915 # overwrite was allowed.
916 tgtLocation.uri.transfer_from(srcUri, transfer=transfer, transaction=self._transaction,
917 overwrite=True)
919 if tgtLocation is None: 919 ↛ 921line 919 didn't jump to line 921, because the condition on line 919 was never true
920 # This means we are using direct mode
921 targetUri = srcUri
922 targetPath = str(srcUri)
923 else:
924 targetUri = tgtLocation.uri
925 targetPath = tgtLocation.pathInStore.path
927 # the file should exist in the datastore now
928 if not have_sized:
929 size = targetUri.size()
930 checksum = self.computeChecksum(targetUri) if self.useChecksum else None
932 return StoredFileInfo(formatter=formatter, path=targetPath,
933 storageClass=ref.datasetType.storageClass,
934 component=ref.datasetType.component(),
935 file_size=size, checksum=checksum)
937 def _prepIngest(self, *datasets: FileDataset, transfer: Optional[str] = None) -> _IngestPrepData:
938 # Docstring inherited from Datastore._prepIngest.
939 filtered = []
940 for dataset in datasets:
941 acceptable = [ref for ref in dataset.refs if self.constraints.isAcceptable(ref)]
942 if not acceptable:
943 continue
944 else:
945 dataset.refs = acceptable
946 if dataset.formatter is None:
947 dataset.formatter = self.formatterFactory.getFormatterClass(dataset.refs[0])
948 else:
949 assert isinstance(dataset.formatter, (type, str))
950 formatter_class = get_class_of(dataset.formatter)
951 if not issubclass(formatter_class, Formatter): 951 ↛ 952line 951 didn't jump to line 952, because the condition on line 951 was never true
952 raise TypeError(f"Requested formatter {dataset.formatter} is not a Formatter class.")
953 dataset.formatter = formatter_class
954 dataset.path = self._standardizeIngestPath(dataset.path, transfer=transfer)
955 filtered.append(dataset)
956 return _IngestPrepData(filtered)
958 @transactional
959 def _finishIngest(self, prepData: Datastore.IngestPrepData, *, transfer: Optional[str] = None) -> None:
960 # Docstring inherited from Datastore._finishIngest.
961 refsAndInfos = []
962 progress = Progress("lsst.daf.butler.datastores.FileDatastore.ingest", level=logging.DEBUG)
963 for dataset in progress.wrap(prepData.datasets, desc="Ingesting dataset files"):
964 # Do ingest as if the first dataset ref is associated with the file
965 info = self._extractIngestInfo(dataset.path, dataset.refs[0], formatter=dataset.formatter,
966 transfer=transfer)
967 refsAndInfos.extend([(ref, info) for ref in dataset.refs])
968 self._register_datasets(refsAndInfos)
970 def _calculate_ingested_datastore_name(self, srcUri: ButlerURI, ref: DatasetRef,
971 formatter: Union[Formatter, Type[Formatter]]) -> Location:
972 """Given a source URI and a DatasetRef, determine the name the
973 dataset will have inside datastore.
975 Parameters
976 ----------
977 srcUri : `ButlerURI`
978 URI to the source dataset file.
979 ref : `DatasetRef`
980 Ref associated with the newly-ingested dataset artifact. This
981 is used to determine the name within the datastore.
982 formatter : `Formatter` or Formatter class.
983 Formatter to use for validation. Can be a class or an instance.
985 Returns
986 -------
987 location : `Location`
988 Target location for the newly-ingested dataset.
989 """
990 # Ingesting a file from outside the datastore.
991 # This involves a new name.
992 template = self.templates.getTemplate(ref)
993 location = self.locationFactory.fromPath(template.format(ref))
995 # Get the extension
996 ext = srcUri.getExtension()
998 # Update the destination to include that extension
999 location.updateExtension(ext)
1001 # Ask the formatter to validate this extension
1002 formatter.validateExtension(location)
1004 return location
1006 def _write_in_memory_to_artifact(self, inMemoryDataset: Any, ref: DatasetRef) -> StoredFileInfo:
1007 """Write out in memory dataset to datastore.
1009 Parameters
1010 ----------
1011 inMemoryDataset : `object`
1012 Dataset to write to datastore.
1013 ref : `DatasetRef`
1014 Registry information associated with this dataset.
1016 Returns
1017 -------
1018 info : `StoredFileInfo`
1019 Information describing the artifact written to the datastore.
1020 """
1021 location, formatter = self._prepare_for_put(inMemoryDataset, ref)
1022 uri = location.uri
1024 if not uri.dirname().exists():
1025 log.debug("Folder %s does not exist yet so creating it.", uri.dirname())
1026 uri.dirname().mkdir()
1028 if self._transaction is None: 1028 ↛ 1029line 1028 didn't jump to line 1029, because the condition on line 1028 was never true
1029 raise RuntimeError("Attempting to write artifact without transaction enabled")
1031 def _removeFileExists(uri: ButlerURI) -> None:
1032 """Remove a file and do not complain if it is not there.
1034 This is important since a formatter might fail before the file
1035 is written and we should not confuse people by writing spurious
1036 error messages to the log.
1037 """
1038 try:
1039 uri.remove()
1040 except FileNotFoundError:
1041 pass
1043 # Register a callback to try to delete the uploaded data if
1044 # something fails below
1045 self._transaction.registerUndo("artifactWrite", _removeFileExists, uri)
1047 # For a local file, simply use the formatter directly
1048 if uri.isLocal:
1049 try:
1050 formatter.write(inMemoryDataset)
1051 except Exception as e:
1052 raise RuntimeError(f"Failed to serialize dataset {ref} of type {type(inMemoryDataset)} "
1053 f"to location {uri}") from e
1054 log.debug("Successfully wrote python object to local file at %s", uri)
1055 else:
1056 # This is a remote URI. Some datasets can be serialized directly
1057 # to bytes and sent to the remote datastore without writing a
1058 # file. If the dataset is intended to be saved to the cache
1059 # a file is always written and direct write to the remote
1060 # datastore is bypassed.
1061 data_written = False
1062 if not self.cacheManager.should_be_cached(ref):
1063 try:
1064 serializedDataset = formatter.toBytes(inMemoryDataset)
1065 except NotImplementedError:
1066 # Fallback to the file writing option.
1067 pass
1068 except Exception as e:
1069 raise RuntimeError(f"Failed to serialize dataset {ref} "
1070 f"of type {type(inMemoryDataset)} to bytes.") from e
1071 else:
1072 log.debug("Writing bytes directly to %s", uri)
1073 uri.write(serializedDataset, overwrite=True)
1074 log.debug("Successfully wrote bytes directly to %s", uri)
1075 data_written = True
1077 if not data_written:
1078 # Did not write the bytes directly to object store so instead
1079 # write to temporary file.
1080 with ButlerURI.temporary_uri(suffix=uri.getExtension()) as temporary_uri:
1081 # Need to configure the formatter to write to a different
1082 # location and that needs us to overwrite internals
1083 log.debug("Writing dataset to temporary location at %s", temporary_uri)
1084 with formatter._updateLocation(Location(None, temporary_uri)):
1085 try:
1086 formatter.write(inMemoryDataset)
1087 except Exception as e:
1088 raise RuntimeError(f"Failed to serialize dataset {ref} of type"
1089 f" {type(inMemoryDataset)} to "
1090 f"temporary location {temporary_uri}") from e
1091 uri.transfer_from(temporary_uri, transfer="copy", overwrite=True)
1093 # Cache if required
1094 self.cacheManager.move_to_cache(temporary_uri, ref)
1096 log.debug("Successfully wrote dataset to %s via a temporary file.", uri)
1098 # URI is needed to resolve what ingest case are we dealing with
1099 return self._extractIngestInfo(uri, ref, formatter=formatter)
1101 def _read_artifact_into_memory(self, getInfo: DatastoreFileGetInformation,
1102 ref: DatasetRef, isComponent: bool = False,
1103 cache_ref: Optional[DatasetRef] = None) -> Any:
1104 """Read the artifact from datastore into in memory object.
1106 Parameters
1107 ----------
1108 getInfo : `DatastoreFileGetInformation`
1109 Information about the artifact within the datastore.
1110 ref : `DatasetRef`
1111 The registry information associated with this artifact.
1112 isComponent : `bool`
1113 Flag to indicate if a component is being read from this artifact.
1114 cache_ref : `DatasetRef`, optional
1115 The DatasetRef to use when looking up the file in the cache.
1116 This ref must have the same ID as the supplied ref but can
1117 be a parent ref or component ref to indicate to the cache whether
1118 a composite file is being requested from the cache or a component
1119 file. Without this the cache will default to the supplied ref but
1120 it can get confused with read-only derived components for
1121 disassembled composites.
1123 Returns
1124 -------
1125 inMemoryDataset : `object`
1126 The artifact as a python object.
1127 """
1128 location = getInfo.location
1129 uri = location.uri
1130 log.debug("Accessing data from %s", uri)
1132 if cache_ref is None:
1133 cache_ref = ref
1134 if cache_ref.id != ref.id: 1134 ↛ 1135line 1134 didn't jump to line 1135, because the condition on line 1134 was never true
1135 raise ValueError("The supplied cache dataset ref refers to a different dataset than expected:"
1136 f" {ref.id} != {cache_ref.id}")
1138 # Cannot recalculate checksum but can compare size as a quick check
1139 # Do not do this if the size is negative since that indicates
1140 # we do not know.
1141 recorded_size = getInfo.info.file_size
1142 resource_size = uri.size()
1143 if recorded_size >= 0 and resource_size != recorded_size: 1143 ↛ 1144line 1143 didn't jump to line 1144, because the condition on line 1143 was never true
1144 raise RuntimeError("Integrity failure in Datastore. "
1145 f"Size of file {uri} ({resource_size}) "
1146 f"does not match size recorded in registry of {recorded_size}")
1148 # For the general case we have choices for how to proceed.
1149 # 1. Always use a local file (downloading the remote resource to a
1150 # temporary file if needed).
1151 # 2. Use a threshold size and read into memory and use bytes.
1152 # Use both for now with an arbitrary hand off size.
1153 # This allows small datasets to be downloaded from remote object
1154 # stores without requiring a temporary file.
1156 formatter = getInfo.formatter
1157 nbytes_max = 10_000_000 # Arbitrary number that we can tune
1158 if resource_size <= nbytes_max and formatter.can_read_bytes():
1159 with self.cacheManager.find_in_cache(cache_ref, uri.getExtension()) as cached_file:
1160 if cached_file is not None:
1161 desired_uri = cached_file
1162 msg = f" (cached version of {uri})"
1163 else:
1164 desired_uri = uri
1165 msg = ""
1166 with time_this(log, msg="Reading bytes from %s%s", args=(desired_uri, msg)):
1167 serializedDataset = desired_uri.read()
1168 log.debug("Deserializing %s from %d bytes from location %s with formatter %s",
1169 f"component {getInfo.component}" if isComponent else "",
1170 len(serializedDataset), uri, formatter.name())
1171 try:
1172 result = formatter.fromBytes(serializedDataset,
1173 component=getInfo.component if isComponent else None)
1174 except Exception as e:
1175 raise ValueError(f"Failure from formatter '{formatter.name()}' for dataset {ref.id}"
1176 f" ({ref.datasetType.name} from {uri}): {e}") from e
1177 else:
1178 # Read from file.
1180 # Have to update the Location associated with the formatter
1181 # because formatter.read does not allow an override.
1182 # This could be improved.
1183 location_updated = False
1184 msg = ""
1186 # First check in cache for local version.
1187 # The cache will only be relevant for remote resources but
1188 # no harm in always asking. Context manager ensures that cache
1189 # file is not deleted during cache expiration.
1190 with self.cacheManager.find_in_cache(cache_ref, uri.getExtension()) as cached_file:
1191 if cached_file is not None:
1192 msg = f"(via cache read of remote file {uri})"
1193 uri = cached_file
1194 location_updated = True
1196 with uri.as_local() as local_uri:
1198 can_be_cached = False
1199 if uri != local_uri: 1199 ↛ 1201line 1199 didn't jump to line 1201, because the condition on line 1199 was never true
1200 # URI was remote and file was downloaded
1201 cache_msg = ""
1202 location_updated = True
1204 if self.cacheManager.should_be_cached(cache_ref):
1205 # In this scenario we want to ask if the downloaded
1206 # file should be cached but we should not cache
1207 # it until after we've used it (to ensure it can't
1208 # be expired whilst we are using it).
1209 can_be_cached = True
1211 # Say that it is "likely" to be cached because
1212 # if the formatter read fails we will not be
1213 # caching this file.
1214 cache_msg = " and likely cached"
1216 msg = f"(via download to local file{cache_msg})"
1218 # Calculate the (possibly) new location for the formatter
1219 # to use.
1220 newLocation = Location(*local_uri.split()) if location_updated else None
1222 log.debug("Reading%s from location %s %s with formatter %s",
1223 f" component {getInfo.component}" if isComponent else "",
1224 uri, msg, formatter.name())
1225 try:
1226 with formatter._updateLocation(newLocation):
1227 with time_this(log, msg="Reading%s from location %s %s with formatter %s",
1228 args=(f" component {getInfo.component}" if isComponent else "",
1229 uri, msg, formatter.name())):
1230 result = formatter.read(component=getInfo.component if isComponent else None)
1231 except Exception as e:
1232 raise ValueError(f"Failure from formatter '{formatter.name()}' for dataset {ref.id}"
1233 f" ({ref.datasetType.name} from {uri}): {e}") from e
1235 # File was read successfully so can move to cache
1236 if can_be_cached: 1236 ↛ 1237line 1236 didn't jump to line 1237, because the condition on line 1236 was never true
1237 self.cacheManager.move_to_cache(local_uri, cache_ref)
1239 return self._post_process_get(result, getInfo.readStorageClass, getInfo.assemblerParams,
1240 isComponent=isComponent)
1242 def knows(self, ref: DatasetRef) -> bool:
1243 """Check if the dataset is known to the datastore.
1245 Does not check for existence of any artifact.
1247 Parameters
1248 ----------
1249 ref : `DatasetRef`
1250 Reference to the required dataset.
1252 Returns
1253 -------
1254 exists : `bool`
1255 `True` if the dataset is known to the datastore.
1256 """
1257 fileLocations = self._get_dataset_locations_info(ref)
1258 if fileLocations:
1259 return True
1260 return False
1262 def _process_mexists_records(self, id_to_ref: Dict[DatasetId, DatasetRef],
1263 records: Dict[DatasetId, List[StoredFileInfo]],
1264 all_required: bool,
1265 artifact_existence: Optional[Dict[ButlerURI,
1266 bool]] = None) -> Dict[DatasetRef, bool]:
1267 """Helper function for mexists that checks the given records.
1269 Parameters
1270 ----------
1271 id_to_ref : `dict` of [`DatasetId`, `DatasetRef`]
1272 Mapping of the dataset ID to the dataset ref itself.
1273 records : `dict` of [`DatasetId`, `list` of `StoredFileInfo`]
1274 Records as generally returned by
1275 ``_get_stored_records_associated_with_refs``.
1276 all_required : `bool`
1277 Flag to indicate whether existence requires all artifacts
1278 associated with a dataset ID to exist or not for existence.
1279 artifact_existence : `dict` of [`ButlerURI`, `bool`], optional
1280 Mapping of datastore artifact to existence. Updated by this
1281 method with details of all artifacts tested. Can be `None`
1282 if the caller is not interested.
1284 Returns
1285 -------
1286 existence : `dict` of [`DatasetRef`, `bool`]
1287 Mapping from dataset to boolean indicating existence.
1288 """
1289 # The URIs to be checked and a mapping of those URIs to
1290 # the dataset ID.
1291 uris_to_check: List[ButlerURI] = []
1292 location_map: Dict[ButlerURI, DatasetId] = {}
1294 location_factory = self.locationFactory
1296 for ref_id, info in records.items():
1297 # Key is the dataId, value is list of StoredItemInfo
1298 uris = [info.file_location(location_factory).uri for info in info]
1299 uris_to_check.extend(uris)
1300 location_map.update({uri: ref_id for uri in uris})
1302 uri_existence: Dict[ButlerURI, bool] = {}
1303 if artifact_existence is not None:
1304 # If a URI has already been checked remove it from the list
1305 # and immediately add the status to the output dict.
1306 filtered_uris_to_check = []
1307 for uri in uris_to_check:
1308 if uri in artifact_existence:
1309 uri_existence[uri] = artifact_existence[uri]
1310 else:
1311 filtered_uris_to_check.append(uri)
1312 uris_to_check = filtered_uris_to_check
1314 # Results.
1315 dataset_existence: Dict[DatasetRef, bool] = {}
1317 uri_existence.update(ButlerURI.mexists(uris_to_check))
1318 for uri, exists in uri_existence.items():
1319 dataset_id = location_map[uri]
1320 ref = id_to_ref[dataset_id]
1322 # Disassembled composite needs to check all locations.
1323 # all_required indicates whether all need to exist or not.
1324 if ref in dataset_existence:
1325 if all_required:
1326 exists = dataset_existence[ref] and exists
1327 else:
1328 exists = dataset_existence[ref] or exists
1329 dataset_existence[ref] = exists
1331 if artifact_existence is not None:
1332 artifact_existence.update(uri_existence)
1334 return dataset_existence
1336 def mexists(self, refs: Iterable[DatasetRef],
1337 artifact_existence: Optional[Dict[ButlerURI, bool]] = None) -> Dict[DatasetRef, bool]:
1338 """Check the existence of multiple datasets at once.
1340 Parameters
1341 ----------
1342 refs : iterable of `DatasetRef`
1343 The datasets to be checked.
1344 artifact_existence : `dict` of [`ButlerURI`, `bool`], optional
1345 Mapping of datastore artifact to existence. Updated by this
1346 method with details of all artifacts tested. Can be `None`
1347 if the caller is not interested.
1349 Returns
1350 -------
1351 existence : `dict` of [`DatasetRef`, `bool`]
1352 Mapping from dataset to boolean indicating existence.
1353 """
1354 chunk_size = 10_000
1355 dataset_existence: Dict[DatasetRef, bool] = {}
1356 log.debug("Checking for the existence of multiple artifacts in datastore in chunks of %d",
1357 chunk_size)
1358 n_found_total = 0
1359 n_checked = 0
1360 n_chunks = 0
1361 for chunk in chunk_iterable(refs, chunk_size=chunk_size):
1362 chunk_result = self._mexists(chunk, artifact_existence)
1363 if log.isEnabledFor(VERBOSE):
1364 n_results = len(chunk_result)
1365 n_checked += n_results
1366 # Can treat the booleans as 0, 1 integers and sum them.
1367 n_found = sum(chunk_result.values())
1368 n_found_total += n_found
1369 log.verbose("Number of datasets found in datastore for chunk %d = %d/%d"
1370 " (running total: %d/%d)",
1371 n_chunks, n_found, n_results, n_found_total, n_checked)
1372 dataset_existence.update(chunk_result)
1373 n_chunks += 1
1375 return dataset_existence
1377 def _mexists(self, refs: Iterable[DatasetRef],
1378 artifact_existence: Optional[Dict[ButlerURI, bool]] = None) -> Dict[DatasetRef, bool]:
1379 """Check the existence of multiple datasets at once.
1381 Parameters
1382 ----------
1383 refs : iterable of `DatasetRef`
1384 The datasets to be checked.
1386 Returns
1387 -------
1388 existence : `dict` of [`DatasetRef`, `bool`]
1389 Mapping from dataset to boolean indicating existence.
1390 """
1391 # Need a mapping of dataset_id to dataset ref since the API
1392 # works with dataset_id
1393 id_to_ref = {ref.getCheckedId(): ref for ref in refs}
1395 # Set of all IDs we are checking for.
1396 requested_ids = set(id_to_ref.keys())
1398 # The records themselves. Could be missing some entries.
1399 records = self._get_stored_records_associated_with_refs(refs)
1401 dataset_existence = self._process_mexists_records(id_to_ref, records, True,
1402 artifact_existence=artifact_existence)
1404 # Set of IDs that have been handled.
1405 handled_ids = {ref.id for ref in dataset_existence.keys()}
1407 missing_ids = requested_ids - handled_ids
1408 if missing_ids:
1409 if not self.trustGetRequest:
1410 # Must assume these do not exist
1411 for missing in missing_ids:
1412 dataset_existence[id_to_ref[missing]] = False
1413 else:
1414 log.debug("%d out of %d datasets were not known to datastore during initial existence check.",
1415 len(missing_ids), len(requested_ids))
1417 # Construct data structure identical to that returned
1418 # by _get_stored_records_associated_with_refs() but using
1419 # guessed names.
1420 records = {}
1421 for missing in missing_ids:
1422 expected = self._get_expected_dataset_locations_info(id_to_ref[missing])
1423 records[missing] = [info for _, info in expected]
1425 dataset_existence.update(self._process_mexists_records(id_to_ref, records, False,
1426 artifact_existence=artifact_existence))
1428 return dataset_existence
1430 def exists(self, ref: DatasetRef) -> bool:
1431 """Check if the dataset exists in the datastore.
1433 Parameters
1434 ----------
1435 ref : `DatasetRef`
1436 Reference to the required dataset.
1438 Returns
1439 -------
1440 exists : `bool`
1441 `True` if the entity exists in the `Datastore`.
1442 """
1443 fileLocations = self._get_dataset_locations_info(ref)
1445 # if we are being asked to trust that registry might not be correct
1446 # we ask for the expected locations and check them explicitly
1447 if not fileLocations:
1448 if not self.trustGetRequest:
1449 return False
1451 # When we are guessing a dataset location we can not check
1452 # for the existence of every component since we can not
1453 # know if every component was written. Instead we check
1454 # for the existence of any of the expected locations.
1455 for location, _ in self._get_expected_dataset_locations_info(ref): 1455 ↛ 1458line 1455 didn't jump to line 1458, because the loop on line 1455 didn't complete
1456 if self._artifact_exists(location): 1456 ↛ 1455line 1456 didn't jump to line 1455, because the condition on line 1456 was never false
1457 return True
1458 return False
1460 # All listed artifacts must exist.
1461 for location, _ in fileLocations:
1462 if not self._artifact_exists(location):
1463 return False
1465 return True
1467 def getURIs(self, ref: DatasetRef,
1468 predict: bool = False) -> Tuple[Optional[ButlerURI], Dict[str, ButlerURI]]:
1469 """Return URIs associated with dataset.
1471 Parameters
1472 ----------
1473 ref : `DatasetRef`
1474 Reference to the required dataset.
1475 predict : `bool`, optional
1476 If the datastore does not know about the dataset, should it
1477 return a predicted URI or not?
1479 Returns
1480 -------
1481 primary : `ButlerURI`
1482 The URI to the primary artifact associated with this dataset.
1483 If the dataset was disassembled within the datastore this
1484 may be `None`.
1485 components : `dict`
1486 URIs to any components associated with the dataset artifact.
1487 Can be empty if there are no components.
1488 """
1490 primary: Optional[ButlerURI] = None
1491 components: Dict[str, ButlerURI] = {}
1493 # if this has never been written then we have to guess
1494 if not self.exists(ref):
1495 if not predict:
1496 raise FileNotFoundError("Dataset {} not in this datastore".format(ref))
1498 doDisassembly = self.composites.shouldBeDisassembled(ref)
1500 if doDisassembly:
1502 for component, componentStorage in ref.datasetType.storageClass.components.items():
1503 compRef = ref.makeComponentRef(component)
1504 compLocation, _ = self._determine_put_formatter_location(compRef)
1506 # Add a URI fragment to indicate this is a guess
1507 components[component] = ButlerURI(compLocation.uri.geturl() + "#predicted")
1509 else:
1511 location, _ = self._determine_put_formatter_location(ref)
1513 # Add a URI fragment to indicate this is a guess
1514 primary = ButlerURI(location.uri.geturl() + "#predicted")
1516 return primary, components
1518 # If this is a ref that we have written we can get the path.
1519 # Get file metadata and internal metadata
1520 fileLocations = self._get_dataset_locations_info(ref)
1522 guessing = False
1523 if not fileLocations:
1524 if not self.trustGetRequest: 1524 ↛ 1525line 1524 didn't jump to line 1525, because the condition on line 1524 was never true
1525 raise RuntimeError(f"Unexpectedly got no artifacts for dataset {ref}")
1526 fileLocations = self._get_expected_dataset_locations_info(ref)
1527 guessing = True
1529 if len(fileLocations) == 1:
1530 # No disassembly so this is the primary URI
1531 uri = fileLocations[0][0].uri
1532 if guessing and not uri.exists(): 1532 ↛ 1533line 1532 didn't jump to line 1533, because the condition on line 1532 was never true
1533 raise FileNotFoundError(f"Expected URI ({uri}) does not exist")
1534 primary = uri
1536 else:
1537 for location, storedFileInfo in fileLocations:
1538 if storedFileInfo.component is None: 1538 ↛ 1539line 1538 didn't jump to line 1539, because the condition on line 1538 was never true
1539 raise RuntimeError(f"Unexpectedly got no component name for a component at {location}")
1540 uri = location.uri
1541 if guessing and not uri.exists(): 1541 ↛ 1545line 1541 didn't jump to line 1545, because the condition on line 1541 was never true
1542 # If we are trusting then it is entirely possible for
1543 # some components to be missing. In that case we skip
1544 # to the next component.
1545 if self.trustGetRequest:
1546 continue
1547 raise FileNotFoundError(f"Expected URI ({uri}) does not exist")
1548 components[storedFileInfo.component] = uri
1550 return primary, components
1552 def getURI(self, ref: DatasetRef, predict: bool = False) -> ButlerURI:
1553 """URI to the Dataset.
1555 Parameters
1556 ----------
1557 ref : `DatasetRef`
1558 Reference to the required Dataset.
1559 predict : `bool`
1560 If `True`, allow URIs to be returned of datasets that have not
1561 been written.
1563 Returns
1564 -------
1565 uri : `str`
1566 URI pointing to the dataset within the datastore. If the
1567 dataset does not exist in the datastore, and if ``predict`` is
1568 `True`, the URI will be a prediction and will include a URI
1569 fragment "#predicted".
1570 If the datastore does not have entities that relate well
1571 to the concept of a URI the returned URI will be
1572 descriptive. The returned URI is not guaranteed to be obtainable.
1574 Raises
1575 ------
1576 FileNotFoundError
1577 Raised if a URI has been requested for a dataset that does not
1578 exist and guessing is not allowed.
1579 RuntimeError
1580 Raised if a request is made for a single URI but multiple URIs
1581 are associated with this dataset.
1583 Notes
1584 -----
1585 When a predicted URI is requested an attempt will be made to form
1586 a reasonable URI based on file templates and the expected formatter.
1587 """
1588 primary, components = self.getURIs(ref, predict)
1589 if primary is None or components: 1589 ↛ 1590line 1589 didn't jump to line 1590, because the condition on line 1589 was never true
1590 raise RuntimeError(f"Dataset ({ref}) includes distinct URIs for components. "
1591 "Use Datastore.getURIs() instead.")
1592 return primary
1594 def retrieveArtifacts(self, refs: Iterable[DatasetRef],
1595 destination: ButlerURI, transfer: str = "auto",
1596 preserve_path: bool = True,
1597 overwrite: bool = False) -> List[ButlerURI]:
1598 """Retrieve the file artifacts associated with the supplied refs.
1600 Parameters
1601 ----------
1602 refs : iterable of `DatasetRef`
1603 The datasets for which file artifacts are to be retrieved.
1604 A single ref can result in multiple files. The refs must
1605 be resolved.
1606 destination : `ButlerURI`
1607 Location to write the file artifacts.
1608 transfer : `str`, optional
1609 Method to use to transfer the artifacts. Must be one of the options
1610 supported by `ButlerURI.transfer_from()`. "move" is not allowed.
1611 preserve_path : `bool`, optional
1612 If `True` the full path of the file artifact within the datastore
1613 is preserved. If `False` the final file component of the path
1614 is used.
1615 overwrite : `bool`, optional
1616 If `True` allow transfers to overwrite existing files at the
1617 destination.
1619 Returns
1620 -------
1621 targets : `list` of `ButlerURI`
1622 URIs of file artifacts in destination location. Order is not
1623 preserved.
1624 """
1625 if not destination.isdir(): 1625 ↛ 1626line 1625 didn't jump to line 1626, because the condition on line 1625 was never true
1626 raise ValueError(f"Destination location must refer to a directory. Given {destination}")
1628 if transfer == "move":
1629 raise ValueError("Can not move artifacts out of datastore. Use copy instead.")
1631 # Source -> Destination
1632 # This also helps filter out duplicate DatasetRef in the request
1633 # that will map to the same underlying file transfer.
1634 to_transfer: Dict[ButlerURI, ButlerURI] = {}
1636 for ref in refs:
1637 locations = self._get_dataset_locations_info(ref)
1638 for location, _ in locations:
1639 source_uri = location.uri
1640 target_path: Union[str, ButlerURI]
1641 if preserve_path:
1642 target_path = location.pathInStore
1643 if target_path.isabs(): 1643 ↛ 1646line 1643 didn't jump to line 1646, because the condition on line 1643 was never true
1644 # This is an absolute path to an external file.
1645 # Use the full path.
1646 target_path = target_path.relativeToPathRoot
1647 else:
1648 target_path = source_uri.basename()
1649 target_uri = destination.join(target_path)
1650 to_transfer[source_uri] = target_uri
1652 # In theory can now parallelize the transfer
1653 log.debug("Number of artifacts to transfer to %s: %d",
1654 str(destination), len(to_transfer))
1655 for source_uri, target_uri in to_transfer.items():
1656 target_uri.transfer_from(source_uri, transfer=transfer, overwrite=overwrite)
1658 return list(to_transfer.values())
1660 def get(self, ref: DatasetRef, parameters: Optional[Mapping[str, Any]] = None) -> Any:
1661 """Load an InMemoryDataset from the store.
1663 Parameters
1664 ----------
1665 ref : `DatasetRef`
1666 Reference to the required Dataset.
1667 parameters : `dict`
1668 `StorageClass`-specific parameters that specify, for example,
1669 a slice of the dataset to be loaded.
1671 Returns
1672 -------
1673 inMemoryDataset : `object`
1674 Requested dataset or slice thereof as an InMemoryDataset.
1676 Raises
1677 ------
1678 FileNotFoundError
1679 Requested dataset can not be retrieved.
1680 TypeError
1681 Return value from formatter has unexpected type.
1682 ValueError
1683 Formatter failed to process the dataset.
1684 """
1685 allGetInfo = self._prepare_for_get(ref, parameters)
1686 refComponent = ref.datasetType.component()
1688 # Supplied storage class for the component being read
1689 refStorageClass = ref.datasetType.storageClass
1691 # Create mapping from component name to related info
1692 allComponents = {i.component: i for i in allGetInfo}
1694 # By definition the dataset is disassembled if we have more
1695 # than one record for it.
1696 isDisassembled = len(allGetInfo) > 1
1698 # Look for the special case where we are disassembled but the
1699 # component is a derived component that was not written during
1700 # disassembly. For this scenario we need to check that the
1701 # component requested is listed as a derived component for the
1702 # composite storage class
1703 isDisassembledReadOnlyComponent = False
1704 if isDisassembled and refComponent:
1705 # The composite storage class should be accessible through
1706 # the component dataset type
1707 compositeStorageClass = ref.datasetType.parentStorageClass
1709 # In the unlikely scenario where the composite storage
1710 # class is not known, we can only assume that this is a
1711 # normal component. If that assumption is wrong then the
1712 # branch below that reads a persisted component will fail
1713 # so there is no need to complain here.
1714 if compositeStorageClass is not None: 1714 ↛ 1717line 1714 didn't jump to line 1717, because the condition on line 1714 was never false
1715 isDisassembledReadOnlyComponent = refComponent in compositeStorageClass.derivedComponents
1717 if isDisassembled and not refComponent:
1718 # This was a disassembled dataset spread over multiple files
1719 # and we need to put them all back together again.
1720 # Read into memory and then assemble
1722 # Check that the supplied parameters are suitable for the type read
1723 refStorageClass.validateParameters(parameters)
1725 # We want to keep track of all the parameters that were not used
1726 # by formatters. We assume that if any of the component formatters
1727 # use a parameter that we do not need to apply it again in the
1728 # assembler.
1729 usedParams = set()
1731 components: Dict[str, Any] = {}
1732 for getInfo in allGetInfo:
1733 # assemblerParams are parameters not understood by the
1734 # associated formatter.
1735 usedParams.update(set(getInfo.formatterParams))
1737 component = getInfo.component
1739 if component is None: 1739 ↛ 1740line 1739 didn't jump to line 1740, because the condition on line 1739 was never true
1740 raise RuntimeError(f"Internal error in datastore assembly of {ref}")
1742 # We do not want the formatter to think it's reading
1743 # a component though because it is really reading a
1744 # standalone dataset -- always tell reader it is not a
1745 # component.
1746 components[component] = self._read_artifact_into_memory(getInfo,
1747 ref.makeComponentRef(component),
1748 isComponent=False)
1750 inMemoryDataset = ref.datasetType.storageClass.delegate().assemble(components)
1752 # Any unused parameters will have to be passed to the assembler
1753 if parameters:
1754 unusedParams = {k: v for k, v in parameters.items() if k not in usedParams}
1755 else:
1756 unusedParams = {}
1758 # Process parameters
1759 return ref.datasetType.storageClass.delegate().handleParameters(inMemoryDataset,
1760 parameters=unusedParams)
1762 elif isDisassembledReadOnlyComponent:
1764 compositeStorageClass = ref.datasetType.parentStorageClass
1765 if compositeStorageClass is None: 1765 ↛ 1766line 1765 didn't jump to line 1766, because the condition on line 1765 was never true
1766 raise RuntimeError(f"Unable to retrieve derived component '{refComponent}' since"
1767 "no composite storage class is available.")
1769 if refComponent is None: 1769 ↛ 1771line 1769 didn't jump to line 1771, because the condition on line 1769 was never true
1770 # Mainly for mypy
1771 raise RuntimeError(f"Internal error in datastore {self.name}: component can not be None here")
1773 # Assume that every derived component can be calculated by
1774 # forwarding the request to a single read/write component.
1775 # Rather than guessing which rw component is the right one by
1776 # scanning each for a derived component of the same name,
1777 # we ask the storage class delegate directly which one is best to
1778 # use.
1779 compositeDelegate = compositeStorageClass.delegate()
1780 forwardedComponent = compositeDelegate.selectResponsibleComponent(refComponent,
1781 set(allComponents))
1783 # Select the relevant component
1784 rwInfo = allComponents[forwardedComponent]
1786 # For now assume that read parameters are validated against
1787 # the real component and not the requested component
1788 forwardedStorageClass = rwInfo.formatter.fileDescriptor.readStorageClass
1789 forwardedStorageClass.validateParameters(parameters)
1791 # The reference to use for the caching must refer to the forwarded
1792 # component and not the derived component.
1793 cache_ref = ref.makeCompositeRef().makeComponentRef(forwardedComponent)
1795 # Unfortunately the FileDescriptor inside the formatter will have
1796 # the wrong write storage class so we need to create a new one
1797 # given the immutability constraint.
1798 writeStorageClass = rwInfo.info.storageClass
1800 # We may need to put some thought into parameters for read
1801 # components but for now forward them on as is
1802 readFormatter = type(rwInfo.formatter)(FileDescriptor(rwInfo.location,
1803 readStorageClass=refStorageClass,
1804 storageClass=writeStorageClass,
1805 parameters=parameters),
1806 ref.dataId)
1808 # The assembler can not receive any parameter requests for a
1809 # derived component at this time since the assembler will
1810 # see the storage class of the derived component and those
1811 # parameters will have to be handled by the formatter on the
1812 # forwarded storage class.
1813 assemblerParams: Dict[str, Any] = {}
1815 # Need to created a new info that specifies the derived
1816 # component and associated storage class
1817 readInfo = DatastoreFileGetInformation(rwInfo.location, readFormatter,
1818 rwInfo.info, assemblerParams, {},
1819 refComponent, refStorageClass)
1821 return self._read_artifact_into_memory(readInfo, ref, isComponent=True,
1822 cache_ref=cache_ref)
1824 else:
1825 # Single file request or component from that composite file
1826 for lookup in (refComponent, None): 1826 ↛ 1831line 1826 didn't jump to line 1831, because the loop on line 1826 didn't complete
1827 if lookup in allComponents: 1827 ↛ 1826line 1827 didn't jump to line 1826, because the condition on line 1827 was never false
1828 getInfo = allComponents[lookup]
1829 break
1830 else:
1831 raise FileNotFoundError(f"Component {refComponent} not found "
1832 f"for ref {ref} in datastore {self.name}")
1834 # Do not need the component itself if already disassembled
1835 if isDisassembled:
1836 isComponent = False
1837 else:
1838 isComponent = getInfo.component is not None
1840 # For a component read of a composite we want the cache to
1841 # be looking at the composite ref itself.
1842 cache_ref = ref.makeCompositeRef() if isComponent else ref
1844 # For a disassembled component we can validate parametersagainst
1845 # the component storage class directly
1846 if isDisassembled:
1847 refStorageClass.validateParameters(parameters)
1848 else:
1849 # For an assembled composite this could be a derived
1850 # component derived from a real component. The validity
1851 # of the parameters is not clear. For now validate against
1852 # the composite storage class
1853 getInfo.formatter.fileDescriptor.storageClass.validateParameters(parameters)
1855 return self._read_artifact_into_memory(getInfo, ref, isComponent=isComponent,
1856 cache_ref=cache_ref)
1858 @transactional
1859 def put(self, inMemoryDataset: Any, ref: DatasetRef) -> None:
1860 """Write a InMemoryDataset with a given `DatasetRef` to the store.
1862 Parameters
1863 ----------
1864 inMemoryDataset : `object`
1865 The dataset to store.
1866 ref : `DatasetRef`
1867 Reference to the associated Dataset.
1869 Raises
1870 ------
1871 TypeError
1872 Supplied object and storage class are inconsistent.
1873 DatasetTypeNotSupportedError
1874 The associated `DatasetType` is not handled by this datastore.
1876 Notes
1877 -----
1878 If the datastore is configured to reject certain dataset types it
1879 is possible that the put will fail and raise a
1880 `DatasetTypeNotSupportedError`. The main use case for this is to
1881 allow `ChainedDatastore` to put to multiple datastores without
1882 requiring that every datastore accepts the dataset.
1883 """
1885 doDisassembly = self.composites.shouldBeDisassembled(ref)
1886 # doDisassembly = True
1888 artifacts = []
1889 if doDisassembly:
1890 components = ref.datasetType.storageClass.delegate().disassemble(inMemoryDataset)
1891 for component, componentInfo in components.items():
1892 # Don't recurse because we want to take advantage of
1893 # bulk insert -- need a new DatasetRef that refers to the
1894 # same dataset_id but has the component DatasetType
1895 # DatasetType does not refer to the types of components
1896 # So we construct one ourselves.
1897 compRef = ref.makeComponentRef(component)
1898 storedInfo = self._write_in_memory_to_artifact(componentInfo.component, compRef)
1899 artifacts.append((compRef, storedInfo))
1900 else:
1901 # Write the entire thing out
1902 storedInfo = self._write_in_memory_to_artifact(inMemoryDataset, ref)
1903 artifacts.append((ref, storedInfo))
1905 self._register_datasets(artifacts)
1907 @transactional
1908 def trash(self, ref: Union[DatasetRef, Iterable[DatasetRef]], ignore_errors: bool = True) -> None:
1909 # At this point can safely remove these datasets from the cache
1910 # to avoid confusion later on. If they are not trashed later
1911 # the cache will simply be refilled.
1912 self.cacheManager.remove_from_cache(ref)
1914 # If we are in trust mode there will be nothing to move to
1915 # the trash table and we will have to try to delete the file
1916 # immediately.
1917 if self.trustGetRequest:
1918 # Try to keep the logic below for a single file trash.
1919 if isinstance(ref, DatasetRef):
1920 refs = {ref}
1921 else:
1922 # Will recreate ref at the end of this branch.
1923 refs = set(ref)
1925 # Determine which datasets are known to datastore directly.
1926 id_to_ref = {ref.getCheckedId(): ref for ref in refs}
1927 existing_ids = self._get_stored_records_associated_with_refs(refs)
1928 existing_refs = {id_to_ref[ref_id] for ref_id in existing_ids}
1930 missing = refs - existing_refs
1931 if missing:
1932 # Do an explicit existence check on these refs.
1933 # We only care about the artifacts at this point and not
1934 # the dataset existence.
1935 artifact_existence: Dict[ButlerURI, bool] = {}
1936 _ = self.mexists(missing, artifact_existence)
1937 uris = [uri for uri, exists in artifact_existence.items() if exists]
1939 # FUTURE UPGRADE: Implement a parallelized bulk remove.
1940 log.debug("Removing %d artifacts from datastore that are unknown to datastore", len(uris))
1941 for uri in uris:
1942 try:
1943 uri.remove()
1944 except Exception as e:
1945 if ignore_errors:
1946 log.debug("Artifact %s could not be removed: %s", uri, e)
1947 continue
1948 raise
1950 # There is no point asking the code below to remove refs we
1951 # know are missing so update it with the list of existing
1952 # records. Try to retain one vs many logic.
1953 if not existing_refs:
1954 # Nothing more to do since none of the datasets were
1955 # known to the datastore record table.
1956 return
1957 ref = list(existing_refs)
1958 if len(ref) == 1:
1959 ref = ref[0]
1961 # Get file metadata and internal metadata
1962 if not isinstance(ref, DatasetRef):
1963 log.debug("Doing multi-dataset trash in datastore %s", self.name)
1964 # Assumed to be an iterable of refs so bulk mode enabled.
1965 try:
1966 self.bridge.moveToTrash(ref)
1967 except Exception as e:
1968 if ignore_errors:
1969 log.warning("Unexpected issue moving multiple datasets to trash: %s", e)
1970 else:
1971 raise
1972 return
1974 log.debug("Trashing dataset %s in datastore %s", ref, self.name)
1976 fileLocations = self._get_dataset_locations_info(ref)
1978 if not fileLocations:
1979 err_msg = f"Requested dataset to trash ({ref}) is not known to datastore {self.name}"
1980 if ignore_errors:
1981 log.warning(err_msg)
1982 return
1983 else:
1984 raise FileNotFoundError(err_msg)
1986 for location, storedFileInfo in fileLocations:
1987 if not self._artifact_exists(location): 1987 ↛ 1988line 1987 didn't jump to line 1988, because the condition on line 1987 was never true
1988 err_msg = f"Dataset is known to datastore {self.name} but " \
1989 f"associated artifact ({location.uri}) is missing"
1990 if ignore_errors:
1991 log.warning(err_msg)
1992 return
1993 else:
1994 raise FileNotFoundError(err_msg)
1996 # Mark dataset as trashed
1997 try:
1998 self.bridge.moveToTrash([ref])
1999 except Exception as e:
2000 if ignore_errors:
2001 log.warning("Attempted to mark dataset (%s) to be trashed in datastore %s "
2002 "but encountered an error: %s", ref, self.name, e)
2003 pass
2004 else:
2005 raise
2007 @transactional
2008 def emptyTrash(self, ignore_errors: bool = True) -> None:
2009 """Remove all datasets from the trash.
2011 Parameters
2012 ----------
2013 ignore_errors : `bool`
2014 If `True` return without error even if something went wrong.
2015 Problems could occur if another process is simultaneously trying
2016 to delete.
2017 """
2018 log.debug("Emptying trash in datastore %s", self.name)
2020 # Context manager will empty trash iff we finish it without raising.
2021 # It will also automatically delete the relevant rows from the
2022 # trash table and the records table.
2023 with self.bridge.emptyTrash(self._table, record_class=StoredFileInfo,
2024 record_column="path") as trash_data:
2025 # Removing the artifacts themselves requires that the files are
2026 # not also associated with refs that are not to be trashed.
2027 # Therefore need to do a query with the file paths themselves
2028 # and return all the refs associated with them. Can only delete
2029 # a file if the refs to be trashed are the only refs associated
2030 # with the file.
2031 # This requires multiple copies of the trashed items
2032 trashed, artifacts_to_keep = trash_data
2034 if artifacts_to_keep is None:
2035 # The bridge is not helping us so have to work it out
2036 # ourselves. This is not going to be as efficient.
2037 trashed = list(trashed)
2039 # The instance check is for mypy since up to this point it
2040 # does not know the type of info.
2041 path_map = self._refs_associated_with_artifacts([info.path for _, info in trashed
2042 if isinstance(info, StoredFileInfo)])
2044 for ref, info in trashed:
2046 # Mypy needs to know this is not the base class
2047 assert isinstance(info, StoredFileInfo), f"Unexpectedly got info of class {type(info)}"
2049 # Check for mypy
2050 assert ref.id is not None, f"Internal logic error in emptyTrash with ref {ref}/{info}"
2052 path_map[info.path].remove(ref.id)
2053 if not path_map[info.path]: 2053 ↛ 2044line 2053 didn't jump to line 2044, because the condition on line 2053 was never false
2054 del path_map[info.path]
2056 artifacts_to_keep = set(path_map)
2058 for ref, info in trashed:
2060 # Should not happen for this implementation but need
2061 # to keep mypy happy.
2062 assert info is not None, f"Internal logic error in emptyTrash with ref {ref}."
2064 # Mypy needs to know this is not the base class
2065 assert isinstance(info, StoredFileInfo), f"Unexpectedly got info of class {type(info)}"
2067 # Check for mypy
2068 assert ref.id is not None, f"Internal logic error in emptyTrash with ref {ref}/{info}"
2070 if info.path in artifacts_to_keep:
2071 # This is a multi-dataset artifact and we are not
2072 # removing all associated refs.
2073 continue
2075 # Only trashed refs still known to datastore will be returned.
2076 location = info.file_location(self.locationFactory)
2078 # Point of no return for this artifact
2079 log.debug("Removing artifact %s from datastore %s", location.uri, self.name)
2080 try:
2081 self._delete_artifact(location)
2082 except FileNotFoundError:
2083 # If the file itself has been deleted there is nothing
2084 # we can do about it. It is possible that trash has
2085 # been run in parallel in another process or someone
2086 # decided to delete the file. It is unlikely to come
2087 # back and so we should still continue with the removal
2088 # of the entry from the trash table. It is also possible
2089 # we removed it in a previous iteration if it was
2090 # a multi-dataset artifact. The delete artifact method
2091 # will log a debug message in this scenario.
2092 # Distinguishing file missing before trash started and
2093 # file already removed previously as part of this trash
2094 # is not worth the distinction with regards to potential
2095 # memory cost.
2096 pass
2097 except Exception as e:
2098 if ignore_errors:
2099 # Use a debug message here even though it's not
2100 # a good situation. In some cases this can be
2101 # caused by a race between user A and user B
2102 # and neither of them has permissions for the
2103 # other's files. Butler does not know about users
2104 # and trash has no idea what collections these
2105 # files were in (without guessing from a path).
2106 log.debug("Encountered error removing artifact %s from datastore %s: %s",
2107 location.uri, self.name, e)
2108 else:
2109 raise
2111 @transactional
2112 def transfer_from(self, source_datastore: Datastore, refs: Iterable[DatasetRef],
2113 local_refs: Optional[Iterable[DatasetRef]] = None,
2114 transfer: str = "auto",
2115 artifact_existence: Optional[Dict[ButlerURI, bool]] = None) -> None:
2116 # Docstring inherited
2117 if type(self) is not type(source_datastore):
2118 raise TypeError(f"Datastore mismatch between this datastore ({type(self)}) and the "
2119 f"source datastore ({type(source_datastore)}).")
2121 # Be explicit for mypy
2122 if not isinstance(source_datastore, FileDatastore): 2122 ↛ 2123line 2122 didn't jump to line 2123, because the condition on line 2122 was never true
2123 raise TypeError("Can only transfer to a FileDatastore from another FileDatastore, not"
2124 f" {type(source_datastore)}")
2126 # Stop early if "direct" transfer mode is requested. That would
2127 # require that the URI inside the source datastore should be stored
2128 # directly in the target datastore, which seems unlikely to be useful
2129 # since at any moment the source datastore could delete the file.
2130 if transfer in ("direct", "split"):
2131 raise ValueError(f"Can not transfer from a source datastore using {transfer} mode since"
2132 " those files are controlled by the other datastore.")
2134 # Empty existence lookup if none given.
2135 if artifact_existence is None:
2136 artifact_existence = {}
2138 # We will go through the list multiple times so must convert
2139 # generators to lists.
2140 refs = list(refs)
2142 if local_refs is None:
2143 local_refs = refs
2144 else:
2145 local_refs = list(local_refs)
2147 # In order to handle disassembled composites the code works
2148 # at the records level since it can assume that internal APIs
2149 # can be used.
2150 # - If the record already exists in the destination this is assumed
2151 # to be okay.
2152 # - If there is no record but the source and destination URIs are
2153 # identical no transfer is done but the record is added.
2154 # - If the source record refers to an absolute URI currently assume
2155 # that that URI should remain absolute and will be visible to the
2156 # destination butler. May need to have a flag to indicate whether
2157 # the dataset should be transferred. This will only happen if
2158 # the detached Butler has had a local ingest.
2160 # What we really want is all the records in the source datastore
2161 # associated with these refs. Or derived ones if they don't exist
2162 # in the source.
2163 source_records = source_datastore._get_stored_records_associated_with_refs(refs)
2165 # The source dataset_ids are the keys in these records
2166 source_ids = set(source_records)
2167 log.debug("Number of datastore records found in source: %d", len(source_ids))
2169 # The not None check is to appease mypy
2170 requested_ids = set(ref.id for ref in refs if ref.id is not None)
2171 missing_ids = requested_ids - source_ids
2173 # Missing IDs can be okay if that datastore has allowed
2174 # gets based on file existence. Should we transfer what we can
2175 # or complain about it and warn?
2176 if missing_ids and not source_datastore.trustGetRequest: 2176 ↛ 2177line 2176 didn't jump to line 2177, because the condition on line 2176 was never true
2177 raise ValueError(f"Some datasets are missing from source datastore {source_datastore}:"
2178 f" {missing_ids}")
2180 # Need to map these missing IDs to a DatasetRef so we can guess
2181 # the details.
2182 if missing_ids:
2183 log.info("Number of expected datasets missing from source datastore records: %d out of %d",
2184 len(missing_ids), len(requested_ids))
2185 id_to_ref = {ref.id: ref for ref in refs if ref.id in missing_ids}
2187 # This should be chunked in case we end up having to check
2188 # the file store since we need some log output to show
2189 # progress.
2190 for missing_ids_chunk in chunk_iterable(missing_ids, chunk_size=10_000):
2191 records = {}
2192 for missing in missing_ids_chunk:
2193 # Ask the source datastore where the missing artifacts
2194 # should be. An execution butler might not know about the
2195 # artifacts even if they are there.
2196 expected = source_datastore._get_expected_dataset_locations_info(id_to_ref[missing])
2197 records[missing] = [info for _, info in expected]
2199 # Call the mexist helper method in case we have not already
2200 # checked these artifacts such that artifact_existence is
2201 # empty. This allows us to benefit from parallelism.
2202 # datastore.mexists() itself does not give us access to the
2203 # derived datastore record.
2204 log.verbose("Checking existence of %d datasets unknown to datastore",
2205 len(records))
2206 ref_exists = source_datastore._process_mexists_records(id_to_ref, records, False,
2207 artifact_existence=artifact_existence)
2209 # Now go through the records and propagate the ones that exist.
2210 location_factory = source_datastore.locationFactory
2211 for missing, record_list in records.items():
2212 # Skip completely if the ref does not exist.
2213 ref = id_to_ref[missing]
2214 if not ref_exists[ref]:
2215 log.warning("Asked to transfer dataset %s but no file artifacts exist for it.",
2216 ref)
2217 continue
2218 # Check for file artifact to decide which parts of a
2219 # disassembled composite do exist. If there is only a
2220 # single record we don't even need to look because it can't
2221 # be a composite and must exist.
2222 if len(record_list) == 1:
2223 dataset_records = record_list
2224 else:
2225 dataset_records = [record for record in record_list
2226 if artifact_existence[record.file_location(location_factory).uri]]
2227 assert len(dataset_records) > 0, "Disassembled composite should have had some files."
2229 # Rely on source_records being a defaultdict.
2230 source_records[missing].extend(dataset_records)
2232 # See if we already have these records
2233 target_records = self._get_stored_records_associated_with_refs(local_refs)
2235 # The artifacts to register
2236 artifacts = []
2238 # Refs that already exist
2239 already_present = []
2241 # Now can transfer the artifacts
2242 for source_ref, target_ref in zip(refs, local_refs):
2243 if target_ref.id in target_records:
2244 # Already have an artifact for this.
2245 already_present.append(target_ref)
2246 continue
2248 # mypy needs to know these are always resolved refs
2249 for info in source_records[source_ref.getCheckedId()]:
2250 source_location = info.file_location(source_datastore.locationFactory)
2251 target_location = info.file_location(self.locationFactory)
2252 if source_location == target_location: 2252 ↛ 2256line 2252 didn't jump to line 2256, because the condition on line 2252 was never true
2253 # Either the dataset is already in the target datastore
2254 # (which is how execution butler currently runs) or
2255 # it is an absolute URI.
2256 if source_location.pathInStore.isabs():
2257 # Just because we can see the artifact when running
2258 # the transfer doesn't mean it will be generally
2259 # accessible to a user of this butler. For now warn
2260 # but assume it will be accessible.
2261 log.warning("Transfer request for an outside-datastore artifact has been found at %s",
2262 source_location)
2263 else:
2264 # Need to transfer it to the new location.
2265 # Assume we should always overwrite. If the artifact
2266 # is there this might indicate that a previous transfer
2267 # was interrupted but was not able to be rolled back
2268 # completely (eg pre-emption) so follow Datastore default
2269 # and overwrite.
2270 target_location.uri.transfer_from(source_location.uri, transfer=transfer,
2271 overwrite=True, transaction=self._transaction)
2273 artifacts.append((target_ref, info))
2275 self._register_datasets(artifacts)
2277 if already_present:
2278 n_skipped = len(already_present)
2279 log.info("Skipped transfer of %d dataset%s already present in datastore", n_skipped,
2280 "" if n_skipped == 1 else "s")
2282 @transactional
2283 def forget(self, refs: Iterable[DatasetRef]) -> None:
2284 # Docstring inherited.
2285 refs = list(refs)
2286 self.bridge.forget(refs)
2287 self._table.delete(["dataset_id"], *[{"dataset_id": ref.getCheckedId()} for ref in refs])
2289 def validateConfiguration(self, entities: Iterable[Union[DatasetRef, DatasetType, StorageClass]],
2290 logFailures: bool = False) -> None:
2291 """Validate some of the configuration for this datastore.
2293 Parameters
2294 ----------
2295 entities : iterable of `DatasetRef`, `DatasetType`, or `StorageClass`
2296 Entities to test against this configuration. Can be differing
2297 types.
2298 logFailures : `bool`, optional
2299 If `True`, output a log message for every validation error
2300 detected.
2302 Raises
2303 ------
2304 DatastoreValidationError
2305 Raised if there is a validation problem with a configuration.
2306 All the problems are reported in a single exception.
2308 Notes
2309 -----
2310 This method checks that all the supplied entities have valid file
2311 templates and also have formatters defined.
2312 """
2314 templateFailed = None
2315 try:
2316 self.templates.validateTemplates(entities, logFailures=logFailures)
2317 except FileTemplateValidationError as e:
2318 templateFailed = str(e)
2320 formatterFailed = []
2321 for entity in entities:
2322 try:
2323 self.formatterFactory.getFormatterClass(entity)
2324 except KeyError as e:
2325 formatterFailed.append(str(e))
2326 if logFailures: 2326 ↛ 2321line 2326 didn't jump to line 2321, because the condition on line 2326 was never false
2327 log.critical("Formatter failure: %s", e)
2329 if templateFailed or formatterFailed:
2330 messages = []
2331 if templateFailed: 2331 ↛ 2332line 2331 didn't jump to line 2332, because the condition on line 2331 was never true
2332 messages.append(templateFailed)
2333 if formatterFailed: 2333 ↛ 2335line 2333 didn't jump to line 2335, because the condition on line 2333 was never false
2334 messages.append(",".join(formatterFailed))
2335 msg = ";\n".join(messages)
2336 raise DatastoreValidationError(msg)
2338 def getLookupKeys(self) -> Set[LookupKey]:
2339 # Docstring is inherited from base class
2340 return self.templates.getLookupKeys() | self.formatterFactory.getLookupKeys() | \
2341 self.constraints.getLookupKeys()
2343 def validateKey(self, lookupKey: LookupKey,
2344 entity: Union[DatasetRef, DatasetType, StorageClass]) -> None:
2345 # Docstring is inherited from base class
2346 # The key can be valid in either formatters or templates so we can
2347 # only check the template if it exists
2348 if lookupKey in self.templates:
2349 try:
2350 self.templates[lookupKey].validateTemplate(entity)
2351 except FileTemplateValidationError as e:
2352 raise DatastoreValidationError(e) from e
2354 def export(self, refs: Iterable[DatasetRef], *,
2355 directory: Optional[Union[ButlerURI, str]] = None,
2356 transfer: Optional[str] = "auto") -> Iterable[FileDataset]:
2357 # Docstring inherited from Datastore.export.
2358 if transfer is not None and directory is None: 2358 ↛ 2359line 2358 didn't jump to line 2359, because the condition on line 2358 was never true
2359 raise RuntimeError(f"Cannot export using transfer mode {transfer} with no "
2360 "export directory given")
2362 # Force the directory to be a URI object
2363 directoryUri: Optional[ButlerURI] = None
2364 if directory is not None: 2364 ↛ 2367line 2364 didn't jump to line 2367, because the condition on line 2364 was never false
2365 directoryUri = ButlerURI(directory, forceDirectory=True)
2367 if transfer is not None and directoryUri is not None: 2367 ↛ 2372line 2367 didn't jump to line 2372, because the condition on line 2367 was never false
2368 # mypy needs the second test
2369 if not directoryUri.exists(): 2369 ↛ 2370line 2369 didn't jump to line 2370, because the condition on line 2369 was never true
2370 raise FileNotFoundError(f"Export location {directory} does not exist")
2372 progress = Progress("lsst.daf.butler.datastores.FileDatastore.export", level=logging.DEBUG)
2373 for ref in progress.wrap(refs, "Exporting dataset files"):
2374 fileLocations = self._get_dataset_locations_info(ref)
2375 if not fileLocations: 2375 ↛ 2376line 2375 didn't jump to line 2376, because the condition on line 2375 was never true
2376 raise FileNotFoundError(f"Could not retrieve dataset {ref}.")
2377 # For now we can not export disassembled datasets
2378 if len(fileLocations) > 1:
2379 raise NotImplementedError(f"Can not export disassembled datasets such as {ref}")
2380 location, storedFileInfo = fileLocations[0]
2382 pathInStore = location.pathInStore.path
2383 if transfer is None: 2383 ↛ 2387line 2383 didn't jump to line 2387, because the condition on line 2383 was never true
2384 # TODO: do we also need to return the readStorageClass somehow?
2385 # We will use the path in store directly. If this is an
2386 # absolute URI, preserve it.
2387 if location.pathInStore.isabs():
2388 pathInStore = str(location.uri)
2389 elif transfer == "direct": 2389 ↛ 2391line 2389 didn't jump to line 2391, because the condition on line 2389 was never true
2390 # Use full URIs to the remote store in the export
2391 pathInStore = str(location.uri)
2392 else:
2393 # mypy needs help
2394 assert directoryUri is not None, "directoryUri must be defined to get here"
2395 storeUri = ButlerURI(location.uri)
2397 # if the datastore has an absolute URI to a resource, we
2398 # have two options:
2399 # 1. Keep the absolute URI in the exported YAML
2400 # 2. Allocate a new name in the local datastore and transfer
2401 # it.
2402 # For now go with option 2
2403 if location.pathInStore.isabs(): 2403 ↛ 2404line 2403 didn't jump to line 2404, because the condition on line 2403 was never true
2404 template = self.templates.getTemplate(ref)
2405 newURI = ButlerURI(template.format(ref), forceAbsolute=False)
2406 pathInStore = str(newURI.updatedExtension(location.pathInStore.getExtension()))
2408 exportUri = directoryUri.join(pathInStore)
2409 exportUri.transfer_from(storeUri, transfer=transfer)
2411 yield FileDataset(refs=[ref], path=pathInStore, formatter=storedFileInfo.formatter)
2413 @staticmethod
2414 def computeChecksum(uri: ButlerURI, algorithm: str = "blake2b", block_size: int = 8192) -> Optional[str]:
2415 """Compute the checksum of the supplied file.
2417 Parameters
2418 ----------
2419 uri : `ButlerURI`
2420 Name of resource to calculate checksum from.
2421 algorithm : `str`, optional
2422 Name of algorithm to use. Must be one of the algorithms supported
2423 by :py:class`hashlib`.
2424 block_size : `int`
2425 Number of bytes to read from file at one time.
2427 Returns
2428 -------
2429 hexdigest : `str`
2430 Hex digest of the file.
2432 Notes
2433 -----
2434 Currently returns None if the URI is for a remote resource.
2435 """
2436 if algorithm not in hashlib.algorithms_guaranteed: 2436 ↛ 2437line 2436 didn't jump to line 2437, because the condition on line 2436 was never true
2437 raise NameError("The specified algorithm '{}' is not supported by hashlib".format(algorithm))
2439 if not uri.isLocal: 2439 ↛ 2440line 2439 didn't jump to line 2440, because the condition on line 2439 was never true
2440 return None
2442 hasher = hashlib.new(algorithm)
2444 with uri.as_local() as local_uri:
2445 with open(local_uri.ospath, "rb") as f:
2446 for chunk in iter(lambda: f.read(block_size), b""):
2447 hasher.update(chunk)
2449 return hasher.hexdigest()
2451 def needs_expanded_data_ids(
2452 self,
2453 transfer: Optional[str],
2454 entity: Optional[Union[DatasetRef, DatasetType, StorageClass]] = None,
2455 ) -> bool:
2456 # Docstring inherited.
2457 # This _could_ also use entity to inspect whether the filename template
2458 # involves placeholders other than the required dimensions for its
2459 # dataset type, but that's not necessary for correctness; it just
2460 # enables more optimizations (perhaps only in theory).
2461 return transfer not in ("direct", None)