Coverage for python/lsst/daf/butler/datastores/fileDatastore.py: 83%
Shortcuts on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
21from __future__ import annotations
23"""Generic file-based datastore code."""
25__all__ = ("FileDatastore",)
27import hashlib
28import logging
29from collections import defaultdict
30from dataclasses import dataclass
31from typing import (
32 TYPE_CHECKING,
33 Any,
34 ClassVar,
35 Dict,
36 Iterable,
37 List,
38 Mapping,
39 Optional,
40 Set,
41 Tuple,
42 Type,
43 Union,
44)
46from lsst.daf.butler import (
47 ButlerURI,
48 CompositesMap,
49 Config,
50 DatasetId,
51 DatasetRef,
52 DatasetType,
53 DatasetTypeNotSupportedError,
54 Datastore,
55 DatastoreCacheManager,
56 DatastoreConfig,
57 DatastoreDisabledCacheManager,
58 DatastoreValidationError,
59 FileDataset,
60 FileDescriptor,
61 FileTemplates,
62 FileTemplateValidationError,
63 Formatter,
64 FormatterFactory,
65 Location,
66 LocationFactory,
67 Progress,
68 StorageClass,
69 StoredFileInfo,
70 ddl,
71)
72from lsst.daf.butler.core.repoRelocation import replaceRoot
73from lsst.daf.butler.core.utils import transactional
74from lsst.daf.butler.registry.interfaces import DatastoreRegistryBridge, ReadOnlyDatabaseError
75from lsst.utils.introspection import get_class_of, get_instance_of
76from lsst.utils.iteration import chunk_iterable
78# For VERBOSE logging usage.
79from lsst.utils.logging import VERBOSE, getLogger
80from lsst.utils.timer import time_this
81from sqlalchemy import BigInteger, String
83from .genericDatastore import GenericBaseDatastore
85if TYPE_CHECKING: 85 ↛ 86line 85 didn't jump to line 86, because the condition on line 85 was never true
86 from lsst.daf.butler import AbstractDatastoreCacheManager, LookupKey
87 from lsst.daf.butler.registry.interfaces import DatasetIdRef, DatastoreRegistryBridgeManager
89log = getLogger(__name__)
92class _IngestPrepData(Datastore.IngestPrepData):
93 """Helper class for FileDatastore ingest implementation.
95 Parameters
96 ----------
97 datasets : `list` of `FileDataset`
98 Files to be ingested by this datastore.
99 """
101 def __init__(self, datasets: List[FileDataset]):
102 super().__init__(ref for dataset in datasets for ref in dataset.refs)
103 self.datasets = datasets
106@dataclass(frozen=True)
107class DatastoreFileGetInformation:
108 """Collection of useful parameters needed to retrieve a file from
109 a Datastore.
110 """
112 location: Location
113 """The location from which to read the dataset."""
115 formatter: Formatter
116 """The `Formatter` to use to deserialize the dataset."""
118 info: StoredFileInfo
119 """Stored information about this file and its formatter."""
121 assemblerParams: Dict[str, Any]
122 """Parameters to use for post-processing the retrieved dataset."""
124 formatterParams: Dict[str, Any]
125 """Parameters that were understood by the associated formatter."""
127 component: Optional[str]
128 """The component to be retrieved (can be `None`)."""
130 readStorageClass: StorageClass
131 """The `StorageClass` of the dataset being read."""
134class FileDatastore(GenericBaseDatastore):
135 """Generic Datastore for file-based implementations.
137 Should always be sub-classed since key abstract methods are missing.
139 Parameters
140 ----------
141 config : `DatastoreConfig` or `str`
142 Configuration as either a `Config` object or URI to file.
143 bridgeManager : `DatastoreRegistryBridgeManager`
144 Object that manages the interface between `Registry` and datastores.
145 butlerRoot : `str`, optional
146 New datastore root to use to override the configuration value.
148 Raises
149 ------
150 ValueError
151 If root location does not exist and ``create`` is `False` in the
152 configuration.
153 """
155 defaultConfigFile: ClassVar[Optional[str]] = None
156 """Path to configuration defaults. Accessed within the ``config`` resource
157 or relative to a search path. Can be None if no defaults specified.
158 """
160 root: ButlerURI
161 """Root directory URI of this `Datastore`."""
163 locationFactory: LocationFactory
164 """Factory for creating locations relative to the datastore root."""
166 formatterFactory: FormatterFactory
167 """Factory for creating instances of formatters."""
169 templates: FileTemplates
170 """File templates that can be used by this `Datastore`."""
172 composites: CompositesMap
173 """Determines whether a dataset should be disassembled on put."""
175 defaultConfigFile = "datastores/fileDatastore.yaml"
176 """Path to configuration defaults. Accessed within the ``config`` resource
177 or relative to a search path. Can be None if no defaults specified.
178 """
180 @classmethod
181 def setConfigRoot(cls, root: str, config: Config, full: Config, overwrite: bool = True) -> None:
182 """Set any filesystem-dependent config options for this Datastore to
183 be appropriate for a new empty repository with the given root.
185 Parameters
186 ----------
187 root : `str`
188 URI to the root of the data repository.
189 config : `Config`
190 A `Config` to update. Only the subset understood by
191 this component will be updated. Will not expand
192 defaults.
193 full : `Config`
194 A complete config with all defaults expanded that can be
195 converted to a `DatastoreConfig`. Read-only and will not be
196 modified by this method.
197 Repository-specific options that should not be obtained
198 from defaults when Butler instances are constructed
199 should be copied from ``full`` to ``config``.
200 overwrite : `bool`, optional
201 If `False`, do not modify a value in ``config`` if the value
202 already exists. Default is always to overwrite with the provided
203 ``root``.
205 Notes
206 -----
207 If a keyword is explicitly defined in the supplied ``config`` it
208 will not be overridden by this method if ``overwrite`` is `False`.
209 This allows explicit values set in external configs to be retained.
210 """
211 Config.updateParameters(
212 DatastoreConfig,
213 config,
214 full,
215 toUpdate={"root": root},
216 toCopy=("cls", ("records", "table")),
217 overwrite=overwrite,
218 )
220 @classmethod
221 def makeTableSpec(cls, datasetIdColumnType: type) -> ddl.TableSpec:
222 return ddl.TableSpec(
223 fields=[
224 ddl.FieldSpec(name="dataset_id", dtype=datasetIdColumnType, primaryKey=True),
225 ddl.FieldSpec(name="path", dtype=String, length=256, nullable=False),
226 ddl.FieldSpec(name="formatter", dtype=String, length=128, nullable=False),
227 ddl.FieldSpec(name="storage_class", dtype=String, length=64, nullable=False),
228 # Use empty string to indicate no component
229 ddl.FieldSpec(name="component", dtype=String, length=32, primaryKey=True),
230 # TODO: should checksum be Base64Bytes instead?
231 ddl.FieldSpec(name="checksum", dtype=String, length=128, nullable=True),
232 ddl.FieldSpec(name="file_size", dtype=BigInteger, nullable=True),
233 ],
234 unique=frozenset(),
235 indexes=[tuple(["path"])],
236 )
238 def __init__(
239 self,
240 config: Union[DatastoreConfig, str],
241 bridgeManager: DatastoreRegistryBridgeManager,
242 butlerRoot: str = None,
243 ):
244 super().__init__(config, bridgeManager)
245 if "root" not in self.config: 245 ↛ 246line 245 didn't jump to line 246, because the condition on line 245 was never true
246 raise ValueError("No root directory specified in configuration")
248 # Name ourselves either using an explicit name or a name
249 # derived from the (unexpanded) root
250 if "name" in self.config:
251 self.name = self.config["name"]
252 else:
253 # We use the unexpanded root in the name to indicate that this
254 # datastore can be moved without having to update registry.
255 self.name = "{}@{}".format(type(self).__name__, self.config["root"])
257 # Support repository relocation in config
258 # Existence of self.root is checked in subclass
259 self.root = ButlerURI(
260 replaceRoot(self.config["root"], butlerRoot), forceDirectory=True, forceAbsolute=True
261 )
263 self.locationFactory = LocationFactory(self.root)
264 self.formatterFactory = FormatterFactory()
266 # Now associate formatters with storage classes
267 self.formatterFactory.registerFormatters(self.config["formatters"], universe=bridgeManager.universe)
269 # Read the file naming templates
270 self.templates = FileTemplates(self.config["templates"], universe=bridgeManager.universe)
272 # See if composites should be disassembled
273 self.composites = CompositesMap(self.config["composites"], universe=bridgeManager.universe)
275 tableName = self.config["records", "table"]
276 try:
277 # Storage of paths and formatters, keyed by dataset_id
278 self._table = bridgeManager.opaque.register(
279 tableName, self.makeTableSpec(bridgeManager.datasetIdColumnType)
280 )
281 # Interface to Registry.
282 self._bridge = bridgeManager.register(self.name)
283 except ReadOnlyDatabaseError:
284 # If the database is read only and we just tried and failed to
285 # create a table, it means someone is trying to create a read-only
286 # butler client for an empty repo. That should be okay, as long
287 # as they then try to get any datasets before some other client
288 # creates the table. Chances are they'rejust validating
289 # configuration.
290 pass
292 # Determine whether checksums should be used - default to False
293 self.useChecksum = self.config.get("checksum", False)
295 # Determine whether we can fall back to configuration if a
296 # requested dataset is not known to registry
297 self.trustGetRequest = self.config.get("trust_get_request", False)
299 # Create a cache manager
300 self.cacheManager: AbstractDatastoreCacheManager
301 if "cached" in self.config: 301 ↛ 304line 301 didn't jump to line 304, because the condition on line 301 was never false
302 self.cacheManager = DatastoreCacheManager(self.config["cached"], universe=bridgeManager.universe)
303 else:
304 self.cacheManager = DatastoreDisabledCacheManager("", universe=bridgeManager.universe)
306 # Check existence and create directory structure if necessary
307 if not self.root.exists():
308 if "create" not in self.config or not self.config["create"]: 308 ↛ 309line 308 didn't jump to line 309, because the condition on line 308 was never true
309 raise ValueError(f"No valid root and not allowed to create one at: {self.root}")
310 try:
311 self.root.mkdir()
312 except Exception as e:
313 raise ValueError(
314 f"Can not create datastore root '{self.root}', check permissions. Got error: {e}"
315 ) from e
317 def __str__(self) -> str:
318 return str(self.root)
320 @property
321 def bridge(self) -> DatastoreRegistryBridge:
322 return self._bridge
324 def _artifact_exists(self, location: Location) -> bool:
325 """Check that an artifact exists in this datastore at the specified
326 location.
328 Parameters
329 ----------
330 location : `Location`
331 Expected location of the artifact associated with this datastore.
333 Returns
334 -------
335 exists : `bool`
336 True if the location can be found, false otherwise.
337 """
338 log.debug("Checking if resource exists: %s", location.uri)
339 return location.uri.exists()
341 def _delete_artifact(self, location: Location) -> None:
342 """Delete the artifact from the datastore.
344 Parameters
345 ----------
346 location : `Location`
347 Location of the artifact associated with this datastore.
348 """
349 if location.pathInStore.isabs(): 349 ↛ 350line 349 didn't jump to line 350, because the condition on line 349 was never true
350 raise RuntimeError(f"Cannot delete artifact with absolute uri {location.uri}.")
352 try:
353 location.uri.remove()
354 except FileNotFoundError:
355 log.debug("File %s did not exist and so could not be deleted.", location.uri)
356 raise
357 except Exception as e:
358 log.critical("Failed to delete file: %s (%s)", location.uri, e)
359 raise
360 log.debug("Successfully deleted file: %s", location.uri)
362 def addStoredItemInfo(self, refs: Iterable[DatasetRef], infos: Iterable[StoredFileInfo]) -> None:
363 # Docstring inherited from GenericBaseDatastore
364 records = [info.to_record(ref) for ref, info in zip(refs, infos)]
365 self._table.insert(*records)
367 def getStoredItemsInfo(self, ref: DatasetIdRef) -> List[StoredFileInfo]:
368 # Docstring inherited from GenericBaseDatastore
370 # Look for the dataset_id -- there might be multiple matches
371 # if we have disassembled the dataset.
372 records = self._table.fetch(dataset_id=ref.id)
373 return [StoredFileInfo.from_record(record) for record in records]
375 def _get_stored_records_associated_with_refs(
376 self, refs: Iterable[DatasetIdRef]
377 ) -> Dict[DatasetId, List[StoredFileInfo]]:
378 """Retrieve all records associated with the provided refs.
380 Parameters
381 ----------
382 refs : iterable of `DatasetIdRef`
383 The refs for which records are to be retrieved.
385 Returns
386 -------
387 records : `dict` of [`DatasetId`, `list` of `StoredFileInfo`]
388 The matching records indexed by the ref ID. The number of entries
389 in the dict can be smaller than the number of requested refs.
390 """
391 records = self._table.fetch(dataset_id=[ref.id for ref in refs])
393 # Uniqueness is dataset_id + component so can have multiple records
394 # per ref.
395 records_by_ref = defaultdict(list)
396 for record in records:
397 records_by_ref[record["dataset_id"]].append(StoredFileInfo.from_record(record))
398 return records_by_ref
400 def _refs_associated_with_artifacts(
401 self, paths: List[Union[str, ButlerURI]]
402 ) -> Dict[str, Set[DatasetId]]:
403 """Return paths and associated dataset refs.
405 Parameters
406 ----------
407 paths : `list` of `str` or `ButlerURI`
408 All the paths to include in search.
410 Returns
411 -------
412 mapping : `dict` of [`str`, `set` [`DatasetId`]]
413 Mapping of each path to a set of associated database IDs.
414 """
415 records = self._table.fetch(path=[str(path) for path in paths])
416 result = defaultdict(set)
417 for row in records:
418 result[row["path"]].add(row["dataset_id"])
419 return result
421 def _registered_refs_per_artifact(self, pathInStore: ButlerURI) -> Set[DatasetId]:
422 """Return all dataset refs associated with the supplied path.
424 Parameters
425 ----------
426 pathInStore : `ButlerURI`
427 Path of interest in the data store.
429 Returns
430 -------
431 ids : `set` of `int`
432 All `DatasetRef` IDs associated with this path.
433 """
434 records = list(self._table.fetch(path=str(pathInStore)))
435 ids = {r["dataset_id"] for r in records}
436 return ids
438 def removeStoredItemInfo(self, ref: DatasetIdRef) -> None:
439 # Docstring inherited from GenericBaseDatastore
440 self._table.delete(["dataset_id"], {"dataset_id": ref.id})
442 def _get_dataset_locations_info(self, ref: DatasetIdRef) -> List[Tuple[Location, StoredFileInfo]]:
443 r"""Find all the `Location`\ s of the requested dataset in the
444 `Datastore` and the associated stored file information.
446 Parameters
447 ----------
448 ref : `DatasetRef`
449 Reference to the required `Dataset`.
451 Returns
452 -------
453 results : `list` [`tuple` [`Location`, `StoredFileInfo` ]]
454 Location of the dataset within the datastore and
455 stored information about each file and its formatter.
456 """
457 # Get the file information (this will fail if no file)
458 records = self.getStoredItemsInfo(ref)
460 # Use the path to determine the location -- we need to take
461 # into account absolute URIs in the datastore record
462 return [(r.file_location(self.locationFactory), r) for r in records]
464 def _can_remove_dataset_artifact(self, ref: DatasetIdRef, location: Location) -> bool:
465 """Check that there is only one dataset associated with the
466 specified artifact.
468 Parameters
469 ----------
470 ref : `DatasetRef` or `FakeDatasetRef`
471 Dataset to be removed.
472 location : `Location`
473 The location of the artifact to be removed.
475 Returns
476 -------
477 can_remove : `Bool`
478 True if the artifact can be safely removed.
479 """
480 # Can't ever delete absolute URIs.
481 if location.pathInStore.isabs():
482 return False
484 # Get all entries associated with this path
485 allRefs = self._registered_refs_per_artifact(location.pathInStore)
486 if not allRefs:
487 raise RuntimeError(f"Datastore inconsistency error. {location.pathInStore} not in registry")
489 # Remove these refs from all the refs and if there is nothing left
490 # then we can delete
491 remainingRefs = allRefs - {ref.id}
493 if remainingRefs:
494 return False
495 return True
497 def _get_expected_dataset_locations_info(self, ref: DatasetRef) -> List[Tuple[Location, StoredFileInfo]]:
498 """Predict the location and related file information of the requested
499 dataset in this datastore.
501 Parameters
502 ----------
503 ref : `DatasetRef`
504 Reference to the required `Dataset`.
506 Returns
507 -------
508 results : `list` [`tuple` [`Location`, `StoredFileInfo` ]]
509 Expected Location of the dataset within the datastore and
510 placeholder information about each file and its formatter.
512 Notes
513 -----
514 Uses the current configuration to determine how we would expect the
515 datastore files to have been written if we couldn't ask registry.
516 This is safe so long as there has been no change to datastore
517 configuration between writing the dataset and wanting to read it.
518 Will not work for files that have been ingested without using the
519 standard file template or default formatter.
520 """
522 # If we have a component ref we always need to ask the questions
523 # of the composite. If the composite is disassembled this routine
524 # should return all components. If the composite was not
525 # disassembled the composite is what is stored regardless of
526 # component request. Note that if the caller has disassembled
527 # a composite there is no way for this guess to know that
528 # without trying both the composite and component ref and seeing
529 # if there is something at the component Location even without
530 # disassembly being enabled.
531 if ref.datasetType.isComponent():
532 ref = ref.makeCompositeRef()
534 # See if the ref is a composite that should be disassembled
535 doDisassembly = self.composites.shouldBeDisassembled(ref)
537 all_info: List[Tuple[Location, Formatter, StorageClass, Optional[str]]] = []
539 if doDisassembly:
540 for component, componentStorage in ref.datasetType.storageClass.components.items():
541 compRef = ref.makeComponentRef(component)
542 location, formatter = self._determine_put_formatter_location(compRef)
543 all_info.append((location, formatter, componentStorage, component))
545 else:
546 # Always use the composite ref if no disassembly
547 location, formatter = self._determine_put_formatter_location(ref)
548 all_info.append((location, formatter, ref.datasetType.storageClass, None))
550 # Convert the list of tuples to have StoredFileInfo as second element
551 return [
552 (
553 location,
554 StoredFileInfo(
555 formatter=formatter,
556 path=location.pathInStore.path,
557 storageClass=storageClass,
558 component=component,
559 checksum=None,
560 file_size=-1,
561 ),
562 )
563 for location, formatter, storageClass, component in all_info
564 ]
566 def _prepare_for_get(
567 self, ref: DatasetRef, parameters: Optional[Mapping[str, Any]] = None
568 ) -> List[DatastoreFileGetInformation]:
569 """Check parameters for ``get`` and obtain formatter and
570 location.
572 Parameters
573 ----------
574 ref : `DatasetRef`
575 Reference to the required Dataset.
576 parameters : `dict`
577 `StorageClass`-specific parameters that specify, for example,
578 a slice of the dataset to be loaded.
580 Returns
581 -------
582 getInfo : `list` [`DatastoreFileGetInformation`]
583 Parameters needed to retrieve each file.
584 """
585 log.debug("Retrieve %s from %s with parameters %s", ref, self.name, parameters)
587 # Get file metadata and internal metadata
588 fileLocations = self._get_dataset_locations_info(ref)
589 if not fileLocations:
590 if not self.trustGetRequest:
591 raise FileNotFoundError(f"Could not retrieve dataset {ref}.")
592 # Assume the dataset is where we think it should be
593 fileLocations = self._get_expected_dataset_locations_info(ref)
595 # The storage class we want to use eventually
596 refStorageClass = ref.datasetType.storageClass
598 if len(fileLocations) > 1:
599 disassembled = True
601 # If trust is involved it is possible that there will be
602 # components listed here that do not exist in the datastore.
603 # Explicitly check for file artifact existence and filter out any
604 # that are missing.
605 if self.trustGetRequest:
606 fileLocations = [loc for loc in fileLocations if loc[0].uri.exists()]
608 # For now complain only if we have no components at all. One
609 # component is probably a problem but we can punt that to the
610 # assembler.
611 if not fileLocations: 611 ↛ 612line 611 didn't jump to line 612, because the condition on line 611 was never true
612 raise FileNotFoundError(f"None of the component files for dataset {ref} exist.")
614 else:
615 disassembled = False
617 # Is this a component request?
618 refComponent = ref.datasetType.component()
620 fileGetInfo = []
621 for location, storedFileInfo in fileLocations:
623 # The storage class used to write the file
624 writeStorageClass = storedFileInfo.storageClass
626 # If this has been disassembled we need read to match the write
627 if disassembled:
628 readStorageClass = writeStorageClass
629 else:
630 readStorageClass = refStorageClass
632 formatter = get_instance_of(
633 storedFileInfo.formatter,
634 FileDescriptor(
635 location,
636 readStorageClass=readStorageClass,
637 storageClass=writeStorageClass,
638 parameters=parameters,
639 ),
640 ref.dataId,
641 )
643 formatterParams, notFormatterParams = formatter.segregateParameters()
645 # Of the remaining parameters, extract the ones supported by
646 # this StorageClass (for components not all will be handled)
647 assemblerParams = readStorageClass.filterParameters(notFormatterParams)
649 # The ref itself could be a component if the dataset was
650 # disassembled by butler, or we disassembled in datastore and
651 # components came from the datastore records
652 component = storedFileInfo.component if storedFileInfo.component else refComponent
654 fileGetInfo.append(
655 DatastoreFileGetInformation(
656 location,
657 formatter,
658 storedFileInfo,
659 assemblerParams,
660 formatterParams,
661 component,
662 readStorageClass,
663 )
664 )
666 return fileGetInfo
668 def _prepare_for_put(self, inMemoryDataset: Any, ref: DatasetRef) -> Tuple[Location, Formatter]:
669 """Check the arguments for ``put`` and obtain formatter and
670 location.
672 Parameters
673 ----------
674 inMemoryDataset : `object`
675 The dataset to store.
676 ref : `DatasetRef`
677 Reference to the associated Dataset.
679 Returns
680 -------
681 location : `Location`
682 The location to write the dataset.
683 formatter : `Formatter`
684 The `Formatter` to use to write the dataset.
686 Raises
687 ------
688 TypeError
689 Supplied object and storage class are inconsistent.
690 DatasetTypeNotSupportedError
691 The associated `DatasetType` is not handled by this datastore.
692 """
693 self._validate_put_parameters(inMemoryDataset, ref)
694 return self._determine_put_formatter_location(ref)
696 def _determine_put_formatter_location(self, ref: DatasetRef) -> Tuple[Location, Formatter]:
697 """Calculate the formatter and output location to use for put.
699 Parameters
700 ----------
701 ref : `DatasetRef`
702 Reference to the associated Dataset.
704 Returns
705 -------
706 location : `Location`
707 The location to write the dataset.
708 formatter : `Formatter`
709 The `Formatter` to use to write the dataset.
710 """
711 # Work out output file name
712 try:
713 template = self.templates.getTemplate(ref)
714 except KeyError as e:
715 raise DatasetTypeNotSupportedError(f"Unable to find template for {ref}") from e
717 # Validate the template to protect against filenames from different
718 # dataIds returning the same and causing overwrite confusion.
719 template.validateTemplate(ref)
721 location = self.locationFactory.fromPath(template.format(ref))
723 # Get the formatter based on the storage class
724 storageClass = ref.datasetType.storageClass
725 try:
726 formatter = self.formatterFactory.getFormatter(
727 ref, FileDescriptor(location, storageClass=storageClass), ref.dataId
728 )
729 except KeyError as e:
730 raise DatasetTypeNotSupportedError(
731 f"Unable to find formatter for {ref} in datastore {self.name}"
732 ) from e
734 # Now that we know the formatter, update the location
735 location = formatter.makeUpdatedLocation(location)
737 return location, formatter
739 def _overrideTransferMode(self, *datasets: FileDataset, transfer: Optional[str] = None) -> Optional[str]:
740 # Docstring inherited from base class
741 if transfer != "auto":
742 return transfer
744 # See if the paths are within the datastore or not
745 inside = [self._pathInStore(d.path) is not None for d in datasets]
747 if all(inside):
748 transfer = None
749 elif not any(inside): 749 ↛ 758line 749 didn't jump to line 758, because the condition on line 749 was never false
750 # Allow ButlerURI to use its own knowledge
751 transfer = "auto"
752 else:
753 # This can happen when importing from a datastore that
754 # has had some datasets ingested using "direct" mode.
755 # Also allow ButlerURI to sort it out but warn about it.
756 # This can happen if you are importing from a datastore
757 # that had some direct transfer datasets.
758 log.warning(
759 "Some datasets are inside the datastore and some are outside. Using 'split' "
760 "transfer mode. This assumes that the files outside the datastore are "
761 "still accessible to the new butler since they will not be copied into "
762 "the target datastore."
763 )
764 transfer = "split"
766 return transfer
768 def _pathInStore(self, path: Union[str, ButlerURI]) -> Optional[str]:
769 """Return path relative to datastore root
771 Parameters
772 ----------
773 path : `str` or `ButlerURI`
774 Path to dataset. Can be absolute URI. If relative assumed to
775 be relative to the datastore. Returns path in datastore
776 or raises an exception if the path it outside.
778 Returns
779 -------
780 inStore : `str`
781 Path relative to datastore root. Returns `None` if the file is
782 outside the root.
783 """
784 # Relative path will always be relative to datastore
785 pathUri = ButlerURI(path, forceAbsolute=False)
786 return pathUri.relative_to(self.root)
788 def _standardizeIngestPath(
789 self, path: Union[str, ButlerURI], *, transfer: Optional[str] = None
790 ) -> Union[str, ButlerURI]:
791 """Standardize the path of a to-be-ingested file.
793 Parameters
794 ----------
795 path : `str` or `ButlerURI`
796 Path of a file to be ingested.
797 transfer : `str`, optional
798 How (and whether) the dataset should be added to the datastore.
799 See `ingest` for details of transfer modes.
800 This implementation is provided only so
801 `NotImplementedError` can be raised if the mode is not supported;
802 actual transfers are deferred to `_extractIngestInfo`.
804 Returns
805 -------
806 path : `str` or `ButlerURI`
807 New path in what the datastore considers standard form. If an
808 absolute URI was given that will be returned unchanged.
810 Notes
811 -----
812 Subclasses of `FileDatastore` can implement this method instead
813 of `_prepIngest`. It should not modify the data repository or given
814 file in any way.
816 Raises
817 ------
818 NotImplementedError
819 Raised if the datastore does not support the given transfer mode
820 (including the case where ingest is not supported at all).
821 FileNotFoundError
822 Raised if one of the given files does not exist.
823 """
824 if transfer not in (None, "direct", "split") + self.root.transferModes: 824 ↛ 825line 824 didn't jump to line 825, because the condition on line 824 was never true
825 raise NotImplementedError(f"Transfer mode {transfer} not supported.")
827 # A relative URI indicates relative to datastore root
828 srcUri = ButlerURI(path, forceAbsolute=False)
829 if not srcUri.isabs():
830 srcUri = self.root.join(path)
832 if not srcUri.exists():
833 raise FileNotFoundError(
834 f"Resource at {srcUri} does not exist; note that paths to ingest "
835 f"are assumed to be relative to {self.root} unless they are absolute."
836 )
838 if transfer is None:
839 relpath = srcUri.relative_to(self.root)
840 if not relpath:
841 raise RuntimeError(
842 f"Transfer is none but source file ({srcUri}) is not within datastore ({self.root})"
843 )
845 # Return the relative path within the datastore for internal
846 # transfer
847 path = relpath
849 return path
851 def _extractIngestInfo(
852 self,
853 path: Union[str, ButlerURI],
854 ref: DatasetRef,
855 *,
856 formatter: Union[Formatter, Type[Formatter]],
857 transfer: Optional[str] = None,
858 ) -> StoredFileInfo:
859 """Relocate (if necessary) and extract `StoredFileInfo` from a
860 to-be-ingested file.
862 Parameters
863 ----------
864 path : `str` or `ButlerURI`
865 URI or path of a file to be ingested.
866 ref : `DatasetRef`
867 Reference for the dataset being ingested. Guaranteed to have
868 ``dataset_id not None`.
869 formatter : `type` or `Formatter`
870 `Formatter` subclass to use for this dataset or an instance.
871 transfer : `str`, optional
872 How (and whether) the dataset should be added to the datastore.
873 See `ingest` for details of transfer modes.
875 Returns
876 -------
877 info : `StoredFileInfo`
878 Internal datastore record for this file. This will be inserted by
879 the caller; the `_extractIngestInfo` is only responsible for
880 creating and populating the struct.
882 Raises
883 ------
884 FileNotFoundError
885 Raised if one of the given files does not exist.
886 FileExistsError
887 Raised if transfer is not `None` but the (internal) location the
888 file would be moved to is already occupied.
889 """
890 if self._transaction is None: 890 ↛ 891line 890 didn't jump to line 891, because the condition on line 890 was never true
891 raise RuntimeError("Ingest called without transaction enabled")
893 # Create URI of the source path, do not need to force a relative
894 # path to absolute.
895 srcUri = ButlerURI(path, forceAbsolute=False)
897 # Track whether we have read the size of the source yet
898 have_sized = False
900 tgtLocation: Optional[Location]
901 if transfer is None or transfer == "split":
902 # A relative path is assumed to be relative to the datastore
903 # in this context
904 if not srcUri.isabs():
905 tgtLocation = self.locationFactory.fromPath(srcUri.ospath)
906 else:
907 # Work out the path in the datastore from an absolute URI
908 # This is required to be within the datastore.
909 pathInStore = srcUri.relative_to(self.root)
910 if pathInStore is None and transfer is None: 910 ↛ 911line 910 didn't jump to line 911, because the condition on line 910 was never true
911 raise RuntimeError(
912 f"Unexpectedly learned that {srcUri} is not within datastore {self.root}"
913 )
914 if pathInStore: 914 ↛ 916line 914 didn't jump to line 916, because the condition on line 914 was never false
915 tgtLocation = self.locationFactory.fromPath(pathInStore)
916 elif transfer == "split":
917 # Outside the datastore but treat that as a direct ingest
918 # instead.
919 tgtLocation = None
920 else:
921 raise RuntimeError(f"Unexpected transfer mode encountered: {transfer} for URI {srcUri}")
922 elif transfer == "direct": 922 ↛ 927line 922 didn't jump to line 927, because the condition on line 922 was never true
923 # Want to store the full URI to the resource directly in
924 # datastore. This is useful for referring to permanent archive
925 # storage for raw data.
926 # Trust that people know what they are doing.
927 tgtLocation = None
928 else:
929 # Work out the name we want this ingested file to have
930 # inside the datastore
931 tgtLocation = self._calculate_ingested_datastore_name(srcUri, ref, formatter)
932 if not tgtLocation.uri.dirname().exists():
933 log.debug("Folder %s does not exist yet.", tgtLocation.uri.dirname())
934 tgtLocation.uri.dirname().mkdir()
936 # if we are transferring from a local file to a remote location
937 # it may be more efficient to get the size and checksum of the
938 # local file rather than the transferred one
939 if not srcUri.scheme or srcUri.scheme == "file": 939 ↛ 949line 939 didn't jump to line 949, because the condition on line 939 was never false
940 size = srcUri.size()
941 checksum = self.computeChecksum(srcUri) if self.useChecksum else None
942 have_sized = True
944 # Transfer the resource to the destination.
945 # Allow overwrite of an existing file. This matches the behavior
946 # of datastore.put() in that it trusts that registry would not
947 # be asking to overwrite unless registry thought that the
948 # overwrite was allowed.
949 tgtLocation.uri.transfer_from(
950 srcUri, transfer=transfer, transaction=self._transaction, overwrite=True
951 )
953 if tgtLocation is None: 953 ↛ 955line 953 didn't jump to line 955, because the condition on line 953 was never true
954 # This means we are using direct mode
955 targetUri = srcUri
956 targetPath = str(srcUri)
957 else:
958 targetUri = tgtLocation.uri
959 targetPath = tgtLocation.pathInStore.path
961 # the file should exist in the datastore now
962 if not have_sized:
963 size = targetUri.size()
964 checksum = self.computeChecksum(targetUri) if self.useChecksum else None
966 return StoredFileInfo(
967 formatter=formatter,
968 path=targetPath,
969 storageClass=ref.datasetType.storageClass,
970 component=ref.datasetType.component(),
971 file_size=size,
972 checksum=checksum,
973 )
975 def _prepIngest(self, *datasets: FileDataset, transfer: Optional[str] = None) -> _IngestPrepData:
976 # Docstring inherited from Datastore._prepIngest.
977 filtered = []
978 for dataset in datasets:
979 acceptable = [ref for ref in dataset.refs if self.constraints.isAcceptable(ref)]
980 if not acceptable:
981 continue
982 else:
983 dataset.refs = acceptable
984 if dataset.formatter is None:
985 dataset.formatter = self.formatterFactory.getFormatterClass(dataset.refs[0])
986 else:
987 assert isinstance(dataset.formatter, (type, str))
988 formatter_class = get_class_of(dataset.formatter)
989 if not issubclass(formatter_class, Formatter): 989 ↛ 990line 989 didn't jump to line 990, because the condition on line 989 was never true
990 raise TypeError(f"Requested formatter {dataset.formatter} is not a Formatter class.")
991 dataset.formatter = formatter_class
992 dataset.path = self._standardizeIngestPath(dataset.path, transfer=transfer)
993 filtered.append(dataset)
994 return _IngestPrepData(filtered)
996 @transactional
997 def _finishIngest(self, prepData: Datastore.IngestPrepData, *, transfer: Optional[str] = None) -> None:
998 # Docstring inherited from Datastore._finishIngest.
999 refsAndInfos = []
1000 progress = Progress("lsst.daf.butler.datastores.FileDatastore.ingest", level=logging.DEBUG)
1001 for dataset in progress.wrap(prepData.datasets, desc="Ingesting dataset files"):
1002 # Do ingest as if the first dataset ref is associated with the file
1003 info = self._extractIngestInfo(
1004 dataset.path, dataset.refs[0], formatter=dataset.formatter, transfer=transfer
1005 )
1006 refsAndInfos.extend([(ref, info) for ref in dataset.refs])
1007 self._register_datasets(refsAndInfos)
1009 def _calculate_ingested_datastore_name(
1010 self, srcUri: ButlerURI, ref: DatasetRef, formatter: Union[Formatter, Type[Formatter]]
1011 ) -> Location:
1012 """Given a source URI and a DatasetRef, determine the name the
1013 dataset will have inside datastore.
1015 Parameters
1016 ----------
1017 srcUri : `ButlerURI`
1018 URI to the source dataset file.
1019 ref : `DatasetRef`
1020 Ref associated with the newly-ingested dataset artifact. This
1021 is used to determine the name within the datastore.
1022 formatter : `Formatter` or Formatter class.
1023 Formatter to use for validation. Can be a class or an instance.
1025 Returns
1026 -------
1027 location : `Location`
1028 Target location for the newly-ingested dataset.
1029 """
1030 # Ingesting a file from outside the datastore.
1031 # This involves a new name.
1032 template = self.templates.getTemplate(ref)
1033 location = self.locationFactory.fromPath(template.format(ref))
1035 # Get the extension
1036 ext = srcUri.getExtension()
1038 # Update the destination to include that extension
1039 location.updateExtension(ext)
1041 # Ask the formatter to validate this extension
1042 formatter.validateExtension(location)
1044 return location
1046 def _write_in_memory_to_artifact(self, inMemoryDataset: Any, ref: DatasetRef) -> StoredFileInfo:
1047 """Write out in memory dataset to datastore.
1049 Parameters
1050 ----------
1051 inMemoryDataset : `object`
1052 Dataset to write to datastore.
1053 ref : `DatasetRef`
1054 Registry information associated with this dataset.
1056 Returns
1057 -------
1058 info : `StoredFileInfo`
1059 Information describing the artifact written to the datastore.
1060 """
1061 location, formatter = self._prepare_for_put(inMemoryDataset, ref)
1062 uri = location.uri
1064 if not uri.dirname().exists():
1065 log.debug("Folder %s does not exist yet so creating it.", uri.dirname())
1066 uri.dirname().mkdir()
1068 if self._transaction is None: 1068 ↛ 1069line 1068 didn't jump to line 1069, because the condition on line 1068 was never true
1069 raise RuntimeError("Attempting to write artifact without transaction enabled")
1071 def _removeFileExists(uri: ButlerURI) -> None:
1072 """Remove a file and do not complain if it is not there.
1074 This is important since a formatter might fail before the file
1075 is written and we should not confuse people by writing spurious
1076 error messages to the log.
1077 """
1078 try:
1079 uri.remove()
1080 except FileNotFoundError:
1081 pass
1083 # Register a callback to try to delete the uploaded data if
1084 # something fails below
1085 self._transaction.registerUndo("artifactWrite", _removeFileExists, uri)
1087 # For a local file, simply use the formatter directly
1088 if uri.isLocal:
1089 try:
1090 formatter.write(inMemoryDataset)
1091 except Exception as e:
1092 raise RuntimeError(
1093 f"Failed to serialize dataset {ref} of type {type(inMemoryDataset)} to location {uri}"
1094 ) from e
1095 log.debug("Successfully wrote python object to local file at %s", uri)
1096 else:
1097 # This is a remote URI. Some datasets can be serialized directly
1098 # to bytes and sent to the remote datastore without writing a
1099 # file. If the dataset is intended to be saved to the cache
1100 # a file is always written and direct write to the remote
1101 # datastore is bypassed.
1102 data_written = False
1103 if not self.cacheManager.should_be_cached(ref):
1104 try:
1105 serializedDataset = formatter.toBytes(inMemoryDataset)
1106 except NotImplementedError:
1107 # Fallback to the file writing option.
1108 pass
1109 except Exception as e:
1110 raise RuntimeError(
1111 f"Failed to serialize dataset {ref} of type {type(inMemoryDataset)} to bytes."
1112 ) from e
1113 else:
1114 log.debug("Writing bytes directly to %s", uri)
1115 uri.write(serializedDataset, overwrite=True)
1116 log.debug("Successfully wrote bytes directly to %s", uri)
1117 data_written = True
1119 if not data_written:
1120 # Did not write the bytes directly to object store so instead
1121 # write to temporary file.
1122 with ButlerURI.temporary_uri(suffix=uri.getExtension()) as temporary_uri:
1123 # Need to configure the formatter to write to a different
1124 # location and that needs us to overwrite internals
1125 log.debug("Writing dataset to temporary location at %s", temporary_uri)
1126 with formatter._updateLocation(Location(None, temporary_uri)):
1127 try:
1128 formatter.write(inMemoryDataset)
1129 except Exception as e:
1130 raise RuntimeError(
1131 f"Failed to serialize dataset {ref} of type"
1132 f" {type(inMemoryDataset)} to "
1133 f"temporary location {temporary_uri}"
1134 ) from e
1135 uri.transfer_from(temporary_uri, transfer="copy", overwrite=True)
1137 # Cache if required
1138 self.cacheManager.move_to_cache(temporary_uri, ref)
1140 log.debug("Successfully wrote dataset to %s via a temporary file.", uri)
1142 # URI is needed to resolve what ingest case are we dealing with
1143 return self._extractIngestInfo(uri, ref, formatter=formatter)
1145 def _read_artifact_into_memory(
1146 self,
1147 getInfo: DatastoreFileGetInformation,
1148 ref: DatasetRef,
1149 isComponent: bool = False,
1150 cache_ref: Optional[DatasetRef] = None,
1151 ) -> Any:
1152 """Read the artifact from datastore into in memory object.
1154 Parameters
1155 ----------
1156 getInfo : `DatastoreFileGetInformation`
1157 Information about the artifact within the datastore.
1158 ref : `DatasetRef`
1159 The registry information associated with this artifact.
1160 isComponent : `bool`
1161 Flag to indicate if a component is being read from this artifact.
1162 cache_ref : `DatasetRef`, optional
1163 The DatasetRef to use when looking up the file in the cache.
1164 This ref must have the same ID as the supplied ref but can
1165 be a parent ref or component ref to indicate to the cache whether
1166 a composite file is being requested from the cache or a component
1167 file. Without this the cache will default to the supplied ref but
1168 it can get confused with read-only derived components for
1169 disassembled composites.
1171 Returns
1172 -------
1173 inMemoryDataset : `object`
1174 The artifact as a python object.
1175 """
1176 location = getInfo.location
1177 uri = location.uri
1178 log.debug("Accessing data from %s", uri)
1180 if cache_ref is None:
1181 cache_ref = ref
1182 if cache_ref.id != ref.id: 1182 ↛ 1183line 1182 didn't jump to line 1183, because the condition on line 1182 was never true
1183 raise ValueError(
1184 "The supplied cache dataset ref refers to a different dataset than expected:"
1185 f" {ref.id} != {cache_ref.id}"
1186 )
1188 # Cannot recalculate checksum but can compare size as a quick check
1189 # Do not do this if the size is negative since that indicates
1190 # we do not know.
1191 recorded_size = getInfo.info.file_size
1192 resource_size = uri.size()
1193 if recorded_size >= 0 and resource_size != recorded_size: 1193 ↛ 1194line 1193 didn't jump to line 1194, because the condition on line 1193 was never true
1194 raise RuntimeError(
1195 "Integrity failure in Datastore. "
1196 f"Size of file {uri} ({resource_size}) "
1197 f"does not match size recorded in registry of {recorded_size}"
1198 )
1200 # For the general case we have choices for how to proceed.
1201 # 1. Always use a local file (downloading the remote resource to a
1202 # temporary file if needed).
1203 # 2. Use a threshold size and read into memory and use bytes.
1204 # Use both for now with an arbitrary hand off size.
1205 # This allows small datasets to be downloaded from remote object
1206 # stores without requiring a temporary file.
1208 formatter = getInfo.formatter
1209 nbytes_max = 10_000_000 # Arbitrary number that we can tune
1210 if resource_size <= nbytes_max and formatter.can_read_bytes():
1211 with self.cacheManager.find_in_cache(cache_ref, uri.getExtension()) as cached_file:
1212 if cached_file is not None:
1213 desired_uri = cached_file
1214 msg = f" (cached version of {uri})"
1215 else:
1216 desired_uri = uri
1217 msg = ""
1218 with time_this(log, msg="Reading bytes from %s%s", args=(desired_uri, msg)):
1219 serializedDataset = desired_uri.read()
1220 log.debug(
1221 "Deserializing %s from %d bytes from location %s with formatter %s",
1222 f"component {getInfo.component}" if isComponent else "",
1223 len(serializedDataset),
1224 uri,
1225 formatter.name(),
1226 )
1227 try:
1228 result = formatter.fromBytes(
1229 serializedDataset, component=getInfo.component if isComponent else None
1230 )
1231 except Exception as e:
1232 raise ValueError(
1233 f"Failure from formatter '{formatter.name()}' for dataset {ref.id}"
1234 f" ({ref.datasetType.name} from {uri}): {e}"
1235 ) from e
1236 else:
1237 # Read from file.
1239 # Have to update the Location associated with the formatter
1240 # because formatter.read does not allow an override.
1241 # This could be improved.
1242 location_updated = False
1243 msg = ""
1245 # First check in cache for local version.
1246 # The cache will only be relevant for remote resources but
1247 # no harm in always asking. Context manager ensures that cache
1248 # file is not deleted during cache expiration.
1249 with self.cacheManager.find_in_cache(cache_ref, uri.getExtension()) as cached_file:
1250 if cached_file is not None:
1251 msg = f"(via cache read of remote file {uri})"
1252 uri = cached_file
1253 location_updated = True
1255 with uri.as_local() as local_uri:
1257 can_be_cached = False
1258 if uri != local_uri: 1258 ↛ 1260line 1258 didn't jump to line 1260, because the condition on line 1258 was never true
1259 # URI was remote and file was downloaded
1260 cache_msg = ""
1261 location_updated = True
1263 if self.cacheManager.should_be_cached(cache_ref):
1264 # In this scenario we want to ask if the downloaded
1265 # file should be cached but we should not cache
1266 # it until after we've used it (to ensure it can't
1267 # be expired whilst we are using it).
1268 can_be_cached = True
1270 # Say that it is "likely" to be cached because
1271 # if the formatter read fails we will not be
1272 # caching this file.
1273 cache_msg = " and likely cached"
1275 msg = f"(via download to local file{cache_msg})"
1277 # Calculate the (possibly) new location for the formatter
1278 # to use.
1279 newLocation = Location(*local_uri.split()) if location_updated else None
1281 log.debug(
1282 "Reading%s from location %s %s with formatter %s",
1283 f" component {getInfo.component}" if isComponent else "",
1284 uri,
1285 msg,
1286 formatter.name(),
1287 )
1288 try:
1289 with formatter._updateLocation(newLocation):
1290 with time_this(
1291 log,
1292 msg="Reading%s from location %s %s with formatter %s",
1293 args=(
1294 f" component {getInfo.component}" if isComponent else "",
1295 uri,
1296 msg,
1297 formatter.name(),
1298 ),
1299 ):
1300 result = formatter.read(component=getInfo.component if isComponent else None)
1301 except Exception as e:
1302 raise ValueError(
1303 f"Failure from formatter '{formatter.name()}' for dataset {ref.id}"
1304 f" ({ref.datasetType.name} from {uri}): {e}"
1305 ) from e
1307 # File was read successfully so can move to cache
1308 if can_be_cached: 1308 ↛ 1309line 1308 didn't jump to line 1309, because the condition on line 1308 was never true
1309 self.cacheManager.move_to_cache(local_uri, cache_ref)
1311 return self._post_process_get(
1312 result, getInfo.readStorageClass, getInfo.assemblerParams, isComponent=isComponent
1313 )
1315 def knows(self, ref: DatasetRef) -> bool:
1316 """Check if the dataset is known to the datastore.
1318 Does not check for existence of any artifact.
1320 Parameters
1321 ----------
1322 ref : `DatasetRef`
1323 Reference to the required dataset.
1325 Returns
1326 -------
1327 exists : `bool`
1328 `True` if the dataset is known to the datastore.
1329 """
1330 fileLocations = self._get_dataset_locations_info(ref)
1331 if fileLocations:
1332 return True
1333 return False
1335 def _process_mexists_records(
1336 self,
1337 id_to_ref: Dict[DatasetId, DatasetRef],
1338 records: Dict[DatasetId, List[StoredFileInfo]],
1339 all_required: bool,
1340 artifact_existence: Optional[Dict[ButlerURI, bool]] = None,
1341 ) -> Dict[DatasetRef, bool]:
1342 """Helper function for mexists that checks the given records.
1344 Parameters
1345 ----------
1346 id_to_ref : `dict` of [`DatasetId`, `DatasetRef`]
1347 Mapping of the dataset ID to the dataset ref itself.
1348 records : `dict` of [`DatasetId`, `list` of `StoredFileInfo`]
1349 Records as generally returned by
1350 ``_get_stored_records_associated_with_refs``.
1351 all_required : `bool`
1352 Flag to indicate whether existence requires all artifacts
1353 associated with a dataset ID to exist or not for existence.
1354 artifact_existence : `dict` of [`ButlerURI`, `bool`], optional
1355 Mapping of datastore artifact to existence. Updated by this
1356 method with details of all artifacts tested. Can be `None`
1357 if the caller is not interested.
1359 Returns
1360 -------
1361 existence : `dict` of [`DatasetRef`, `bool`]
1362 Mapping from dataset to boolean indicating existence.
1363 """
1364 # The URIs to be checked and a mapping of those URIs to
1365 # the dataset ID.
1366 uris_to_check: List[ButlerURI] = []
1367 location_map: Dict[ButlerURI, DatasetId] = {}
1369 location_factory = self.locationFactory
1371 for ref_id, info in records.items():
1372 # Key is the dataId, value is list of StoredItemInfo
1373 uris = [info.file_location(location_factory).uri for info in info]
1374 uris_to_check.extend(uris)
1375 location_map.update({uri: ref_id for uri in uris})
1377 uri_existence: Dict[ButlerURI, bool] = {}
1378 if artifact_existence is not None:
1379 # If a URI has already been checked remove it from the list
1380 # and immediately add the status to the output dict.
1381 filtered_uris_to_check = []
1382 for uri in uris_to_check:
1383 if uri in artifact_existence:
1384 uri_existence[uri] = artifact_existence[uri]
1385 else:
1386 filtered_uris_to_check.append(uri)
1387 uris_to_check = filtered_uris_to_check
1389 # Results.
1390 dataset_existence: Dict[DatasetRef, bool] = {}
1392 uri_existence.update(ButlerURI.mexists(uris_to_check))
1393 for uri, exists in uri_existence.items():
1394 dataset_id = location_map[uri]
1395 ref = id_to_ref[dataset_id]
1397 # Disassembled composite needs to check all locations.
1398 # all_required indicates whether all need to exist or not.
1399 if ref in dataset_existence:
1400 if all_required:
1401 exists = dataset_existence[ref] and exists
1402 else:
1403 exists = dataset_existence[ref] or exists
1404 dataset_existence[ref] = exists
1406 if artifact_existence is not None:
1407 artifact_existence.update(uri_existence)
1409 return dataset_existence
1411 def mexists(
1412 self, refs: Iterable[DatasetRef], artifact_existence: Optional[Dict[ButlerURI, bool]] = None
1413 ) -> Dict[DatasetRef, bool]:
1414 """Check the existence of multiple datasets at once.
1416 Parameters
1417 ----------
1418 refs : iterable of `DatasetRef`
1419 The datasets to be checked.
1420 artifact_existence : `dict` of [`ButlerURI`, `bool`], optional
1421 Mapping of datastore artifact to existence. Updated by this
1422 method with details of all artifacts tested. Can be `None`
1423 if the caller is not interested.
1425 Returns
1426 -------
1427 existence : `dict` of [`DatasetRef`, `bool`]
1428 Mapping from dataset to boolean indicating existence.
1429 """
1430 chunk_size = 10_000
1431 dataset_existence: Dict[DatasetRef, bool] = {}
1432 log.debug("Checking for the existence of multiple artifacts in datastore in chunks of %d", chunk_size)
1433 n_found_total = 0
1434 n_checked = 0
1435 n_chunks = 0
1436 for chunk in chunk_iterable(refs, chunk_size=chunk_size):
1437 chunk_result = self._mexists(chunk, artifact_existence)
1438 if log.isEnabledFor(VERBOSE):
1439 n_results = len(chunk_result)
1440 n_checked += n_results
1441 # Can treat the booleans as 0, 1 integers and sum them.
1442 n_found = sum(chunk_result.values())
1443 n_found_total += n_found
1444 log.verbose(
1445 "Number of datasets found in datastore for chunk %d = %d/%d (running total: %d/%d)",
1446 n_chunks,
1447 n_found,
1448 n_results,
1449 n_found_total,
1450 n_checked,
1451 )
1452 dataset_existence.update(chunk_result)
1453 n_chunks += 1
1455 return dataset_existence
1457 def _mexists(
1458 self, refs: Iterable[DatasetRef], artifact_existence: Optional[Dict[ButlerURI, bool]] = None
1459 ) -> Dict[DatasetRef, bool]:
1460 """Check the existence of multiple datasets at once.
1462 Parameters
1463 ----------
1464 refs : iterable of `DatasetRef`
1465 The datasets to be checked.
1467 Returns
1468 -------
1469 existence : `dict` of [`DatasetRef`, `bool`]
1470 Mapping from dataset to boolean indicating existence.
1471 """
1472 # Need a mapping of dataset_id to dataset ref since the API
1473 # works with dataset_id
1474 id_to_ref = {ref.getCheckedId(): ref for ref in refs}
1476 # Set of all IDs we are checking for.
1477 requested_ids = set(id_to_ref.keys())
1479 # The records themselves. Could be missing some entries.
1480 records = self._get_stored_records_associated_with_refs(refs)
1482 dataset_existence = self._process_mexists_records(
1483 id_to_ref, records, True, artifact_existence=artifact_existence
1484 )
1486 # Set of IDs that have been handled.
1487 handled_ids = {ref.id for ref in dataset_existence.keys()}
1489 missing_ids = requested_ids - handled_ids
1490 if missing_ids:
1491 if not self.trustGetRequest:
1492 # Must assume these do not exist
1493 for missing in missing_ids:
1494 dataset_existence[id_to_ref[missing]] = False
1495 else:
1496 log.debug(
1497 "%d out of %d datasets were not known to datastore during initial existence check.",
1498 len(missing_ids),
1499 len(requested_ids),
1500 )
1502 # Construct data structure identical to that returned
1503 # by _get_stored_records_associated_with_refs() but using
1504 # guessed names.
1505 records = {}
1506 for missing in missing_ids:
1507 expected = self._get_expected_dataset_locations_info(id_to_ref[missing])
1508 records[missing] = [info for _, info in expected]
1510 dataset_existence.update(
1511 self._process_mexists_records(
1512 id_to_ref, records, False, artifact_existence=artifact_existence
1513 )
1514 )
1516 return dataset_existence
1518 def exists(self, ref: DatasetRef) -> bool:
1519 """Check if the dataset exists in the datastore.
1521 Parameters
1522 ----------
1523 ref : `DatasetRef`
1524 Reference to the required dataset.
1526 Returns
1527 -------
1528 exists : `bool`
1529 `True` if the entity exists in the `Datastore`.
1530 """
1531 fileLocations = self._get_dataset_locations_info(ref)
1533 # if we are being asked to trust that registry might not be correct
1534 # we ask for the expected locations and check them explicitly
1535 if not fileLocations:
1536 if not self.trustGetRequest:
1537 return False
1539 # When we are guessing a dataset location we can not check
1540 # for the existence of every component since we can not
1541 # know if every component was written. Instead we check
1542 # for the existence of any of the expected locations.
1543 for location, _ in self._get_expected_dataset_locations_info(ref): 1543 ↛ 1546line 1543 didn't jump to line 1546, because the loop on line 1543 didn't complete
1544 if self._artifact_exists(location): 1544 ↛ 1543line 1544 didn't jump to line 1543, because the condition on line 1544 was never false
1545 return True
1546 return False
1548 # All listed artifacts must exist.
1549 for location, _ in fileLocations:
1550 if not self._artifact_exists(location):
1551 return False
1553 return True
1555 def getURIs(
1556 self, ref: DatasetRef, predict: bool = False
1557 ) -> Tuple[Optional[ButlerURI], Dict[str, ButlerURI]]:
1558 """Return URIs associated with dataset.
1560 Parameters
1561 ----------
1562 ref : `DatasetRef`
1563 Reference to the required dataset.
1564 predict : `bool`, optional
1565 If the datastore does not know about the dataset, should it
1566 return a predicted URI or not?
1568 Returns
1569 -------
1570 primary : `ButlerURI`
1571 The URI to the primary artifact associated with this dataset.
1572 If the dataset was disassembled within the datastore this
1573 may be `None`.
1574 components : `dict`
1575 URIs to any components associated with the dataset artifact.
1576 Can be empty if there are no components.
1577 """
1579 primary: Optional[ButlerURI] = None
1580 components: Dict[str, ButlerURI] = {}
1582 # if this has never been written then we have to guess
1583 if not self.exists(ref):
1584 if not predict:
1585 raise FileNotFoundError("Dataset {} not in this datastore".format(ref))
1587 doDisassembly = self.composites.shouldBeDisassembled(ref)
1589 if doDisassembly:
1591 for component, componentStorage in ref.datasetType.storageClass.components.items():
1592 compRef = ref.makeComponentRef(component)
1593 compLocation, _ = self._determine_put_formatter_location(compRef)
1595 # Add a URI fragment to indicate this is a guess
1596 components[component] = ButlerURI(compLocation.uri.geturl() + "#predicted")
1598 else:
1600 location, _ = self._determine_put_formatter_location(ref)
1602 # Add a URI fragment to indicate this is a guess
1603 primary = ButlerURI(location.uri.geturl() + "#predicted")
1605 return primary, components
1607 # If this is a ref that we have written we can get the path.
1608 # Get file metadata and internal metadata
1609 fileLocations = self._get_dataset_locations_info(ref)
1611 guessing = False
1612 if not fileLocations:
1613 if not self.trustGetRequest: 1613 ↛ 1614line 1613 didn't jump to line 1614, because the condition on line 1613 was never true
1614 raise RuntimeError(f"Unexpectedly got no artifacts for dataset {ref}")
1615 fileLocations = self._get_expected_dataset_locations_info(ref)
1616 guessing = True
1618 if len(fileLocations) == 1:
1619 # No disassembly so this is the primary URI
1620 uri = fileLocations[0][0].uri
1621 if guessing and not uri.exists(): 1621 ↛ 1622line 1621 didn't jump to line 1622, because the condition on line 1621 was never true
1622 raise FileNotFoundError(f"Expected URI ({uri}) does not exist")
1623 primary = uri
1625 else:
1626 for location, storedFileInfo in fileLocations:
1627 if storedFileInfo.component is None: 1627 ↛ 1628line 1627 didn't jump to line 1628, because the condition on line 1627 was never true
1628 raise RuntimeError(f"Unexpectedly got no component name for a component at {location}")
1629 uri = location.uri
1630 if guessing and not uri.exists(): 1630 ↛ 1634line 1630 didn't jump to line 1634, because the condition on line 1630 was never true
1631 # If we are trusting then it is entirely possible for
1632 # some components to be missing. In that case we skip
1633 # to the next component.
1634 if self.trustGetRequest:
1635 continue
1636 raise FileNotFoundError(f"Expected URI ({uri}) does not exist")
1637 components[storedFileInfo.component] = uri
1639 return primary, components
1641 def getURI(self, ref: DatasetRef, predict: bool = False) -> ButlerURI:
1642 """URI to the Dataset.
1644 Parameters
1645 ----------
1646 ref : `DatasetRef`
1647 Reference to the required Dataset.
1648 predict : `bool`
1649 If `True`, allow URIs to be returned of datasets that have not
1650 been written.
1652 Returns
1653 -------
1654 uri : `str`
1655 URI pointing to the dataset within the datastore. If the
1656 dataset does not exist in the datastore, and if ``predict`` is
1657 `True`, the URI will be a prediction and will include a URI
1658 fragment "#predicted".
1659 If the datastore does not have entities that relate well
1660 to the concept of a URI the returned URI will be
1661 descriptive. The returned URI is not guaranteed to be obtainable.
1663 Raises
1664 ------
1665 FileNotFoundError
1666 Raised if a URI has been requested for a dataset that does not
1667 exist and guessing is not allowed.
1668 RuntimeError
1669 Raised if a request is made for a single URI but multiple URIs
1670 are associated with this dataset.
1672 Notes
1673 -----
1674 When a predicted URI is requested an attempt will be made to form
1675 a reasonable URI based on file templates and the expected formatter.
1676 """
1677 primary, components = self.getURIs(ref, predict)
1678 if primary is None or components: 1678 ↛ 1679line 1678 didn't jump to line 1679, because the condition on line 1678 was never true
1679 raise RuntimeError(
1680 f"Dataset ({ref}) includes distinct URIs for components. Use Datastore.getURIs() instead."
1681 )
1682 return primary
1684 def retrieveArtifacts(
1685 self,
1686 refs: Iterable[DatasetRef],
1687 destination: ButlerURI,
1688 transfer: str = "auto",
1689 preserve_path: bool = True,
1690 overwrite: bool = False,
1691 ) -> List[ButlerURI]:
1692 """Retrieve the file artifacts associated with the supplied refs.
1694 Parameters
1695 ----------
1696 refs : iterable of `DatasetRef`
1697 The datasets for which file artifacts are to be retrieved.
1698 A single ref can result in multiple files. The refs must
1699 be resolved.
1700 destination : `ButlerURI`
1701 Location to write the file artifacts.
1702 transfer : `str`, optional
1703 Method to use to transfer the artifacts. Must be one of the options
1704 supported by `ButlerURI.transfer_from()`. "move" is not allowed.
1705 preserve_path : `bool`, optional
1706 If `True` the full path of the file artifact within the datastore
1707 is preserved. If `False` the final file component of the path
1708 is used.
1709 overwrite : `bool`, optional
1710 If `True` allow transfers to overwrite existing files at the
1711 destination.
1713 Returns
1714 -------
1715 targets : `list` of `ButlerURI`
1716 URIs of file artifacts in destination location. Order is not
1717 preserved.
1718 """
1719 if not destination.isdir(): 1719 ↛ 1720line 1719 didn't jump to line 1720, because the condition on line 1719 was never true
1720 raise ValueError(f"Destination location must refer to a directory. Given {destination}")
1722 if transfer == "move":
1723 raise ValueError("Can not move artifacts out of datastore. Use copy instead.")
1725 # Source -> Destination
1726 # This also helps filter out duplicate DatasetRef in the request
1727 # that will map to the same underlying file transfer.
1728 to_transfer: Dict[ButlerURI, ButlerURI] = {}
1730 for ref in refs:
1731 locations = self._get_dataset_locations_info(ref)
1732 for location, _ in locations:
1733 source_uri = location.uri
1734 target_path: Union[str, ButlerURI]
1735 if preserve_path:
1736 target_path = location.pathInStore
1737 if target_path.isabs(): 1737 ↛ 1740line 1737 didn't jump to line 1740, because the condition on line 1737 was never true
1738 # This is an absolute path to an external file.
1739 # Use the full path.
1740 target_path = target_path.relativeToPathRoot
1741 else:
1742 target_path = source_uri.basename()
1743 target_uri = destination.join(target_path)
1744 to_transfer[source_uri] = target_uri
1746 # In theory can now parallelize the transfer
1747 log.debug("Number of artifacts to transfer to %s: %d", str(destination), len(to_transfer))
1748 for source_uri, target_uri in to_transfer.items():
1749 target_uri.transfer_from(source_uri, transfer=transfer, overwrite=overwrite)
1751 return list(to_transfer.values())
1753 def get(self, ref: DatasetRef, parameters: Optional[Mapping[str, Any]] = None) -> Any:
1754 """Load an InMemoryDataset from the store.
1756 Parameters
1757 ----------
1758 ref : `DatasetRef`
1759 Reference to the required Dataset.
1760 parameters : `dict`
1761 `StorageClass`-specific parameters that specify, for example,
1762 a slice of the dataset to be loaded.
1764 Returns
1765 -------
1766 inMemoryDataset : `object`
1767 Requested dataset or slice thereof as an InMemoryDataset.
1769 Raises
1770 ------
1771 FileNotFoundError
1772 Requested dataset can not be retrieved.
1773 TypeError
1774 Return value from formatter has unexpected type.
1775 ValueError
1776 Formatter failed to process the dataset.
1777 """
1778 allGetInfo = self._prepare_for_get(ref, parameters)
1779 refComponent = ref.datasetType.component()
1781 # Supplied storage class for the component being read
1782 refStorageClass = ref.datasetType.storageClass
1784 # Create mapping from component name to related info
1785 allComponents = {i.component: i for i in allGetInfo}
1787 # By definition the dataset is disassembled if we have more
1788 # than one record for it.
1789 isDisassembled = len(allGetInfo) > 1
1791 # Look for the special case where we are disassembled but the
1792 # component is a derived component that was not written during
1793 # disassembly. For this scenario we need to check that the
1794 # component requested is listed as a derived component for the
1795 # composite storage class
1796 isDisassembledReadOnlyComponent = False
1797 if isDisassembled and refComponent:
1798 # The composite storage class should be accessible through
1799 # the component dataset type
1800 compositeStorageClass = ref.datasetType.parentStorageClass
1802 # In the unlikely scenario where the composite storage
1803 # class is not known, we can only assume that this is a
1804 # normal component. If that assumption is wrong then the
1805 # branch below that reads a persisted component will fail
1806 # so there is no need to complain here.
1807 if compositeStorageClass is not None: 1807 ↛ 1810line 1807 didn't jump to line 1810, because the condition on line 1807 was never false
1808 isDisassembledReadOnlyComponent = refComponent in compositeStorageClass.derivedComponents
1810 if isDisassembled and not refComponent:
1811 # This was a disassembled dataset spread over multiple files
1812 # and we need to put them all back together again.
1813 # Read into memory and then assemble
1815 # Check that the supplied parameters are suitable for the type read
1816 refStorageClass.validateParameters(parameters)
1818 # We want to keep track of all the parameters that were not used
1819 # by formatters. We assume that if any of the component formatters
1820 # use a parameter that we do not need to apply it again in the
1821 # assembler.
1822 usedParams = set()
1824 components: Dict[str, Any] = {}
1825 for getInfo in allGetInfo:
1826 # assemblerParams are parameters not understood by the
1827 # associated formatter.
1828 usedParams.update(set(getInfo.formatterParams))
1830 component = getInfo.component
1832 if component is None: 1832 ↛ 1833line 1832 didn't jump to line 1833, because the condition on line 1832 was never true
1833 raise RuntimeError(f"Internal error in datastore assembly of {ref}")
1835 # We do not want the formatter to think it's reading
1836 # a component though because it is really reading a
1837 # standalone dataset -- always tell reader it is not a
1838 # component.
1839 components[component] = self._read_artifact_into_memory(
1840 getInfo, ref.makeComponentRef(component), isComponent=False
1841 )
1843 inMemoryDataset = ref.datasetType.storageClass.delegate().assemble(components)
1845 # Any unused parameters will have to be passed to the assembler
1846 if parameters:
1847 unusedParams = {k: v for k, v in parameters.items() if k not in usedParams}
1848 else:
1849 unusedParams = {}
1851 # Process parameters
1852 return ref.datasetType.storageClass.delegate().handleParameters(
1853 inMemoryDataset, parameters=unusedParams
1854 )
1856 elif isDisassembledReadOnlyComponent:
1858 compositeStorageClass = ref.datasetType.parentStorageClass
1859 if compositeStorageClass is None: 1859 ↛ 1860line 1859 didn't jump to line 1860, because the condition on line 1859 was never true
1860 raise RuntimeError(
1861 f"Unable to retrieve derived component '{refComponent}' since"
1862 "no composite storage class is available."
1863 )
1865 if refComponent is None: 1865 ↛ 1867line 1865 didn't jump to line 1867, because the condition on line 1865 was never true
1866 # Mainly for mypy
1867 raise RuntimeError(f"Internal error in datastore {self.name}: component can not be None here")
1869 # Assume that every derived component can be calculated by
1870 # forwarding the request to a single read/write component.
1871 # Rather than guessing which rw component is the right one by
1872 # scanning each for a derived component of the same name,
1873 # we ask the storage class delegate directly which one is best to
1874 # use.
1875 compositeDelegate = compositeStorageClass.delegate()
1876 forwardedComponent = compositeDelegate.selectResponsibleComponent(
1877 refComponent, set(allComponents)
1878 )
1880 # Select the relevant component
1881 rwInfo = allComponents[forwardedComponent]
1883 # For now assume that read parameters are validated against
1884 # the real component and not the requested component
1885 forwardedStorageClass = rwInfo.formatter.fileDescriptor.readStorageClass
1886 forwardedStorageClass.validateParameters(parameters)
1888 # The reference to use for the caching must refer to the forwarded
1889 # component and not the derived component.
1890 cache_ref = ref.makeCompositeRef().makeComponentRef(forwardedComponent)
1892 # Unfortunately the FileDescriptor inside the formatter will have
1893 # the wrong write storage class so we need to create a new one
1894 # given the immutability constraint.
1895 writeStorageClass = rwInfo.info.storageClass
1897 # We may need to put some thought into parameters for read
1898 # components but for now forward them on as is
1899 readFormatter = type(rwInfo.formatter)(
1900 FileDescriptor(
1901 rwInfo.location,
1902 readStorageClass=refStorageClass,
1903 storageClass=writeStorageClass,
1904 parameters=parameters,
1905 ),
1906 ref.dataId,
1907 )
1909 # The assembler can not receive any parameter requests for a
1910 # derived component at this time since the assembler will
1911 # see the storage class of the derived component and those
1912 # parameters will have to be handled by the formatter on the
1913 # forwarded storage class.
1914 assemblerParams: Dict[str, Any] = {}
1916 # Need to created a new info that specifies the derived
1917 # component and associated storage class
1918 readInfo = DatastoreFileGetInformation(
1919 rwInfo.location,
1920 readFormatter,
1921 rwInfo.info,
1922 assemblerParams,
1923 {},
1924 refComponent,
1925 refStorageClass,
1926 )
1928 return self._read_artifact_into_memory(readInfo, ref, isComponent=True, cache_ref=cache_ref)
1930 else:
1931 # Single file request or component from that composite file
1932 for lookup in (refComponent, None): 1932 ↛ 1937line 1932 didn't jump to line 1937, because the loop on line 1932 didn't complete
1933 if lookup in allComponents: 1933 ↛ 1932line 1933 didn't jump to line 1932, because the condition on line 1933 was never false
1934 getInfo = allComponents[lookup]
1935 break
1936 else:
1937 raise FileNotFoundError(
1938 f"Component {refComponent} not found for ref {ref} in datastore {self.name}"
1939 )
1941 # Do not need the component itself if already disassembled
1942 if isDisassembled:
1943 isComponent = False
1944 else:
1945 isComponent = getInfo.component is not None
1947 # For a component read of a composite we want the cache to
1948 # be looking at the composite ref itself.
1949 cache_ref = ref.makeCompositeRef() if isComponent else ref
1951 # For a disassembled component we can validate parametersagainst
1952 # the component storage class directly
1953 if isDisassembled:
1954 refStorageClass.validateParameters(parameters)
1955 else:
1956 # For an assembled composite this could be a derived
1957 # component derived from a real component. The validity
1958 # of the parameters is not clear. For now validate against
1959 # the composite storage class
1960 getInfo.formatter.fileDescriptor.storageClass.validateParameters(parameters)
1962 return self._read_artifact_into_memory(getInfo, ref, isComponent=isComponent, cache_ref=cache_ref)
1964 @transactional
1965 def put(self, inMemoryDataset: Any, ref: DatasetRef) -> None:
1966 """Write a InMemoryDataset with a given `DatasetRef` to the store.
1968 Parameters
1969 ----------
1970 inMemoryDataset : `object`
1971 The dataset to store.
1972 ref : `DatasetRef`
1973 Reference to the associated Dataset.
1975 Raises
1976 ------
1977 TypeError
1978 Supplied object and storage class are inconsistent.
1979 DatasetTypeNotSupportedError
1980 The associated `DatasetType` is not handled by this datastore.
1982 Notes
1983 -----
1984 If the datastore is configured to reject certain dataset types it
1985 is possible that the put will fail and raise a
1986 `DatasetTypeNotSupportedError`. The main use case for this is to
1987 allow `ChainedDatastore` to put to multiple datastores without
1988 requiring that every datastore accepts the dataset.
1989 """
1991 doDisassembly = self.composites.shouldBeDisassembled(ref)
1992 # doDisassembly = True
1994 artifacts = []
1995 if doDisassembly:
1996 components = ref.datasetType.storageClass.delegate().disassemble(inMemoryDataset)
1997 for component, componentInfo in components.items():
1998 # Don't recurse because we want to take advantage of
1999 # bulk insert -- need a new DatasetRef that refers to the
2000 # same dataset_id but has the component DatasetType
2001 # DatasetType does not refer to the types of components
2002 # So we construct one ourselves.
2003 compRef = ref.makeComponentRef(component)
2004 storedInfo = self._write_in_memory_to_artifact(componentInfo.component, compRef)
2005 artifacts.append((compRef, storedInfo))
2006 else:
2007 # Write the entire thing out
2008 storedInfo = self._write_in_memory_to_artifact(inMemoryDataset, ref)
2009 artifacts.append((ref, storedInfo))
2011 self._register_datasets(artifacts)
2013 @transactional
2014 def trash(self, ref: Union[DatasetRef, Iterable[DatasetRef]], ignore_errors: bool = True) -> None:
2015 # At this point can safely remove these datasets from the cache
2016 # to avoid confusion later on. If they are not trashed later
2017 # the cache will simply be refilled.
2018 self.cacheManager.remove_from_cache(ref)
2020 # If we are in trust mode there will be nothing to move to
2021 # the trash table and we will have to try to delete the file
2022 # immediately.
2023 if self.trustGetRequest:
2024 # Try to keep the logic below for a single file trash.
2025 if isinstance(ref, DatasetRef):
2026 refs = {ref}
2027 else:
2028 # Will recreate ref at the end of this branch.
2029 refs = set(ref)
2031 # Determine which datasets are known to datastore directly.
2032 id_to_ref = {ref.getCheckedId(): ref for ref in refs}
2033 existing_ids = self._get_stored_records_associated_with_refs(refs)
2034 existing_refs = {id_to_ref[ref_id] for ref_id in existing_ids}
2036 missing = refs - existing_refs
2037 if missing:
2038 # Do an explicit existence check on these refs.
2039 # We only care about the artifacts at this point and not
2040 # the dataset existence.
2041 artifact_existence: Dict[ButlerURI, bool] = {}
2042 _ = self.mexists(missing, artifact_existence)
2043 uris = [uri for uri, exists in artifact_existence.items() if exists]
2045 # FUTURE UPGRADE: Implement a parallelized bulk remove.
2046 log.debug("Removing %d artifacts from datastore that are unknown to datastore", len(uris))
2047 for uri in uris:
2048 try:
2049 uri.remove()
2050 except Exception as e:
2051 if ignore_errors:
2052 log.debug("Artifact %s could not be removed: %s", uri, e)
2053 continue
2054 raise
2056 # There is no point asking the code below to remove refs we
2057 # know are missing so update it with the list of existing
2058 # records. Try to retain one vs many logic.
2059 if not existing_refs:
2060 # Nothing more to do since none of the datasets were
2061 # known to the datastore record table.
2062 return
2063 ref = list(existing_refs)
2064 if len(ref) == 1:
2065 ref = ref[0]
2067 # Get file metadata and internal metadata
2068 if not isinstance(ref, DatasetRef):
2069 log.debug("Doing multi-dataset trash in datastore %s", self.name)
2070 # Assumed to be an iterable of refs so bulk mode enabled.
2071 try:
2072 self.bridge.moveToTrash(ref)
2073 except Exception as e:
2074 if ignore_errors:
2075 log.warning("Unexpected issue moving multiple datasets to trash: %s", e)
2076 else:
2077 raise
2078 return
2080 log.debug("Trashing dataset %s in datastore %s", ref, self.name)
2082 fileLocations = self._get_dataset_locations_info(ref)
2084 if not fileLocations:
2085 err_msg = f"Requested dataset to trash ({ref}) is not known to datastore {self.name}"
2086 if ignore_errors:
2087 log.warning(err_msg)
2088 return
2089 else:
2090 raise FileNotFoundError(err_msg)
2092 for location, storedFileInfo in fileLocations:
2093 if not self._artifact_exists(location): 2093 ↛ 2094line 2093 didn't jump to line 2094
2094 err_msg = (
2095 f"Dataset is known to datastore {self.name} but "
2096 f"associated artifact ({location.uri}) is missing"
2097 )
2098 if ignore_errors:
2099 log.warning(err_msg)
2100 return
2101 else:
2102 raise FileNotFoundError(err_msg)
2104 # Mark dataset as trashed
2105 try:
2106 self.bridge.moveToTrash([ref])
2107 except Exception as e:
2108 if ignore_errors:
2109 log.warning(
2110 "Attempted to mark dataset (%s) to be trashed in datastore %s "
2111 "but encountered an error: %s",
2112 ref,
2113 self.name,
2114 e,
2115 )
2116 pass
2117 else:
2118 raise
2120 @transactional
2121 def emptyTrash(self, ignore_errors: bool = True) -> None:
2122 """Remove all datasets from the trash.
2124 Parameters
2125 ----------
2126 ignore_errors : `bool`
2127 If `True` return without error even if something went wrong.
2128 Problems could occur if another process is simultaneously trying
2129 to delete.
2130 """
2131 log.debug("Emptying trash in datastore %s", self.name)
2133 # Context manager will empty trash iff we finish it without raising.
2134 # It will also automatically delete the relevant rows from the
2135 # trash table and the records table.
2136 with self.bridge.emptyTrash(
2137 self._table, record_class=StoredFileInfo, record_column="path"
2138 ) as trash_data:
2139 # Removing the artifacts themselves requires that the files are
2140 # not also associated with refs that are not to be trashed.
2141 # Therefore need to do a query with the file paths themselves
2142 # and return all the refs associated with them. Can only delete
2143 # a file if the refs to be trashed are the only refs associated
2144 # with the file.
2145 # This requires multiple copies of the trashed items
2146 trashed, artifacts_to_keep = trash_data
2148 if artifacts_to_keep is None:
2149 # The bridge is not helping us so have to work it out
2150 # ourselves. This is not going to be as efficient.
2151 trashed = list(trashed)
2153 # The instance check is for mypy since up to this point it
2154 # does not know the type of info.
2155 path_map = self._refs_associated_with_artifacts(
2156 [info.path for _, info in trashed if isinstance(info, StoredFileInfo)]
2157 )
2159 for ref, info in trashed:
2161 # Mypy needs to know this is not the base class
2162 assert isinstance(info, StoredFileInfo), f"Unexpectedly got info of class {type(info)}"
2164 # Check for mypy
2165 assert ref.id is not None, f"Internal logic error in emptyTrash with ref {ref}/{info}"
2167 path_map[info.path].remove(ref.id)
2168 if not path_map[info.path]: 2168 ↛ 2159line 2168 didn't jump to line 2159, because the condition on line 2168 was never false
2169 del path_map[info.path]
2171 artifacts_to_keep = set(path_map)
2173 for ref, info in trashed:
2175 # Should not happen for this implementation but need
2176 # to keep mypy happy.
2177 assert info is not None, f"Internal logic error in emptyTrash with ref {ref}."
2179 # Mypy needs to know this is not the base class
2180 assert isinstance(info, StoredFileInfo), f"Unexpectedly got info of class {type(info)}"
2182 # Check for mypy
2183 assert ref.id is not None, f"Internal logic error in emptyTrash with ref {ref}/{info}"
2185 if info.path in artifacts_to_keep:
2186 # This is a multi-dataset artifact and we are not
2187 # removing all associated refs.
2188 continue
2190 # Only trashed refs still known to datastore will be returned.
2191 location = info.file_location(self.locationFactory)
2193 # Point of no return for this artifact
2194 log.debug("Removing artifact %s from datastore %s", location.uri, self.name)
2195 try:
2196 self._delete_artifact(location)
2197 except FileNotFoundError:
2198 # If the file itself has been deleted there is nothing
2199 # we can do about it. It is possible that trash has
2200 # been run in parallel in another process or someone
2201 # decided to delete the file. It is unlikely to come
2202 # back and so we should still continue with the removal
2203 # of the entry from the trash table. It is also possible
2204 # we removed it in a previous iteration if it was
2205 # a multi-dataset artifact. The delete artifact method
2206 # will log a debug message in this scenario.
2207 # Distinguishing file missing before trash started and
2208 # file already removed previously as part of this trash
2209 # is not worth the distinction with regards to potential
2210 # memory cost.
2211 pass
2212 except Exception as e:
2213 if ignore_errors:
2214 # Use a debug message here even though it's not
2215 # a good situation. In some cases this can be
2216 # caused by a race between user A and user B
2217 # and neither of them has permissions for the
2218 # other's files. Butler does not know about users
2219 # and trash has no idea what collections these
2220 # files were in (without guessing from a path).
2221 log.debug(
2222 "Encountered error removing artifact %s from datastore %s: %s",
2223 location.uri,
2224 self.name,
2225 e,
2226 )
2227 else:
2228 raise
2230 @transactional
2231 def transfer_from(
2232 self,
2233 source_datastore: Datastore,
2234 refs: Iterable[DatasetRef],
2235 local_refs: Optional[Iterable[DatasetRef]] = None,
2236 transfer: str = "auto",
2237 artifact_existence: Optional[Dict[ButlerURI, bool]] = None,
2238 ) -> None:
2239 # Docstring inherited
2240 if type(self) is not type(source_datastore):
2241 raise TypeError(
2242 f"Datastore mismatch between this datastore ({type(self)}) and the "
2243 f"source datastore ({type(source_datastore)})."
2244 )
2246 # Be explicit for mypy
2247 if not isinstance(source_datastore, FileDatastore): 2247 ↛ 2248line 2247 didn't jump to line 2248, because the condition on line 2247 was never true
2248 raise TypeError(
2249 "Can only transfer to a FileDatastore from another FileDatastore, not"
2250 f" {type(source_datastore)}"
2251 )
2253 # Stop early if "direct" transfer mode is requested. That would
2254 # require that the URI inside the source datastore should be stored
2255 # directly in the target datastore, which seems unlikely to be useful
2256 # since at any moment the source datastore could delete the file.
2257 if transfer in ("direct", "split"):
2258 raise ValueError(
2259 f"Can not transfer from a source datastore using {transfer} mode since"
2260 " those files are controlled by the other datastore."
2261 )
2263 # Empty existence lookup if none given.
2264 if artifact_existence is None:
2265 artifact_existence = {}
2267 # We will go through the list multiple times so must convert
2268 # generators to lists.
2269 refs = list(refs)
2271 if local_refs is None:
2272 local_refs = refs
2273 else:
2274 local_refs = list(local_refs)
2276 # In order to handle disassembled composites the code works
2277 # at the records level since it can assume that internal APIs
2278 # can be used.
2279 # - If the record already exists in the destination this is assumed
2280 # to be okay.
2281 # - If there is no record but the source and destination URIs are
2282 # identical no transfer is done but the record is added.
2283 # - If the source record refers to an absolute URI currently assume
2284 # that that URI should remain absolute and will be visible to the
2285 # destination butler. May need to have a flag to indicate whether
2286 # the dataset should be transferred. This will only happen if
2287 # the detached Butler has had a local ingest.
2289 # What we really want is all the records in the source datastore
2290 # associated with these refs. Or derived ones if they don't exist
2291 # in the source.
2292 source_records = source_datastore._get_stored_records_associated_with_refs(refs)
2294 # The source dataset_ids are the keys in these records
2295 source_ids = set(source_records)
2296 log.debug("Number of datastore records found in source: %d", len(source_ids))
2298 # The not None check is to appease mypy
2299 requested_ids = set(ref.id for ref in refs if ref.id is not None)
2300 missing_ids = requested_ids - source_ids
2302 # Missing IDs can be okay if that datastore has allowed
2303 # gets based on file existence. Should we transfer what we can
2304 # or complain about it and warn?
2305 if missing_ids and not source_datastore.trustGetRequest: 2305 ↛ 2306line 2305 didn't jump to line 2306, because the condition on line 2305 was never true
2306 raise ValueError(
2307 f"Some datasets are missing from source datastore {source_datastore}: {missing_ids}"
2308 )
2310 # Need to map these missing IDs to a DatasetRef so we can guess
2311 # the details.
2312 if missing_ids:
2313 log.info(
2314 "Number of expected datasets missing from source datastore records: %d out of %d",
2315 len(missing_ids),
2316 len(requested_ids),
2317 )
2318 id_to_ref = {ref.id: ref for ref in refs if ref.id in missing_ids}
2320 # This should be chunked in case we end up having to check
2321 # the file store since we need some log output to show
2322 # progress.
2323 for missing_ids_chunk in chunk_iterable(missing_ids, chunk_size=10_000):
2324 records = {}
2325 for missing in missing_ids_chunk:
2326 # Ask the source datastore where the missing artifacts
2327 # should be. An execution butler might not know about the
2328 # artifacts even if they are there.
2329 expected = source_datastore._get_expected_dataset_locations_info(id_to_ref[missing])
2330 records[missing] = [info for _, info in expected]
2332 # Call the mexist helper method in case we have not already
2333 # checked these artifacts such that artifact_existence is
2334 # empty. This allows us to benefit from parallelism.
2335 # datastore.mexists() itself does not give us access to the
2336 # derived datastore record.
2337 log.verbose("Checking existence of %d datasets unknown to datastore", len(records))
2338 ref_exists = source_datastore._process_mexists_records(
2339 id_to_ref, records, False, artifact_existence=artifact_existence
2340 )
2342 # Now go through the records and propagate the ones that exist.
2343 location_factory = source_datastore.locationFactory
2344 for missing, record_list in records.items():
2345 # Skip completely if the ref does not exist.
2346 ref = id_to_ref[missing]
2347 if not ref_exists[ref]:
2348 log.warning("Asked to transfer dataset %s but no file artifacts exist for it.", ref)
2349 continue
2350 # Check for file artifact to decide which parts of a
2351 # disassembled composite do exist. If there is only a
2352 # single record we don't even need to look because it can't
2353 # be a composite and must exist.
2354 if len(record_list) == 1:
2355 dataset_records = record_list
2356 else:
2357 dataset_records = [
2358 record
2359 for record in record_list
2360 if artifact_existence[record.file_location(location_factory).uri]
2361 ]
2362 assert len(dataset_records) > 0, "Disassembled composite should have had some files."
2364 # Rely on source_records being a defaultdict.
2365 source_records[missing].extend(dataset_records)
2367 # See if we already have these records
2368 target_records = self._get_stored_records_associated_with_refs(local_refs)
2370 # The artifacts to register
2371 artifacts = []
2373 # Refs that already exist
2374 already_present = []
2376 # Now can transfer the artifacts
2377 for source_ref, target_ref in zip(refs, local_refs):
2378 if target_ref.id in target_records:
2379 # Already have an artifact for this.
2380 already_present.append(target_ref)
2381 continue
2383 # mypy needs to know these are always resolved refs
2384 for info in source_records[source_ref.getCheckedId()]:
2385 source_location = info.file_location(source_datastore.locationFactory)
2386 target_location = info.file_location(self.locationFactory)
2387 if source_location == target_location: 2387 ↛ 2391line 2387 didn't jump to line 2391, because the condition on line 2387 was never true
2388 # Either the dataset is already in the target datastore
2389 # (which is how execution butler currently runs) or
2390 # it is an absolute URI.
2391 if source_location.pathInStore.isabs():
2392 # Just because we can see the artifact when running
2393 # the transfer doesn't mean it will be generally
2394 # accessible to a user of this butler. For now warn
2395 # but assume it will be accessible.
2396 log.warning(
2397 "Transfer request for an outside-datastore artifact has been found at %s",
2398 source_location,
2399 )
2400 else:
2401 # Need to transfer it to the new location.
2402 # Assume we should always overwrite. If the artifact
2403 # is there this might indicate that a previous transfer
2404 # was interrupted but was not able to be rolled back
2405 # completely (eg pre-emption) so follow Datastore default
2406 # and overwrite.
2407 target_location.uri.transfer_from(
2408 source_location.uri, transfer=transfer, overwrite=True, transaction=self._transaction
2409 )
2411 artifacts.append((target_ref, info))
2413 self._register_datasets(artifacts)
2415 if already_present:
2416 n_skipped = len(already_present)
2417 log.info(
2418 "Skipped transfer of %d dataset%s already present in datastore",
2419 n_skipped,
2420 "" if n_skipped == 1 else "s",
2421 )
2423 @transactional
2424 def forget(self, refs: Iterable[DatasetRef]) -> None:
2425 # Docstring inherited.
2426 refs = list(refs)
2427 self.bridge.forget(refs)
2428 self._table.delete(["dataset_id"], *[{"dataset_id": ref.getCheckedId()} for ref in refs])
2430 def validateConfiguration(
2431 self, entities: Iterable[Union[DatasetRef, DatasetType, StorageClass]], logFailures: bool = False
2432 ) -> None:
2433 """Validate some of the configuration for this datastore.
2435 Parameters
2436 ----------
2437 entities : iterable of `DatasetRef`, `DatasetType`, or `StorageClass`
2438 Entities to test against this configuration. Can be differing
2439 types.
2440 logFailures : `bool`, optional
2441 If `True`, output a log message for every validation error
2442 detected.
2444 Raises
2445 ------
2446 DatastoreValidationError
2447 Raised if there is a validation problem with a configuration.
2448 All the problems are reported in a single exception.
2450 Notes
2451 -----
2452 This method checks that all the supplied entities have valid file
2453 templates and also have formatters defined.
2454 """
2456 templateFailed = None
2457 try:
2458 self.templates.validateTemplates(entities, logFailures=logFailures)
2459 except FileTemplateValidationError as e:
2460 templateFailed = str(e)
2462 formatterFailed = []
2463 for entity in entities:
2464 try:
2465 self.formatterFactory.getFormatterClass(entity)
2466 except KeyError as e:
2467 formatterFailed.append(str(e))
2468 if logFailures: 2468 ↛ 2463line 2468 didn't jump to line 2463, because the condition on line 2468 was never false
2469 log.critical("Formatter failure: %s", e)
2471 if templateFailed or formatterFailed:
2472 messages = []
2473 if templateFailed: 2473 ↛ 2474line 2473 didn't jump to line 2474, because the condition on line 2473 was never true
2474 messages.append(templateFailed)
2475 if formatterFailed: 2475 ↛ 2477line 2475 didn't jump to line 2477, because the condition on line 2475 was never false
2476 messages.append(",".join(formatterFailed))
2477 msg = ";\n".join(messages)
2478 raise DatastoreValidationError(msg)
2480 def getLookupKeys(self) -> Set[LookupKey]:
2481 # Docstring is inherited from base class
2482 return (
2483 self.templates.getLookupKeys()
2484 | self.formatterFactory.getLookupKeys()
2485 | self.constraints.getLookupKeys()
2486 )
2488 def validateKey(self, lookupKey: LookupKey, entity: Union[DatasetRef, DatasetType, StorageClass]) -> None:
2489 # Docstring is inherited from base class
2490 # The key can be valid in either formatters or templates so we can
2491 # only check the template if it exists
2492 if lookupKey in self.templates:
2493 try:
2494 self.templates[lookupKey].validateTemplate(entity)
2495 except FileTemplateValidationError as e:
2496 raise DatastoreValidationError(e) from e
2498 def export(
2499 self,
2500 refs: Iterable[DatasetRef],
2501 *,
2502 directory: Optional[Union[ButlerURI, str]] = None,
2503 transfer: Optional[str] = "auto",
2504 ) -> Iterable[FileDataset]:
2505 # Docstring inherited from Datastore.export.
2506 if transfer is not None and directory is None: 2506 ↛ 2507line 2506 didn't jump to line 2507, because the condition on line 2506 was never true
2507 raise RuntimeError(f"Cannot export using transfer mode {transfer} with no export directory given")
2509 # Force the directory to be a URI object
2510 directoryUri: Optional[ButlerURI] = None
2511 if directory is not None: 2511 ↛ 2514line 2511 didn't jump to line 2514, because the condition on line 2511 was never false
2512 directoryUri = ButlerURI(directory, forceDirectory=True)
2514 if transfer is not None and directoryUri is not None: 2514 ↛ 2519line 2514 didn't jump to line 2519, because the condition on line 2514 was never false
2515 # mypy needs the second test
2516 if not directoryUri.exists(): 2516 ↛ 2517line 2516 didn't jump to line 2517, because the condition on line 2516 was never true
2517 raise FileNotFoundError(f"Export location {directory} does not exist")
2519 progress = Progress("lsst.daf.butler.datastores.FileDatastore.export", level=logging.DEBUG)
2520 for ref in progress.wrap(refs, "Exporting dataset files"):
2521 fileLocations = self._get_dataset_locations_info(ref)
2522 if not fileLocations: 2522 ↛ 2523line 2522 didn't jump to line 2523, because the condition on line 2522 was never true
2523 raise FileNotFoundError(f"Could not retrieve dataset {ref}.")
2524 # For now we can not export disassembled datasets
2525 if len(fileLocations) > 1:
2526 raise NotImplementedError(f"Can not export disassembled datasets such as {ref}")
2527 location, storedFileInfo = fileLocations[0]
2529 pathInStore = location.pathInStore.path
2530 if transfer is None: 2530 ↛ 2534line 2530 didn't jump to line 2534, because the condition on line 2530 was never true
2531 # TODO: do we also need to return the readStorageClass somehow?
2532 # We will use the path in store directly. If this is an
2533 # absolute URI, preserve it.
2534 if location.pathInStore.isabs():
2535 pathInStore = str(location.uri)
2536 elif transfer == "direct": 2536 ↛ 2538line 2536 didn't jump to line 2538, because the condition on line 2536 was never true
2537 # Use full URIs to the remote store in the export
2538 pathInStore = str(location.uri)
2539 else:
2540 # mypy needs help
2541 assert directoryUri is not None, "directoryUri must be defined to get here"
2542 storeUri = ButlerURI(location.uri)
2544 # if the datastore has an absolute URI to a resource, we
2545 # have two options:
2546 # 1. Keep the absolute URI in the exported YAML
2547 # 2. Allocate a new name in the local datastore and transfer
2548 # it.
2549 # For now go with option 2
2550 if location.pathInStore.isabs(): 2550 ↛ 2551line 2550 didn't jump to line 2551, because the condition on line 2550 was never true
2551 template = self.templates.getTemplate(ref)
2552 newURI = ButlerURI(template.format(ref), forceAbsolute=False)
2553 pathInStore = str(newURI.updatedExtension(location.pathInStore.getExtension()))
2555 exportUri = directoryUri.join(pathInStore)
2556 exportUri.transfer_from(storeUri, transfer=transfer)
2558 yield FileDataset(refs=[ref], path=pathInStore, formatter=storedFileInfo.formatter)
2560 @staticmethod
2561 def computeChecksum(uri: ButlerURI, algorithm: str = "blake2b", block_size: int = 8192) -> Optional[str]:
2562 """Compute the checksum of the supplied file.
2564 Parameters
2565 ----------
2566 uri : `ButlerURI`
2567 Name of resource to calculate checksum from.
2568 algorithm : `str`, optional
2569 Name of algorithm to use. Must be one of the algorithms supported
2570 by :py:class`hashlib`.
2571 block_size : `int`
2572 Number of bytes to read from file at one time.
2574 Returns
2575 -------
2576 hexdigest : `str`
2577 Hex digest of the file.
2579 Notes
2580 -----
2581 Currently returns None if the URI is for a remote resource.
2582 """
2583 if algorithm not in hashlib.algorithms_guaranteed: 2583 ↛ 2584line 2583 didn't jump to line 2584, because the condition on line 2583 was never true
2584 raise NameError("The specified algorithm '{}' is not supported by hashlib".format(algorithm))
2586 if not uri.isLocal: 2586 ↛ 2587line 2586 didn't jump to line 2587, because the condition on line 2586 was never true
2587 return None
2589 hasher = hashlib.new(algorithm)
2591 with uri.as_local() as local_uri:
2592 with open(local_uri.ospath, "rb") as f:
2593 for chunk in iter(lambda: f.read(block_size), b""):
2594 hasher.update(chunk)
2596 return hasher.hexdigest()
2598 def needs_expanded_data_ids(
2599 self,
2600 transfer: Optional[str],
2601 entity: Optional[Union[DatasetRef, DatasetType, StorageClass]] = None,
2602 ) -> bool:
2603 # Docstring inherited.
2604 # This _could_ also use entity to inspect whether the filename template
2605 # involves placeholders other than the required dimensions for its
2606 # dataset type, but that's not necessary for correctness; it just
2607 # enables more optimizations (perhaps only in theory).
2608 return transfer not in ("direct", None)