Coverage for python/lsst/daf/butler/datastores/fileDatastore.py: 84%
Shortcuts on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
21from __future__ import annotations
23"""Generic file-based datastore code."""
25__all__ = ("FileDatastore",)
27import hashlib
28import logging
29from collections import defaultdict
30from dataclasses import dataclass
31from typing import (
32 TYPE_CHECKING,
33 Any,
34 ClassVar,
35 Dict,
36 Iterable,
37 List,
38 Mapping,
39 Optional,
40 Set,
41 Tuple,
42 Type,
43 Union,
44)
46from lsst.daf.butler import (
47 CompositesMap,
48 Config,
49 DatasetId,
50 DatasetRef,
51 DatasetType,
52 DatasetTypeNotSupportedError,
53 Datastore,
54 DatastoreCacheManager,
55 DatastoreConfig,
56 DatastoreDisabledCacheManager,
57 DatastoreValidationError,
58 FileDataset,
59 FileDescriptor,
60 FileTemplates,
61 FileTemplateValidationError,
62 Formatter,
63 FormatterFactory,
64 Location,
65 LocationFactory,
66 Progress,
67 StorageClass,
68 StoredFileInfo,
69 ddl,
70)
71from lsst.daf.butler.core.repoRelocation import replaceRoot
72from lsst.daf.butler.core.utils import transactional
73from lsst.daf.butler.registry.interfaces import DatastoreRegistryBridge, ReadOnlyDatabaseError
74from lsst.resources import ResourcePath, ResourcePathExpression
75from lsst.utils.introspection import get_class_of, get_instance_of
76from lsst.utils.iteration import chunk_iterable
78# For VERBOSE logging usage.
79from lsst.utils.logging import VERBOSE, getLogger
80from lsst.utils.timer import time_this
81from sqlalchemy import BigInteger, String
83from .genericDatastore import GenericBaseDatastore
85if TYPE_CHECKING: 85 ↛ 86line 85 didn't jump to line 86, because the condition on line 85 was never true
86 from lsst.daf.butler import AbstractDatastoreCacheManager, LookupKey
87 from lsst.daf.butler.registry.interfaces import DatasetIdRef, DatastoreRegistryBridgeManager
89log = getLogger(__name__)
92class _IngestPrepData(Datastore.IngestPrepData):
93 """Helper class for FileDatastore ingest implementation.
95 Parameters
96 ----------
97 datasets : `list` of `FileDataset`
98 Files to be ingested by this datastore.
99 """
101 def __init__(self, datasets: List[FileDataset]):
102 super().__init__(ref for dataset in datasets for ref in dataset.refs)
103 self.datasets = datasets
106@dataclass(frozen=True)
107class DatastoreFileGetInformation:
108 """Collection of useful parameters needed to retrieve a file from
109 a Datastore.
110 """
112 location: Location
113 """The location from which to read the dataset."""
115 formatter: Formatter
116 """The `Formatter` to use to deserialize the dataset."""
118 info: StoredFileInfo
119 """Stored information about this file and its formatter."""
121 assemblerParams: Dict[str, Any]
122 """Parameters to use for post-processing the retrieved dataset."""
124 formatterParams: Dict[str, Any]
125 """Parameters that were understood by the associated formatter."""
127 component: Optional[str]
128 """The component to be retrieved (can be `None`)."""
130 readStorageClass: StorageClass
131 """The `StorageClass` of the dataset being read."""
134class FileDatastore(GenericBaseDatastore):
135 """Generic Datastore for file-based implementations.
137 Should always be sub-classed since key abstract methods are missing.
139 Parameters
140 ----------
141 config : `DatastoreConfig` or `str`
142 Configuration as either a `Config` object or URI to file.
143 bridgeManager : `DatastoreRegistryBridgeManager`
144 Object that manages the interface between `Registry` and datastores.
145 butlerRoot : `str`, optional
146 New datastore root to use to override the configuration value.
148 Raises
149 ------
150 ValueError
151 If root location does not exist and ``create`` is `False` in the
152 configuration.
153 """
155 defaultConfigFile: ClassVar[Optional[str]] = None
156 """Path to configuration defaults. Accessed within the ``config`` resource
157 or relative to a search path. Can be None if no defaults specified.
158 """
160 root: ResourcePath
161 """Root directory URI of this `Datastore`."""
163 locationFactory: LocationFactory
164 """Factory for creating locations relative to the datastore root."""
166 formatterFactory: FormatterFactory
167 """Factory for creating instances of formatters."""
169 templates: FileTemplates
170 """File templates that can be used by this `Datastore`."""
172 composites: CompositesMap
173 """Determines whether a dataset should be disassembled on put."""
175 defaultConfigFile = "datastores/fileDatastore.yaml"
176 """Path to configuration defaults. Accessed within the ``config`` resource
177 or relative to a search path. Can be None if no defaults specified.
178 """
180 @classmethod
181 def setConfigRoot(cls, root: str, config: Config, full: Config, overwrite: bool = True) -> None:
182 """Set any filesystem-dependent config options for this Datastore to
183 be appropriate for a new empty repository with the given root.
185 Parameters
186 ----------
187 root : `str`
188 URI to the root of the data repository.
189 config : `Config`
190 A `Config` to update. Only the subset understood by
191 this component will be updated. Will not expand
192 defaults.
193 full : `Config`
194 A complete config with all defaults expanded that can be
195 converted to a `DatastoreConfig`. Read-only and will not be
196 modified by this method.
197 Repository-specific options that should not be obtained
198 from defaults when Butler instances are constructed
199 should be copied from ``full`` to ``config``.
200 overwrite : `bool`, optional
201 If `False`, do not modify a value in ``config`` if the value
202 already exists. Default is always to overwrite with the provided
203 ``root``.
205 Notes
206 -----
207 If a keyword is explicitly defined in the supplied ``config`` it
208 will not be overridden by this method if ``overwrite`` is `False`.
209 This allows explicit values set in external configs to be retained.
210 """
211 Config.updateParameters(
212 DatastoreConfig,
213 config,
214 full,
215 toUpdate={"root": root},
216 toCopy=("cls", ("records", "table")),
217 overwrite=overwrite,
218 )
220 @classmethod
221 def makeTableSpec(cls, datasetIdColumnType: type) -> ddl.TableSpec:
222 return ddl.TableSpec(
223 fields=[
224 ddl.FieldSpec(name="dataset_id", dtype=datasetIdColumnType, primaryKey=True),
225 ddl.FieldSpec(name="path", dtype=String, length=256, nullable=False),
226 ddl.FieldSpec(name="formatter", dtype=String, length=128, nullable=False),
227 ddl.FieldSpec(name="storage_class", dtype=String, length=64, nullable=False),
228 # Use empty string to indicate no component
229 ddl.FieldSpec(name="component", dtype=String, length=32, primaryKey=True),
230 # TODO: should checksum be Base64Bytes instead?
231 ddl.FieldSpec(name="checksum", dtype=String, length=128, nullable=True),
232 ddl.FieldSpec(name="file_size", dtype=BigInteger, nullable=True),
233 ],
234 unique=frozenset(),
235 indexes=[tuple(["path"])],
236 )
238 def __init__(
239 self,
240 config: Union[DatastoreConfig, str],
241 bridgeManager: DatastoreRegistryBridgeManager,
242 butlerRoot: str = None,
243 ):
244 super().__init__(config, bridgeManager)
245 if "root" not in self.config: 245 ↛ 246line 245 didn't jump to line 246, because the condition on line 245 was never true
246 raise ValueError("No root directory specified in configuration")
248 # Name ourselves either using an explicit name or a name
249 # derived from the (unexpanded) root
250 if "name" in self.config:
251 self.name = self.config["name"]
252 else:
253 # We use the unexpanded root in the name to indicate that this
254 # datastore can be moved without having to update registry.
255 self.name = "{}@{}".format(type(self).__name__, self.config["root"])
257 # Support repository relocation in config
258 # Existence of self.root is checked in subclass
259 self.root = ResourcePath(
260 replaceRoot(self.config["root"], butlerRoot), forceDirectory=True, forceAbsolute=True
261 )
263 self.locationFactory = LocationFactory(self.root)
264 self.formatterFactory = FormatterFactory()
266 # Now associate formatters with storage classes
267 self.formatterFactory.registerFormatters(self.config["formatters"], universe=bridgeManager.universe)
269 # Read the file naming templates
270 self.templates = FileTemplates(self.config["templates"], universe=bridgeManager.universe)
272 # See if composites should be disassembled
273 self.composites = CompositesMap(self.config["composites"], universe=bridgeManager.universe)
275 tableName = self.config["records", "table"]
276 try:
277 # Storage of paths and formatters, keyed by dataset_id
278 self._table = bridgeManager.opaque.register(
279 tableName, self.makeTableSpec(bridgeManager.datasetIdColumnType)
280 )
281 # Interface to Registry.
282 self._bridge = bridgeManager.register(self.name)
283 except ReadOnlyDatabaseError:
284 # If the database is read only and we just tried and failed to
285 # create a table, it means someone is trying to create a read-only
286 # butler client for an empty repo. That should be okay, as long
287 # as they then try to get any datasets before some other client
288 # creates the table. Chances are they'rejust validating
289 # configuration.
290 pass
292 # Determine whether checksums should be used - default to False
293 self.useChecksum = self.config.get("checksum", False)
295 # Determine whether we can fall back to configuration if a
296 # requested dataset is not known to registry
297 self.trustGetRequest = self.config.get("trust_get_request", False)
299 # Create a cache manager
300 self.cacheManager: AbstractDatastoreCacheManager
301 if "cached" in self.config: 301 ↛ 304line 301 didn't jump to line 304, because the condition on line 301 was never false
302 self.cacheManager = DatastoreCacheManager(self.config["cached"], universe=bridgeManager.universe)
303 else:
304 self.cacheManager = DatastoreDisabledCacheManager("", universe=bridgeManager.universe)
306 # Check existence and create directory structure if necessary
307 if not self.root.exists():
308 if "create" not in self.config or not self.config["create"]: 308 ↛ 309line 308 didn't jump to line 309, because the condition on line 308 was never true
309 raise ValueError(f"No valid root and not allowed to create one at: {self.root}")
310 try:
311 self.root.mkdir()
312 except Exception as e:
313 raise ValueError(
314 f"Can not create datastore root '{self.root}', check permissions. Got error: {e}"
315 ) from e
317 def __str__(self) -> str:
318 return str(self.root)
320 @property
321 def bridge(self) -> DatastoreRegistryBridge:
322 return self._bridge
324 def _artifact_exists(self, location: Location) -> bool:
325 """Check that an artifact exists in this datastore at the specified
326 location.
328 Parameters
329 ----------
330 location : `Location`
331 Expected location of the artifact associated with this datastore.
333 Returns
334 -------
335 exists : `bool`
336 True if the location can be found, false otherwise.
337 """
338 log.debug("Checking if resource exists: %s", location.uri)
339 return location.uri.exists()
341 def _delete_artifact(self, location: Location) -> None:
342 """Delete the artifact from the datastore.
344 Parameters
345 ----------
346 location : `Location`
347 Location of the artifact associated with this datastore.
348 """
349 if location.pathInStore.isabs(): 349 ↛ 350line 349 didn't jump to line 350, because the condition on line 349 was never true
350 raise RuntimeError(f"Cannot delete artifact with absolute uri {location.uri}.")
352 try:
353 location.uri.remove()
354 except FileNotFoundError:
355 log.debug("File %s did not exist and so could not be deleted.", location.uri)
356 raise
357 except Exception as e:
358 log.critical("Failed to delete file: %s (%s)", location.uri, e)
359 raise
360 log.debug("Successfully deleted file: %s", location.uri)
362 def addStoredItemInfo(self, refs: Iterable[DatasetRef], infos: Iterable[StoredFileInfo]) -> None:
363 # Docstring inherited from GenericBaseDatastore
364 records = [info.to_record(ref) for ref, info in zip(refs, infos)]
365 self._table.insert(*records)
367 def getStoredItemsInfo(self, ref: DatasetIdRef) -> List[StoredFileInfo]:
368 # Docstring inherited from GenericBaseDatastore
370 # Look for the dataset_id -- there might be multiple matches
371 # if we have disassembled the dataset.
372 records = self._table.fetch(dataset_id=ref.id)
373 return [StoredFileInfo.from_record(record) for record in records]
375 def _get_stored_records_associated_with_refs(
376 self, refs: Iterable[DatasetIdRef]
377 ) -> Dict[DatasetId, List[StoredFileInfo]]:
378 """Retrieve all records associated with the provided refs.
380 Parameters
381 ----------
382 refs : iterable of `DatasetIdRef`
383 The refs for which records are to be retrieved.
385 Returns
386 -------
387 records : `dict` of [`DatasetId`, `list` of `StoredFileInfo`]
388 The matching records indexed by the ref ID. The number of entries
389 in the dict can be smaller than the number of requested refs.
390 """
391 records = self._table.fetch(dataset_id=[ref.id for ref in refs])
393 # Uniqueness is dataset_id + component so can have multiple records
394 # per ref.
395 records_by_ref = defaultdict(list)
396 for record in records:
397 records_by_ref[record["dataset_id"]].append(StoredFileInfo.from_record(record))
398 return records_by_ref
400 def _refs_associated_with_artifacts(
401 self, paths: List[Union[str, ResourcePath]]
402 ) -> Dict[str, Set[DatasetId]]:
403 """Return paths and associated dataset refs.
405 Parameters
406 ----------
407 paths : `list` of `str` or `lsst.resources.ResourcePath`
408 All the paths to include in search.
410 Returns
411 -------
412 mapping : `dict` of [`str`, `set` [`DatasetId`]]
413 Mapping of each path to a set of associated database IDs.
414 """
415 records = self._table.fetch(path=[str(path) for path in paths])
416 result = defaultdict(set)
417 for row in records:
418 result[row["path"]].add(row["dataset_id"])
419 return result
421 def _registered_refs_per_artifact(self, pathInStore: ResourcePath) -> Set[DatasetId]:
422 """Return all dataset refs associated with the supplied path.
424 Parameters
425 ----------
426 pathInStore : `lsst.resources.ResourcePath`
427 Path of interest in the data store.
429 Returns
430 -------
431 ids : `set` of `int`
432 All `DatasetRef` IDs associated with this path.
433 """
434 records = list(self._table.fetch(path=str(pathInStore)))
435 ids = {r["dataset_id"] for r in records}
436 return ids
438 def removeStoredItemInfo(self, ref: DatasetIdRef) -> None:
439 # Docstring inherited from GenericBaseDatastore
440 self._table.delete(["dataset_id"], {"dataset_id": ref.id})
442 def _get_dataset_locations_info(self, ref: DatasetIdRef) -> List[Tuple[Location, StoredFileInfo]]:
443 r"""Find all the `Location`\ s of the requested dataset in the
444 `Datastore` and the associated stored file information.
446 Parameters
447 ----------
448 ref : `DatasetRef`
449 Reference to the required `Dataset`.
451 Returns
452 -------
453 results : `list` [`tuple` [`Location`, `StoredFileInfo` ]]
454 Location of the dataset within the datastore and
455 stored information about each file and its formatter.
456 """
457 # Get the file information (this will fail if no file)
458 records = self.getStoredItemsInfo(ref)
460 # Use the path to determine the location -- we need to take
461 # into account absolute URIs in the datastore record
462 return [(r.file_location(self.locationFactory), r) for r in records]
464 def _can_remove_dataset_artifact(self, ref: DatasetIdRef, location: Location) -> bool:
465 """Check that there is only one dataset associated with the
466 specified artifact.
468 Parameters
469 ----------
470 ref : `DatasetRef` or `FakeDatasetRef`
471 Dataset to be removed.
472 location : `Location`
473 The location of the artifact to be removed.
475 Returns
476 -------
477 can_remove : `Bool`
478 True if the artifact can be safely removed.
479 """
480 # Can't ever delete absolute URIs.
481 if location.pathInStore.isabs():
482 return False
484 # Get all entries associated with this path
485 allRefs = self._registered_refs_per_artifact(location.pathInStore)
486 if not allRefs:
487 raise RuntimeError(f"Datastore inconsistency error. {location.pathInStore} not in registry")
489 # Remove these refs from all the refs and if there is nothing left
490 # then we can delete
491 remainingRefs = allRefs - {ref.id}
493 if remainingRefs:
494 return False
495 return True
497 def _get_expected_dataset_locations_info(self, ref: DatasetRef) -> List[Tuple[Location, StoredFileInfo]]:
498 """Predict the location and related file information of the requested
499 dataset in this datastore.
501 Parameters
502 ----------
503 ref : `DatasetRef`
504 Reference to the required `Dataset`.
506 Returns
507 -------
508 results : `list` [`tuple` [`Location`, `StoredFileInfo` ]]
509 Expected Location of the dataset within the datastore and
510 placeholder information about each file and its formatter.
512 Notes
513 -----
514 Uses the current configuration to determine how we would expect the
515 datastore files to have been written if we couldn't ask registry.
516 This is safe so long as there has been no change to datastore
517 configuration between writing the dataset and wanting to read it.
518 Will not work for files that have been ingested without using the
519 standard file template or default formatter.
520 """
522 # If we have a component ref we always need to ask the questions
523 # of the composite. If the composite is disassembled this routine
524 # should return all components. If the composite was not
525 # disassembled the composite is what is stored regardless of
526 # component request. Note that if the caller has disassembled
527 # a composite there is no way for this guess to know that
528 # without trying both the composite and component ref and seeing
529 # if there is something at the component Location even without
530 # disassembly being enabled.
531 if ref.datasetType.isComponent():
532 ref = ref.makeCompositeRef()
534 # See if the ref is a composite that should be disassembled
535 doDisassembly = self.composites.shouldBeDisassembled(ref)
537 all_info: List[Tuple[Location, Formatter, StorageClass, Optional[str]]] = []
539 if doDisassembly:
540 for component, componentStorage in ref.datasetType.storageClass.components.items():
541 compRef = ref.makeComponentRef(component)
542 location, formatter = self._determine_put_formatter_location(compRef)
543 all_info.append((location, formatter, componentStorage, component))
545 else:
546 # Always use the composite ref if no disassembly
547 location, formatter = self._determine_put_formatter_location(ref)
548 all_info.append((location, formatter, ref.datasetType.storageClass, None))
550 # Convert the list of tuples to have StoredFileInfo as second element
551 return [
552 (
553 location,
554 StoredFileInfo(
555 formatter=formatter,
556 path=location.pathInStore.path,
557 storageClass=storageClass,
558 component=component,
559 checksum=None,
560 file_size=-1,
561 ),
562 )
563 for location, formatter, storageClass, component in all_info
564 ]
566 def _prepare_for_get(
567 self, ref: DatasetRef, parameters: Optional[Mapping[str, Any]] = None
568 ) -> List[DatastoreFileGetInformation]:
569 """Check parameters for ``get`` and obtain formatter and
570 location.
572 Parameters
573 ----------
574 ref : `DatasetRef`
575 Reference to the required Dataset.
576 parameters : `dict`
577 `StorageClass`-specific parameters that specify, for example,
578 a slice of the dataset to be loaded.
580 Returns
581 -------
582 getInfo : `list` [`DatastoreFileGetInformation`]
583 Parameters needed to retrieve each file.
584 """
585 log.debug("Retrieve %s from %s with parameters %s", ref, self.name, parameters)
587 # Get file metadata and internal metadata
588 fileLocations = self._get_dataset_locations_info(ref)
589 if not fileLocations:
590 if not self.trustGetRequest:
591 raise FileNotFoundError(f"Could not retrieve dataset {ref}.")
592 # Assume the dataset is where we think it should be
593 fileLocations = self._get_expected_dataset_locations_info(ref)
595 # The storage class we want to use eventually
596 refStorageClass = ref.datasetType.storageClass
598 if len(fileLocations) > 1:
599 disassembled = True
601 # If trust is involved it is possible that there will be
602 # components listed here that do not exist in the datastore.
603 # Explicitly check for file artifact existence and filter out any
604 # that are missing.
605 if self.trustGetRequest:
606 fileLocations = [loc for loc in fileLocations if loc[0].uri.exists()]
608 # For now complain only if we have no components at all. One
609 # component is probably a problem but we can punt that to the
610 # assembler.
611 if not fileLocations: 611 ↛ 612line 611 didn't jump to line 612, because the condition on line 611 was never true
612 raise FileNotFoundError(f"None of the component files for dataset {ref} exist.")
614 else:
615 disassembled = False
617 # Is this a component request?
618 refComponent = ref.datasetType.component()
620 fileGetInfo = []
621 for location, storedFileInfo in fileLocations:
623 # The storage class used to write the file
624 writeStorageClass = storedFileInfo.storageClass
626 # If this has been disassembled we need read to match the write
627 if disassembled:
628 readStorageClass = writeStorageClass
629 else:
630 readStorageClass = refStorageClass
632 formatter = get_instance_of(
633 storedFileInfo.formatter,
634 FileDescriptor(
635 location,
636 readStorageClass=readStorageClass,
637 storageClass=writeStorageClass,
638 parameters=parameters,
639 ),
640 ref.dataId,
641 )
643 formatterParams, notFormatterParams = formatter.segregateParameters()
645 # Of the remaining parameters, extract the ones supported by
646 # this StorageClass (for components not all will be handled)
647 assemblerParams = readStorageClass.filterParameters(notFormatterParams)
649 # The ref itself could be a component if the dataset was
650 # disassembled by butler, or we disassembled in datastore and
651 # components came from the datastore records
652 component = storedFileInfo.component if storedFileInfo.component else refComponent
654 fileGetInfo.append(
655 DatastoreFileGetInformation(
656 location,
657 formatter,
658 storedFileInfo,
659 assemblerParams,
660 formatterParams,
661 component,
662 readStorageClass,
663 )
664 )
666 return fileGetInfo
668 def _prepare_for_put(self, inMemoryDataset: Any, ref: DatasetRef) -> Tuple[Location, Formatter]:
669 """Check the arguments for ``put`` and obtain formatter and
670 location.
672 Parameters
673 ----------
674 inMemoryDataset : `object`
675 The dataset to store.
676 ref : `DatasetRef`
677 Reference to the associated Dataset.
679 Returns
680 -------
681 location : `Location`
682 The location to write the dataset.
683 formatter : `Formatter`
684 The `Formatter` to use to write the dataset.
686 Raises
687 ------
688 TypeError
689 Supplied object and storage class are inconsistent.
690 DatasetTypeNotSupportedError
691 The associated `DatasetType` is not handled by this datastore.
692 """
693 self._validate_put_parameters(inMemoryDataset, ref)
694 return self._determine_put_formatter_location(ref)
696 def _determine_put_formatter_location(self, ref: DatasetRef) -> Tuple[Location, Formatter]:
697 """Calculate the formatter and output location to use for put.
699 Parameters
700 ----------
701 ref : `DatasetRef`
702 Reference to the associated Dataset.
704 Returns
705 -------
706 location : `Location`
707 The location to write the dataset.
708 formatter : `Formatter`
709 The `Formatter` to use to write the dataset.
710 """
711 # Work out output file name
712 try:
713 template = self.templates.getTemplate(ref)
714 except KeyError as e:
715 raise DatasetTypeNotSupportedError(f"Unable to find template for {ref}") from e
717 # Validate the template to protect against filenames from different
718 # dataIds returning the same and causing overwrite confusion.
719 template.validateTemplate(ref)
721 location = self.locationFactory.fromPath(template.format(ref))
723 # Get the formatter based on the storage class
724 storageClass = ref.datasetType.storageClass
725 try:
726 formatter = self.formatterFactory.getFormatter(
727 ref, FileDescriptor(location, storageClass=storageClass), ref.dataId
728 )
729 except KeyError as e:
730 raise DatasetTypeNotSupportedError(
731 f"Unable to find formatter for {ref} in datastore {self.name}"
732 ) from e
734 # Now that we know the formatter, update the location
735 location = formatter.makeUpdatedLocation(location)
737 return location, formatter
739 def _overrideTransferMode(self, *datasets: FileDataset, transfer: Optional[str] = None) -> Optional[str]:
740 # Docstring inherited from base class
741 if transfer != "auto":
742 return transfer
744 # See if the paths are within the datastore or not
745 inside = [self._pathInStore(d.path) is not None for d in datasets]
747 if all(inside):
748 transfer = None
749 elif not any(inside): 749 ↛ 758line 749 didn't jump to line 758, because the condition on line 749 was never false
750 # Allow ResourcePath to use its own knowledge
751 transfer = "auto"
752 else:
753 # This can happen when importing from a datastore that
754 # has had some datasets ingested using "direct" mode.
755 # Also allow ResourcePath to sort it out but warn about it.
756 # This can happen if you are importing from a datastore
757 # that had some direct transfer datasets.
758 log.warning(
759 "Some datasets are inside the datastore and some are outside. Using 'split' "
760 "transfer mode. This assumes that the files outside the datastore are "
761 "still accessible to the new butler since they will not be copied into "
762 "the target datastore."
763 )
764 transfer = "split"
766 return transfer
768 def _pathInStore(self, path: ResourcePathExpression) -> Optional[str]:
769 """Return path relative to datastore root
771 Parameters
772 ----------
773 path : `lsst.resources.ResourcePathExpression`
774 Path to dataset. Can be absolute URI. If relative assumed to
775 be relative to the datastore. Returns path in datastore
776 or raises an exception if the path it outside.
778 Returns
779 -------
780 inStore : `str`
781 Path relative to datastore root. Returns `None` if the file is
782 outside the root.
783 """
784 # Relative path will always be relative to datastore
785 pathUri = ResourcePath(path, forceAbsolute=False)
786 return pathUri.relative_to(self.root)
788 def _standardizeIngestPath(
789 self, path: Union[str, ResourcePath], *, transfer: Optional[str] = None
790 ) -> Union[str, ResourcePath]:
791 """Standardize the path of a to-be-ingested file.
793 Parameters
794 ----------
795 path : `str` or `lsst.resources.ResourcePath`
796 Path of a file to be ingested. This parameter is not expected
797 to be all the types that can be used to construct a
798 `~lsst.resources.ResourcePath`.
799 transfer : `str`, optional
800 How (and whether) the dataset should be added to the datastore.
801 See `ingest` for details of transfer modes.
802 This implementation is provided only so
803 `NotImplementedError` can be raised if the mode is not supported;
804 actual transfers are deferred to `_extractIngestInfo`.
806 Returns
807 -------
808 path : `str` or `lsst.resources.ResourcePath`
809 New path in what the datastore considers standard form. If an
810 absolute URI was given that will be returned unchanged.
812 Notes
813 -----
814 Subclasses of `FileDatastore` can implement this method instead
815 of `_prepIngest`. It should not modify the data repository or given
816 file in any way.
818 Raises
819 ------
820 NotImplementedError
821 Raised if the datastore does not support the given transfer mode
822 (including the case where ingest is not supported at all).
823 FileNotFoundError
824 Raised if one of the given files does not exist.
825 """
826 if transfer not in (None, "direct", "split") + self.root.transferModes: 826 ↛ 827line 826 didn't jump to line 827, because the condition on line 826 was never true
827 raise NotImplementedError(f"Transfer mode {transfer} not supported.")
829 # A relative URI indicates relative to datastore root
830 srcUri = ResourcePath(path, forceAbsolute=False)
831 if not srcUri.isabs():
832 srcUri = self.root.join(path)
834 if not srcUri.exists():
835 raise FileNotFoundError(
836 f"Resource at {srcUri} does not exist; note that paths to ingest "
837 f"are assumed to be relative to {self.root} unless they are absolute."
838 )
840 if transfer is None:
841 relpath = srcUri.relative_to(self.root)
842 if not relpath:
843 raise RuntimeError(
844 f"Transfer is none but source file ({srcUri}) is not within datastore ({self.root})"
845 )
847 # Return the relative path within the datastore for internal
848 # transfer
849 path = relpath
851 return path
853 def _extractIngestInfo(
854 self,
855 path: ResourcePathExpression,
856 ref: DatasetRef,
857 *,
858 formatter: Union[Formatter, Type[Formatter]],
859 transfer: Optional[str] = None,
860 record_validation_info: bool = True,
861 ) -> StoredFileInfo:
862 """Relocate (if necessary) and extract `StoredFileInfo` from a
863 to-be-ingested file.
865 Parameters
866 ----------
867 path : `lsst.resources.ResourcePathExpression`
868 URI or path of a file to be ingested.
869 ref : `DatasetRef`
870 Reference for the dataset being ingested. Guaranteed to have
871 ``dataset_id not None`.
872 formatter : `type` or `Formatter`
873 `Formatter` subclass to use for this dataset or an instance.
874 transfer : `str`, optional
875 How (and whether) the dataset should be added to the datastore.
876 See `ingest` for details of transfer modes.
877 record_validation_info : `bool`, optional
878 If `True`, the default, the datastore can record validation
879 information associated with the file. If `False` the datastore
880 will not attempt to track any information such as checksums
881 or file sizes. This can be useful if such information is tracked
882 in an external system or if the file is to be compressed in place.
883 It is up to the datastore whether this parameter is relevant.
885 Returns
886 -------
887 info : `StoredFileInfo`
888 Internal datastore record for this file. This will be inserted by
889 the caller; the `_extractIngestInfo` is only responsible for
890 creating and populating the struct.
892 Raises
893 ------
894 FileNotFoundError
895 Raised if one of the given files does not exist.
896 FileExistsError
897 Raised if transfer is not `None` but the (internal) location the
898 file would be moved to is already occupied.
899 """
900 if self._transaction is None: 900 ↛ 901line 900 didn't jump to line 901, because the condition on line 900 was never true
901 raise RuntimeError("Ingest called without transaction enabled")
903 # Create URI of the source path, do not need to force a relative
904 # path to absolute.
905 srcUri = ResourcePath(path, forceAbsolute=False)
907 # Track whether we have read the size of the source yet
908 have_sized = False
910 tgtLocation: Optional[Location]
911 if transfer is None or transfer == "split":
912 # A relative path is assumed to be relative to the datastore
913 # in this context
914 if not srcUri.isabs():
915 tgtLocation = self.locationFactory.fromPath(srcUri.ospath)
916 else:
917 # Work out the path in the datastore from an absolute URI
918 # This is required to be within the datastore.
919 pathInStore = srcUri.relative_to(self.root)
920 if pathInStore is None and transfer is None: 920 ↛ 921line 920 didn't jump to line 921, because the condition on line 920 was never true
921 raise RuntimeError(
922 f"Unexpectedly learned that {srcUri} is not within datastore {self.root}"
923 )
924 if pathInStore: 924 ↛ 926line 924 didn't jump to line 926, because the condition on line 924 was never false
925 tgtLocation = self.locationFactory.fromPath(pathInStore)
926 elif transfer == "split":
927 # Outside the datastore but treat that as a direct ingest
928 # instead.
929 tgtLocation = None
930 else:
931 raise RuntimeError(f"Unexpected transfer mode encountered: {transfer} for URI {srcUri}")
932 elif transfer == "direct": 932 ↛ 937line 932 didn't jump to line 937, because the condition on line 932 was never true
933 # Want to store the full URI to the resource directly in
934 # datastore. This is useful for referring to permanent archive
935 # storage for raw data.
936 # Trust that people know what they are doing.
937 tgtLocation = None
938 else:
939 # Work out the name we want this ingested file to have
940 # inside the datastore
941 tgtLocation = self._calculate_ingested_datastore_name(srcUri, ref, formatter)
942 if not tgtLocation.uri.dirname().exists():
943 log.debug("Folder %s does not exist yet.", tgtLocation.uri.dirname())
944 tgtLocation.uri.dirname().mkdir()
946 # if we are transferring from a local file to a remote location
947 # it may be more efficient to get the size and checksum of the
948 # local file rather than the transferred one
949 if record_validation_info and srcUri.isLocal:
950 size = srcUri.size()
951 checksum = self.computeChecksum(srcUri) if self.useChecksum else None
952 have_sized = True
954 # Transfer the resource to the destination.
955 # Allow overwrite of an existing file. This matches the behavior
956 # of datastore.put() in that it trusts that registry would not
957 # be asking to overwrite unless registry thought that the
958 # overwrite was allowed.
959 tgtLocation.uri.transfer_from(
960 srcUri, transfer=transfer, transaction=self._transaction, overwrite=True
961 )
963 if tgtLocation is None: 963 ↛ 965line 963 didn't jump to line 965, because the condition on line 963 was never true
964 # This means we are using direct mode
965 targetUri = srcUri
966 targetPath = str(srcUri)
967 else:
968 targetUri = tgtLocation.uri
969 targetPath = tgtLocation.pathInStore.path
971 # the file should exist in the datastore now
972 if record_validation_info:
973 if not have_sized:
974 size = targetUri.size()
975 checksum = self.computeChecksum(targetUri) if self.useChecksum else None
976 else:
977 # Not recording any file information.
978 size = -1
979 checksum = None
981 return StoredFileInfo(
982 formatter=formatter,
983 path=targetPath,
984 storageClass=ref.datasetType.storageClass,
985 component=ref.datasetType.component(),
986 file_size=size,
987 checksum=checksum,
988 )
990 def _prepIngest(self, *datasets: FileDataset, transfer: Optional[str] = None) -> _IngestPrepData:
991 # Docstring inherited from Datastore._prepIngest.
992 filtered = []
993 for dataset in datasets:
994 acceptable = [ref for ref in dataset.refs if self.constraints.isAcceptable(ref)]
995 if not acceptable:
996 continue
997 else:
998 dataset.refs = acceptable
999 if dataset.formatter is None:
1000 dataset.formatter = self.formatterFactory.getFormatterClass(dataset.refs[0])
1001 else:
1002 assert isinstance(dataset.formatter, (type, str))
1003 formatter_class = get_class_of(dataset.formatter)
1004 if not issubclass(formatter_class, Formatter): 1004 ↛ 1005line 1004 didn't jump to line 1005, because the condition on line 1004 was never true
1005 raise TypeError(f"Requested formatter {dataset.formatter} is not a Formatter class.")
1006 dataset.formatter = formatter_class
1007 dataset.path = self._standardizeIngestPath(dataset.path, transfer=transfer)
1008 filtered.append(dataset)
1009 return _IngestPrepData(filtered)
1011 @transactional
1012 def _finishIngest(
1013 self,
1014 prepData: Datastore.IngestPrepData,
1015 *,
1016 transfer: Optional[str] = None,
1017 record_validation_info: bool = True,
1018 ) -> None:
1019 # Docstring inherited from Datastore._finishIngest.
1020 refsAndInfos = []
1021 progress = Progress("lsst.daf.butler.datastores.FileDatastore.ingest", level=logging.DEBUG)
1022 for dataset in progress.wrap(prepData.datasets, desc="Ingesting dataset files"):
1023 # Do ingest as if the first dataset ref is associated with the file
1024 info = self._extractIngestInfo(
1025 dataset.path,
1026 dataset.refs[0],
1027 formatter=dataset.formatter,
1028 transfer=transfer,
1029 record_validation_info=record_validation_info,
1030 )
1031 refsAndInfos.extend([(ref, info) for ref in dataset.refs])
1032 self._register_datasets(refsAndInfos)
1034 def _calculate_ingested_datastore_name(
1035 self, srcUri: ResourcePath, ref: DatasetRef, formatter: Union[Formatter, Type[Formatter]]
1036 ) -> Location:
1037 """Given a source URI and a DatasetRef, determine the name the
1038 dataset will have inside datastore.
1040 Parameters
1041 ----------
1042 srcUri : `lsst.resources.ResourcePath`
1043 URI to the source dataset file.
1044 ref : `DatasetRef`
1045 Ref associated with the newly-ingested dataset artifact. This
1046 is used to determine the name within the datastore.
1047 formatter : `Formatter` or Formatter class.
1048 Formatter to use for validation. Can be a class or an instance.
1050 Returns
1051 -------
1052 location : `Location`
1053 Target location for the newly-ingested dataset.
1054 """
1055 # Ingesting a file from outside the datastore.
1056 # This involves a new name.
1057 template = self.templates.getTemplate(ref)
1058 location = self.locationFactory.fromPath(template.format(ref))
1060 # Get the extension
1061 ext = srcUri.getExtension()
1063 # Update the destination to include that extension
1064 location.updateExtension(ext)
1066 # Ask the formatter to validate this extension
1067 formatter.validateExtension(location)
1069 return location
1071 def _write_in_memory_to_artifact(self, inMemoryDataset: Any, ref: DatasetRef) -> StoredFileInfo:
1072 """Write out in memory dataset to datastore.
1074 Parameters
1075 ----------
1076 inMemoryDataset : `object`
1077 Dataset to write to datastore.
1078 ref : `DatasetRef`
1079 Registry information associated with this dataset.
1081 Returns
1082 -------
1083 info : `StoredFileInfo`
1084 Information describing the artifact written to the datastore.
1085 """
1086 # May need to coerce the in memory dataset to the correct
1087 # python type.
1088 inMemoryDataset = ref.datasetType.storageClass.coerce_type(inMemoryDataset)
1090 location, formatter = self._prepare_for_put(inMemoryDataset, ref)
1091 uri = location.uri
1093 if not uri.dirname().exists():
1094 log.debug("Folder %s does not exist yet so creating it.", uri.dirname())
1095 uri.dirname().mkdir()
1097 if self._transaction is None: 1097 ↛ 1098line 1097 didn't jump to line 1098, because the condition on line 1097 was never true
1098 raise RuntimeError("Attempting to write artifact without transaction enabled")
1100 def _removeFileExists(uri: ResourcePath) -> None:
1101 """Remove a file and do not complain if it is not there.
1103 This is important since a formatter might fail before the file
1104 is written and we should not confuse people by writing spurious
1105 error messages to the log.
1106 """
1107 try:
1108 uri.remove()
1109 except FileNotFoundError:
1110 pass
1112 # Register a callback to try to delete the uploaded data if
1113 # something fails below
1114 self._transaction.registerUndo("artifactWrite", _removeFileExists, uri)
1116 # For a local file, simply use the formatter directly
1117 if uri.isLocal:
1118 try:
1119 formatter.write(inMemoryDataset)
1120 except Exception as e:
1121 raise RuntimeError(
1122 f"Failed to serialize dataset {ref} of type {type(inMemoryDataset)} to location {uri}"
1123 ) from e
1124 log.debug("Successfully wrote python object to local file at %s", uri)
1125 else:
1126 # This is a remote URI. Some datasets can be serialized directly
1127 # to bytes and sent to the remote datastore without writing a
1128 # file. If the dataset is intended to be saved to the cache
1129 # a file is always written and direct write to the remote
1130 # datastore is bypassed.
1131 data_written = False
1132 if not self.cacheManager.should_be_cached(ref):
1133 try:
1134 serializedDataset = formatter.toBytes(inMemoryDataset)
1135 except NotImplementedError:
1136 # Fallback to the file writing option.
1137 pass
1138 except Exception as e:
1139 raise RuntimeError(
1140 f"Failed to serialize dataset {ref} of type {type(inMemoryDataset)} to bytes."
1141 ) from e
1142 else:
1143 log.debug("Writing bytes directly to %s", uri)
1144 uri.write(serializedDataset, overwrite=True)
1145 log.debug("Successfully wrote bytes directly to %s", uri)
1146 data_written = True
1148 if not data_written:
1149 # Did not write the bytes directly to object store so instead
1150 # write to temporary file.
1151 with ResourcePath.temporary_uri(suffix=uri.getExtension()) as temporary_uri:
1152 # Need to configure the formatter to write to a different
1153 # location and that needs us to overwrite internals
1154 log.debug("Writing dataset to temporary location at %s", temporary_uri)
1155 with formatter._updateLocation(Location(None, temporary_uri)):
1156 try:
1157 formatter.write(inMemoryDataset)
1158 except Exception as e:
1159 raise RuntimeError(
1160 f"Failed to serialize dataset {ref} of type"
1161 f" {type(inMemoryDataset)} to "
1162 f"temporary location {temporary_uri}"
1163 ) from e
1164 uri.transfer_from(temporary_uri, transfer="copy", overwrite=True)
1166 # Cache if required
1167 self.cacheManager.move_to_cache(temporary_uri, ref)
1169 log.debug("Successfully wrote dataset to %s via a temporary file.", uri)
1171 # URI is needed to resolve what ingest case are we dealing with
1172 return self._extractIngestInfo(uri, ref, formatter=formatter)
1174 def _read_artifact_into_memory(
1175 self,
1176 getInfo: DatastoreFileGetInformation,
1177 ref: DatasetRef,
1178 isComponent: bool = False,
1179 cache_ref: Optional[DatasetRef] = None,
1180 ) -> Any:
1181 """Read the artifact from datastore into in memory object.
1183 Parameters
1184 ----------
1185 getInfo : `DatastoreFileGetInformation`
1186 Information about the artifact within the datastore.
1187 ref : `DatasetRef`
1188 The registry information associated with this artifact.
1189 isComponent : `bool`
1190 Flag to indicate if a component is being read from this artifact.
1191 cache_ref : `DatasetRef`, optional
1192 The DatasetRef to use when looking up the file in the cache.
1193 This ref must have the same ID as the supplied ref but can
1194 be a parent ref or component ref to indicate to the cache whether
1195 a composite file is being requested from the cache or a component
1196 file. Without this the cache will default to the supplied ref but
1197 it can get confused with read-only derived components for
1198 disassembled composites.
1200 Returns
1201 -------
1202 inMemoryDataset : `object`
1203 The artifact as a python object.
1204 """
1205 location = getInfo.location
1206 uri = location.uri
1207 log.debug("Accessing data from %s", uri)
1209 if cache_ref is None:
1210 cache_ref = ref
1211 if cache_ref.id != ref.id: 1211 ↛ 1212line 1211 didn't jump to line 1212, because the condition on line 1211 was never true
1212 raise ValueError(
1213 "The supplied cache dataset ref refers to a different dataset than expected:"
1214 f" {ref.id} != {cache_ref.id}"
1215 )
1217 # Cannot recalculate checksum but can compare size as a quick check
1218 # Do not do this if the size is negative since that indicates
1219 # we do not know.
1220 recorded_size = getInfo.info.file_size
1221 resource_size = uri.size()
1222 if recorded_size >= 0 and resource_size != recorded_size: 1222 ↛ 1223line 1222 didn't jump to line 1223, because the condition on line 1222 was never true
1223 raise RuntimeError(
1224 "Integrity failure in Datastore. "
1225 f"Size of file {uri} ({resource_size}) "
1226 f"does not match size recorded in registry of {recorded_size}"
1227 )
1229 # For the general case we have choices for how to proceed.
1230 # 1. Always use a local file (downloading the remote resource to a
1231 # temporary file if needed).
1232 # 2. Use a threshold size and read into memory and use bytes.
1233 # Use both for now with an arbitrary hand off size.
1234 # This allows small datasets to be downloaded from remote object
1235 # stores without requiring a temporary file.
1237 formatter = getInfo.formatter
1238 nbytes_max = 10_000_000 # Arbitrary number that we can tune
1239 if resource_size <= nbytes_max and formatter.can_read_bytes():
1240 with self.cacheManager.find_in_cache(cache_ref, uri.getExtension()) as cached_file:
1241 if cached_file is not None:
1242 desired_uri = cached_file
1243 msg = f" (cached version of {uri})"
1244 else:
1245 desired_uri = uri
1246 msg = ""
1247 with time_this(log, msg="Reading bytes from %s%s", args=(desired_uri, msg)):
1248 serializedDataset = desired_uri.read()
1249 log.debug(
1250 "Deserializing %s from %d bytes from location %s with formatter %s",
1251 f"component {getInfo.component}" if isComponent else "",
1252 len(serializedDataset),
1253 uri,
1254 formatter.name(),
1255 )
1256 try:
1257 result = formatter.fromBytes(
1258 serializedDataset, component=getInfo.component if isComponent else None
1259 )
1260 except Exception as e:
1261 raise ValueError(
1262 f"Failure from formatter '{formatter.name()}' for dataset {ref.id}"
1263 f" ({ref.datasetType.name} from {uri}): {e}"
1264 ) from e
1265 else:
1266 # Read from file.
1268 # Have to update the Location associated with the formatter
1269 # because formatter.read does not allow an override.
1270 # This could be improved.
1271 location_updated = False
1272 msg = ""
1274 # First check in cache for local version.
1275 # The cache will only be relevant for remote resources but
1276 # no harm in always asking. Context manager ensures that cache
1277 # file is not deleted during cache expiration.
1278 with self.cacheManager.find_in_cache(cache_ref, uri.getExtension()) as cached_file:
1279 if cached_file is not None:
1280 msg = f"(via cache read of remote file {uri})"
1281 uri = cached_file
1282 location_updated = True
1284 with uri.as_local() as local_uri:
1286 can_be_cached = False
1287 if uri != local_uri: 1287 ↛ 1289line 1287 didn't jump to line 1289, because the condition on line 1287 was never true
1288 # URI was remote and file was downloaded
1289 cache_msg = ""
1290 location_updated = True
1292 if self.cacheManager.should_be_cached(cache_ref):
1293 # In this scenario we want to ask if the downloaded
1294 # file should be cached but we should not cache
1295 # it until after we've used it (to ensure it can't
1296 # be expired whilst we are using it).
1297 can_be_cached = True
1299 # Say that it is "likely" to be cached because
1300 # if the formatter read fails we will not be
1301 # caching this file.
1302 cache_msg = " and likely cached"
1304 msg = f"(via download to local file{cache_msg})"
1306 # Calculate the (possibly) new location for the formatter
1307 # to use.
1308 newLocation = Location(*local_uri.split()) if location_updated else None
1310 log.debug(
1311 "Reading%s from location %s %s with formatter %s",
1312 f" component {getInfo.component}" if isComponent else "",
1313 uri,
1314 msg,
1315 formatter.name(),
1316 )
1317 try:
1318 with formatter._updateLocation(newLocation):
1319 with time_this(
1320 log,
1321 msg="Reading%s from location %s %s with formatter %s",
1322 args=(
1323 f" component {getInfo.component}" if isComponent else "",
1324 uri,
1325 msg,
1326 formatter.name(),
1327 ),
1328 ):
1329 result = formatter.read(component=getInfo.component if isComponent else None)
1330 except Exception as e:
1331 raise ValueError(
1332 f"Failure from formatter '{formatter.name()}' for dataset {ref.id}"
1333 f" ({ref.datasetType.name} from {uri}): {e}"
1334 ) from e
1336 # File was read successfully so can move to cache
1337 if can_be_cached: 1337 ↛ 1338line 1337 didn't jump to line 1338, because the condition on line 1337 was never true
1338 self.cacheManager.move_to_cache(local_uri, cache_ref)
1340 return self._post_process_get(
1341 result, getInfo.readStorageClass, getInfo.assemblerParams, isComponent=isComponent
1342 )
1344 def knows(self, ref: DatasetRef) -> bool:
1345 """Check if the dataset is known to the datastore.
1347 Does not check for existence of any artifact.
1349 Parameters
1350 ----------
1351 ref : `DatasetRef`
1352 Reference to the required dataset.
1354 Returns
1355 -------
1356 exists : `bool`
1357 `True` if the dataset is known to the datastore.
1358 """
1359 fileLocations = self._get_dataset_locations_info(ref)
1360 if fileLocations:
1361 return True
1362 return False
1364 def _process_mexists_records(
1365 self,
1366 id_to_ref: Dict[DatasetId, DatasetRef],
1367 records: Dict[DatasetId, List[StoredFileInfo]],
1368 all_required: bool,
1369 artifact_existence: Optional[Dict[ResourcePath, bool]] = None,
1370 ) -> Dict[DatasetRef, bool]:
1371 """Helper function for mexists that checks the given records.
1373 Parameters
1374 ----------
1375 id_to_ref : `dict` of [`DatasetId`, `DatasetRef`]
1376 Mapping of the dataset ID to the dataset ref itself.
1377 records : `dict` of [`DatasetId`, `list` of `StoredFileInfo`]
1378 Records as generally returned by
1379 ``_get_stored_records_associated_with_refs``.
1380 all_required : `bool`
1381 Flag to indicate whether existence requires all artifacts
1382 associated with a dataset ID to exist or not for existence.
1383 artifact_existence : `dict` [`lsst.resources.ResourcePath`, `bool`]
1384 Optional mapping of datastore artifact to existence. Updated by
1385 this method with details of all artifacts tested. Can be `None`
1386 if the caller is not interested.
1388 Returns
1389 -------
1390 existence : `dict` of [`DatasetRef`, `bool`]
1391 Mapping from dataset to boolean indicating existence.
1392 """
1393 # The URIs to be checked and a mapping of those URIs to
1394 # the dataset ID.
1395 uris_to_check: List[ResourcePath] = []
1396 location_map: Dict[ResourcePath, DatasetId] = {}
1398 location_factory = self.locationFactory
1400 for ref_id, info in records.items():
1401 # Key is the dataId, value is list of StoredItemInfo
1402 uris = [info.file_location(location_factory).uri for info in info]
1403 uris_to_check.extend(uris)
1404 location_map.update({uri: ref_id for uri in uris})
1406 uri_existence: Dict[ResourcePath, bool] = {}
1407 if artifact_existence is not None:
1408 # If a URI has already been checked remove it from the list
1409 # and immediately add the status to the output dict.
1410 filtered_uris_to_check = []
1411 for uri in uris_to_check:
1412 if uri in artifact_existence:
1413 uri_existence[uri] = artifact_existence[uri]
1414 else:
1415 filtered_uris_to_check.append(uri)
1416 uris_to_check = filtered_uris_to_check
1418 # Results.
1419 dataset_existence: Dict[DatasetRef, bool] = {}
1421 uri_existence.update(ResourcePath.mexists(uris_to_check))
1422 for uri, exists in uri_existence.items():
1423 dataset_id = location_map[uri]
1424 ref = id_to_ref[dataset_id]
1426 # Disassembled composite needs to check all locations.
1427 # all_required indicates whether all need to exist or not.
1428 if ref in dataset_existence:
1429 if all_required:
1430 exists = dataset_existence[ref] and exists
1431 else:
1432 exists = dataset_existence[ref] or exists
1433 dataset_existence[ref] = exists
1435 if artifact_existence is not None:
1436 artifact_existence.update(uri_existence)
1438 return dataset_existence
1440 def mexists(
1441 self, refs: Iterable[DatasetRef], artifact_existence: Optional[Dict[ResourcePath, bool]] = None
1442 ) -> Dict[DatasetRef, bool]:
1443 """Check the existence of multiple datasets at once.
1445 Parameters
1446 ----------
1447 refs : iterable of `DatasetRef`
1448 The datasets to be checked.
1449 artifact_existence : `dict` [`lsst.resources.ResourcePath`, `bool`]
1450 Optional mapping of datastore artifact to existence. Updated by
1451 this method with details of all artifacts tested. Can be `None`
1452 if the caller is not interested.
1454 Returns
1455 -------
1456 existence : `dict` of [`DatasetRef`, `bool`]
1457 Mapping from dataset to boolean indicating existence.
1458 """
1459 chunk_size = 10_000
1460 dataset_existence: Dict[DatasetRef, bool] = {}
1461 log.debug("Checking for the existence of multiple artifacts in datastore in chunks of %d", chunk_size)
1462 n_found_total = 0
1463 n_checked = 0
1464 n_chunks = 0
1465 for chunk in chunk_iterable(refs, chunk_size=chunk_size):
1466 chunk_result = self._mexists(chunk, artifact_existence)
1467 if log.isEnabledFor(VERBOSE):
1468 n_results = len(chunk_result)
1469 n_checked += n_results
1470 # Can treat the booleans as 0, 1 integers and sum them.
1471 n_found = sum(chunk_result.values())
1472 n_found_total += n_found
1473 log.verbose(
1474 "Number of datasets found in datastore for chunk %d = %d/%d (running total: %d/%d)",
1475 n_chunks,
1476 n_found,
1477 n_results,
1478 n_found_total,
1479 n_checked,
1480 )
1481 dataset_existence.update(chunk_result)
1482 n_chunks += 1
1484 return dataset_existence
1486 def _mexists(
1487 self, refs: Iterable[DatasetRef], artifact_existence: Optional[Dict[ResourcePath, bool]] = None
1488 ) -> Dict[DatasetRef, bool]:
1489 """Check the existence of multiple datasets at once.
1491 Parameters
1492 ----------
1493 refs : iterable of `DatasetRef`
1494 The datasets to be checked.
1496 Returns
1497 -------
1498 existence : `dict` of [`DatasetRef`, `bool`]
1499 Mapping from dataset to boolean indicating existence.
1500 """
1501 # Need a mapping of dataset_id to dataset ref since the API
1502 # works with dataset_id
1503 id_to_ref = {ref.getCheckedId(): ref for ref in refs}
1505 # Set of all IDs we are checking for.
1506 requested_ids = set(id_to_ref.keys())
1508 # The records themselves. Could be missing some entries.
1509 records = self._get_stored_records_associated_with_refs(refs)
1511 dataset_existence = self._process_mexists_records(
1512 id_to_ref, records, True, artifact_existence=artifact_existence
1513 )
1515 # Set of IDs that have been handled.
1516 handled_ids = {ref.id for ref in dataset_existence.keys()}
1518 missing_ids = requested_ids - handled_ids
1519 if missing_ids:
1520 if not self.trustGetRequest:
1521 # Must assume these do not exist
1522 for missing in missing_ids:
1523 dataset_existence[id_to_ref[missing]] = False
1524 else:
1525 log.debug(
1526 "%d out of %d datasets were not known to datastore during initial existence check.",
1527 len(missing_ids),
1528 len(requested_ids),
1529 )
1531 # Construct data structure identical to that returned
1532 # by _get_stored_records_associated_with_refs() but using
1533 # guessed names.
1534 records = {}
1535 for missing in missing_ids:
1536 expected = self._get_expected_dataset_locations_info(id_to_ref[missing])
1537 records[missing] = [info for _, info in expected]
1539 dataset_existence.update(
1540 self._process_mexists_records(
1541 id_to_ref, records, False, artifact_existence=artifact_existence
1542 )
1543 )
1545 return dataset_existence
1547 def exists(self, ref: DatasetRef) -> bool:
1548 """Check if the dataset exists in the datastore.
1550 Parameters
1551 ----------
1552 ref : `DatasetRef`
1553 Reference to the required dataset.
1555 Returns
1556 -------
1557 exists : `bool`
1558 `True` if the entity exists in the `Datastore`.
1559 """
1560 fileLocations = self._get_dataset_locations_info(ref)
1562 # if we are being asked to trust that registry might not be correct
1563 # we ask for the expected locations and check them explicitly
1564 if not fileLocations:
1565 if not self.trustGetRequest:
1566 return False
1568 # When we are guessing a dataset location we can not check
1569 # for the existence of every component since we can not
1570 # know if every component was written. Instead we check
1571 # for the existence of any of the expected locations.
1572 for location, _ in self._get_expected_dataset_locations_info(ref): 1572 ↛ 1575line 1572 didn't jump to line 1575, because the loop on line 1572 didn't complete
1573 if self._artifact_exists(location): 1573 ↛ 1572line 1573 didn't jump to line 1572, because the condition on line 1573 was never false
1574 return True
1575 return False
1577 # All listed artifacts must exist.
1578 for location, _ in fileLocations:
1579 if not self._artifact_exists(location):
1580 return False
1582 return True
1584 def getURIs(
1585 self, ref: DatasetRef, predict: bool = False
1586 ) -> Tuple[Optional[ResourcePath], Dict[str, ResourcePath]]:
1587 """Return URIs associated with dataset.
1589 Parameters
1590 ----------
1591 ref : `DatasetRef`
1592 Reference to the required dataset.
1593 predict : `bool`, optional
1594 If the datastore does not know about the dataset, should it
1595 return a predicted URI or not?
1597 Returns
1598 -------
1599 primary : `lsst.resources.ResourcePath`
1600 The URI to the primary artifact associated with this dataset.
1601 If the dataset was disassembled within the datastore this
1602 may be `None`.
1603 components : `dict`
1604 URIs to any components associated with the dataset artifact.
1605 Can be empty if there are no components.
1606 """
1608 primary: Optional[ResourcePath] = None
1609 components: Dict[str, ResourcePath] = {}
1611 # if this has never been written then we have to guess
1612 if not self.exists(ref):
1613 if not predict:
1614 raise FileNotFoundError("Dataset {} not in this datastore".format(ref))
1616 doDisassembly = self.composites.shouldBeDisassembled(ref)
1618 if doDisassembly:
1620 for component, componentStorage in ref.datasetType.storageClass.components.items():
1621 compRef = ref.makeComponentRef(component)
1622 compLocation, _ = self._determine_put_formatter_location(compRef)
1624 # Add a URI fragment to indicate this is a guess
1625 components[component] = ResourcePath(compLocation.uri.geturl() + "#predicted")
1627 else:
1629 location, _ = self._determine_put_formatter_location(ref)
1631 # Add a URI fragment to indicate this is a guess
1632 primary = ResourcePath(location.uri.geturl() + "#predicted")
1634 return primary, components
1636 # If this is a ref that we have written we can get the path.
1637 # Get file metadata and internal metadata
1638 fileLocations = self._get_dataset_locations_info(ref)
1640 guessing = False
1641 if not fileLocations:
1642 if not self.trustGetRequest: 1642 ↛ 1643line 1642 didn't jump to line 1643, because the condition on line 1642 was never true
1643 raise RuntimeError(f"Unexpectedly got no artifacts for dataset {ref}")
1644 fileLocations = self._get_expected_dataset_locations_info(ref)
1645 guessing = True
1647 if len(fileLocations) == 1:
1648 # No disassembly so this is the primary URI
1649 uri = fileLocations[0][0].uri
1650 if guessing and not uri.exists(): 1650 ↛ 1651line 1650 didn't jump to line 1651, because the condition on line 1650 was never true
1651 raise FileNotFoundError(f"Expected URI ({uri}) does not exist")
1652 primary = uri
1654 else:
1655 for location, storedFileInfo in fileLocations:
1656 if storedFileInfo.component is None: 1656 ↛ 1657line 1656 didn't jump to line 1657, because the condition on line 1656 was never true
1657 raise RuntimeError(f"Unexpectedly got no component name for a component at {location}")
1658 uri = location.uri
1659 if guessing and not uri.exists(): 1659 ↛ 1663line 1659 didn't jump to line 1663, because the condition on line 1659 was never true
1660 # If we are trusting then it is entirely possible for
1661 # some components to be missing. In that case we skip
1662 # to the next component.
1663 if self.trustGetRequest:
1664 continue
1665 raise FileNotFoundError(f"Expected URI ({uri}) does not exist")
1666 components[storedFileInfo.component] = uri
1668 return primary, components
1670 def getURI(self, ref: DatasetRef, predict: bool = False) -> ResourcePath:
1671 """URI to the Dataset.
1673 Parameters
1674 ----------
1675 ref : `DatasetRef`
1676 Reference to the required Dataset.
1677 predict : `bool`
1678 If `True`, allow URIs to be returned of datasets that have not
1679 been written.
1681 Returns
1682 -------
1683 uri : `str`
1684 URI pointing to the dataset within the datastore. If the
1685 dataset does not exist in the datastore, and if ``predict`` is
1686 `True`, the URI will be a prediction and will include a URI
1687 fragment "#predicted".
1688 If the datastore does not have entities that relate well
1689 to the concept of a URI the returned URI will be
1690 descriptive. The returned URI is not guaranteed to be obtainable.
1692 Raises
1693 ------
1694 FileNotFoundError
1695 Raised if a URI has been requested for a dataset that does not
1696 exist and guessing is not allowed.
1697 RuntimeError
1698 Raised if a request is made for a single URI but multiple URIs
1699 are associated with this dataset.
1701 Notes
1702 -----
1703 When a predicted URI is requested an attempt will be made to form
1704 a reasonable URI based on file templates and the expected formatter.
1705 """
1706 primary, components = self.getURIs(ref, predict)
1707 if primary is None or components: 1707 ↛ 1708line 1707 didn't jump to line 1708, because the condition on line 1707 was never true
1708 raise RuntimeError(
1709 f"Dataset ({ref}) includes distinct URIs for components. Use Datastore.getURIs() instead."
1710 )
1711 return primary
1713 def retrieveArtifacts(
1714 self,
1715 refs: Iterable[DatasetRef],
1716 destination: ResourcePath,
1717 transfer: str = "auto",
1718 preserve_path: bool = True,
1719 overwrite: bool = False,
1720 ) -> List[ResourcePath]:
1721 """Retrieve the file artifacts associated with the supplied refs.
1723 Parameters
1724 ----------
1725 refs : iterable of `DatasetRef`
1726 The datasets for which file artifacts are to be retrieved.
1727 A single ref can result in multiple files. The refs must
1728 be resolved.
1729 destination : `lsst.resources.ResourcePath`
1730 Location to write the file artifacts.
1731 transfer : `str`, optional
1732 Method to use to transfer the artifacts. Must be one of the options
1733 supported by `lsst.resources.ResourcePath.transfer_from()`.
1734 "move" is not allowed.
1735 preserve_path : `bool`, optional
1736 If `True` the full path of the file artifact within the datastore
1737 is preserved. If `False` the final file component of the path
1738 is used.
1739 overwrite : `bool`, optional
1740 If `True` allow transfers to overwrite existing files at the
1741 destination.
1743 Returns
1744 -------
1745 targets : `list` of `lsst.resources.ResourcePath`
1746 URIs of file artifacts in destination location. Order is not
1747 preserved.
1748 """
1749 if not destination.isdir(): 1749 ↛ 1750line 1749 didn't jump to line 1750, because the condition on line 1749 was never true
1750 raise ValueError(f"Destination location must refer to a directory. Given {destination}")
1752 if transfer == "move":
1753 raise ValueError("Can not move artifacts out of datastore. Use copy instead.")
1755 # Source -> Destination
1756 # This also helps filter out duplicate DatasetRef in the request
1757 # that will map to the same underlying file transfer.
1758 to_transfer: Dict[ResourcePath, ResourcePath] = {}
1760 for ref in refs:
1761 locations = self._get_dataset_locations_info(ref)
1762 for location, _ in locations:
1763 source_uri = location.uri
1764 target_path: ResourcePathExpression
1765 if preserve_path:
1766 target_path = location.pathInStore
1767 if target_path.isabs(): 1767 ↛ 1770line 1767 didn't jump to line 1770, because the condition on line 1767 was never true
1768 # This is an absolute path to an external file.
1769 # Use the full path.
1770 target_path = target_path.relativeToPathRoot
1771 else:
1772 target_path = source_uri.basename()
1773 target_uri = destination.join(target_path)
1774 to_transfer[source_uri] = target_uri
1776 # In theory can now parallelize the transfer
1777 log.debug("Number of artifacts to transfer to %s: %d", str(destination), len(to_transfer))
1778 for source_uri, target_uri in to_transfer.items():
1779 target_uri.transfer_from(source_uri, transfer=transfer, overwrite=overwrite)
1781 return list(to_transfer.values())
1783 def get(self, ref: DatasetRef, parameters: Optional[Mapping[str, Any]] = None) -> Any:
1784 """Load an InMemoryDataset from the store.
1786 Parameters
1787 ----------
1788 ref : `DatasetRef`
1789 Reference to the required Dataset.
1790 parameters : `dict`
1791 `StorageClass`-specific parameters that specify, for example,
1792 a slice of the dataset to be loaded.
1794 Returns
1795 -------
1796 inMemoryDataset : `object`
1797 Requested dataset or slice thereof as an InMemoryDataset.
1799 Raises
1800 ------
1801 FileNotFoundError
1802 Requested dataset can not be retrieved.
1803 TypeError
1804 Return value from formatter has unexpected type.
1805 ValueError
1806 Formatter failed to process the dataset.
1807 """
1808 allGetInfo = self._prepare_for_get(ref, parameters)
1809 refComponent = ref.datasetType.component()
1811 # Supplied storage class for the component being read
1812 refStorageClass = ref.datasetType.storageClass
1814 # Create mapping from component name to related info
1815 allComponents = {i.component: i for i in allGetInfo}
1817 # By definition the dataset is disassembled if we have more
1818 # than one record for it.
1819 isDisassembled = len(allGetInfo) > 1
1821 # Look for the special case where we are disassembled but the
1822 # component is a derived component that was not written during
1823 # disassembly. For this scenario we need to check that the
1824 # component requested is listed as a derived component for the
1825 # composite storage class
1826 isDisassembledReadOnlyComponent = False
1827 if isDisassembled and refComponent:
1828 # The composite storage class should be accessible through
1829 # the component dataset type
1830 compositeStorageClass = ref.datasetType.parentStorageClass
1832 # In the unlikely scenario where the composite storage
1833 # class is not known, we can only assume that this is a
1834 # normal component. If that assumption is wrong then the
1835 # branch below that reads a persisted component will fail
1836 # so there is no need to complain here.
1837 if compositeStorageClass is not None: 1837 ↛ 1840line 1837 didn't jump to line 1840, because the condition on line 1837 was never false
1838 isDisassembledReadOnlyComponent = refComponent in compositeStorageClass.derivedComponents
1840 if isDisassembled and not refComponent:
1841 # This was a disassembled dataset spread over multiple files
1842 # and we need to put them all back together again.
1843 # Read into memory and then assemble
1845 # Check that the supplied parameters are suitable for the type read
1846 refStorageClass.validateParameters(parameters)
1848 # We want to keep track of all the parameters that were not used
1849 # by formatters. We assume that if any of the component formatters
1850 # use a parameter that we do not need to apply it again in the
1851 # assembler.
1852 usedParams = set()
1854 components: Dict[str, Any] = {}
1855 for getInfo in allGetInfo:
1856 # assemblerParams are parameters not understood by the
1857 # associated formatter.
1858 usedParams.update(set(getInfo.formatterParams))
1860 component = getInfo.component
1862 if component is None: 1862 ↛ 1863line 1862 didn't jump to line 1863, because the condition on line 1862 was never true
1863 raise RuntimeError(f"Internal error in datastore assembly of {ref}")
1865 # We do not want the formatter to think it's reading
1866 # a component though because it is really reading a
1867 # standalone dataset -- always tell reader it is not a
1868 # component.
1869 components[component] = self._read_artifact_into_memory(
1870 getInfo, ref.makeComponentRef(component), isComponent=False
1871 )
1873 inMemoryDataset = ref.datasetType.storageClass.delegate().assemble(components)
1875 # Any unused parameters will have to be passed to the assembler
1876 if parameters:
1877 unusedParams = {k: v for k, v in parameters.items() if k not in usedParams}
1878 else:
1879 unusedParams = {}
1881 # Process parameters
1882 return ref.datasetType.storageClass.delegate().handleParameters(
1883 inMemoryDataset, parameters=unusedParams
1884 )
1886 elif isDisassembledReadOnlyComponent:
1888 compositeStorageClass = ref.datasetType.parentStorageClass
1889 if compositeStorageClass is None: 1889 ↛ 1890line 1889 didn't jump to line 1890, because the condition on line 1889 was never true
1890 raise RuntimeError(
1891 f"Unable to retrieve derived component '{refComponent}' since"
1892 "no composite storage class is available."
1893 )
1895 if refComponent is None: 1895 ↛ 1897line 1895 didn't jump to line 1897, because the condition on line 1895 was never true
1896 # Mainly for mypy
1897 raise RuntimeError(f"Internal error in datastore {self.name}: component can not be None here")
1899 # Assume that every derived component can be calculated by
1900 # forwarding the request to a single read/write component.
1901 # Rather than guessing which rw component is the right one by
1902 # scanning each for a derived component of the same name,
1903 # we ask the storage class delegate directly which one is best to
1904 # use.
1905 compositeDelegate = compositeStorageClass.delegate()
1906 forwardedComponent = compositeDelegate.selectResponsibleComponent(
1907 refComponent, set(allComponents)
1908 )
1910 # Select the relevant component
1911 rwInfo = allComponents[forwardedComponent]
1913 # For now assume that read parameters are validated against
1914 # the real component and not the requested component
1915 forwardedStorageClass = rwInfo.formatter.fileDescriptor.readStorageClass
1916 forwardedStorageClass.validateParameters(parameters)
1918 # The reference to use for the caching must refer to the forwarded
1919 # component and not the derived component.
1920 cache_ref = ref.makeCompositeRef().makeComponentRef(forwardedComponent)
1922 # Unfortunately the FileDescriptor inside the formatter will have
1923 # the wrong write storage class so we need to create a new one
1924 # given the immutability constraint.
1925 writeStorageClass = rwInfo.info.storageClass
1927 # We may need to put some thought into parameters for read
1928 # components but for now forward them on as is
1929 readFormatter = type(rwInfo.formatter)(
1930 FileDescriptor(
1931 rwInfo.location,
1932 readStorageClass=refStorageClass,
1933 storageClass=writeStorageClass,
1934 parameters=parameters,
1935 ),
1936 ref.dataId,
1937 )
1939 # The assembler can not receive any parameter requests for a
1940 # derived component at this time since the assembler will
1941 # see the storage class of the derived component and those
1942 # parameters will have to be handled by the formatter on the
1943 # forwarded storage class.
1944 assemblerParams: Dict[str, Any] = {}
1946 # Need to created a new info that specifies the derived
1947 # component and associated storage class
1948 readInfo = DatastoreFileGetInformation(
1949 rwInfo.location,
1950 readFormatter,
1951 rwInfo.info,
1952 assemblerParams,
1953 {},
1954 refComponent,
1955 refStorageClass,
1956 )
1958 return self._read_artifact_into_memory(readInfo, ref, isComponent=True, cache_ref=cache_ref)
1960 else:
1961 # Single file request or component from that composite file
1962 for lookup in (refComponent, None): 1962 ↛ 1967line 1962 didn't jump to line 1967, because the loop on line 1962 didn't complete
1963 if lookup in allComponents: 1963 ↛ 1962line 1963 didn't jump to line 1962, because the condition on line 1963 was never false
1964 getInfo = allComponents[lookup]
1965 break
1966 else:
1967 raise FileNotFoundError(
1968 f"Component {refComponent} not found for ref {ref} in datastore {self.name}"
1969 )
1971 # Do not need the component itself if already disassembled
1972 if isDisassembled:
1973 isComponent = False
1974 else:
1975 isComponent = getInfo.component is not None
1977 # For a component read of a composite we want the cache to
1978 # be looking at the composite ref itself.
1979 cache_ref = ref.makeCompositeRef() if isComponent else ref
1981 # For a disassembled component we can validate parametersagainst
1982 # the component storage class directly
1983 if isDisassembled:
1984 refStorageClass.validateParameters(parameters)
1985 else:
1986 # For an assembled composite this could be a derived
1987 # component derived from a real component. The validity
1988 # of the parameters is not clear. For now validate against
1989 # the composite storage class
1990 getInfo.formatter.fileDescriptor.storageClass.validateParameters(parameters)
1992 return self._read_artifact_into_memory(getInfo, ref, isComponent=isComponent, cache_ref=cache_ref)
1994 @transactional
1995 def put(self, inMemoryDataset: Any, ref: DatasetRef) -> None:
1996 """Write a InMemoryDataset with a given `DatasetRef` to the store.
1998 Parameters
1999 ----------
2000 inMemoryDataset : `object`
2001 The dataset to store.
2002 ref : `DatasetRef`
2003 Reference to the associated Dataset.
2005 Raises
2006 ------
2007 TypeError
2008 Supplied object and storage class are inconsistent.
2009 DatasetTypeNotSupportedError
2010 The associated `DatasetType` is not handled by this datastore.
2012 Notes
2013 -----
2014 If the datastore is configured to reject certain dataset types it
2015 is possible that the put will fail and raise a
2016 `DatasetTypeNotSupportedError`. The main use case for this is to
2017 allow `ChainedDatastore` to put to multiple datastores without
2018 requiring that every datastore accepts the dataset.
2019 """
2021 doDisassembly = self.composites.shouldBeDisassembled(ref)
2022 # doDisassembly = True
2024 artifacts = []
2025 if doDisassembly:
2026 components = ref.datasetType.storageClass.delegate().disassemble(inMemoryDataset)
2027 for component, componentInfo in components.items():
2028 # Don't recurse because we want to take advantage of
2029 # bulk insert -- need a new DatasetRef that refers to the
2030 # same dataset_id but has the component DatasetType
2031 # DatasetType does not refer to the types of components
2032 # So we construct one ourselves.
2033 compRef = ref.makeComponentRef(component)
2034 storedInfo = self._write_in_memory_to_artifact(componentInfo.component, compRef)
2035 artifacts.append((compRef, storedInfo))
2036 else:
2037 # Write the entire thing out
2038 storedInfo = self._write_in_memory_to_artifact(inMemoryDataset, ref)
2039 artifacts.append((ref, storedInfo))
2041 self._register_datasets(artifacts)
2043 @transactional
2044 def trash(self, ref: Union[DatasetRef, Iterable[DatasetRef]], ignore_errors: bool = True) -> None:
2045 # At this point can safely remove these datasets from the cache
2046 # to avoid confusion later on. If they are not trashed later
2047 # the cache will simply be refilled.
2048 self.cacheManager.remove_from_cache(ref)
2050 # If we are in trust mode there will be nothing to move to
2051 # the trash table and we will have to try to delete the file
2052 # immediately.
2053 if self.trustGetRequest:
2054 # Try to keep the logic below for a single file trash.
2055 if isinstance(ref, DatasetRef):
2056 refs = {ref}
2057 else:
2058 # Will recreate ref at the end of this branch.
2059 refs = set(ref)
2061 # Determine which datasets are known to datastore directly.
2062 id_to_ref = {ref.getCheckedId(): ref for ref in refs}
2063 existing_ids = self._get_stored_records_associated_with_refs(refs)
2064 existing_refs = {id_to_ref[ref_id] for ref_id in existing_ids}
2066 missing = refs - existing_refs
2067 if missing:
2068 # Do an explicit existence check on these refs.
2069 # We only care about the artifacts at this point and not
2070 # the dataset existence.
2071 artifact_existence: Dict[ResourcePath, bool] = {}
2072 _ = self.mexists(missing, artifact_existence)
2073 uris = [uri for uri, exists in artifact_existence.items() if exists]
2075 # FUTURE UPGRADE: Implement a parallelized bulk remove.
2076 log.debug("Removing %d artifacts from datastore that are unknown to datastore", len(uris))
2077 for uri in uris:
2078 try:
2079 uri.remove()
2080 except Exception as e:
2081 if ignore_errors:
2082 log.debug("Artifact %s could not be removed: %s", uri, e)
2083 continue
2084 raise
2086 # There is no point asking the code below to remove refs we
2087 # know are missing so update it with the list of existing
2088 # records. Try to retain one vs many logic.
2089 if not existing_refs:
2090 # Nothing more to do since none of the datasets were
2091 # known to the datastore record table.
2092 return
2093 ref = list(existing_refs)
2094 if len(ref) == 1:
2095 ref = ref[0]
2097 # Get file metadata and internal metadata
2098 if not isinstance(ref, DatasetRef):
2099 log.debug("Doing multi-dataset trash in datastore %s", self.name)
2100 # Assumed to be an iterable of refs so bulk mode enabled.
2101 try:
2102 self.bridge.moveToTrash(ref)
2103 except Exception as e:
2104 if ignore_errors:
2105 log.warning("Unexpected issue moving multiple datasets to trash: %s", e)
2106 else:
2107 raise
2108 return
2110 log.debug("Trashing dataset %s in datastore %s", ref, self.name)
2112 fileLocations = self._get_dataset_locations_info(ref)
2114 if not fileLocations:
2115 err_msg = f"Requested dataset to trash ({ref}) is not known to datastore {self.name}"
2116 if ignore_errors:
2117 log.warning(err_msg)
2118 return
2119 else:
2120 raise FileNotFoundError(err_msg)
2122 for location, storedFileInfo in fileLocations:
2123 if not self._artifact_exists(location): 2123 ↛ 2124line 2123 didn't jump to line 2124
2124 err_msg = (
2125 f"Dataset is known to datastore {self.name} but "
2126 f"associated artifact ({location.uri}) is missing"
2127 )
2128 if ignore_errors:
2129 log.warning(err_msg)
2130 return
2131 else:
2132 raise FileNotFoundError(err_msg)
2134 # Mark dataset as trashed
2135 try:
2136 self.bridge.moveToTrash([ref])
2137 except Exception as e:
2138 if ignore_errors:
2139 log.warning(
2140 "Attempted to mark dataset (%s) to be trashed in datastore %s "
2141 "but encountered an error: %s",
2142 ref,
2143 self.name,
2144 e,
2145 )
2146 pass
2147 else:
2148 raise
2150 @transactional
2151 def emptyTrash(self, ignore_errors: bool = True) -> None:
2152 """Remove all datasets from the trash.
2154 Parameters
2155 ----------
2156 ignore_errors : `bool`
2157 If `True` return without error even if something went wrong.
2158 Problems could occur if another process is simultaneously trying
2159 to delete.
2160 """
2161 log.debug("Emptying trash in datastore %s", self.name)
2163 # Context manager will empty trash iff we finish it without raising.
2164 # It will also automatically delete the relevant rows from the
2165 # trash table and the records table.
2166 with self.bridge.emptyTrash(
2167 self._table, record_class=StoredFileInfo, record_column="path"
2168 ) as trash_data:
2169 # Removing the artifacts themselves requires that the files are
2170 # not also associated with refs that are not to be trashed.
2171 # Therefore need to do a query with the file paths themselves
2172 # and return all the refs associated with them. Can only delete
2173 # a file if the refs to be trashed are the only refs associated
2174 # with the file.
2175 # This requires multiple copies of the trashed items
2176 trashed, artifacts_to_keep = trash_data
2178 if artifacts_to_keep is None:
2179 # The bridge is not helping us so have to work it out
2180 # ourselves. This is not going to be as efficient.
2181 trashed = list(trashed)
2183 # The instance check is for mypy since up to this point it
2184 # does not know the type of info.
2185 path_map = self._refs_associated_with_artifacts(
2186 [info.path for _, info in trashed if isinstance(info, StoredFileInfo)]
2187 )
2189 for ref, info in trashed:
2191 # Mypy needs to know this is not the base class
2192 assert isinstance(info, StoredFileInfo), f"Unexpectedly got info of class {type(info)}"
2194 # Check for mypy
2195 assert ref.id is not None, f"Internal logic error in emptyTrash with ref {ref}/{info}"
2197 path_map[info.path].remove(ref.id)
2198 if not path_map[info.path]: 2198 ↛ 2189line 2198 didn't jump to line 2189, because the condition on line 2198 was never false
2199 del path_map[info.path]
2201 artifacts_to_keep = set(path_map)
2203 for ref, info in trashed:
2205 # Should not happen for this implementation but need
2206 # to keep mypy happy.
2207 assert info is not None, f"Internal logic error in emptyTrash with ref {ref}."
2209 # Mypy needs to know this is not the base class
2210 assert isinstance(info, StoredFileInfo), f"Unexpectedly got info of class {type(info)}"
2212 # Check for mypy
2213 assert ref.id is not None, f"Internal logic error in emptyTrash with ref {ref}/{info}"
2215 if info.path in artifacts_to_keep:
2216 # This is a multi-dataset artifact and we are not
2217 # removing all associated refs.
2218 continue
2220 # Only trashed refs still known to datastore will be returned.
2221 location = info.file_location(self.locationFactory)
2223 # Point of no return for this artifact
2224 log.debug("Removing artifact %s from datastore %s", location.uri, self.name)
2225 try:
2226 self._delete_artifact(location)
2227 except FileNotFoundError:
2228 # If the file itself has been deleted there is nothing
2229 # we can do about it. It is possible that trash has
2230 # been run in parallel in another process or someone
2231 # decided to delete the file. It is unlikely to come
2232 # back and so we should still continue with the removal
2233 # of the entry from the trash table. It is also possible
2234 # we removed it in a previous iteration if it was
2235 # a multi-dataset artifact. The delete artifact method
2236 # will log a debug message in this scenario.
2237 # Distinguishing file missing before trash started and
2238 # file already removed previously as part of this trash
2239 # is not worth the distinction with regards to potential
2240 # memory cost.
2241 pass
2242 except Exception as e:
2243 if ignore_errors:
2244 # Use a debug message here even though it's not
2245 # a good situation. In some cases this can be
2246 # caused by a race between user A and user B
2247 # and neither of them has permissions for the
2248 # other's files. Butler does not know about users
2249 # and trash has no idea what collections these
2250 # files were in (without guessing from a path).
2251 log.debug(
2252 "Encountered error removing artifact %s from datastore %s: %s",
2253 location.uri,
2254 self.name,
2255 e,
2256 )
2257 else:
2258 raise
2260 @transactional
2261 def transfer_from(
2262 self,
2263 source_datastore: Datastore,
2264 refs: Iterable[DatasetRef],
2265 local_refs: Optional[Iterable[DatasetRef]] = None,
2266 transfer: str = "auto",
2267 artifact_existence: Optional[Dict[ResourcePath, bool]] = None,
2268 ) -> None:
2269 # Docstring inherited
2270 if type(self) is not type(source_datastore):
2271 raise TypeError(
2272 f"Datastore mismatch between this datastore ({type(self)}) and the "
2273 f"source datastore ({type(source_datastore)})."
2274 )
2276 # Be explicit for mypy
2277 if not isinstance(source_datastore, FileDatastore): 2277 ↛ 2278line 2277 didn't jump to line 2278, because the condition on line 2277 was never true
2278 raise TypeError(
2279 "Can only transfer to a FileDatastore from another FileDatastore, not"
2280 f" {type(source_datastore)}"
2281 )
2283 # Stop early if "direct" transfer mode is requested. That would
2284 # require that the URI inside the source datastore should be stored
2285 # directly in the target datastore, which seems unlikely to be useful
2286 # since at any moment the source datastore could delete the file.
2287 if transfer in ("direct", "split"):
2288 raise ValueError(
2289 f"Can not transfer from a source datastore using {transfer} mode since"
2290 " those files are controlled by the other datastore."
2291 )
2293 # Empty existence lookup if none given.
2294 if artifact_existence is None:
2295 artifact_existence = {}
2297 # We will go through the list multiple times so must convert
2298 # generators to lists.
2299 refs = list(refs)
2301 if local_refs is None:
2302 local_refs = refs
2303 else:
2304 local_refs = list(local_refs)
2306 # In order to handle disassembled composites the code works
2307 # at the records level since it can assume that internal APIs
2308 # can be used.
2309 # - If the record already exists in the destination this is assumed
2310 # to be okay.
2311 # - If there is no record but the source and destination URIs are
2312 # identical no transfer is done but the record is added.
2313 # - If the source record refers to an absolute URI currently assume
2314 # that that URI should remain absolute and will be visible to the
2315 # destination butler. May need to have a flag to indicate whether
2316 # the dataset should be transferred. This will only happen if
2317 # the detached Butler has had a local ingest.
2319 # What we really want is all the records in the source datastore
2320 # associated with these refs. Or derived ones if they don't exist
2321 # in the source.
2322 source_records = source_datastore._get_stored_records_associated_with_refs(refs)
2324 # The source dataset_ids are the keys in these records
2325 source_ids = set(source_records)
2326 log.debug("Number of datastore records found in source: %d", len(source_ids))
2328 # The not None check is to appease mypy
2329 requested_ids = set(ref.id for ref in refs if ref.id is not None)
2330 missing_ids = requested_ids - source_ids
2332 # Missing IDs can be okay if that datastore has allowed
2333 # gets based on file existence. Should we transfer what we can
2334 # or complain about it and warn?
2335 if missing_ids and not source_datastore.trustGetRequest: 2335 ↛ 2336line 2335 didn't jump to line 2336, because the condition on line 2335 was never true
2336 raise ValueError(
2337 f"Some datasets are missing from source datastore {source_datastore}: {missing_ids}"
2338 )
2340 # Need to map these missing IDs to a DatasetRef so we can guess
2341 # the details.
2342 if missing_ids:
2343 log.info(
2344 "Number of expected datasets missing from source datastore records: %d out of %d",
2345 len(missing_ids),
2346 len(requested_ids),
2347 )
2348 id_to_ref = {ref.id: ref for ref in refs if ref.id in missing_ids}
2350 # This should be chunked in case we end up having to check
2351 # the file store since we need some log output to show
2352 # progress.
2353 for missing_ids_chunk in chunk_iterable(missing_ids, chunk_size=10_000):
2354 records = {}
2355 for missing in missing_ids_chunk:
2356 # Ask the source datastore where the missing artifacts
2357 # should be. An execution butler might not know about the
2358 # artifacts even if they are there.
2359 expected = source_datastore._get_expected_dataset_locations_info(id_to_ref[missing])
2360 records[missing] = [info for _, info in expected]
2362 # Call the mexist helper method in case we have not already
2363 # checked these artifacts such that artifact_existence is
2364 # empty. This allows us to benefit from parallelism.
2365 # datastore.mexists() itself does not give us access to the
2366 # derived datastore record.
2367 log.verbose("Checking existence of %d datasets unknown to datastore", len(records))
2368 ref_exists = source_datastore._process_mexists_records(
2369 id_to_ref, records, False, artifact_existence=artifact_existence
2370 )
2372 # Now go through the records and propagate the ones that exist.
2373 location_factory = source_datastore.locationFactory
2374 for missing, record_list in records.items():
2375 # Skip completely if the ref does not exist.
2376 ref = id_to_ref[missing]
2377 if not ref_exists[ref]:
2378 log.warning("Asked to transfer dataset %s but no file artifacts exist for it.", ref)
2379 continue
2380 # Check for file artifact to decide which parts of a
2381 # disassembled composite do exist. If there is only a
2382 # single record we don't even need to look because it can't
2383 # be a composite and must exist.
2384 if len(record_list) == 1:
2385 dataset_records = record_list
2386 else:
2387 dataset_records = [
2388 record
2389 for record in record_list
2390 if artifact_existence[record.file_location(location_factory).uri]
2391 ]
2392 assert len(dataset_records) > 0, "Disassembled composite should have had some files."
2394 # Rely on source_records being a defaultdict.
2395 source_records[missing].extend(dataset_records)
2397 # See if we already have these records
2398 target_records = self._get_stored_records_associated_with_refs(local_refs)
2400 # The artifacts to register
2401 artifacts = []
2403 # Refs that already exist
2404 already_present = []
2406 # Now can transfer the artifacts
2407 for source_ref, target_ref in zip(refs, local_refs):
2408 if target_ref.id in target_records:
2409 # Already have an artifact for this.
2410 already_present.append(target_ref)
2411 continue
2413 # mypy needs to know these are always resolved refs
2414 for info in source_records[source_ref.getCheckedId()]:
2415 source_location = info.file_location(source_datastore.locationFactory)
2416 target_location = info.file_location(self.locationFactory)
2417 if source_location == target_location: 2417 ↛ 2421line 2417 didn't jump to line 2421, because the condition on line 2417 was never true
2418 # Either the dataset is already in the target datastore
2419 # (which is how execution butler currently runs) or
2420 # it is an absolute URI.
2421 if source_location.pathInStore.isabs():
2422 # Just because we can see the artifact when running
2423 # the transfer doesn't mean it will be generally
2424 # accessible to a user of this butler. For now warn
2425 # but assume it will be accessible.
2426 log.warning(
2427 "Transfer request for an outside-datastore artifact has been found at %s",
2428 source_location,
2429 )
2430 else:
2431 # Need to transfer it to the new location.
2432 # Assume we should always overwrite. If the artifact
2433 # is there this might indicate that a previous transfer
2434 # was interrupted but was not able to be rolled back
2435 # completely (eg pre-emption) so follow Datastore default
2436 # and overwrite.
2437 target_location.uri.transfer_from(
2438 source_location.uri, transfer=transfer, overwrite=True, transaction=self._transaction
2439 )
2441 artifacts.append((target_ref, info))
2443 self._register_datasets(artifacts)
2445 if already_present:
2446 n_skipped = len(already_present)
2447 log.info(
2448 "Skipped transfer of %d dataset%s already present in datastore",
2449 n_skipped,
2450 "" if n_skipped == 1 else "s",
2451 )
2453 @transactional
2454 def forget(self, refs: Iterable[DatasetRef]) -> None:
2455 # Docstring inherited.
2456 refs = list(refs)
2457 self.bridge.forget(refs)
2458 self._table.delete(["dataset_id"], *[{"dataset_id": ref.getCheckedId()} for ref in refs])
2460 def validateConfiguration(
2461 self, entities: Iterable[Union[DatasetRef, DatasetType, StorageClass]], logFailures: bool = False
2462 ) -> None:
2463 """Validate some of the configuration for this datastore.
2465 Parameters
2466 ----------
2467 entities : iterable of `DatasetRef`, `DatasetType`, or `StorageClass`
2468 Entities to test against this configuration. Can be differing
2469 types.
2470 logFailures : `bool`, optional
2471 If `True`, output a log message for every validation error
2472 detected.
2474 Raises
2475 ------
2476 DatastoreValidationError
2477 Raised if there is a validation problem with a configuration.
2478 All the problems are reported in a single exception.
2480 Notes
2481 -----
2482 This method checks that all the supplied entities have valid file
2483 templates and also have formatters defined.
2484 """
2486 templateFailed = None
2487 try:
2488 self.templates.validateTemplates(entities, logFailures=logFailures)
2489 except FileTemplateValidationError as e:
2490 templateFailed = str(e)
2492 formatterFailed = []
2493 for entity in entities:
2494 try:
2495 self.formatterFactory.getFormatterClass(entity)
2496 except KeyError as e:
2497 formatterFailed.append(str(e))
2498 if logFailures: 2498 ↛ 2493line 2498 didn't jump to line 2493, because the condition on line 2498 was never false
2499 log.critical("Formatter failure: %s", e)
2501 if templateFailed or formatterFailed:
2502 messages = []
2503 if templateFailed: 2503 ↛ 2504line 2503 didn't jump to line 2504, because the condition on line 2503 was never true
2504 messages.append(templateFailed)
2505 if formatterFailed: 2505 ↛ 2507line 2505 didn't jump to line 2507, because the condition on line 2505 was never false
2506 messages.append(",".join(formatterFailed))
2507 msg = ";\n".join(messages)
2508 raise DatastoreValidationError(msg)
2510 def getLookupKeys(self) -> Set[LookupKey]:
2511 # Docstring is inherited from base class
2512 return (
2513 self.templates.getLookupKeys()
2514 | self.formatterFactory.getLookupKeys()
2515 | self.constraints.getLookupKeys()
2516 )
2518 def validateKey(self, lookupKey: LookupKey, entity: Union[DatasetRef, DatasetType, StorageClass]) -> None:
2519 # Docstring is inherited from base class
2520 # The key can be valid in either formatters or templates so we can
2521 # only check the template if it exists
2522 if lookupKey in self.templates:
2523 try:
2524 self.templates[lookupKey].validateTemplate(entity)
2525 except FileTemplateValidationError as e:
2526 raise DatastoreValidationError(e) from e
2528 def export(
2529 self,
2530 refs: Iterable[DatasetRef],
2531 *,
2532 directory: Optional[ResourcePathExpression] = None,
2533 transfer: Optional[str] = "auto",
2534 ) -> Iterable[FileDataset]:
2535 # Docstring inherited from Datastore.export.
2536 if transfer is not None and directory is None: 2536 ↛ 2537line 2536 didn't jump to line 2537, because the condition on line 2536 was never true
2537 raise RuntimeError(f"Cannot export using transfer mode {transfer} with no export directory given")
2539 # Force the directory to be a URI object
2540 directoryUri: Optional[ResourcePath] = None
2541 if directory is not None: 2541 ↛ 2544line 2541 didn't jump to line 2544, because the condition on line 2541 was never false
2542 directoryUri = ResourcePath(directory, forceDirectory=True)
2544 if transfer is not None and directoryUri is not None: 2544 ↛ 2549line 2544 didn't jump to line 2549, because the condition on line 2544 was never false
2545 # mypy needs the second test
2546 if not directoryUri.exists(): 2546 ↛ 2547line 2546 didn't jump to line 2547, because the condition on line 2546 was never true
2547 raise FileNotFoundError(f"Export location {directory} does not exist")
2549 progress = Progress("lsst.daf.butler.datastores.FileDatastore.export", level=logging.DEBUG)
2550 for ref in progress.wrap(refs, "Exporting dataset files"):
2551 fileLocations = self._get_dataset_locations_info(ref)
2552 if not fileLocations: 2552 ↛ 2553line 2552 didn't jump to line 2553, because the condition on line 2552 was never true
2553 raise FileNotFoundError(f"Could not retrieve dataset {ref}.")
2554 # For now we can not export disassembled datasets
2555 if len(fileLocations) > 1:
2556 raise NotImplementedError(f"Can not export disassembled datasets such as {ref}")
2557 location, storedFileInfo = fileLocations[0]
2559 pathInStore = location.pathInStore.path
2560 if transfer is None: 2560 ↛ 2564line 2560 didn't jump to line 2564, because the condition on line 2560 was never true
2561 # TODO: do we also need to return the readStorageClass somehow?
2562 # We will use the path in store directly. If this is an
2563 # absolute URI, preserve it.
2564 if location.pathInStore.isabs():
2565 pathInStore = str(location.uri)
2566 elif transfer == "direct": 2566 ↛ 2568line 2566 didn't jump to line 2568, because the condition on line 2566 was never true
2567 # Use full URIs to the remote store in the export
2568 pathInStore = str(location.uri)
2569 else:
2570 # mypy needs help
2571 assert directoryUri is not None, "directoryUri must be defined to get here"
2572 storeUri = ResourcePath(location.uri)
2574 # if the datastore has an absolute URI to a resource, we
2575 # have two options:
2576 # 1. Keep the absolute URI in the exported YAML
2577 # 2. Allocate a new name in the local datastore and transfer
2578 # it.
2579 # For now go with option 2
2580 if location.pathInStore.isabs(): 2580 ↛ 2581line 2580 didn't jump to line 2581, because the condition on line 2580 was never true
2581 template = self.templates.getTemplate(ref)
2582 newURI = ResourcePath(template.format(ref), forceAbsolute=False)
2583 pathInStore = str(newURI.updatedExtension(location.pathInStore.getExtension()))
2585 exportUri = directoryUri.join(pathInStore)
2586 exportUri.transfer_from(storeUri, transfer=transfer)
2588 yield FileDataset(refs=[ref], path=pathInStore, formatter=storedFileInfo.formatter)
2590 @staticmethod
2591 def computeChecksum(
2592 uri: ResourcePath, algorithm: str = "blake2b", block_size: int = 8192
2593 ) -> Optional[str]:
2594 """Compute the checksum of the supplied file.
2596 Parameters
2597 ----------
2598 uri : `lsst.resources.ResourcePath`
2599 Name of resource to calculate checksum from.
2600 algorithm : `str`, optional
2601 Name of algorithm to use. Must be one of the algorithms supported
2602 by :py:class`hashlib`.
2603 block_size : `int`
2604 Number of bytes to read from file at one time.
2606 Returns
2607 -------
2608 hexdigest : `str`
2609 Hex digest of the file.
2611 Notes
2612 -----
2613 Currently returns None if the URI is for a remote resource.
2614 """
2615 if algorithm not in hashlib.algorithms_guaranteed: 2615 ↛ 2616line 2615 didn't jump to line 2616, because the condition on line 2615 was never true
2616 raise NameError("The specified algorithm '{}' is not supported by hashlib".format(algorithm))
2618 if not uri.isLocal: 2618 ↛ 2619line 2618 didn't jump to line 2619, because the condition on line 2618 was never true
2619 return None
2621 hasher = hashlib.new(algorithm)
2623 with uri.as_local() as local_uri:
2624 with open(local_uri.ospath, "rb") as f:
2625 for chunk in iter(lambda: f.read(block_size), b""):
2626 hasher.update(chunk)
2628 return hasher.hexdigest()
2630 def needs_expanded_data_ids(
2631 self,
2632 transfer: Optional[str],
2633 entity: Optional[Union[DatasetRef, DatasetType, StorageClass]] = None,
2634 ) -> bool:
2635 # Docstring inherited.
2636 # This _could_ also use entity to inspect whether the filename template
2637 # involves placeholders other than the required dimensions for its
2638 # dataset type, but that's not necessary for correctness; it just
2639 # enables more optimizations (perhaps only in theory).
2640 return transfer not in ("direct", None)