Coverage for python/lsst/daf/butler/datastores/fileDatastore.py: 83%
Shortcuts on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
21from __future__ import annotations
23"""Generic file-based datastore code."""
25__all__ = ("FileDatastore",)
27import hashlib
28import logging
29from collections import defaultdict
30from dataclasses import dataclass
31from typing import (
32 TYPE_CHECKING,
33 Any,
34 ClassVar,
35 Dict,
36 Iterable,
37 List,
38 Mapping,
39 Optional,
40 Set,
41 Tuple,
42 Type,
43 Union,
44)
46from lsst.daf.butler import (
47 CompositesMap,
48 Config,
49 DatasetId,
50 DatasetRef,
51 DatasetType,
52 DatasetTypeNotSupportedError,
53 Datastore,
54 DatastoreCacheManager,
55 DatastoreConfig,
56 DatastoreDisabledCacheManager,
57 DatastoreValidationError,
58 FileDataset,
59 FileDescriptor,
60 FileTemplates,
61 FileTemplateValidationError,
62 Formatter,
63 FormatterFactory,
64 Location,
65 LocationFactory,
66 Progress,
67 StorageClass,
68 StoredFileInfo,
69 ddl,
70)
71from lsst.daf.butler.core.repoRelocation import replaceRoot
72from lsst.daf.butler.core.utils import transactional
73from lsst.daf.butler.registry.interfaces import DatastoreRegistryBridge, ReadOnlyDatabaseError
74from lsst.resources import ResourcePath, ResourcePathExpression
75from lsst.utils.introspection import get_class_of, get_instance_of
76from lsst.utils.iteration import chunk_iterable
78# For VERBOSE logging usage.
79from lsst.utils.logging import VERBOSE, getLogger
80from lsst.utils.timer import time_this
81from sqlalchemy import BigInteger, String
83from .genericDatastore import GenericBaseDatastore
85if TYPE_CHECKING: 85 ↛ 86line 85 didn't jump to line 86, because the condition on line 85 was never true
86 from lsst.daf.butler import AbstractDatastoreCacheManager, LookupKey
87 from lsst.daf.butler.registry.interfaces import DatasetIdRef, DatastoreRegistryBridgeManager
89log = getLogger(__name__)
92class _IngestPrepData(Datastore.IngestPrepData):
93 """Helper class for FileDatastore ingest implementation.
95 Parameters
96 ----------
97 datasets : `list` of `FileDataset`
98 Files to be ingested by this datastore.
99 """
101 def __init__(self, datasets: List[FileDataset]):
102 super().__init__(ref for dataset in datasets for ref in dataset.refs)
103 self.datasets = datasets
106@dataclass(frozen=True)
107class DatastoreFileGetInformation:
108 """Collection of useful parameters needed to retrieve a file from
109 a Datastore.
110 """
112 location: Location
113 """The location from which to read the dataset."""
115 formatter: Formatter
116 """The `Formatter` to use to deserialize the dataset."""
118 info: StoredFileInfo
119 """Stored information about this file and its formatter."""
121 assemblerParams: Dict[str, Any]
122 """Parameters to use for post-processing the retrieved dataset."""
124 formatterParams: Dict[str, Any]
125 """Parameters that were understood by the associated formatter."""
127 component: Optional[str]
128 """The component to be retrieved (can be `None`)."""
130 readStorageClass: StorageClass
131 """The `StorageClass` of the dataset being read."""
134class FileDatastore(GenericBaseDatastore):
135 """Generic Datastore for file-based implementations.
137 Should always be sub-classed since key abstract methods are missing.
139 Parameters
140 ----------
141 config : `DatastoreConfig` or `str`
142 Configuration as either a `Config` object or URI to file.
143 bridgeManager : `DatastoreRegistryBridgeManager`
144 Object that manages the interface between `Registry` and datastores.
145 butlerRoot : `str`, optional
146 New datastore root to use to override the configuration value.
148 Raises
149 ------
150 ValueError
151 If root location does not exist and ``create`` is `False` in the
152 configuration.
153 """
155 defaultConfigFile: ClassVar[Optional[str]] = None
156 """Path to configuration defaults. Accessed within the ``config`` resource
157 or relative to a search path. Can be None if no defaults specified.
158 """
160 root: ResourcePath
161 """Root directory URI of this `Datastore`."""
163 locationFactory: LocationFactory
164 """Factory for creating locations relative to the datastore root."""
166 formatterFactory: FormatterFactory
167 """Factory for creating instances of formatters."""
169 templates: FileTemplates
170 """File templates that can be used by this `Datastore`."""
172 composites: CompositesMap
173 """Determines whether a dataset should be disassembled on put."""
175 defaultConfigFile = "datastores/fileDatastore.yaml"
176 """Path to configuration defaults. Accessed within the ``config`` resource
177 or relative to a search path. Can be None if no defaults specified.
178 """
180 @classmethod
181 def setConfigRoot(cls, root: str, config: Config, full: Config, overwrite: bool = True) -> None:
182 """Set any filesystem-dependent config options for this Datastore to
183 be appropriate for a new empty repository with the given root.
185 Parameters
186 ----------
187 root : `str`
188 URI to the root of the data repository.
189 config : `Config`
190 A `Config` to update. Only the subset understood by
191 this component will be updated. Will not expand
192 defaults.
193 full : `Config`
194 A complete config with all defaults expanded that can be
195 converted to a `DatastoreConfig`. Read-only and will not be
196 modified by this method.
197 Repository-specific options that should not be obtained
198 from defaults when Butler instances are constructed
199 should be copied from ``full`` to ``config``.
200 overwrite : `bool`, optional
201 If `False`, do not modify a value in ``config`` if the value
202 already exists. Default is always to overwrite with the provided
203 ``root``.
205 Notes
206 -----
207 If a keyword is explicitly defined in the supplied ``config`` it
208 will not be overridden by this method if ``overwrite`` is `False`.
209 This allows explicit values set in external configs to be retained.
210 """
211 Config.updateParameters(
212 DatastoreConfig,
213 config,
214 full,
215 toUpdate={"root": root},
216 toCopy=("cls", ("records", "table")),
217 overwrite=overwrite,
218 )
220 @classmethod
221 def makeTableSpec(cls, datasetIdColumnType: type) -> ddl.TableSpec:
222 return ddl.TableSpec(
223 fields=[
224 ddl.FieldSpec(name="dataset_id", dtype=datasetIdColumnType, primaryKey=True),
225 ddl.FieldSpec(name="path", dtype=String, length=256, nullable=False),
226 ddl.FieldSpec(name="formatter", dtype=String, length=128, nullable=False),
227 ddl.FieldSpec(name="storage_class", dtype=String, length=64, nullable=False),
228 # Use empty string to indicate no component
229 ddl.FieldSpec(name="component", dtype=String, length=32, primaryKey=True),
230 # TODO: should checksum be Base64Bytes instead?
231 ddl.FieldSpec(name="checksum", dtype=String, length=128, nullable=True),
232 ddl.FieldSpec(name="file_size", dtype=BigInteger, nullable=True),
233 ],
234 unique=frozenset(),
235 indexes=[tuple(["path"])],
236 )
238 def __init__(
239 self,
240 config: Union[DatastoreConfig, str],
241 bridgeManager: DatastoreRegistryBridgeManager,
242 butlerRoot: str = None,
243 ):
244 super().__init__(config, bridgeManager)
245 if "root" not in self.config: 245 ↛ 246line 245 didn't jump to line 246, because the condition on line 245 was never true
246 raise ValueError("No root directory specified in configuration")
248 # Name ourselves either using an explicit name or a name
249 # derived from the (unexpanded) root
250 if "name" in self.config:
251 self.name = self.config["name"]
252 else:
253 # We use the unexpanded root in the name to indicate that this
254 # datastore can be moved without having to update registry.
255 self.name = "{}@{}".format(type(self).__name__, self.config["root"])
257 # Support repository relocation in config
258 # Existence of self.root is checked in subclass
259 self.root = ResourcePath(
260 replaceRoot(self.config["root"], butlerRoot), forceDirectory=True, forceAbsolute=True
261 )
263 self.locationFactory = LocationFactory(self.root)
264 self.formatterFactory = FormatterFactory()
266 # Now associate formatters with storage classes
267 self.formatterFactory.registerFormatters(self.config["formatters"], universe=bridgeManager.universe)
269 # Read the file naming templates
270 self.templates = FileTemplates(self.config["templates"], universe=bridgeManager.universe)
272 # See if composites should be disassembled
273 self.composites = CompositesMap(self.config["composites"], universe=bridgeManager.universe)
275 tableName = self.config["records", "table"]
276 try:
277 # Storage of paths and formatters, keyed by dataset_id
278 self._table = bridgeManager.opaque.register(
279 tableName, self.makeTableSpec(bridgeManager.datasetIdColumnType)
280 )
281 # Interface to Registry.
282 self._bridge = bridgeManager.register(self.name)
283 except ReadOnlyDatabaseError:
284 # If the database is read only and we just tried and failed to
285 # create a table, it means someone is trying to create a read-only
286 # butler client for an empty repo. That should be okay, as long
287 # as they then try to get any datasets before some other client
288 # creates the table. Chances are they'rejust validating
289 # configuration.
290 pass
292 # Determine whether checksums should be used - default to False
293 self.useChecksum = self.config.get("checksum", False)
295 # Determine whether we can fall back to configuration if a
296 # requested dataset is not known to registry
297 self.trustGetRequest = self.config.get("trust_get_request", False)
299 # Create a cache manager
300 self.cacheManager: AbstractDatastoreCacheManager
301 if "cached" in self.config: 301 ↛ 304line 301 didn't jump to line 304, because the condition on line 301 was never false
302 self.cacheManager = DatastoreCacheManager(self.config["cached"], universe=bridgeManager.universe)
303 else:
304 self.cacheManager = DatastoreDisabledCacheManager("", universe=bridgeManager.universe)
306 # Check existence and create directory structure if necessary
307 if not self.root.exists():
308 if "create" not in self.config or not self.config["create"]: 308 ↛ 309line 308 didn't jump to line 309, because the condition on line 308 was never true
309 raise ValueError(f"No valid root and not allowed to create one at: {self.root}")
310 try:
311 self.root.mkdir()
312 except Exception as e:
313 raise ValueError(
314 f"Can not create datastore root '{self.root}', check permissions. Got error: {e}"
315 ) from e
317 def __str__(self) -> str:
318 return str(self.root)
320 @property
321 def bridge(self) -> DatastoreRegistryBridge:
322 return self._bridge
324 def _artifact_exists(self, location: Location) -> bool:
325 """Check that an artifact exists in this datastore at the specified
326 location.
328 Parameters
329 ----------
330 location : `Location`
331 Expected location of the artifact associated with this datastore.
333 Returns
334 -------
335 exists : `bool`
336 True if the location can be found, false otherwise.
337 """
338 log.debug("Checking if resource exists: %s", location.uri)
339 return location.uri.exists()
341 def _delete_artifact(self, location: Location) -> None:
342 """Delete the artifact from the datastore.
344 Parameters
345 ----------
346 location : `Location`
347 Location of the artifact associated with this datastore.
348 """
349 if location.pathInStore.isabs(): 349 ↛ 350line 349 didn't jump to line 350, because the condition on line 349 was never true
350 raise RuntimeError(f"Cannot delete artifact with absolute uri {location.uri}.")
352 try:
353 location.uri.remove()
354 except FileNotFoundError:
355 log.debug("File %s did not exist and so could not be deleted.", location.uri)
356 raise
357 except Exception as e:
358 log.critical("Failed to delete file: %s (%s)", location.uri, e)
359 raise
360 log.debug("Successfully deleted file: %s", location.uri)
362 def addStoredItemInfo(self, refs: Iterable[DatasetRef], infos: Iterable[StoredFileInfo]) -> None:
363 # Docstring inherited from GenericBaseDatastore
364 records = [info.to_record(ref) for ref, info in zip(refs, infos)]
365 self._table.insert(*records)
367 def getStoredItemsInfo(self, ref: DatasetIdRef) -> List[StoredFileInfo]:
368 # Docstring inherited from GenericBaseDatastore
370 # Look for the dataset_id -- there might be multiple matches
371 # if we have disassembled the dataset.
372 records = self._table.fetch(dataset_id=ref.id)
373 return [StoredFileInfo.from_record(record) for record in records]
375 def _get_stored_records_associated_with_refs(
376 self, refs: Iterable[DatasetIdRef]
377 ) -> Dict[DatasetId, List[StoredFileInfo]]:
378 """Retrieve all records associated with the provided refs.
380 Parameters
381 ----------
382 refs : iterable of `DatasetIdRef`
383 The refs for which records are to be retrieved.
385 Returns
386 -------
387 records : `dict` of [`DatasetId`, `list` of `StoredFileInfo`]
388 The matching records indexed by the ref ID. The number of entries
389 in the dict can be smaller than the number of requested refs.
390 """
391 records = self._table.fetch(dataset_id=[ref.id for ref in refs])
393 # Uniqueness is dataset_id + component so can have multiple records
394 # per ref.
395 records_by_ref = defaultdict(list)
396 for record in records:
397 records_by_ref[record["dataset_id"]].append(StoredFileInfo.from_record(record))
398 return records_by_ref
400 def _refs_associated_with_artifacts(
401 self, paths: List[Union[str, ResourcePath]]
402 ) -> Dict[str, Set[DatasetId]]:
403 """Return paths and associated dataset refs.
405 Parameters
406 ----------
407 paths : `list` of `str` or `lsst.resources.ResourcePath`
408 All the paths to include in search.
410 Returns
411 -------
412 mapping : `dict` of [`str`, `set` [`DatasetId`]]
413 Mapping of each path to a set of associated database IDs.
414 """
415 records = self._table.fetch(path=[str(path) for path in paths])
416 result = defaultdict(set)
417 for row in records:
418 result[row["path"]].add(row["dataset_id"])
419 return result
421 def _registered_refs_per_artifact(self, pathInStore: ResourcePath) -> Set[DatasetId]:
422 """Return all dataset refs associated with the supplied path.
424 Parameters
425 ----------
426 pathInStore : `lsst.resources.ResourcePath`
427 Path of interest in the data store.
429 Returns
430 -------
431 ids : `set` of `int`
432 All `DatasetRef` IDs associated with this path.
433 """
434 records = list(self._table.fetch(path=str(pathInStore)))
435 ids = {r["dataset_id"] for r in records}
436 return ids
438 def removeStoredItemInfo(self, ref: DatasetIdRef) -> None:
439 # Docstring inherited from GenericBaseDatastore
440 self._table.delete(["dataset_id"], {"dataset_id": ref.id})
442 def _get_dataset_locations_info(self, ref: DatasetIdRef) -> List[Tuple[Location, StoredFileInfo]]:
443 r"""Find all the `Location`\ s of the requested dataset in the
444 `Datastore` and the associated stored file information.
446 Parameters
447 ----------
448 ref : `DatasetRef`
449 Reference to the required `Dataset`.
451 Returns
452 -------
453 results : `list` [`tuple` [`Location`, `StoredFileInfo` ]]
454 Location of the dataset within the datastore and
455 stored information about each file and its formatter.
456 """
457 # Get the file information (this will fail if no file)
458 records = self.getStoredItemsInfo(ref)
460 # Use the path to determine the location -- we need to take
461 # into account absolute URIs in the datastore record
462 return [(r.file_location(self.locationFactory), r) for r in records]
464 def _can_remove_dataset_artifact(self, ref: DatasetIdRef, location: Location) -> bool:
465 """Check that there is only one dataset associated with the
466 specified artifact.
468 Parameters
469 ----------
470 ref : `DatasetRef` or `FakeDatasetRef`
471 Dataset to be removed.
472 location : `Location`
473 The location of the artifact to be removed.
475 Returns
476 -------
477 can_remove : `Bool`
478 True if the artifact can be safely removed.
479 """
480 # Can't ever delete absolute URIs.
481 if location.pathInStore.isabs():
482 return False
484 # Get all entries associated with this path
485 allRefs = self._registered_refs_per_artifact(location.pathInStore)
486 if not allRefs:
487 raise RuntimeError(f"Datastore inconsistency error. {location.pathInStore} not in registry")
489 # Remove these refs from all the refs and if there is nothing left
490 # then we can delete
491 remainingRefs = allRefs - {ref.id}
493 if remainingRefs:
494 return False
495 return True
497 def _get_expected_dataset_locations_info(self, ref: DatasetRef) -> List[Tuple[Location, StoredFileInfo]]:
498 """Predict the location and related file information of the requested
499 dataset in this datastore.
501 Parameters
502 ----------
503 ref : `DatasetRef`
504 Reference to the required `Dataset`.
506 Returns
507 -------
508 results : `list` [`tuple` [`Location`, `StoredFileInfo` ]]
509 Expected Location of the dataset within the datastore and
510 placeholder information about each file and its formatter.
512 Notes
513 -----
514 Uses the current configuration to determine how we would expect the
515 datastore files to have been written if we couldn't ask registry.
516 This is safe so long as there has been no change to datastore
517 configuration between writing the dataset and wanting to read it.
518 Will not work for files that have been ingested without using the
519 standard file template or default formatter.
520 """
522 # If we have a component ref we always need to ask the questions
523 # of the composite. If the composite is disassembled this routine
524 # should return all components. If the composite was not
525 # disassembled the composite is what is stored regardless of
526 # component request. Note that if the caller has disassembled
527 # a composite there is no way for this guess to know that
528 # without trying both the composite and component ref and seeing
529 # if there is something at the component Location even without
530 # disassembly being enabled.
531 if ref.datasetType.isComponent():
532 ref = ref.makeCompositeRef()
534 # See if the ref is a composite that should be disassembled
535 doDisassembly = self.composites.shouldBeDisassembled(ref)
537 all_info: List[Tuple[Location, Formatter, StorageClass, Optional[str]]] = []
539 if doDisassembly:
540 for component, componentStorage in ref.datasetType.storageClass.components.items():
541 compRef = ref.makeComponentRef(component)
542 location, formatter = self._determine_put_formatter_location(compRef)
543 all_info.append((location, formatter, componentStorage, component))
545 else:
546 # Always use the composite ref if no disassembly
547 location, formatter = self._determine_put_formatter_location(ref)
548 all_info.append((location, formatter, ref.datasetType.storageClass, None))
550 # Convert the list of tuples to have StoredFileInfo as second element
551 return [
552 (
553 location,
554 StoredFileInfo(
555 formatter=formatter,
556 path=location.pathInStore.path,
557 storageClass=storageClass,
558 component=component,
559 checksum=None,
560 file_size=-1,
561 ),
562 )
563 for location, formatter, storageClass, component in all_info
564 ]
566 def _prepare_for_get(
567 self, ref: DatasetRef, parameters: Optional[Mapping[str, Any]] = None
568 ) -> List[DatastoreFileGetInformation]:
569 """Check parameters for ``get`` and obtain formatter and
570 location.
572 Parameters
573 ----------
574 ref : `DatasetRef`
575 Reference to the required Dataset.
576 parameters : `dict`
577 `StorageClass`-specific parameters that specify, for example,
578 a slice of the dataset to be loaded.
580 Returns
581 -------
582 getInfo : `list` [`DatastoreFileGetInformation`]
583 Parameters needed to retrieve each file.
584 """
585 log.debug("Retrieve %s from %s with parameters %s", ref, self.name, parameters)
587 # Get file metadata and internal metadata
588 fileLocations = self._get_dataset_locations_info(ref)
589 if not fileLocations:
590 if not self.trustGetRequest:
591 raise FileNotFoundError(f"Could not retrieve dataset {ref}.")
592 # Assume the dataset is where we think it should be
593 fileLocations = self._get_expected_dataset_locations_info(ref)
595 # The storage class we want to use eventually
596 refStorageClass = ref.datasetType.storageClass
598 if len(fileLocations) > 1:
599 disassembled = True
601 # If trust is involved it is possible that there will be
602 # components listed here that do not exist in the datastore.
603 # Explicitly check for file artifact existence and filter out any
604 # that are missing.
605 if self.trustGetRequest:
606 fileLocations = [loc for loc in fileLocations if loc[0].uri.exists()]
608 # For now complain only if we have no components at all. One
609 # component is probably a problem but we can punt that to the
610 # assembler.
611 if not fileLocations: 611 ↛ 612line 611 didn't jump to line 612, because the condition on line 611 was never true
612 raise FileNotFoundError(f"None of the component files for dataset {ref} exist.")
614 else:
615 disassembled = False
617 # Is this a component request?
618 refComponent = ref.datasetType.component()
620 fileGetInfo = []
621 for location, storedFileInfo in fileLocations:
623 # The storage class used to write the file
624 writeStorageClass = storedFileInfo.storageClass
626 # If this has been disassembled we need read to match the write
627 if disassembled:
628 readStorageClass = writeStorageClass
629 else:
630 readStorageClass = refStorageClass
632 formatter = get_instance_of(
633 storedFileInfo.formatter,
634 FileDescriptor(
635 location,
636 readStorageClass=readStorageClass,
637 storageClass=writeStorageClass,
638 parameters=parameters,
639 ),
640 ref.dataId,
641 )
643 formatterParams, notFormatterParams = formatter.segregateParameters()
645 # Of the remaining parameters, extract the ones supported by
646 # this StorageClass (for components not all will be handled)
647 assemblerParams = readStorageClass.filterParameters(notFormatterParams)
649 # The ref itself could be a component if the dataset was
650 # disassembled by butler, or we disassembled in datastore and
651 # components came from the datastore records
652 component = storedFileInfo.component if storedFileInfo.component else refComponent
654 fileGetInfo.append(
655 DatastoreFileGetInformation(
656 location,
657 formatter,
658 storedFileInfo,
659 assemblerParams,
660 formatterParams,
661 component,
662 readStorageClass,
663 )
664 )
666 return fileGetInfo
668 def _prepare_for_put(self, inMemoryDataset: Any, ref: DatasetRef) -> Tuple[Location, Formatter]:
669 """Check the arguments for ``put`` and obtain formatter and
670 location.
672 Parameters
673 ----------
674 inMemoryDataset : `object`
675 The dataset to store.
676 ref : `DatasetRef`
677 Reference to the associated Dataset.
679 Returns
680 -------
681 location : `Location`
682 The location to write the dataset.
683 formatter : `Formatter`
684 The `Formatter` to use to write the dataset.
686 Raises
687 ------
688 TypeError
689 Supplied object and storage class are inconsistent.
690 DatasetTypeNotSupportedError
691 The associated `DatasetType` is not handled by this datastore.
692 """
693 self._validate_put_parameters(inMemoryDataset, ref)
694 return self._determine_put_formatter_location(ref)
696 def _determine_put_formatter_location(self, ref: DatasetRef) -> Tuple[Location, Formatter]:
697 """Calculate the formatter and output location to use for put.
699 Parameters
700 ----------
701 ref : `DatasetRef`
702 Reference to the associated Dataset.
704 Returns
705 -------
706 location : `Location`
707 The location to write the dataset.
708 formatter : `Formatter`
709 The `Formatter` to use to write the dataset.
710 """
711 # Work out output file name
712 try:
713 template = self.templates.getTemplate(ref)
714 except KeyError as e:
715 raise DatasetTypeNotSupportedError(f"Unable to find template for {ref}") from e
717 # Validate the template to protect against filenames from different
718 # dataIds returning the same and causing overwrite confusion.
719 template.validateTemplate(ref)
721 location = self.locationFactory.fromPath(template.format(ref))
723 # Get the formatter based on the storage class
724 storageClass = ref.datasetType.storageClass
725 try:
726 formatter = self.formatterFactory.getFormatter(
727 ref, FileDescriptor(location, storageClass=storageClass), ref.dataId
728 )
729 except KeyError as e:
730 raise DatasetTypeNotSupportedError(
731 f"Unable to find formatter for {ref} in datastore {self.name}"
732 ) from e
734 # Now that we know the formatter, update the location
735 location = formatter.makeUpdatedLocation(location)
737 return location, formatter
739 def _overrideTransferMode(self, *datasets: FileDataset, transfer: Optional[str] = None) -> Optional[str]:
740 # Docstring inherited from base class
741 if transfer != "auto":
742 return transfer
744 # See if the paths are within the datastore or not
745 inside = [self._pathInStore(d.path) is not None for d in datasets]
747 if all(inside):
748 transfer = None
749 elif not any(inside): 749 ↛ 758line 749 didn't jump to line 758, because the condition on line 749 was never false
750 # Allow ResourcePath to use its own knowledge
751 transfer = "auto"
752 else:
753 # This can happen when importing from a datastore that
754 # has had some datasets ingested using "direct" mode.
755 # Also allow ResourcePath to sort it out but warn about it.
756 # This can happen if you are importing from a datastore
757 # that had some direct transfer datasets.
758 log.warning(
759 "Some datasets are inside the datastore and some are outside. Using 'split' "
760 "transfer mode. This assumes that the files outside the datastore are "
761 "still accessible to the new butler since they will not be copied into "
762 "the target datastore."
763 )
764 transfer = "split"
766 return transfer
768 def _pathInStore(self, path: ResourcePathExpression) -> Optional[str]:
769 """Return path relative to datastore root
771 Parameters
772 ----------
773 path : `lsst.resources.ResourcePathExpression`
774 Path to dataset. Can be absolute URI. If relative assumed to
775 be relative to the datastore. Returns path in datastore
776 or raises an exception if the path it outside.
778 Returns
779 -------
780 inStore : `str`
781 Path relative to datastore root. Returns `None` if the file is
782 outside the root.
783 """
784 # Relative path will always be relative to datastore
785 pathUri = ResourcePath(path, forceAbsolute=False)
786 return pathUri.relative_to(self.root)
788 def _standardizeIngestPath(
789 self, path: ResourcePathExpression, *, transfer: Optional[str] = None
790 ) -> Union[str, ResourcePath]:
791 """Standardize the path of a to-be-ingested file.
793 Parameters
794 ----------
795 path : `lsst.resources.ResourcePathExpression`
796 Path of a file to be ingested.
797 transfer : `str`, optional
798 How (and whether) the dataset should be added to the datastore.
799 See `ingest` for details of transfer modes.
800 This implementation is provided only so
801 `NotImplementedError` can be raised if the mode is not supported;
802 actual transfers are deferred to `_extractIngestInfo`.
804 Returns
805 -------
806 path : `str` or `lsst.resources.ResourcePath`
807 New path in what the datastore considers standard form. If an
808 absolute URI was given that will be returned unchanged.
810 Notes
811 -----
812 Subclasses of `FileDatastore` can implement this method instead
813 of `_prepIngest`. It should not modify the data repository or given
814 file in any way.
816 Raises
817 ------
818 NotImplementedError
819 Raised if the datastore does not support the given transfer mode
820 (including the case where ingest is not supported at all).
821 FileNotFoundError
822 Raised if one of the given files does not exist.
823 """
824 if transfer not in (None, "direct", "split") + self.root.transferModes: 824 ↛ 825line 824 didn't jump to line 825, because the condition on line 824 was never true
825 raise NotImplementedError(f"Transfer mode {transfer} not supported.")
827 # A relative URI indicates relative to datastore root
828 srcUri = ResourcePath(path, forceAbsolute=False)
829 if not srcUri.isabs():
830 srcUri = self.root.join(path)
832 if not srcUri.exists():
833 raise FileNotFoundError(
834 f"Resource at {srcUri} does not exist; note that paths to ingest "
835 f"are assumed to be relative to {self.root} unless they are absolute."
836 )
838 if transfer is None:
839 relpath = srcUri.relative_to(self.root)
840 if not relpath:
841 raise RuntimeError(
842 f"Transfer is none but source file ({srcUri}) is not within datastore ({self.root})"
843 )
845 # Return the relative path within the datastore for internal
846 # transfer
847 path = relpath
849 return path
851 def _extractIngestInfo(
852 self,
853 path: ResourcePathExpression,
854 ref: DatasetRef,
855 *,
856 formatter: Union[Formatter, Type[Formatter]],
857 transfer: Optional[str] = None,
858 ) -> StoredFileInfo:
859 """Relocate (if necessary) and extract `StoredFileInfo` from a
860 to-be-ingested file.
862 Parameters
863 ----------
864 path : `lsst.resources.ResourcePathExpression`
865 URI or path of a file to be ingested.
866 ref : `DatasetRef`
867 Reference for the dataset being ingested. Guaranteed to have
868 ``dataset_id not None`.
869 formatter : `type` or `Formatter`
870 `Formatter` subclass to use for this dataset or an instance.
871 transfer : `str`, optional
872 How (and whether) the dataset should be added to the datastore.
873 See `ingest` for details of transfer modes.
875 Returns
876 -------
877 info : `StoredFileInfo`
878 Internal datastore record for this file. This will be inserted by
879 the caller; the `_extractIngestInfo` is only responsible for
880 creating and populating the struct.
882 Raises
883 ------
884 FileNotFoundError
885 Raised if one of the given files does not exist.
886 FileExistsError
887 Raised if transfer is not `None` but the (internal) location the
888 file would be moved to is already occupied.
889 """
890 if self._transaction is None: 890 ↛ 891line 890 didn't jump to line 891, because the condition on line 890 was never true
891 raise RuntimeError("Ingest called without transaction enabled")
893 # Create URI of the source path, do not need to force a relative
894 # path to absolute.
895 srcUri = ResourcePath(path, forceAbsolute=False)
897 # Track whether we have read the size of the source yet
898 have_sized = False
900 tgtLocation: Optional[Location]
901 if transfer is None or transfer == "split":
902 # A relative path is assumed to be relative to the datastore
903 # in this context
904 if not srcUri.isabs():
905 tgtLocation = self.locationFactory.fromPath(srcUri.ospath)
906 else:
907 # Work out the path in the datastore from an absolute URI
908 # This is required to be within the datastore.
909 pathInStore = srcUri.relative_to(self.root)
910 if pathInStore is None and transfer is None: 910 ↛ 911line 910 didn't jump to line 911, because the condition on line 910 was never true
911 raise RuntimeError(
912 f"Unexpectedly learned that {srcUri} is not within datastore {self.root}"
913 )
914 if pathInStore: 914 ↛ 916line 914 didn't jump to line 916, because the condition on line 914 was never false
915 tgtLocation = self.locationFactory.fromPath(pathInStore)
916 elif transfer == "split":
917 # Outside the datastore but treat that as a direct ingest
918 # instead.
919 tgtLocation = None
920 else:
921 raise RuntimeError(f"Unexpected transfer mode encountered: {transfer} for URI {srcUri}")
922 elif transfer == "direct": 922 ↛ 927line 922 didn't jump to line 927, because the condition on line 922 was never true
923 # Want to store the full URI to the resource directly in
924 # datastore. This is useful for referring to permanent archive
925 # storage for raw data.
926 # Trust that people know what they are doing.
927 tgtLocation = None
928 else:
929 # Work out the name we want this ingested file to have
930 # inside the datastore
931 tgtLocation = self._calculate_ingested_datastore_name(srcUri, ref, formatter)
932 if not tgtLocation.uri.dirname().exists():
933 log.debug("Folder %s does not exist yet.", tgtLocation.uri.dirname())
934 tgtLocation.uri.dirname().mkdir()
936 # if we are transferring from a local file to a remote location
937 # it may be more efficient to get the size and checksum of the
938 # local file rather than the transferred one
939 if not srcUri.scheme or srcUri.scheme == "file": 939 ↛ 949line 939 didn't jump to line 949, because the condition on line 939 was never false
940 size = srcUri.size()
941 checksum = self.computeChecksum(srcUri) if self.useChecksum else None
942 have_sized = True
944 # Transfer the resource to the destination.
945 # Allow overwrite of an existing file. This matches the behavior
946 # of datastore.put() in that it trusts that registry would not
947 # be asking to overwrite unless registry thought that the
948 # overwrite was allowed.
949 tgtLocation.uri.transfer_from(
950 srcUri, transfer=transfer, transaction=self._transaction, overwrite=True
951 )
953 if tgtLocation is None: 953 ↛ 955line 953 didn't jump to line 955, because the condition on line 953 was never true
954 # This means we are using direct mode
955 targetUri = srcUri
956 targetPath = str(srcUri)
957 else:
958 targetUri = tgtLocation.uri
959 targetPath = tgtLocation.pathInStore.path
961 # the file should exist in the datastore now
962 if not have_sized:
963 size = targetUri.size()
964 checksum = self.computeChecksum(targetUri) if self.useChecksum else None
966 return StoredFileInfo(
967 formatter=formatter,
968 path=targetPath,
969 storageClass=ref.datasetType.storageClass,
970 component=ref.datasetType.component(),
971 file_size=size,
972 checksum=checksum,
973 )
975 def _prepIngest(self, *datasets: FileDataset, transfer: Optional[str] = None) -> _IngestPrepData:
976 # Docstring inherited from Datastore._prepIngest.
977 filtered = []
978 for dataset in datasets:
979 acceptable = [ref for ref in dataset.refs if self.constraints.isAcceptable(ref)]
980 if not acceptable:
981 continue
982 else:
983 dataset.refs = acceptable
984 if dataset.formatter is None:
985 dataset.formatter = self.formatterFactory.getFormatterClass(dataset.refs[0])
986 else:
987 assert isinstance(dataset.formatter, (type, str))
988 formatter_class = get_class_of(dataset.formatter)
989 if not issubclass(formatter_class, Formatter): 989 ↛ 990line 989 didn't jump to line 990, because the condition on line 989 was never true
990 raise TypeError(f"Requested formatter {dataset.formatter} is not a Formatter class.")
991 dataset.formatter = formatter_class
992 dataset.path = self._standardizeIngestPath(dataset.path, transfer=transfer)
993 filtered.append(dataset)
994 return _IngestPrepData(filtered)
996 @transactional
997 def _finishIngest(self, prepData: Datastore.IngestPrepData, *, transfer: Optional[str] = None) -> None:
998 # Docstring inherited from Datastore._finishIngest.
999 refsAndInfos = []
1000 progress = Progress("lsst.daf.butler.datastores.FileDatastore.ingest", level=logging.DEBUG)
1001 for dataset in progress.wrap(prepData.datasets, desc="Ingesting dataset files"):
1002 # Do ingest as if the first dataset ref is associated with the file
1003 info = self._extractIngestInfo(
1004 dataset.path, dataset.refs[0], formatter=dataset.formatter, transfer=transfer
1005 )
1006 refsAndInfos.extend([(ref, info) for ref in dataset.refs])
1007 self._register_datasets(refsAndInfos)
1009 def _calculate_ingested_datastore_name(
1010 self, srcUri: ResourcePath, ref: DatasetRef, formatter: Union[Formatter, Type[Formatter]]
1011 ) -> Location:
1012 """Given a source URI and a DatasetRef, determine the name the
1013 dataset will have inside datastore.
1015 Parameters
1016 ----------
1017 srcUri : `lsst.resources.ResourcePath`
1018 URI to the source dataset file.
1019 ref : `DatasetRef`
1020 Ref associated with the newly-ingested dataset artifact. This
1021 is used to determine the name within the datastore.
1022 formatter : `Formatter` or Formatter class.
1023 Formatter to use for validation. Can be a class or an instance.
1025 Returns
1026 -------
1027 location : `Location`
1028 Target location for the newly-ingested dataset.
1029 """
1030 # Ingesting a file from outside the datastore.
1031 # This involves a new name.
1032 template = self.templates.getTemplate(ref)
1033 location = self.locationFactory.fromPath(template.format(ref))
1035 # Get the extension
1036 ext = srcUri.getExtension()
1038 # Update the destination to include that extension
1039 location.updateExtension(ext)
1041 # Ask the formatter to validate this extension
1042 formatter.validateExtension(location)
1044 return location
1046 def _write_in_memory_to_artifact(self, inMemoryDataset: Any, ref: DatasetRef) -> StoredFileInfo:
1047 """Write out in memory dataset to datastore.
1049 Parameters
1050 ----------
1051 inMemoryDataset : `object`
1052 Dataset to write to datastore.
1053 ref : `DatasetRef`
1054 Registry information associated with this dataset.
1056 Returns
1057 -------
1058 info : `StoredFileInfo`
1059 Information describing the artifact written to the datastore.
1060 """
1061 location, formatter = self._prepare_for_put(inMemoryDataset, ref)
1062 uri = location.uri
1064 if not uri.dirname().exists():
1065 log.debug("Folder %s does not exist yet so creating it.", uri.dirname())
1066 uri.dirname().mkdir()
1068 if self._transaction is None: 1068 ↛ 1069line 1068 didn't jump to line 1069, because the condition on line 1068 was never true
1069 raise RuntimeError("Attempting to write artifact without transaction enabled")
1071 def _removeFileExists(uri: ResourcePath) -> None:
1072 """Remove a file and do not complain if it is not there.
1074 This is important since a formatter might fail before the file
1075 is written and we should not confuse people by writing spurious
1076 error messages to the log.
1077 """
1078 try:
1079 uri.remove()
1080 except FileNotFoundError:
1081 pass
1083 # Register a callback to try to delete the uploaded data if
1084 # something fails below
1085 self._transaction.registerUndo("artifactWrite", _removeFileExists, uri)
1087 # For a local file, simply use the formatter directly
1088 if uri.isLocal:
1089 try:
1090 formatter.write(inMemoryDataset)
1091 except Exception as e:
1092 raise RuntimeError(
1093 f"Failed to serialize dataset {ref} of type {type(inMemoryDataset)} to location {uri}"
1094 ) from e
1095 log.debug("Successfully wrote python object to local file at %s", uri)
1096 else:
1097 # This is a remote URI. Some datasets can be serialized directly
1098 # to bytes and sent to the remote datastore without writing a
1099 # file. If the dataset is intended to be saved to the cache
1100 # a file is always written and direct write to the remote
1101 # datastore is bypassed.
1102 data_written = False
1103 if not self.cacheManager.should_be_cached(ref):
1104 try:
1105 serializedDataset = formatter.toBytes(inMemoryDataset)
1106 except NotImplementedError:
1107 # Fallback to the file writing option.
1108 pass
1109 except Exception as e:
1110 raise RuntimeError(
1111 f"Failed to serialize dataset {ref} of type {type(inMemoryDataset)} to bytes."
1112 ) from e
1113 else:
1114 log.debug("Writing bytes directly to %s", uri)
1115 uri.write(serializedDataset, overwrite=True)
1116 log.debug("Successfully wrote bytes directly to %s", uri)
1117 data_written = True
1119 if not data_written:
1120 # Did not write the bytes directly to object store so instead
1121 # write to temporary file.
1122 with ResourcePath.temporary_uri(suffix=uri.getExtension()) as temporary_uri:
1123 # Need to configure the formatter to write to a different
1124 # location and that needs us to overwrite internals
1125 log.debug("Writing dataset to temporary location at %s", temporary_uri)
1126 with formatter._updateLocation(Location(None, temporary_uri)):
1127 try:
1128 formatter.write(inMemoryDataset)
1129 except Exception as e:
1130 raise RuntimeError(
1131 f"Failed to serialize dataset {ref} of type"
1132 f" {type(inMemoryDataset)} to "
1133 f"temporary location {temporary_uri}"
1134 ) from e
1135 uri.transfer_from(temporary_uri, transfer="copy", overwrite=True)
1137 # Cache if required
1138 self.cacheManager.move_to_cache(temporary_uri, ref)
1140 log.debug("Successfully wrote dataset to %s via a temporary file.", uri)
1142 # URI is needed to resolve what ingest case are we dealing with
1143 return self._extractIngestInfo(uri, ref, formatter=formatter)
1145 def _read_artifact_into_memory(
1146 self,
1147 getInfo: DatastoreFileGetInformation,
1148 ref: DatasetRef,
1149 isComponent: bool = False,
1150 cache_ref: Optional[DatasetRef] = None,
1151 ) -> Any:
1152 """Read the artifact from datastore into in memory object.
1154 Parameters
1155 ----------
1156 getInfo : `DatastoreFileGetInformation`
1157 Information about the artifact within the datastore.
1158 ref : `DatasetRef`
1159 The registry information associated with this artifact.
1160 isComponent : `bool`
1161 Flag to indicate if a component is being read from this artifact.
1162 cache_ref : `DatasetRef`, optional
1163 The DatasetRef to use when looking up the file in the cache.
1164 This ref must have the same ID as the supplied ref but can
1165 be a parent ref or component ref to indicate to the cache whether
1166 a composite file is being requested from the cache or a component
1167 file. Without this the cache will default to the supplied ref but
1168 it can get confused with read-only derived components for
1169 disassembled composites.
1171 Returns
1172 -------
1173 inMemoryDataset : `object`
1174 The artifact as a python object.
1175 """
1176 location = getInfo.location
1177 uri = location.uri
1178 log.debug("Accessing data from %s", uri)
1180 if cache_ref is None:
1181 cache_ref = ref
1182 if cache_ref.id != ref.id: 1182 ↛ 1183line 1182 didn't jump to line 1183, because the condition on line 1182 was never true
1183 raise ValueError(
1184 "The supplied cache dataset ref refers to a different dataset than expected:"
1185 f" {ref.id} != {cache_ref.id}"
1186 )
1188 # Cannot recalculate checksum but can compare size as a quick check
1189 # Do not do this if the size is negative since that indicates
1190 # we do not know.
1191 recorded_size = getInfo.info.file_size
1192 resource_size = uri.size()
1193 if recorded_size >= 0 and resource_size != recorded_size: 1193 ↛ 1194line 1193 didn't jump to line 1194, because the condition on line 1193 was never true
1194 raise RuntimeError(
1195 "Integrity failure in Datastore. "
1196 f"Size of file {uri} ({resource_size}) "
1197 f"does not match size recorded in registry of {recorded_size}"
1198 )
1200 # For the general case we have choices for how to proceed.
1201 # 1. Always use a local file (downloading the remote resource to a
1202 # temporary file if needed).
1203 # 2. Use a threshold size and read into memory and use bytes.
1204 # Use both for now with an arbitrary hand off size.
1205 # This allows small datasets to be downloaded from remote object
1206 # stores without requiring a temporary file.
1208 formatter = getInfo.formatter
1209 nbytes_max = 10_000_000 # Arbitrary number that we can tune
1210 if resource_size <= nbytes_max and formatter.can_read_bytes():
1211 with self.cacheManager.find_in_cache(cache_ref, uri.getExtension()) as cached_file:
1212 if cached_file is not None:
1213 desired_uri = cached_file
1214 msg = f" (cached version of {uri})"
1215 else:
1216 desired_uri = uri
1217 msg = ""
1218 with time_this(log, msg="Reading bytes from %s%s", args=(desired_uri, msg)):
1219 serializedDataset = desired_uri.read()
1220 log.debug(
1221 "Deserializing %s from %d bytes from location %s with formatter %s",
1222 f"component {getInfo.component}" if isComponent else "",
1223 len(serializedDataset),
1224 uri,
1225 formatter.name(),
1226 )
1227 try:
1228 result = formatter.fromBytes(
1229 serializedDataset, component=getInfo.component if isComponent else None
1230 )
1231 except Exception as e:
1232 raise ValueError(
1233 f"Failure from formatter '{formatter.name()}' for dataset {ref.id}"
1234 f" ({ref.datasetType.name} from {uri}): {e}"
1235 ) from e
1236 else:
1237 # Read from file.
1239 # Have to update the Location associated with the formatter
1240 # because formatter.read does not allow an override.
1241 # This could be improved.
1242 location_updated = False
1243 msg = ""
1245 # First check in cache for local version.
1246 # The cache will only be relevant for remote resources but
1247 # no harm in always asking. Context manager ensures that cache
1248 # file is not deleted during cache expiration.
1249 with self.cacheManager.find_in_cache(cache_ref, uri.getExtension()) as cached_file:
1250 if cached_file is not None:
1251 msg = f"(via cache read of remote file {uri})"
1252 uri = cached_file
1253 location_updated = True
1255 with uri.as_local() as local_uri:
1257 can_be_cached = False
1258 if uri != local_uri: 1258 ↛ 1260line 1258 didn't jump to line 1260, because the condition on line 1258 was never true
1259 # URI was remote and file was downloaded
1260 cache_msg = ""
1261 location_updated = True
1263 if self.cacheManager.should_be_cached(cache_ref):
1264 # In this scenario we want to ask if the downloaded
1265 # file should be cached but we should not cache
1266 # it until after we've used it (to ensure it can't
1267 # be expired whilst we are using it).
1268 can_be_cached = True
1270 # Say that it is "likely" to be cached because
1271 # if the formatter read fails we will not be
1272 # caching this file.
1273 cache_msg = " and likely cached"
1275 msg = f"(via download to local file{cache_msg})"
1277 # Calculate the (possibly) new location for the formatter
1278 # to use.
1279 newLocation = Location(*local_uri.split()) if location_updated else None
1281 log.debug(
1282 "Reading%s from location %s %s with formatter %s",
1283 f" component {getInfo.component}" if isComponent else "",
1284 uri,
1285 msg,
1286 formatter.name(),
1287 )
1288 try:
1289 with formatter._updateLocation(newLocation):
1290 with time_this(
1291 log,
1292 msg="Reading%s from location %s %s with formatter %s",
1293 args=(
1294 f" component {getInfo.component}" if isComponent else "",
1295 uri,
1296 msg,
1297 formatter.name(),
1298 ),
1299 ):
1300 result = formatter.read(component=getInfo.component if isComponent else None)
1301 except Exception as e:
1302 raise ValueError(
1303 f"Failure from formatter '{formatter.name()}' for dataset {ref.id}"
1304 f" ({ref.datasetType.name} from {uri}): {e}"
1305 ) from e
1307 # File was read successfully so can move to cache
1308 if can_be_cached: 1308 ↛ 1309line 1308 didn't jump to line 1309, because the condition on line 1308 was never true
1309 self.cacheManager.move_to_cache(local_uri, cache_ref)
1311 return self._post_process_get(
1312 result, getInfo.readStorageClass, getInfo.assemblerParams, isComponent=isComponent
1313 )
1315 def knows(self, ref: DatasetRef) -> bool:
1316 """Check if the dataset is known to the datastore.
1318 Does not check for existence of any artifact.
1320 Parameters
1321 ----------
1322 ref : `DatasetRef`
1323 Reference to the required dataset.
1325 Returns
1326 -------
1327 exists : `bool`
1328 `True` if the dataset is known to the datastore.
1329 """
1330 fileLocations = self._get_dataset_locations_info(ref)
1331 if fileLocations:
1332 return True
1333 return False
1335 def _process_mexists_records(
1336 self,
1337 id_to_ref: Dict[DatasetId, DatasetRef],
1338 records: Dict[DatasetId, List[StoredFileInfo]],
1339 all_required: bool,
1340 artifact_existence: Optional[Dict[ResourcePath, bool]] = None,
1341 ) -> Dict[DatasetRef, bool]:
1342 """Helper function for mexists that checks the given records.
1344 Parameters
1345 ----------
1346 id_to_ref : `dict` of [`DatasetId`, `DatasetRef`]
1347 Mapping of the dataset ID to the dataset ref itself.
1348 records : `dict` of [`DatasetId`, `list` of `StoredFileInfo`]
1349 Records as generally returned by
1350 ``_get_stored_records_associated_with_refs``.
1351 all_required : `bool`
1352 Flag to indicate whether existence requires all artifacts
1353 associated with a dataset ID to exist or not for existence.
1354 artifact_existence : `dict` [`lsst.resources.ResourcePath`, `bool`]
1355 Optional mapping of datastore artifact to existence. Updated by
1356 this method with details of all artifacts tested. Can be `None`
1357 if the caller is not interested.
1359 Returns
1360 -------
1361 existence : `dict` of [`DatasetRef`, `bool`]
1362 Mapping from dataset to boolean indicating existence.
1363 """
1364 # The URIs to be checked and a mapping of those URIs to
1365 # the dataset ID.
1366 uris_to_check: List[ResourcePath] = []
1367 location_map: Dict[ResourcePath, DatasetId] = {}
1369 location_factory = self.locationFactory
1371 for ref_id, info in records.items():
1372 # Key is the dataId, value is list of StoredItemInfo
1373 uris = [info.file_location(location_factory).uri for info in info]
1374 uris_to_check.extend(uris)
1375 location_map.update({uri: ref_id for uri in uris})
1377 uri_existence: Dict[ResourcePath, bool] = {}
1378 if artifact_existence is not None:
1379 # If a URI has already been checked remove it from the list
1380 # and immediately add the status to the output dict.
1381 filtered_uris_to_check = []
1382 for uri in uris_to_check:
1383 if uri in artifact_existence:
1384 uri_existence[uri] = artifact_existence[uri]
1385 else:
1386 filtered_uris_to_check.append(uri)
1387 uris_to_check = filtered_uris_to_check
1389 # Results.
1390 dataset_existence: Dict[DatasetRef, bool] = {}
1392 uri_existence.update(ResourcePath.mexists(uris_to_check))
1393 for uri, exists in uri_existence.items():
1394 dataset_id = location_map[uri]
1395 ref = id_to_ref[dataset_id]
1397 # Disassembled composite needs to check all locations.
1398 # all_required indicates whether all need to exist or not.
1399 if ref in dataset_existence:
1400 if all_required:
1401 exists = dataset_existence[ref] and exists
1402 else:
1403 exists = dataset_existence[ref] or exists
1404 dataset_existence[ref] = exists
1406 if artifact_existence is not None:
1407 artifact_existence.update(uri_existence)
1409 return dataset_existence
1411 def mexists(
1412 self, refs: Iterable[DatasetRef], artifact_existence: Optional[Dict[ResourcePath, bool]] = None
1413 ) -> Dict[DatasetRef, bool]:
1414 """Check the existence of multiple datasets at once.
1416 Parameters
1417 ----------
1418 refs : iterable of `DatasetRef`
1419 The datasets to be checked.
1420 artifact_existence : `dict` [`lsst.resources.ResourcePath`, `bool`]
1421 Optional mapping of datastore artifact to existence. Updated by
1422 this method with details of all artifacts tested. Can be `None`
1423 if the caller is not interested.
1425 Returns
1426 -------
1427 existence : `dict` of [`DatasetRef`, `bool`]
1428 Mapping from dataset to boolean indicating existence.
1429 """
1430 chunk_size = 10_000
1431 dataset_existence: Dict[DatasetRef, bool] = {}
1432 log.debug("Checking for the existence of multiple artifacts in datastore in chunks of %d", chunk_size)
1433 n_found_total = 0
1434 n_checked = 0
1435 n_chunks = 0
1436 for chunk in chunk_iterable(refs, chunk_size=chunk_size):
1437 chunk_result = self._mexists(chunk, artifact_existence)
1438 if log.isEnabledFor(VERBOSE):
1439 n_results = len(chunk_result)
1440 n_checked += n_results
1441 # Can treat the booleans as 0, 1 integers and sum them.
1442 n_found = sum(chunk_result.values())
1443 n_found_total += n_found
1444 log.verbose(
1445 "Number of datasets found in datastore for chunk %d = %d/%d (running total: %d/%d)",
1446 n_chunks,
1447 n_found,
1448 n_results,
1449 n_found_total,
1450 n_checked,
1451 )
1452 dataset_existence.update(chunk_result)
1453 n_chunks += 1
1455 return dataset_existence
1457 def _mexists(
1458 self, refs: Iterable[DatasetRef], artifact_existence: Optional[Dict[ResourcePath, bool]] = None
1459 ) -> Dict[DatasetRef, bool]:
1460 """Check the existence of multiple datasets at once.
1462 Parameters
1463 ----------
1464 refs : iterable of `DatasetRef`
1465 The datasets to be checked.
1467 Returns
1468 -------
1469 existence : `dict` of [`DatasetRef`, `bool`]
1470 Mapping from dataset to boolean indicating existence.
1471 """
1472 # Need a mapping of dataset_id to dataset ref since the API
1473 # works with dataset_id
1474 id_to_ref = {ref.getCheckedId(): ref for ref in refs}
1476 # Set of all IDs we are checking for.
1477 requested_ids = set(id_to_ref.keys())
1479 # The records themselves. Could be missing some entries.
1480 records = self._get_stored_records_associated_with_refs(refs)
1482 dataset_existence = self._process_mexists_records(
1483 id_to_ref, records, True, artifact_existence=artifact_existence
1484 )
1486 # Set of IDs that have been handled.
1487 handled_ids = {ref.id for ref in dataset_existence.keys()}
1489 missing_ids = requested_ids - handled_ids
1490 if missing_ids:
1491 if not self.trustGetRequest:
1492 # Must assume these do not exist
1493 for missing in missing_ids:
1494 dataset_existence[id_to_ref[missing]] = False
1495 else:
1496 log.debug(
1497 "%d out of %d datasets were not known to datastore during initial existence check.",
1498 len(missing_ids),
1499 len(requested_ids),
1500 )
1502 # Construct data structure identical to that returned
1503 # by _get_stored_records_associated_with_refs() but using
1504 # guessed names.
1505 records = {}
1506 for missing in missing_ids:
1507 expected = self._get_expected_dataset_locations_info(id_to_ref[missing])
1508 records[missing] = [info for _, info in expected]
1510 dataset_existence.update(
1511 self._process_mexists_records(
1512 id_to_ref, records, False, artifact_existence=artifact_existence
1513 )
1514 )
1516 return dataset_existence
1518 def exists(self, ref: DatasetRef) -> bool:
1519 """Check if the dataset exists in the datastore.
1521 Parameters
1522 ----------
1523 ref : `DatasetRef`
1524 Reference to the required dataset.
1526 Returns
1527 -------
1528 exists : `bool`
1529 `True` if the entity exists in the `Datastore`.
1530 """
1531 fileLocations = self._get_dataset_locations_info(ref)
1533 # if we are being asked to trust that registry might not be correct
1534 # we ask for the expected locations and check them explicitly
1535 if not fileLocations:
1536 if not self.trustGetRequest:
1537 return False
1539 # When we are guessing a dataset location we can not check
1540 # for the existence of every component since we can not
1541 # know if every component was written. Instead we check
1542 # for the existence of any of the expected locations.
1543 for location, _ in self._get_expected_dataset_locations_info(ref): 1543 ↛ 1546line 1543 didn't jump to line 1546, because the loop on line 1543 didn't complete
1544 if self._artifact_exists(location): 1544 ↛ 1543line 1544 didn't jump to line 1543, because the condition on line 1544 was never false
1545 return True
1546 return False
1548 # All listed artifacts must exist.
1549 for location, _ in fileLocations:
1550 if not self._artifact_exists(location):
1551 return False
1553 return True
1555 def getURIs(
1556 self, ref: DatasetRef, predict: bool = False
1557 ) -> Tuple[Optional[ResourcePath], Dict[str, ResourcePath]]:
1558 """Return URIs associated with dataset.
1560 Parameters
1561 ----------
1562 ref : `DatasetRef`
1563 Reference to the required dataset.
1564 predict : `bool`, optional
1565 If the datastore does not know about the dataset, should it
1566 return a predicted URI or not?
1568 Returns
1569 -------
1570 primary : `lsst.resources.ResourcePath`
1571 The URI to the primary artifact associated with this dataset.
1572 If the dataset was disassembled within the datastore this
1573 may be `None`.
1574 components : `dict`
1575 URIs to any components associated with the dataset artifact.
1576 Can be empty if there are no components.
1577 """
1579 primary: Optional[ResourcePath] = None
1580 components: Dict[str, ResourcePath] = {}
1582 # if this has never been written then we have to guess
1583 if not self.exists(ref):
1584 if not predict:
1585 raise FileNotFoundError("Dataset {} not in this datastore".format(ref))
1587 doDisassembly = self.composites.shouldBeDisassembled(ref)
1589 if doDisassembly:
1591 for component, componentStorage in ref.datasetType.storageClass.components.items():
1592 compRef = ref.makeComponentRef(component)
1593 compLocation, _ = self._determine_put_formatter_location(compRef)
1595 # Add a URI fragment to indicate this is a guess
1596 components[component] = ResourcePath(compLocation.uri.geturl() + "#predicted")
1598 else:
1600 location, _ = self._determine_put_formatter_location(ref)
1602 # Add a URI fragment to indicate this is a guess
1603 primary = ResourcePath(location.uri.geturl() + "#predicted")
1605 return primary, components
1607 # If this is a ref that we have written we can get the path.
1608 # Get file metadata and internal metadata
1609 fileLocations = self._get_dataset_locations_info(ref)
1611 guessing = False
1612 if not fileLocations:
1613 if not self.trustGetRequest: 1613 ↛ 1614line 1613 didn't jump to line 1614, because the condition on line 1613 was never true
1614 raise RuntimeError(f"Unexpectedly got no artifacts for dataset {ref}")
1615 fileLocations = self._get_expected_dataset_locations_info(ref)
1616 guessing = True
1618 if len(fileLocations) == 1:
1619 # No disassembly so this is the primary URI
1620 uri = fileLocations[0][0].uri
1621 if guessing and not uri.exists(): 1621 ↛ 1622line 1621 didn't jump to line 1622, because the condition on line 1621 was never true
1622 raise FileNotFoundError(f"Expected URI ({uri}) does not exist")
1623 primary = uri
1625 else:
1626 for location, storedFileInfo in fileLocations:
1627 if storedFileInfo.component is None: 1627 ↛ 1628line 1627 didn't jump to line 1628, because the condition on line 1627 was never true
1628 raise RuntimeError(f"Unexpectedly got no component name for a component at {location}")
1629 uri = location.uri
1630 if guessing and not uri.exists(): 1630 ↛ 1634line 1630 didn't jump to line 1634, because the condition on line 1630 was never true
1631 # If we are trusting then it is entirely possible for
1632 # some components to be missing. In that case we skip
1633 # to the next component.
1634 if self.trustGetRequest:
1635 continue
1636 raise FileNotFoundError(f"Expected URI ({uri}) does not exist")
1637 components[storedFileInfo.component] = uri
1639 return primary, components
1641 def getURI(self, ref: DatasetRef, predict: bool = False) -> ResourcePath:
1642 """URI to the Dataset.
1644 Parameters
1645 ----------
1646 ref : `DatasetRef`
1647 Reference to the required Dataset.
1648 predict : `bool`
1649 If `True`, allow URIs to be returned of datasets that have not
1650 been written.
1652 Returns
1653 -------
1654 uri : `str`
1655 URI pointing to the dataset within the datastore. If the
1656 dataset does not exist in the datastore, and if ``predict`` is
1657 `True`, the URI will be a prediction and will include a URI
1658 fragment "#predicted".
1659 If the datastore does not have entities that relate well
1660 to the concept of a URI the returned URI will be
1661 descriptive. The returned URI is not guaranteed to be obtainable.
1663 Raises
1664 ------
1665 FileNotFoundError
1666 Raised if a URI has been requested for a dataset that does not
1667 exist and guessing is not allowed.
1668 RuntimeError
1669 Raised if a request is made for a single URI but multiple URIs
1670 are associated with this dataset.
1672 Notes
1673 -----
1674 When a predicted URI is requested an attempt will be made to form
1675 a reasonable URI based on file templates and the expected formatter.
1676 """
1677 primary, components = self.getURIs(ref, predict)
1678 if primary is None or components: 1678 ↛ 1679line 1678 didn't jump to line 1679, because the condition on line 1678 was never true
1679 raise RuntimeError(
1680 f"Dataset ({ref}) includes distinct URIs for components. Use Datastore.getURIs() instead."
1681 )
1682 return primary
1684 def retrieveArtifacts(
1685 self,
1686 refs: Iterable[DatasetRef],
1687 destination: ResourcePath,
1688 transfer: str = "auto",
1689 preserve_path: bool = True,
1690 overwrite: bool = False,
1691 ) -> List[ResourcePath]:
1692 """Retrieve the file artifacts associated with the supplied refs.
1694 Parameters
1695 ----------
1696 refs : iterable of `DatasetRef`
1697 The datasets for which file artifacts are to be retrieved.
1698 A single ref can result in multiple files. The refs must
1699 be resolved.
1700 destination : `lsst.resources.ResourcePath`
1701 Location to write the file artifacts.
1702 transfer : `str`, optional
1703 Method to use to transfer the artifacts. Must be one of the options
1704 supported by `lsst.resources.ResourcePath.transfer_from()`.
1705 "move" is not allowed.
1706 preserve_path : `bool`, optional
1707 If `True` the full path of the file artifact within the datastore
1708 is preserved. If `False` the final file component of the path
1709 is used.
1710 overwrite : `bool`, optional
1711 If `True` allow transfers to overwrite existing files at the
1712 destination.
1714 Returns
1715 -------
1716 targets : `list` of `lsst.resources.ResourcePath`
1717 URIs of file artifacts in destination location. Order is not
1718 preserved.
1719 """
1720 if not destination.isdir(): 1720 ↛ 1721line 1720 didn't jump to line 1721, because the condition on line 1720 was never true
1721 raise ValueError(f"Destination location must refer to a directory. Given {destination}")
1723 if transfer == "move":
1724 raise ValueError("Can not move artifacts out of datastore. Use copy instead.")
1726 # Source -> Destination
1727 # This also helps filter out duplicate DatasetRef in the request
1728 # that will map to the same underlying file transfer.
1729 to_transfer: Dict[ResourcePath, ResourcePath] = {}
1731 for ref in refs:
1732 locations = self._get_dataset_locations_info(ref)
1733 for location, _ in locations:
1734 source_uri = location.uri
1735 target_path: ResourcePathExpression
1736 if preserve_path:
1737 target_path = location.pathInStore
1738 if target_path.isabs(): 1738 ↛ 1741line 1738 didn't jump to line 1741, because the condition on line 1738 was never true
1739 # This is an absolute path to an external file.
1740 # Use the full path.
1741 target_path = target_path.relativeToPathRoot
1742 else:
1743 target_path = source_uri.basename()
1744 target_uri = destination.join(target_path)
1745 to_transfer[source_uri] = target_uri
1747 # In theory can now parallelize the transfer
1748 log.debug("Number of artifacts to transfer to %s: %d", str(destination), len(to_transfer))
1749 for source_uri, target_uri in to_transfer.items():
1750 target_uri.transfer_from(source_uri, transfer=transfer, overwrite=overwrite)
1752 return list(to_transfer.values())
1754 def get(self, ref: DatasetRef, parameters: Optional[Mapping[str, Any]] = None) -> Any:
1755 """Load an InMemoryDataset from the store.
1757 Parameters
1758 ----------
1759 ref : `DatasetRef`
1760 Reference to the required Dataset.
1761 parameters : `dict`
1762 `StorageClass`-specific parameters that specify, for example,
1763 a slice of the dataset to be loaded.
1765 Returns
1766 -------
1767 inMemoryDataset : `object`
1768 Requested dataset or slice thereof as an InMemoryDataset.
1770 Raises
1771 ------
1772 FileNotFoundError
1773 Requested dataset can not be retrieved.
1774 TypeError
1775 Return value from formatter has unexpected type.
1776 ValueError
1777 Formatter failed to process the dataset.
1778 """
1779 allGetInfo = self._prepare_for_get(ref, parameters)
1780 refComponent = ref.datasetType.component()
1782 # Supplied storage class for the component being read
1783 refStorageClass = ref.datasetType.storageClass
1785 # Create mapping from component name to related info
1786 allComponents = {i.component: i for i in allGetInfo}
1788 # By definition the dataset is disassembled if we have more
1789 # than one record for it.
1790 isDisassembled = len(allGetInfo) > 1
1792 # Look for the special case where we are disassembled but the
1793 # component is a derived component that was not written during
1794 # disassembly. For this scenario we need to check that the
1795 # component requested is listed as a derived component for the
1796 # composite storage class
1797 isDisassembledReadOnlyComponent = False
1798 if isDisassembled and refComponent:
1799 # The composite storage class should be accessible through
1800 # the component dataset type
1801 compositeStorageClass = ref.datasetType.parentStorageClass
1803 # In the unlikely scenario where the composite storage
1804 # class is not known, we can only assume that this is a
1805 # normal component. If that assumption is wrong then the
1806 # branch below that reads a persisted component will fail
1807 # so there is no need to complain here.
1808 if compositeStorageClass is not None: 1808 ↛ 1811line 1808 didn't jump to line 1811, because the condition on line 1808 was never false
1809 isDisassembledReadOnlyComponent = refComponent in compositeStorageClass.derivedComponents
1811 if isDisassembled and not refComponent:
1812 # This was a disassembled dataset spread over multiple files
1813 # and we need to put them all back together again.
1814 # Read into memory and then assemble
1816 # Check that the supplied parameters are suitable for the type read
1817 refStorageClass.validateParameters(parameters)
1819 # We want to keep track of all the parameters that were not used
1820 # by formatters. We assume that if any of the component formatters
1821 # use a parameter that we do not need to apply it again in the
1822 # assembler.
1823 usedParams = set()
1825 components: Dict[str, Any] = {}
1826 for getInfo in allGetInfo:
1827 # assemblerParams are parameters not understood by the
1828 # associated formatter.
1829 usedParams.update(set(getInfo.formatterParams))
1831 component = getInfo.component
1833 if component is None: 1833 ↛ 1834line 1833 didn't jump to line 1834, because the condition on line 1833 was never true
1834 raise RuntimeError(f"Internal error in datastore assembly of {ref}")
1836 # We do not want the formatter to think it's reading
1837 # a component though because it is really reading a
1838 # standalone dataset -- always tell reader it is not a
1839 # component.
1840 components[component] = self._read_artifact_into_memory(
1841 getInfo, ref.makeComponentRef(component), isComponent=False
1842 )
1844 inMemoryDataset = ref.datasetType.storageClass.delegate().assemble(components)
1846 # Any unused parameters will have to be passed to the assembler
1847 if parameters:
1848 unusedParams = {k: v for k, v in parameters.items() if k not in usedParams}
1849 else:
1850 unusedParams = {}
1852 # Process parameters
1853 return ref.datasetType.storageClass.delegate().handleParameters(
1854 inMemoryDataset, parameters=unusedParams
1855 )
1857 elif isDisassembledReadOnlyComponent:
1859 compositeStorageClass = ref.datasetType.parentStorageClass
1860 if compositeStorageClass is None: 1860 ↛ 1861line 1860 didn't jump to line 1861, because the condition on line 1860 was never true
1861 raise RuntimeError(
1862 f"Unable to retrieve derived component '{refComponent}' since"
1863 "no composite storage class is available."
1864 )
1866 if refComponent is None: 1866 ↛ 1868line 1866 didn't jump to line 1868, because the condition on line 1866 was never true
1867 # Mainly for mypy
1868 raise RuntimeError(f"Internal error in datastore {self.name}: component can not be None here")
1870 # Assume that every derived component can be calculated by
1871 # forwarding the request to a single read/write component.
1872 # Rather than guessing which rw component is the right one by
1873 # scanning each for a derived component of the same name,
1874 # we ask the storage class delegate directly which one is best to
1875 # use.
1876 compositeDelegate = compositeStorageClass.delegate()
1877 forwardedComponent = compositeDelegate.selectResponsibleComponent(
1878 refComponent, set(allComponents)
1879 )
1881 # Select the relevant component
1882 rwInfo = allComponents[forwardedComponent]
1884 # For now assume that read parameters are validated against
1885 # the real component and not the requested component
1886 forwardedStorageClass = rwInfo.formatter.fileDescriptor.readStorageClass
1887 forwardedStorageClass.validateParameters(parameters)
1889 # The reference to use for the caching must refer to the forwarded
1890 # component and not the derived component.
1891 cache_ref = ref.makeCompositeRef().makeComponentRef(forwardedComponent)
1893 # Unfortunately the FileDescriptor inside the formatter will have
1894 # the wrong write storage class so we need to create a new one
1895 # given the immutability constraint.
1896 writeStorageClass = rwInfo.info.storageClass
1898 # We may need to put some thought into parameters for read
1899 # components but for now forward them on as is
1900 readFormatter = type(rwInfo.formatter)(
1901 FileDescriptor(
1902 rwInfo.location,
1903 readStorageClass=refStorageClass,
1904 storageClass=writeStorageClass,
1905 parameters=parameters,
1906 ),
1907 ref.dataId,
1908 )
1910 # The assembler can not receive any parameter requests for a
1911 # derived component at this time since the assembler will
1912 # see the storage class of the derived component and those
1913 # parameters will have to be handled by the formatter on the
1914 # forwarded storage class.
1915 assemblerParams: Dict[str, Any] = {}
1917 # Need to created a new info that specifies the derived
1918 # component and associated storage class
1919 readInfo = DatastoreFileGetInformation(
1920 rwInfo.location,
1921 readFormatter,
1922 rwInfo.info,
1923 assemblerParams,
1924 {},
1925 refComponent,
1926 refStorageClass,
1927 )
1929 return self._read_artifact_into_memory(readInfo, ref, isComponent=True, cache_ref=cache_ref)
1931 else:
1932 # Single file request or component from that composite file
1933 for lookup in (refComponent, None): 1933 ↛ 1938line 1933 didn't jump to line 1938, because the loop on line 1933 didn't complete
1934 if lookup in allComponents: 1934 ↛ 1933line 1934 didn't jump to line 1933, because the condition on line 1934 was never false
1935 getInfo = allComponents[lookup]
1936 break
1937 else:
1938 raise FileNotFoundError(
1939 f"Component {refComponent} not found for ref {ref} in datastore {self.name}"
1940 )
1942 # Do not need the component itself if already disassembled
1943 if isDisassembled:
1944 isComponent = False
1945 else:
1946 isComponent = getInfo.component is not None
1948 # For a component read of a composite we want the cache to
1949 # be looking at the composite ref itself.
1950 cache_ref = ref.makeCompositeRef() if isComponent else ref
1952 # For a disassembled component we can validate parametersagainst
1953 # the component storage class directly
1954 if isDisassembled:
1955 refStorageClass.validateParameters(parameters)
1956 else:
1957 # For an assembled composite this could be a derived
1958 # component derived from a real component. The validity
1959 # of the parameters is not clear. For now validate against
1960 # the composite storage class
1961 getInfo.formatter.fileDescriptor.storageClass.validateParameters(parameters)
1963 return self._read_artifact_into_memory(getInfo, ref, isComponent=isComponent, cache_ref=cache_ref)
1965 @transactional
1966 def put(self, inMemoryDataset: Any, ref: DatasetRef) -> None:
1967 """Write a InMemoryDataset with a given `DatasetRef` to the store.
1969 Parameters
1970 ----------
1971 inMemoryDataset : `object`
1972 The dataset to store.
1973 ref : `DatasetRef`
1974 Reference to the associated Dataset.
1976 Raises
1977 ------
1978 TypeError
1979 Supplied object and storage class are inconsistent.
1980 DatasetTypeNotSupportedError
1981 The associated `DatasetType` is not handled by this datastore.
1983 Notes
1984 -----
1985 If the datastore is configured to reject certain dataset types it
1986 is possible that the put will fail and raise a
1987 `DatasetTypeNotSupportedError`. The main use case for this is to
1988 allow `ChainedDatastore` to put to multiple datastores without
1989 requiring that every datastore accepts the dataset.
1990 """
1992 doDisassembly = self.composites.shouldBeDisassembled(ref)
1993 # doDisassembly = True
1995 artifacts = []
1996 if doDisassembly:
1997 components = ref.datasetType.storageClass.delegate().disassemble(inMemoryDataset)
1998 for component, componentInfo in components.items():
1999 # Don't recurse because we want to take advantage of
2000 # bulk insert -- need a new DatasetRef that refers to the
2001 # same dataset_id but has the component DatasetType
2002 # DatasetType does not refer to the types of components
2003 # So we construct one ourselves.
2004 compRef = ref.makeComponentRef(component)
2005 storedInfo = self._write_in_memory_to_artifact(componentInfo.component, compRef)
2006 artifacts.append((compRef, storedInfo))
2007 else:
2008 # Write the entire thing out
2009 storedInfo = self._write_in_memory_to_artifact(inMemoryDataset, ref)
2010 artifacts.append((ref, storedInfo))
2012 self._register_datasets(artifacts)
2014 @transactional
2015 def trash(self, ref: Union[DatasetRef, Iterable[DatasetRef]], ignore_errors: bool = True) -> None:
2016 # At this point can safely remove these datasets from the cache
2017 # to avoid confusion later on. If they are not trashed later
2018 # the cache will simply be refilled.
2019 self.cacheManager.remove_from_cache(ref)
2021 # If we are in trust mode there will be nothing to move to
2022 # the trash table and we will have to try to delete the file
2023 # immediately.
2024 if self.trustGetRequest:
2025 # Try to keep the logic below for a single file trash.
2026 if isinstance(ref, DatasetRef):
2027 refs = {ref}
2028 else:
2029 # Will recreate ref at the end of this branch.
2030 refs = set(ref)
2032 # Determine which datasets are known to datastore directly.
2033 id_to_ref = {ref.getCheckedId(): ref for ref in refs}
2034 existing_ids = self._get_stored_records_associated_with_refs(refs)
2035 existing_refs = {id_to_ref[ref_id] for ref_id in existing_ids}
2037 missing = refs - existing_refs
2038 if missing:
2039 # Do an explicit existence check on these refs.
2040 # We only care about the artifacts at this point and not
2041 # the dataset existence.
2042 artifact_existence: Dict[ResourcePath, bool] = {}
2043 _ = self.mexists(missing, artifact_existence)
2044 uris = [uri for uri, exists in artifact_existence.items() if exists]
2046 # FUTURE UPGRADE: Implement a parallelized bulk remove.
2047 log.debug("Removing %d artifacts from datastore that are unknown to datastore", len(uris))
2048 for uri in uris:
2049 try:
2050 uri.remove()
2051 except Exception as e:
2052 if ignore_errors:
2053 log.debug("Artifact %s could not be removed: %s", uri, e)
2054 continue
2055 raise
2057 # There is no point asking the code below to remove refs we
2058 # know are missing so update it with the list of existing
2059 # records. Try to retain one vs many logic.
2060 if not existing_refs:
2061 # Nothing more to do since none of the datasets were
2062 # known to the datastore record table.
2063 return
2064 ref = list(existing_refs)
2065 if len(ref) == 1:
2066 ref = ref[0]
2068 # Get file metadata and internal metadata
2069 if not isinstance(ref, DatasetRef):
2070 log.debug("Doing multi-dataset trash in datastore %s", self.name)
2071 # Assumed to be an iterable of refs so bulk mode enabled.
2072 try:
2073 self.bridge.moveToTrash(ref)
2074 except Exception as e:
2075 if ignore_errors:
2076 log.warning("Unexpected issue moving multiple datasets to trash: %s", e)
2077 else:
2078 raise
2079 return
2081 log.debug("Trashing dataset %s in datastore %s", ref, self.name)
2083 fileLocations = self._get_dataset_locations_info(ref)
2085 if not fileLocations:
2086 err_msg = f"Requested dataset to trash ({ref}) is not known to datastore {self.name}"
2087 if ignore_errors:
2088 log.warning(err_msg)
2089 return
2090 else:
2091 raise FileNotFoundError(err_msg)
2093 for location, storedFileInfo in fileLocations:
2094 if not self._artifact_exists(location): 2094 ↛ 2095line 2094 didn't jump to line 2095
2095 err_msg = (
2096 f"Dataset is known to datastore {self.name} but "
2097 f"associated artifact ({location.uri}) is missing"
2098 )
2099 if ignore_errors:
2100 log.warning(err_msg)
2101 return
2102 else:
2103 raise FileNotFoundError(err_msg)
2105 # Mark dataset as trashed
2106 try:
2107 self.bridge.moveToTrash([ref])
2108 except Exception as e:
2109 if ignore_errors:
2110 log.warning(
2111 "Attempted to mark dataset (%s) to be trashed in datastore %s "
2112 "but encountered an error: %s",
2113 ref,
2114 self.name,
2115 e,
2116 )
2117 pass
2118 else:
2119 raise
2121 @transactional
2122 def emptyTrash(self, ignore_errors: bool = True) -> None:
2123 """Remove all datasets from the trash.
2125 Parameters
2126 ----------
2127 ignore_errors : `bool`
2128 If `True` return without error even if something went wrong.
2129 Problems could occur if another process is simultaneously trying
2130 to delete.
2131 """
2132 log.debug("Emptying trash in datastore %s", self.name)
2134 # Context manager will empty trash iff we finish it without raising.
2135 # It will also automatically delete the relevant rows from the
2136 # trash table and the records table.
2137 with self.bridge.emptyTrash(
2138 self._table, record_class=StoredFileInfo, record_column="path"
2139 ) as trash_data:
2140 # Removing the artifacts themselves requires that the files are
2141 # not also associated with refs that are not to be trashed.
2142 # Therefore need to do a query with the file paths themselves
2143 # and return all the refs associated with them. Can only delete
2144 # a file if the refs to be trashed are the only refs associated
2145 # with the file.
2146 # This requires multiple copies of the trashed items
2147 trashed, artifacts_to_keep = trash_data
2149 if artifacts_to_keep is None:
2150 # The bridge is not helping us so have to work it out
2151 # ourselves. This is not going to be as efficient.
2152 trashed = list(trashed)
2154 # The instance check is for mypy since up to this point it
2155 # does not know the type of info.
2156 path_map = self._refs_associated_with_artifacts(
2157 [info.path for _, info in trashed if isinstance(info, StoredFileInfo)]
2158 )
2160 for ref, info in trashed:
2162 # Mypy needs to know this is not the base class
2163 assert isinstance(info, StoredFileInfo), f"Unexpectedly got info of class {type(info)}"
2165 # Check for mypy
2166 assert ref.id is not None, f"Internal logic error in emptyTrash with ref {ref}/{info}"
2168 path_map[info.path].remove(ref.id)
2169 if not path_map[info.path]: 2169 ↛ 2160line 2169 didn't jump to line 2160, because the condition on line 2169 was never false
2170 del path_map[info.path]
2172 artifacts_to_keep = set(path_map)
2174 for ref, info in trashed:
2176 # Should not happen for this implementation but need
2177 # to keep mypy happy.
2178 assert info is not None, f"Internal logic error in emptyTrash with ref {ref}."
2180 # Mypy needs to know this is not the base class
2181 assert isinstance(info, StoredFileInfo), f"Unexpectedly got info of class {type(info)}"
2183 # Check for mypy
2184 assert ref.id is not None, f"Internal logic error in emptyTrash with ref {ref}/{info}"
2186 if info.path in artifacts_to_keep:
2187 # This is a multi-dataset artifact and we are not
2188 # removing all associated refs.
2189 continue
2191 # Only trashed refs still known to datastore will be returned.
2192 location = info.file_location(self.locationFactory)
2194 # Point of no return for this artifact
2195 log.debug("Removing artifact %s from datastore %s", location.uri, self.name)
2196 try:
2197 self._delete_artifact(location)
2198 except FileNotFoundError:
2199 # If the file itself has been deleted there is nothing
2200 # we can do about it. It is possible that trash has
2201 # been run in parallel in another process or someone
2202 # decided to delete the file. It is unlikely to come
2203 # back and so we should still continue with the removal
2204 # of the entry from the trash table. It is also possible
2205 # we removed it in a previous iteration if it was
2206 # a multi-dataset artifact. The delete artifact method
2207 # will log a debug message in this scenario.
2208 # Distinguishing file missing before trash started and
2209 # file already removed previously as part of this trash
2210 # is not worth the distinction with regards to potential
2211 # memory cost.
2212 pass
2213 except Exception as e:
2214 if ignore_errors:
2215 # Use a debug message here even though it's not
2216 # a good situation. In some cases this can be
2217 # caused by a race between user A and user B
2218 # and neither of them has permissions for the
2219 # other's files. Butler does not know about users
2220 # and trash has no idea what collections these
2221 # files were in (without guessing from a path).
2222 log.debug(
2223 "Encountered error removing artifact %s from datastore %s: %s",
2224 location.uri,
2225 self.name,
2226 e,
2227 )
2228 else:
2229 raise
2231 @transactional
2232 def transfer_from(
2233 self,
2234 source_datastore: Datastore,
2235 refs: Iterable[DatasetRef],
2236 local_refs: Optional[Iterable[DatasetRef]] = None,
2237 transfer: str = "auto",
2238 artifact_existence: Optional[Dict[ResourcePath, bool]] = None,
2239 ) -> None:
2240 # Docstring inherited
2241 if type(self) is not type(source_datastore):
2242 raise TypeError(
2243 f"Datastore mismatch between this datastore ({type(self)}) and the "
2244 f"source datastore ({type(source_datastore)})."
2245 )
2247 # Be explicit for mypy
2248 if not isinstance(source_datastore, FileDatastore): 2248 ↛ 2249line 2248 didn't jump to line 2249, because the condition on line 2248 was never true
2249 raise TypeError(
2250 "Can only transfer to a FileDatastore from another FileDatastore, not"
2251 f" {type(source_datastore)}"
2252 )
2254 # Stop early if "direct" transfer mode is requested. That would
2255 # require that the URI inside the source datastore should be stored
2256 # directly in the target datastore, which seems unlikely to be useful
2257 # since at any moment the source datastore could delete the file.
2258 if transfer in ("direct", "split"):
2259 raise ValueError(
2260 f"Can not transfer from a source datastore using {transfer} mode since"
2261 " those files are controlled by the other datastore."
2262 )
2264 # Empty existence lookup if none given.
2265 if artifact_existence is None:
2266 artifact_existence = {}
2268 # We will go through the list multiple times so must convert
2269 # generators to lists.
2270 refs = list(refs)
2272 if local_refs is None:
2273 local_refs = refs
2274 else:
2275 local_refs = list(local_refs)
2277 # In order to handle disassembled composites the code works
2278 # at the records level since it can assume that internal APIs
2279 # can be used.
2280 # - If the record already exists in the destination this is assumed
2281 # to be okay.
2282 # - If there is no record but the source and destination URIs are
2283 # identical no transfer is done but the record is added.
2284 # - If the source record refers to an absolute URI currently assume
2285 # that that URI should remain absolute and will be visible to the
2286 # destination butler. May need to have a flag to indicate whether
2287 # the dataset should be transferred. This will only happen if
2288 # the detached Butler has had a local ingest.
2290 # What we really want is all the records in the source datastore
2291 # associated with these refs. Or derived ones if they don't exist
2292 # in the source.
2293 source_records = source_datastore._get_stored_records_associated_with_refs(refs)
2295 # The source dataset_ids are the keys in these records
2296 source_ids = set(source_records)
2297 log.debug("Number of datastore records found in source: %d", len(source_ids))
2299 # The not None check is to appease mypy
2300 requested_ids = set(ref.id for ref in refs if ref.id is not None)
2301 missing_ids = requested_ids - source_ids
2303 # Missing IDs can be okay if that datastore has allowed
2304 # gets based on file existence. Should we transfer what we can
2305 # or complain about it and warn?
2306 if missing_ids and not source_datastore.trustGetRequest: 2306 ↛ 2307line 2306 didn't jump to line 2307, because the condition on line 2306 was never true
2307 raise ValueError(
2308 f"Some datasets are missing from source datastore {source_datastore}: {missing_ids}"
2309 )
2311 # Need to map these missing IDs to a DatasetRef so we can guess
2312 # the details.
2313 if missing_ids:
2314 log.info(
2315 "Number of expected datasets missing from source datastore records: %d out of %d",
2316 len(missing_ids),
2317 len(requested_ids),
2318 )
2319 id_to_ref = {ref.id: ref for ref in refs if ref.id in missing_ids}
2321 # This should be chunked in case we end up having to check
2322 # the file store since we need some log output to show
2323 # progress.
2324 for missing_ids_chunk in chunk_iterable(missing_ids, chunk_size=10_000):
2325 records = {}
2326 for missing in missing_ids_chunk:
2327 # Ask the source datastore where the missing artifacts
2328 # should be. An execution butler might not know about the
2329 # artifacts even if they are there.
2330 expected = source_datastore._get_expected_dataset_locations_info(id_to_ref[missing])
2331 records[missing] = [info for _, info in expected]
2333 # Call the mexist helper method in case we have not already
2334 # checked these artifacts such that artifact_existence is
2335 # empty. This allows us to benefit from parallelism.
2336 # datastore.mexists() itself does not give us access to the
2337 # derived datastore record.
2338 log.verbose("Checking existence of %d datasets unknown to datastore", len(records))
2339 ref_exists = source_datastore._process_mexists_records(
2340 id_to_ref, records, False, artifact_existence=artifact_existence
2341 )
2343 # Now go through the records and propagate the ones that exist.
2344 location_factory = source_datastore.locationFactory
2345 for missing, record_list in records.items():
2346 # Skip completely if the ref does not exist.
2347 ref = id_to_ref[missing]
2348 if not ref_exists[ref]:
2349 log.warning("Asked to transfer dataset %s but no file artifacts exist for it.", ref)
2350 continue
2351 # Check for file artifact to decide which parts of a
2352 # disassembled composite do exist. If there is only a
2353 # single record we don't even need to look because it can't
2354 # be a composite and must exist.
2355 if len(record_list) == 1:
2356 dataset_records = record_list
2357 else:
2358 dataset_records = [
2359 record
2360 for record in record_list
2361 if artifact_existence[record.file_location(location_factory).uri]
2362 ]
2363 assert len(dataset_records) > 0, "Disassembled composite should have had some files."
2365 # Rely on source_records being a defaultdict.
2366 source_records[missing].extend(dataset_records)
2368 # See if we already have these records
2369 target_records = self._get_stored_records_associated_with_refs(local_refs)
2371 # The artifacts to register
2372 artifacts = []
2374 # Refs that already exist
2375 already_present = []
2377 # Now can transfer the artifacts
2378 for source_ref, target_ref in zip(refs, local_refs):
2379 if target_ref.id in target_records:
2380 # Already have an artifact for this.
2381 already_present.append(target_ref)
2382 continue
2384 # mypy needs to know these are always resolved refs
2385 for info in source_records[source_ref.getCheckedId()]:
2386 source_location = info.file_location(source_datastore.locationFactory)
2387 target_location = info.file_location(self.locationFactory)
2388 if source_location == target_location: 2388 ↛ 2392line 2388 didn't jump to line 2392, because the condition on line 2388 was never true
2389 # Either the dataset is already in the target datastore
2390 # (which is how execution butler currently runs) or
2391 # it is an absolute URI.
2392 if source_location.pathInStore.isabs():
2393 # Just because we can see the artifact when running
2394 # the transfer doesn't mean it will be generally
2395 # accessible to a user of this butler. For now warn
2396 # but assume it will be accessible.
2397 log.warning(
2398 "Transfer request for an outside-datastore artifact has been found at %s",
2399 source_location,
2400 )
2401 else:
2402 # Need to transfer it to the new location.
2403 # Assume we should always overwrite. If the artifact
2404 # is there this might indicate that a previous transfer
2405 # was interrupted but was not able to be rolled back
2406 # completely (eg pre-emption) so follow Datastore default
2407 # and overwrite.
2408 target_location.uri.transfer_from(
2409 source_location.uri, transfer=transfer, overwrite=True, transaction=self._transaction
2410 )
2412 artifacts.append((target_ref, info))
2414 self._register_datasets(artifacts)
2416 if already_present:
2417 n_skipped = len(already_present)
2418 log.info(
2419 "Skipped transfer of %d dataset%s already present in datastore",
2420 n_skipped,
2421 "" if n_skipped == 1 else "s",
2422 )
2424 @transactional
2425 def forget(self, refs: Iterable[DatasetRef]) -> None:
2426 # Docstring inherited.
2427 refs = list(refs)
2428 self.bridge.forget(refs)
2429 self._table.delete(["dataset_id"], *[{"dataset_id": ref.getCheckedId()} for ref in refs])
2431 def validateConfiguration(
2432 self, entities: Iterable[Union[DatasetRef, DatasetType, StorageClass]], logFailures: bool = False
2433 ) -> None:
2434 """Validate some of the configuration for this datastore.
2436 Parameters
2437 ----------
2438 entities : iterable of `DatasetRef`, `DatasetType`, or `StorageClass`
2439 Entities to test against this configuration. Can be differing
2440 types.
2441 logFailures : `bool`, optional
2442 If `True`, output a log message for every validation error
2443 detected.
2445 Raises
2446 ------
2447 DatastoreValidationError
2448 Raised if there is a validation problem with a configuration.
2449 All the problems are reported in a single exception.
2451 Notes
2452 -----
2453 This method checks that all the supplied entities have valid file
2454 templates and also have formatters defined.
2455 """
2457 templateFailed = None
2458 try:
2459 self.templates.validateTemplates(entities, logFailures=logFailures)
2460 except FileTemplateValidationError as e:
2461 templateFailed = str(e)
2463 formatterFailed = []
2464 for entity in entities:
2465 try:
2466 self.formatterFactory.getFormatterClass(entity)
2467 except KeyError as e:
2468 formatterFailed.append(str(e))
2469 if logFailures: 2469 ↛ 2464line 2469 didn't jump to line 2464, because the condition on line 2469 was never false
2470 log.critical("Formatter failure: %s", e)
2472 if templateFailed or formatterFailed:
2473 messages = []
2474 if templateFailed: 2474 ↛ 2475line 2474 didn't jump to line 2475, because the condition on line 2474 was never true
2475 messages.append(templateFailed)
2476 if formatterFailed: 2476 ↛ 2478line 2476 didn't jump to line 2478, because the condition on line 2476 was never false
2477 messages.append(",".join(formatterFailed))
2478 msg = ";\n".join(messages)
2479 raise DatastoreValidationError(msg)
2481 def getLookupKeys(self) -> Set[LookupKey]:
2482 # Docstring is inherited from base class
2483 return (
2484 self.templates.getLookupKeys()
2485 | self.formatterFactory.getLookupKeys()
2486 | self.constraints.getLookupKeys()
2487 )
2489 def validateKey(self, lookupKey: LookupKey, entity: Union[DatasetRef, DatasetType, StorageClass]) -> None:
2490 # Docstring is inherited from base class
2491 # The key can be valid in either formatters or templates so we can
2492 # only check the template if it exists
2493 if lookupKey in self.templates:
2494 try:
2495 self.templates[lookupKey].validateTemplate(entity)
2496 except FileTemplateValidationError as e:
2497 raise DatastoreValidationError(e) from e
2499 def export(
2500 self,
2501 refs: Iterable[DatasetRef],
2502 *,
2503 directory: Optional[ResourcePathExpression] = None,
2504 transfer: Optional[str] = "auto",
2505 ) -> Iterable[FileDataset]:
2506 # Docstring inherited from Datastore.export.
2507 if transfer is not None and directory is None: 2507 ↛ 2508line 2507 didn't jump to line 2508, because the condition on line 2507 was never true
2508 raise RuntimeError(f"Cannot export using transfer mode {transfer} with no export directory given")
2510 # Force the directory to be a URI object
2511 directoryUri: Optional[ResourcePath] = None
2512 if directory is not None: 2512 ↛ 2515line 2512 didn't jump to line 2515, because the condition on line 2512 was never false
2513 directoryUri = ResourcePath(directory, forceDirectory=True)
2515 if transfer is not None and directoryUri is not None: 2515 ↛ 2520line 2515 didn't jump to line 2520, because the condition on line 2515 was never false
2516 # mypy needs the second test
2517 if not directoryUri.exists(): 2517 ↛ 2518line 2517 didn't jump to line 2518, because the condition on line 2517 was never true
2518 raise FileNotFoundError(f"Export location {directory} does not exist")
2520 progress = Progress("lsst.daf.butler.datastores.FileDatastore.export", level=logging.DEBUG)
2521 for ref in progress.wrap(refs, "Exporting dataset files"):
2522 fileLocations = self._get_dataset_locations_info(ref)
2523 if not fileLocations: 2523 ↛ 2524line 2523 didn't jump to line 2524, because the condition on line 2523 was never true
2524 raise FileNotFoundError(f"Could not retrieve dataset {ref}.")
2525 # For now we can not export disassembled datasets
2526 if len(fileLocations) > 1:
2527 raise NotImplementedError(f"Can not export disassembled datasets such as {ref}")
2528 location, storedFileInfo = fileLocations[0]
2530 pathInStore = location.pathInStore.path
2531 if transfer is None: 2531 ↛ 2535line 2531 didn't jump to line 2535, because the condition on line 2531 was never true
2532 # TODO: do we also need to return the readStorageClass somehow?
2533 # We will use the path in store directly. If this is an
2534 # absolute URI, preserve it.
2535 if location.pathInStore.isabs():
2536 pathInStore = str(location.uri)
2537 elif transfer == "direct": 2537 ↛ 2539line 2537 didn't jump to line 2539, because the condition on line 2537 was never true
2538 # Use full URIs to the remote store in the export
2539 pathInStore = str(location.uri)
2540 else:
2541 # mypy needs help
2542 assert directoryUri is not None, "directoryUri must be defined to get here"
2543 storeUri = ResourcePath(location.uri)
2545 # if the datastore has an absolute URI to a resource, we
2546 # have two options:
2547 # 1. Keep the absolute URI in the exported YAML
2548 # 2. Allocate a new name in the local datastore and transfer
2549 # it.
2550 # For now go with option 2
2551 if location.pathInStore.isabs(): 2551 ↛ 2552line 2551 didn't jump to line 2552, because the condition on line 2551 was never true
2552 template = self.templates.getTemplate(ref)
2553 newURI = ResourcePath(template.format(ref), forceAbsolute=False)
2554 pathInStore = str(newURI.updatedExtension(location.pathInStore.getExtension()))
2556 exportUri = directoryUri.join(pathInStore)
2557 exportUri.transfer_from(storeUri, transfer=transfer)
2559 yield FileDataset(refs=[ref], path=pathInStore, formatter=storedFileInfo.formatter)
2561 @staticmethod
2562 def computeChecksum(
2563 uri: ResourcePath, algorithm: str = "blake2b", block_size: int = 8192
2564 ) -> Optional[str]:
2565 """Compute the checksum of the supplied file.
2567 Parameters
2568 ----------
2569 uri : `lsst.resources.ResourcePath`
2570 Name of resource to calculate checksum from.
2571 algorithm : `str`, optional
2572 Name of algorithm to use. Must be one of the algorithms supported
2573 by :py:class`hashlib`.
2574 block_size : `int`
2575 Number of bytes to read from file at one time.
2577 Returns
2578 -------
2579 hexdigest : `str`
2580 Hex digest of the file.
2582 Notes
2583 -----
2584 Currently returns None if the URI is for a remote resource.
2585 """
2586 if algorithm not in hashlib.algorithms_guaranteed: 2586 ↛ 2587line 2586 didn't jump to line 2587, because the condition on line 2586 was never true
2587 raise NameError("The specified algorithm '{}' is not supported by hashlib".format(algorithm))
2589 if not uri.isLocal: 2589 ↛ 2590line 2589 didn't jump to line 2590, because the condition on line 2589 was never true
2590 return None
2592 hasher = hashlib.new(algorithm)
2594 with uri.as_local() as local_uri:
2595 with open(local_uri.ospath, "rb") as f:
2596 for chunk in iter(lambda: f.read(block_size), b""):
2597 hasher.update(chunk)
2599 return hasher.hexdigest()
2601 def needs_expanded_data_ids(
2602 self,
2603 transfer: Optional[str],
2604 entity: Optional[Union[DatasetRef, DatasetType, StorageClass]] = None,
2605 ) -> bool:
2606 # Docstring inherited.
2607 # This _could_ also use entity to inspect whether the filename template
2608 # involves placeholders other than the required dimensions for its
2609 # dataset type, but that's not necessary for correctness; it just
2610 # enables more optimizations (perhaps only in theory).
2611 return transfer not in ("direct", None)