Coverage for python/lsst/daf/butler/datastores/fileDatastore.py: 84%
Shortcuts on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
21from __future__ import annotations
23"""Generic file-based datastore code."""
25__all__ = ("FileDatastore",)
27import hashlib
28import logging
29from collections import defaultdict
30from dataclasses import dataclass
31from typing import (
32 TYPE_CHECKING,
33 Any,
34 ClassVar,
35 Dict,
36 Iterable,
37 List,
38 Mapping,
39 Optional,
40 Set,
41 Tuple,
42 Type,
43 Union,
44)
46from lsst.daf.butler import (
47 CompositesMap,
48 Config,
49 DatasetId,
50 DatasetRef,
51 DatasetType,
52 DatasetTypeNotSupportedError,
53 Datastore,
54 DatastoreCacheManager,
55 DatastoreConfig,
56 DatastoreDisabledCacheManager,
57 DatastoreValidationError,
58 FileDataset,
59 FileDescriptor,
60 FileTemplates,
61 FileTemplateValidationError,
62 Formatter,
63 FormatterFactory,
64 Location,
65 LocationFactory,
66 Progress,
67 StorageClass,
68 StoredFileInfo,
69 ddl,
70)
71from lsst.daf.butler.core.repoRelocation import replaceRoot
72from lsst.daf.butler.core.utils import transactional
73from lsst.daf.butler.registry.interfaces import DatastoreRegistryBridge, ReadOnlyDatabaseError
74from lsst.resources import ResourcePath, ResourcePathExpression
75from lsst.utils.introspection import get_class_of, get_instance_of
76from lsst.utils.iteration import chunk_iterable
78# For VERBOSE logging usage.
79from lsst.utils.logging import VERBOSE, getLogger
80from lsst.utils.timer import time_this
81from sqlalchemy import BigInteger, String
83from .genericDatastore import GenericBaseDatastore
85if TYPE_CHECKING: 85 ↛ 86line 85 didn't jump to line 86, because the condition on line 85 was never true
86 from lsst.daf.butler import AbstractDatastoreCacheManager, LookupKey
87 from lsst.daf.butler.registry.interfaces import DatasetIdRef, DatastoreRegistryBridgeManager
89log = getLogger(__name__)
92class _IngestPrepData(Datastore.IngestPrepData):
93 """Helper class for FileDatastore ingest implementation.
95 Parameters
96 ----------
97 datasets : `list` of `FileDataset`
98 Files to be ingested by this datastore.
99 """
101 def __init__(self, datasets: List[FileDataset]):
102 super().__init__(ref for dataset in datasets for ref in dataset.refs)
103 self.datasets = datasets
106@dataclass(frozen=True)
107class DatastoreFileGetInformation:
108 """Collection of useful parameters needed to retrieve a file from
109 a Datastore.
110 """
112 location: Location
113 """The location from which to read the dataset."""
115 formatter: Formatter
116 """The `Formatter` to use to deserialize the dataset."""
118 info: StoredFileInfo
119 """Stored information about this file and its formatter."""
121 assemblerParams: Dict[str, Any]
122 """Parameters to use for post-processing the retrieved dataset."""
124 formatterParams: Dict[str, Any]
125 """Parameters that were understood by the associated formatter."""
127 component: Optional[str]
128 """The component to be retrieved (can be `None`)."""
130 readStorageClass: StorageClass
131 """The `StorageClass` of the dataset being read."""
134class FileDatastore(GenericBaseDatastore):
135 """Generic Datastore for file-based implementations.
137 Should always be sub-classed since key abstract methods are missing.
139 Parameters
140 ----------
141 config : `DatastoreConfig` or `str`
142 Configuration as either a `Config` object or URI to file.
143 bridgeManager : `DatastoreRegistryBridgeManager`
144 Object that manages the interface between `Registry` and datastores.
145 butlerRoot : `str`, optional
146 New datastore root to use to override the configuration value.
148 Raises
149 ------
150 ValueError
151 If root location does not exist and ``create`` is `False` in the
152 configuration.
153 """
155 defaultConfigFile: ClassVar[Optional[str]] = None
156 """Path to configuration defaults. Accessed within the ``config`` resource
157 or relative to a search path. Can be None if no defaults specified.
158 """
160 root: ResourcePath
161 """Root directory URI of this `Datastore`."""
163 locationFactory: LocationFactory
164 """Factory for creating locations relative to the datastore root."""
166 formatterFactory: FormatterFactory
167 """Factory for creating instances of formatters."""
169 templates: FileTemplates
170 """File templates that can be used by this `Datastore`."""
172 composites: CompositesMap
173 """Determines whether a dataset should be disassembled on put."""
175 defaultConfigFile = "datastores/fileDatastore.yaml"
176 """Path to configuration defaults. Accessed within the ``config`` resource
177 or relative to a search path. Can be None if no defaults specified.
178 """
180 @classmethod
181 def setConfigRoot(cls, root: str, config: Config, full: Config, overwrite: bool = True) -> None:
182 """Set any filesystem-dependent config options for this Datastore to
183 be appropriate for a new empty repository with the given root.
185 Parameters
186 ----------
187 root : `str`
188 URI to the root of the data repository.
189 config : `Config`
190 A `Config` to update. Only the subset understood by
191 this component will be updated. Will not expand
192 defaults.
193 full : `Config`
194 A complete config with all defaults expanded that can be
195 converted to a `DatastoreConfig`. Read-only and will not be
196 modified by this method.
197 Repository-specific options that should not be obtained
198 from defaults when Butler instances are constructed
199 should be copied from ``full`` to ``config``.
200 overwrite : `bool`, optional
201 If `False`, do not modify a value in ``config`` if the value
202 already exists. Default is always to overwrite with the provided
203 ``root``.
205 Notes
206 -----
207 If a keyword is explicitly defined in the supplied ``config`` it
208 will not be overridden by this method if ``overwrite`` is `False`.
209 This allows explicit values set in external configs to be retained.
210 """
211 Config.updateParameters(
212 DatastoreConfig,
213 config,
214 full,
215 toUpdate={"root": root},
216 toCopy=("cls", ("records", "table")),
217 overwrite=overwrite,
218 )
220 @classmethod
221 def makeTableSpec(cls, datasetIdColumnType: type) -> ddl.TableSpec:
222 return ddl.TableSpec(
223 fields=[
224 ddl.FieldSpec(name="dataset_id", dtype=datasetIdColumnType, primaryKey=True),
225 ddl.FieldSpec(name="path", dtype=String, length=256, nullable=False),
226 ddl.FieldSpec(name="formatter", dtype=String, length=128, nullable=False),
227 ddl.FieldSpec(name="storage_class", dtype=String, length=64, nullable=False),
228 # Use empty string to indicate no component
229 ddl.FieldSpec(name="component", dtype=String, length=32, primaryKey=True),
230 # TODO: should checksum be Base64Bytes instead?
231 ddl.FieldSpec(name="checksum", dtype=String, length=128, nullable=True),
232 ddl.FieldSpec(name="file_size", dtype=BigInteger, nullable=True),
233 ],
234 unique=frozenset(),
235 indexes=[tuple(["path"])],
236 )
238 def __init__(
239 self,
240 config: Union[DatastoreConfig, str],
241 bridgeManager: DatastoreRegistryBridgeManager,
242 butlerRoot: str = None,
243 ):
244 super().__init__(config, bridgeManager)
245 if "root" not in self.config: 245 ↛ 246line 245 didn't jump to line 246, because the condition on line 245 was never true
246 raise ValueError("No root directory specified in configuration")
248 # Name ourselves either using an explicit name or a name
249 # derived from the (unexpanded) root
250 if "name" in self.config:
251 self.name = self.config["name"]
252 else:
253 # We use the unexpanded root in the name to indicate that this
254 # datastore can be moved without having to update registry.
255 self.name = "{}@{}".format(type(self).__name__, self.config["root"])
257 # Support repository relocation in config
258 # Existence of self.root is checked in subclass
259 self.root = ResourcePath(
260 replaceRoot(self.config["root"], butlerRoot), forceDirectory=True, forceAbsolute=True
261 )
263 self.locationFactory = LocationFactory(self.root)
264 self.formatterFactory = FormatterFactory()
266 # Now associate formatters with storage classes
267 self.formatterFactory.registerFormatters(self.config["formatters"], universe=bridgeManager.universe)
269 # Read the file naming templates
270 self.templates = FileTemplates(self.config["templates"], universe=bridgeManager.universe)
272 # See if composites should be disassembled
273 self.composites = CompositesMap(self.config["composites"], universe=bridgeManager.universe)
275 tableName = self.config["records", "table"]
276 try:
277 # Storage of paths and formatters, keyed by dataset_id
278 self._table = bridgeManager.opaque.register(
279 tableName, self.makeTableSpec(bridgeManager.datasetIdColumnType)
280 )
281 # Interface to Registry.
282 self._bridge = bridgeManager.register(self.name)
283 except ReadOnlyDatabaseError:
284 # If the database is read only and we just tried and failed to
285 # create a table, it means someone is trying to create a read-only
286 # butler client for an empty repo. That should be okay, as long
287 # as they then try to get any datasets before some other client
288 # creates the table. Chances are they'rejust validating
289 # configuration.
290 pass
292 # Determine whether checksums should be used - default to False
293 self.useChecksum = self.config.get("checksum", False)
295 # Determine whether we can fall back to configuration if a
296 # requested dataset is not known to registry
297 self.trustGetRequest = self.config.get("trust_get_request", False)
299 # Create a cache manager
300 self.cacheManager: AbstractDatastoreCacheManager
301 if "cached" in self.config: 301 ↛ 304line 301 didn't jump to line 304, because the condition on line 301 was never false
302 self.cacheManager = DatastoreCacheManager(self.config["cached"], universe=bridgeManager.universe)
303 else:
304 self.cacheManager = DatastoreDisabledCacheManager("", universe=bridgeManager.universe)
306 # Check existence and create directory structure if necessary
307 if not self.root.exists():
308 if "create" not in self.config or not self.config["create"]: 308 ↛ 309line 308 didn't jump to line 309, because the condition on line 308 was never true
309 raise ValueError(f"No valid root and not allowed to create one at: {self.root}")
310 try:
311 self.root.mkdir()
312 except Exception as e:
313 raise ValueError(
314 f"Can not create datastore root '{self.root}', check permissions. Got error: {e}"
315 ) from e
317 def __str__(self) -> str:
318 return str(self.root)
320 @property
321 def bridge(self) -> DatastoreRegistryBridge:
322 return self._bridge
324 def _artifact_exists(self, location: Location) -> bool:
325 """Check that an artifact exists in this datastore at the specified
326 location.
328 Parameters
329 ----------
330 location : `Location`
331 Expected location of the artifact associated with this datastore.
333 Returns
334 -------
335 exists : `bool`
336 True if the location can be found, false otherwise.
337 """
338 log.debug("Checking if resource exists: %s", location.uri)
339 return location.uri.exists()
341 def _delete_artifact(self, location: Location) -> None:
342 """Delete the artifact from the datastore.
344 Parameters
345 ----------
346 location : `Location`
347 Location of the artifact associated with this datastore.
348 """
349 if location.pathInStore.isabs(): 349 ↛ 350line 349 didn't jump to line 350, because the condition on line 349 was never true
350 raise RuntimeError(f"Cannot delete artifact with absolute uri {location.uri}.")
352 try:
353 location.uri.remove()
354 except FileNotFoundError:
355 log.debug("File %s did not exist and so could not be deleted.", location.uri)
356 raise
357 except Exception as e:
358 log.critical("Failed to delete file: %s (%s)", location.uri, e)
359 raise
360 log.debug("Successfully deleted file: %s", location.uri)
362 def addStoredItemInfo(self, refs: Iterable[DatasetRef], infos: Iterable[StoredFileInfo]) -> None:
363 # Docstring inherited from GenericBaseDatastore
364 records = [info.to_record(ref) for ref, info in zip(refs, infos)]
365 self._table.insert(*records)
367 def getStoredItemsInfo(self, ref: DatasetIdRef) -> List[StoredFileInfo]:
368 # Docstring inherited from GenericBaseDatastore
370 # Look for the dataset_id -- there might be multiple matches
371 # if we have disassembled the dataset.
372 records = self._table.fetch(dataset_id=ref.id)
373 return [StoredFileInfo.from_record(record) for record in records]
375 def _get_stored_records_associated_with_refs(
376 self, refs: Iterable[DatasetIdRef]
377 ) -> Dict[DatasetId, List[StoredFileInfo]]:
378 """Retrieve all records associated with the provided refs.
380 Parameters
381 ----------
382 refs : iterable of `DatasetIdRef`
383 The refs for which records are to be retrieved.
385 Returns
386 -------
387 records : `dict` of [`DatasetId`, `list` of `StoredFileInfo`]
388 The matching records indexed by the ref ID. The number of entries
389 in the dict can be smaller than the number of requested refs.
390 """
391 records = self._table.fetch(dataset_id=[ref.id for ref in refs])
393 # Uniqueness is dataset_id + component so can have multiple records
394 # per ref.
395 records_by_ref = defaultdict(list)
396 for record in records:
397 records_by_ref[record["dataset_id"]].append(StoredFileInfo.from_record(record))
398 return records_by_ref
400 def _refs_associated_with_artifacts(
401 self, paths: List[Union[str, ResourcePath]]
402 ) -> Dict[str, Set[DatasetId]]:
403 """Return paths and associated dataset refs.
405 Parameters
406 ----------
407 paths : `list` of `str` or `lsst.resources.ResourcePath`
408 All the paths to include in search.
410 Returns
411 -------
412 mapping : `dict` of [`str`, `set` [`DatasetId`]]
413 Mapping of each path to a set of associated database IDs.
414 """
415 records = self._table.fetch(path=[str(path) for path in paths])
416 result = defaultdict(set)
417 for row in records:
418 result[row["path"]].add(row["dataset_id"])
419 return result
421 def _registered_refs_per_artifact(self, pathInStore: ResourcePath) -> Set[DatasetId]:
422 """Return all dataset refs associated with the supplied path.
424 Parameters
425 ----------
426 pathInStore : `lsst.resources.ResourcePath`
427 Path of interest in the data store.
429 Returns
430 -------
431 ids : `set` of `int`
432 All `DatasetRef` IDs associated with this path.
433 """
434 records = list(self._table.fetch(path=str(pathInStore)))
435 ids = {r["dataset_id"] for r in records}
436 return ids
438 def removeStoredItemInfo(self, ref: DatasetIdRef) -> None:
439 # Docstring inherited from GenericBaseDatastore
440 self._table.delete(["dataset_id"], {"dataset_id": ref.id})
442 def _get_dataset_locations_info(self, ref: DatasetIdRef) -> List[Tuple[Location, StoredFileInfo]]:
443 r"""Find all the `Location`\ s of the requested dataset in the
444 `Datastore` and the associated stored file information.
446 Parameters
447 ----------
448 ref : `DatasetRef`
449 Reference to the required `Dataset`.
451 Returns
452 -------
453 results : `list` [`tuple` [`Location`, `StoredFileInfo` ]]
454 Location of the dataset within the datastore and
455 stored information about each file and its formatter.
456 """
457 # Get the file information (this will fail if no file)
458 records = self.getStoredItemsInfo(ref)
460 # Use the path to determine the location -- we need to take
461 # into account absolute URIs in the datastore record
462 return [(r.file_location(self.locationFactory), r) for r in records]
464 def _can_remove_dataset_artifact(self, ref: DatasetIdRef, location: Location) -> bool:
465 """Check that there is only one dataset associated with the
466 specified artifact.
468 Parameters
469 ----------
470 ref : `DatasetRef` or `FakeDatasetRef`
471 Dataset to be removed.
472 location : `Location`
473 The location of the artifact to be removed.
475 Returns
476 -------
477 can_remove : `Bool`
478 True if the artifact can be safely removed.
479 """
480 # Can't ever delete absolute URIs.
481 if location.pathInStore.isabs():
482 return False
484 # Get all entries associated with this path
485 allRefs = self._registered_refs_per_artifact(location.pathInStore)
486 if not allRefs:
487 raise RuntimeError(f"Datastore inconsistency error. {location.pathInStore} not in registry")
489 # Remove these refs from all the refs and if there is nothing left
490 # then we can delete
491 remainingRefs = allRefs - {ref.id}
493 if remainingRefs:
494 return False
495 return True
497 def _get_expected_dataset_locations_info(self, ref: DatasetRef) -> List[Tuple[Location, StoredFileInfo]]:
498 """Predict the location and related file information of the requested
499 dataset in this datastore.
501 Parameters
502 ----------
503 ref : `DatasetRef`
504 Reference to the required `Dataset`.
506 Returns
507 -------
508 results : `list` [`tuple` [`Location`, `StoredFileInfo` ]]
509 Expected Location of the dataset within the datastore and
510 placeholder information about each file and its formatter.
512 Notes
513 -----
514 Uses the current configuration to determine how we would expect the
515 datastore files to have been written if we couldn't ask registry.
516 This is safe so long as there has been no change to datastore
517 configuration between writing the dataset and wanting to read it.
518 Will not work for files that have been ingested without using the
519 standard file template or default formatter.
520 """
522 # If we have a component ref we always need to ask the questions
523 # of the composite. If the composite is disassembled this routine
524 # should return all components. If the composite was not
525 # disassembled the composite is what is stored regardless of
526 # component request. Note that if the caller has disassembled
527 # a composite there is no way for this guess to know that
528 # without trying both the composite and component ref and seeing
529 # if there is something at the component Location even without
530 # disassembly being enabled.
531 if ref.datasetType.isComponent():
532 ref = ref.makeCompositeRef()
534 # See if the ref is a composite that should be disassembled
535 doDisassembly = self.composites.shouldBeDisassembled(ref)
537 all_info: List[Tuple[Location, Formatter, StorageClass, Optional[str]]] = []
539 if doDisassembly:
540 for component, componentStorage in ref.datasetType.storageClass.components.items():
541 compRef = ref.makeComponentRef(component)
542 location, formatter = self._determine_put_formatter_location(compRef)
543 all_info.append((location, formatter, componentStorage, component))
545 else:
546 # Always use the composite ref if no disassembly
547 location, formatter = self._determine_put_formatter_location(ref)
548 all_info.append((location, formatter, ref.datasetType.storageClass, None))
550 # Convert the list of tuples to have StoredFileInfo as second element
551 return [
552 (
553 location,
554 StoredFileInfo(
555 formatter=formatter,
556 path=location.pathInStore.path,
557 storageClass=storageClass,
558 component=component,
559 checksum=None,
560 file_size=-1,
561 ),
562 )
563 for location, formatter, storageClass, component in all_info
564 ]
566 def _prepare_for_get(
567 self, ref: DatasetRef, parameters: Optional[Mapping[str, Any]] = None
568 ) -> List[DatastoreFileGetInformation]:
569 """Check parameters for ``get`` and obtain formatter and
570 location.
572 Parameters
573 ----------
574 ref : `DatasetRef`
575 Reference to the required Dataset.
576 parameters : `dict`
577 `StorageClass`-specific parameters that specify, for example,
578 a slice of the dataset to be loaded.
580 Returns
581 -------
582 getInfo : `list` [`DatastoreFileGetInformation`]
583 Parameters needed to retrieve each file.
584 """
585 log.debug("Retrieve %s from %s with parameters %s", ref, self.name, parameters)
587 # Get file metadata and internal metadata
588 fileLocations = self._get_dataset_locations_info(ref)
589 if not fileLocations:
590 if not self.trustGetRequest:
591 raise FileNotFoundError(f"Could not retrieve dataset {ref}.")
592 # Assume the dataset is where we think it should be
593 fileLocations = self._get_expected_dataset_locations_info(ref)
595 # The storage class we want to use eventually
596 refStorageClass = ref.datasetType.storageClass
598 if len(fileLocations) > 1:
599 disassembled = True
601 # If trust is involved it is possible that there will be
602 # components listed here that do not exist in the datastore.
603 # Explicitly check for file artifact existence and filter out any
604 # that are missing.
605 if self.trustGetRequest:
606 fileLocations = [loc for loc in fileLocations if loc[0].uri.exists()]
608 # For now complain only if we have no components at all. One
609 # component is probably a problem but we can punt that to the
610 # assembler.
611 if not fileLocations: 611 ↛ 612line 611 didn't jump to line 612, because the condition on line 611 was never true
612 raise FileNotFoundError(f"None of the component files for dataset {ref} exist.")
614 else:
615 disassembled = False
617 # Is this a component request?
618 refComponent = ref.datasetType.component()
620 fileGetInfo = []
621 for location, storedFileInfo in fileLocations:
623 # The storage class used to write the file
624 writeStorageClass = storedFileInfo.storageClass
626 # If this has been disassembled we need read to match the write
627 if disassembled:
628 readStorageClass = writeStorageClass
629 else:
630 readStorageClass = refStorageClass
632 formatter = get_instance_of(
633 storedFileInfo.formatter,
634 FileDescriptor(
635 location,
636 readStorageClass=readStorageClass,
637 storageClass=writeStorageClass,
638 parameters=parameters,
639 ),
640 ref.dataId,
641 )
643 formatterParams, notFormatterParams = formatter.segregateParameters()
645 # Of the remaining parameters, extract the ones supported by
646 # this StorageClass (for components not all will be handled)
647 assemblerParams = readStorageClass.filterParameters(notFormatterParams)
649 # The ref itself could be a component if the dataset was
650 # disassembled by butler, or we disassembled in datastore and
651 # components came from the datastore records
652 component = storedFileInfo.component if storedFileInfo.component else refComponent
654 fileGetInfo.append(
655 DatastoreFileGetInformation(
656 location,
657 formatter,
658 storedFileInfo,
659 assemblerParams,
660 formatterParams,
661 component,
662 readStorageClass,
663 )
664 )
666 return fileGetInfo
668 def _prepare_for_put(self, inMemoryDataset: Any, ref: DatasetRef) -> Tuple[Location, Formatter]:
669 """Check the arguments for ``put`` and obtain formatter and
670 location.
672 Parameters
673 ----------
674 inMemoryDataset : `object`
675 The dataset to store.
676 ref : `DatasetRef`
677 Reference to the associated Dataset.
679 Returns
680 -------
681 location : `Location`
682 The location to write the dataset.
683 formatter : `Formatter`
684 The `Formatter` to use to write the dataset.
686 Raises
687 ------
688 TypeError
689 Supplied object and storage class are inconsistent.
690 DatasetTypeNotSupportedError
691 The associated `DatasetType` is not handled by this datastore.
692 """
693 self._validate_put_parameters(inMemoryDataset, ref)
694 return self._determine_put_formatter_location(ref)
696 def _determine_put_formatter_location(self, ref: DatasetRef) -> Tuple[Location, Formatter]:
697 """Calculate the formatter and output location to use for put.
699 Parameters
700 ----------
701 ref : `DatasetRef`
702 Reference to the associated Dataset.
704 Returns
705 -------
706 location : `Location`
707 The location to write the dataset.
708 formatter : `Formatter`
709 The `Formatter` to use to write the dataset.
710 """
711 # Work out output file name
712 try:
713 template = self.templates.getTemplate(ref)
714 except KeyError as e:
715 raise DatasetTypeNotSupportedError(f"Unable to find template for {ref}") from e
717 # Validate the template to protect against filenames from different
718 # dataIds returning the same and causing overwrite confusion.
719 template.validateTemplate(ref)
721 location = self.locationFactory.fromPath(template.format(ref))
723 # Get the formatter based on the storage class
724 storageClass = ref.datasetType.storageClass
725 try:
726 formatter = self.formatterFactory.getFormatter(
727 ref, FileDescriptor(location, storageClass=storageClass), ref.dataId
728 )
729 except KeyError as e:
730 raise DatasetTypeNotSupportedError(
731 f"Unable to find formatter for {ref} in datastore {self.name}"
732 ) from e
734 # Now that we know the formatter, update the location
735 location = formatter.makeUpdatedLocation(location)
737 return location, formatter
739 def _overrideTransferMode(self, *datasets: FileDataset, transfer: Optional[str] = None) -> Optional[str]:
740 # Docstring inherited from base class
741 if transfer != "auto":
742 return transfer
744 # See if the paths are within the datastore or not
745 inside = [self._pathInStore(d.path) is not None for d in datasets]
747 if all(inside):
748 transfer = None
749 elif not any(inside): 749 ↛ 758line 749 didn't jump to line 758, because the condition on line 749 was never false
750 # Allow ResourcePath to use its own knowledge
751 transfer = "auto"
752 else:
753 # This can happen when importing from a datastore that
754 # has had some datasets ingested using "direct" mode.
755 # Also allow ResourcePath to sort it out but warn about it.
756 # This can happen if you are importing from a datastore
757 # that had some direct transfer datasets.
758 log.warning(
759 "Some datasets are inside the datastore and some are outside. Using 'split' "
760 "transfer mode. This assumes that the files outside the datastore are "
761 "still accessible to the new butler since they will not be copied into "
762 "the target datastore."
763 )
764 transfer = "split"
766 return transfer
768 def _pathInStore(self, path: ResourcePathExpression) -> Optional[str]:
769 """Return path relative to datastore root
771 Parameters
772 ----------
773 path : `lsst.resources.ResourcePathExpression`
774 Path to dataset. Can be absolute URI. If relative assumed to
775 be relative to the datastore. Returns path in datastore
776 or raises an exception if the path it outside.
778 Returns
779 -------
780 inStore : `str`
781 Path relative to datastore root. Returns `None` if the file is
782 outside the root.
783 """
784 # Relative path will always be relative to datastore
785 pathUri = ResourcePath(path, forceAbsolute=False)
786 return pathUri.relative_to(self.root)
788 def _standardizeIngestPath(
789 self, path: Union[str, ResourcePath], *, transfer: Optional[str] = None
790 ) -> Union[str, ResourcePath]:
791 """Standardize the path of a to-be-ingested file.
793 Parameters
794 ----------
795 path : `str` or `lsst.resources.ResourcePath`
796 Path of a file to be ingested. This parameter is not expected
797 to be all the types that can be used to construct a
798 `~lsst.resources.ResourcePath`.
799 transfer : `str`, optional
800 How (and whether) the dataset should be added to the datastore.
801 See `ingest` for details of transfer modes.
802 This implementation is provided only so
803 `NotImplementedError` can be raised if the mode is not supported;
804 actual transfers are deferred to `_extractIngestInfo`.
806 Returns
807 -------
808 path : `str` or `lsst.resources.ResourcePath`
809 New path in what the datastore considers standard form. If an
810 absolute URI was given that will be returned unchanged.
812 Notes
813 -----
814 Subclasses of `FileDatastore` can implement this method instead
815 of `_prepIngest`. It should not modify the data repository or given
816 file in any way.
818 Raises
819 ------
820 NotImplementedError
821 Raised if the datastore does not support the given transfer mode
822 (including the case where ingest is not supported at all).
823 FileNotFoundError
824 Raised if one of the given files does not exist.
825 """
826 if transfer not in (None, "direct", "split") + self.root.transferModes: 826 ↛ 827line 826 didn't jump to line 827, because the condition on line 826 was never true
827 raise NotImplementedError(f"Transfer mode {transfer} not supported.")
829 # A relative URI indicates relative to datastore root
830 srcUri = ResourcePath(path, forceAbsolute=False)
831 if not srcUri.isabs():
832 srcUri = self.root.join(path)
834 if not srcUri.exists():
835 raise FileNotFoundError(
836 f"Resource at {srcUri} does not exist; note that paths to ingest "
837 f"are assumed to be relative to {self.root} unless they are absolute."
838 )
840 if transfer is None:
841 relpath = srcUri.relative_to(self.root)
842 if not relpath:
843 raise RuntimeError(
844 f"Transfer is none but source file ({srcUri}) is not within datastore ({self.root})"
845 )
847 # Return the relative path within the datastore for internal
848 # transfer
849 path = relpath
851 return path
853 def _extractIngestInfo(
854 self,
855 path: ResourcePathExpression,
856 ref: DatasetRef,
857 *,
858 formatter: Union[Formatter, Type[Formatter]],
859 transfer: Optional[str] = None,
860 record_validation_info: bool = True,
861 ) -> StoredFileInfo:
862 """Relocate (if necessary) and extract `StoredFileInfo` from a
863 to-be-ingested file.
865 Parameters
866 ----------
867 path : `lsst.resources.ResourcePathExpression`
868 URI or path of a file to be ingested.
869 ref : `DatasetRef`
870 Reference for the dataset being ingested. Guaranteed to have
871 ``dataset_id not None`.
872 formatter : `type` or `Formatter`
873 `Formatter` subclass to use for this dataset or an instance.
874 transfer : `str`, optional
875 How (and whether) the dataset should be added to the datastore.
876 See `ingest` for details of transfer modes.
877 record_validation_info : `bool`, optional
878 If `True`, the default, the datastore can record validation
879 information associated with the file. If `False` the datastore
880 will not attempt to track any information such as checksums
881 or file sizes. This can be useful if such information is tracked
882 in an external system or if the file is to be compressed in place.
883 It is up to the datastore whether this parameter is relevant.
885 Returns
886 -------
887 info : `StoredFileInfo`
888 Internal datastore record for this file. This will be inserted by
889 the caller; the `_extractIngestInfo` is only responsible for
890 creating and populating the struct.
892 Raises
893 ------
894 FileNotFoundError
895 Raised if one of the given files does not exist.
896 FileExistsError
897 Raised if transfer is not `None` but the (internal) location the
898 file would be moved to is already occupied.
899 """
900 if self._transaction is None: 900 ↛ 901line 900 didn't jump to line 901, because the condition on line 900 was never true
901 raise RuntimeError("Ingest called without transaction enabled")
903 # Create URI of the source path, do not need to force a relative
904 # path to absolute.
905 srcUri = ResourcePath(path, forceAbsolute=False)
907 # Track whether we have read the size of the source yet
908 have_sized = False
910 tgtLocation: Optional[Location]
911 if transfer is None or transfer == "split":
912 # A relative path is assumed to be relative to the datastore
913 # in this context
914 if not srcUri.isabs():
915 tgtLocation = self.locationFactory.fromPath(srcUri.ospath)
916 else:
917 # Work out the path in the datastore from an absolute URI
918 # This is required to be within the datastore.
919 pathInStore = srcUri.relative_to(self.root)
920 if pathInStore is None and transfer is None: 920 ↛ 921line 920 didn't jump to line 921, because the condition on line 920 was never true
921 raise RuntimeError(
922 f"Unexpectedly learned that {srcUri} is not within datastore {self.root}"
923 )
924 if pathInStore: 924 ↛ 926line 924 didn't jump to line 926, because the condition on line 924 was never false
925 tgtLocation = self.locationFactory.fromPath(pathInStore)
926 elif transfer == "split":
927 # Outside the datastore but treat that as a direct ingest
928 # instead.
929 tgtLocation = None
930 else:
931 raise RuntimeError(f"Unexpected transfer mode encountered: {transfer} for URI {srcUri}")
932 elif transfer == "direct": 932 ↛ 937line 932 didn't jump to line 937, because the condition on line 932 was never true
933 # Want to store the full URI to the resource directly in
934 # datastore. This is useful for referring to permanent archive
935 # storage for raw data.
936 # Trust that people know what they are doing.
937 tgtLocation = None
938 else:
939 # Work out the name we want this ingested file to have
940 # inside the datastore
941 tgtLocation = self._calculate_ingested_datastore_name(srcUri, ref, formatter)
942 if not tgtLocation.uri.dirname().exists():
943 log.debug("Folder %s does not exist yet.", tgtLocation.uri.dirname())
944 tgtLocation.uri.dirname().mkdir()
946 # if we are transferring from a local file to a remote location
947 # it may be more efficient to get the size and checksum of the
948 # local file rather than the transferred one
949 if record_validation_info and srcUri.isLocal:
950 size = srcUri.size()
951 checksum = self.computeChecksum(srcUri) if self.useChecksum else None
952 have_sized = True
954 # Transfer the resource to the destination.
955 # Allow overwrite of an existing file. This matches the behavior
956 # of datastore.put() in that it trusts that registry would not
957 # be asking to overwrite unless registry thought that the
958 # overwrite was allowed.
959 tgtLocation.uri.transfer_from(
960 srcUri, transfer=transfer, transaction=self._transaction, overwrite=True
961 )
963 if tgtLocation is None: 963 ↛ 965line 963 didn't jump to line 965, because the condition on line 963 was never true
964 # This means we are using direct mode
965 targetUri = srcUri
966 targetPath = str(srcUri)
967 else:
968 targetUri = tgtLocation.uri
969 targetPath = tgtLocation.pathInStore.path
971 # the file should exist in the datastore now
972 if record_validation_info:
973 if not have_sized:
974 size = targetUri.size()
975 checksum = self.computeChecksum(targetUri) if self.useChecksum else None
976 else:
977 # Not recording any file information.
978 size = -1
979 checksum = None
981 return StoredFileInfo(
982 formatter=formatter,
983 path=targetPath,
984 storageClass=ref.datasetType.storageClass,
985 component=ref.datasetType.component(),
986 file_size=size,
987 checksum=checksum,
988 )
990 def _prepIngest(self, *datasets: FileDataset, transfer: Optional[str] = None) -> _IngestPrepData:
991 # Docstring inherited from Datastore._prepIngest.
992 filtered = []
993 for dataset in datasets:
994 acceptable = [ref for ref in dataset.refs if self.constraints.isAcceptable(ref)]
995 if not acceptable:
996 continue
997 else:
998 dataset.refs = acceptable
999 if dataset.formatter is None:
1000 dataset.formatter = self.formatterFactory.getFormatterClass(dataset.refs[0])
1001 else:
1002 assert isinstance(dataset.formatter, (type, str))
1003 formatter_class = get_class_of(dataset.formatter)
1004 if not issubclass(formatter_class, Formatter): 1004 ↛ 1005line 1004 didn't jump to line 1005, because the condition on line 1004 was never true
1005 raise TypeError(f"Requested formatter {dataset.formatter} is not a Formatter class.")
1006 dataset.formatter = formatter_class
1007 dataset.path = self._standardizeIngestPath(dataset.path, transfer=transfer)
1008 filtered.append(dataset)
1009 return _IngestPrepData(filtered)
1011 @transactional
1012 def _finishIngest(
1013 self,
1014 prepData: Datastore.IngestPrepData,
1015 *,
1016 transfer: Optional[str] = None,
1017 record_validation_info: bool = True,
1018 ) -> None:
1019 # Docstring inherited from Datastore._finishIngest.
1020 refsAndInfos = []
1021 progress = Progress("lsst.daf.butler.datastores.FileDatastore.ingest", level=logging.DEBUG)
1022 for dataset in progress.wrap(prepData.datasets, desc="Ingesting dataset files"):
1023 # Do ingest as if the first dataset ref is associated with the file
1024 info = self._extractIngestInfo(
1025 dataset.path,
1026 dataset.refs[0],
1027 formatter=dataset.formatter,
1028 transfer=transfer,
1029 record_validation_info=record_validation_info,
1030 )
1031 refsAndInfos.extend([(ref, info) for ref in dataset.refs])
1032 self._register_datasets(refsAndInfos)
1034 def _calculate_ingested_datastore_name(
1035 self, srcUri: ResourcePath, ref: DatasetRef, formatter: Union[Formatter, Type[Formatter]]
1036 ) -> Location:
1037 """Given a source URI and a DatasetRef, determine the name the
1038 dataset will have inside datastore.
1040 Parameters
1041 ----------
1042 srcUri : `lsst.resources.ResourcePath`
1043 URI to the source dataset file.
1044 ref : `DatasetRef`
1045 Ref associated with the newly-ingested dataset artifact. This
1046 is used to determine the name within the datastore.
1047 formatter : `Formatter` or Formatter class.
1048 Formatter to use for validation. Can be a class or an instance.
1050 Returns
1051 -------
1052 location : `Location`
1053 Target location for the newly-ingested dataset.
1054 """
1055 # Ingesting a file from outside the datastore.
1056 # This involves a new name.
1057 template = self.templates.getTemplate(ref)
1058 location = self.locationFactory.fromPath(template.format(ref))
1060 # Get the extension
1061 ext = srcUri.getExtension()
1063 # Update the destination to include that extension
1064 location.updateExtension(ext)
1066 # Ask the formatter to validate this extension
1067 formatter.validateExtension(location)
1069 return location
1071 def _write_in_memory_to_artifact(self, inMemoryDataset: Any, ref: DatasetRef) -> StoredFileInfo:
1072 """Write out in memory dataset to datastore.
1074 Parameters
1075 ----------
1076 inMemoryDataset : `object`
1077 Dataset to write to datastore.
1078 ref : `DatasetRef`
1079 Registry information associated with this dataset.
1081 Returns
1082 -------
1083 info : `StoredFileInfo`
1084 Information describing the artifact written to the datastore.
1085 """
1086 # May need to coerce the in memory dataset to the correct
1087 # python type.
1088 inMemoryDataset = ref.datasetType.storageClass.coerce_type(inMemoryDataset)
1090 location, formatter = self._prepare_for_put(inMemoryDataset, ref)
1091 uri = location.uri
1093 if not uri.dirname().exists():
1094 log.debug("Folder %s does not exist yet so creating it.", uri.dirname())
1095 uri.dirname().mkdir()
1097 if self._transaction is None: 1097 ↛ 1098line 1097 didn't jump to line 1098, because the condition on line 1097 was never true
1098 raise RuntimeError("Attempting to write artifact without transaction enabled")
1100 def _removeFileExists(uri: ResourcePath) -> None:
1101 """Remove a file and do not complain if it is not there.
1103 This is important since a formatter might fail before the file
1104 is written and we should not confuse people by writing spurious
1105 error messages to the log.
1106 """
1107 try:
1108 uri.remove()
1109 except FileNotFoundError:
1110 pass
1112 # Register a callback to try to delete the uploaded data if
1113 # something fails below
1114 self._transaction.registerUndo("artifactWrite", _removeFileExists, uri)
1116 # For a local file, simply use the formatter directly
1117 if uri.isLocal:
1118 try:
1119 formatter.write(inMemoryDataset)
1120 except Exception as e:
1121 raise RuntimeError(
1122 f"Failed to serialize dataset {ref} of type {type(inMemoryDataset)} to location {uri}"
1123 ) from e
1124 log.debug("Successfully wrote python object to local file at %s", uri)
1125 else:
1126 # This is a remote URI. Some datasets can be serialized directly
1127 # to bytes and sent to the remote datastore without writing a
1128 # file. If the dataset is intended to be saved to the cache
1129 # a file is always written and direct write to the remote
1130 # datastore is bypassed.
1131 data_written = False
1132 if not self.cacheManager.should_be_cached(ref):
1133 try:
1134 serializedDataset = formatter.toBytes(inMemoryDataset)
1135 except NotImplementedError:
1136 # Fallback to the file writing option.
1137 pass
1138 except Exception as e:
1139 raise RuntimeError(
1140 f"Failed to serialize dataset {ref} of type {type(inMemoryDataset)} to bytes."
1141 ) from e
1142 else:
1143 log.debug("Writing bytes directly to %s", uri)
1144 uri.write(serializedDataset, overwrite=True)
1145 log.debug("Successfully wrote bytes directly to %s", uri)
1146 data_written = True
1148 if not data_written:
1149 # Did not write the bytes directly to object store so instead
1150 # write to temporary file.
1151 with ResourcePath.temporary_uri(suffix=uri.getExtension()) as temporary_uri:
1152 # Need to configure the formatter to write to a different
1153 # location and that needs us to overwrite internals
1154 log.debug("Writing dataset to temporary location at %s", temporary_uri)
1155 with formatter._updateLocation(Location(None, temporary_uri)):
1156 try:
1157 formatter.write(inMemoryDataset)
1158 except Exception as e:
1159 raise RuntimeError(
1160 f"Failed to serialize dataset {ref} of type"
1161 f" {type(inMemoryDataset)} to "
1162 f"temporary location {temporary_uri}"
1163 ) from e
1164 uri.transfer_from(temporary_uri, transfer="copy", overwrite=True)
1166 # Cache if required
1167 self.cacheManager.move_to_cache(temporary_uri, ref)
1169 log.debug("Successfully wrote dataset to %s via a temporary file.", uri)
1171 # URI is needed to resolve what ingest case are we dealing with
1172 return self._extractIngestInfo(uri, ref, formatter=formatter)
1174 def _read_artifact_into_memory(
1175 self,
1176 getInfo: DatastoreFileGetInformation,
1177 ref: DatasetRef,
1178 isComponent: bool = False,
1179 cache_ref: Optional[DatasetRef] = None,
1180 ) -> Any:
1181 """Read the artifact from datastore into in memory object.
1183 Parameters
1184 ----------
1185 getInfo : `DatastoreFileGetInformation`
1186 Information about the artifact within the datastore.
1187 ref : `DatasetRef`
1188 The registry information associated with this artifact.
1189 isComponent : `bool`
1190 Flag to indicate if a component is being read from this artifact.
1191 cache_ref : `DatasetRef`, optional
1192 The DatasetRef to use when looking up the file in the cache.
1193 This ref must have the same ID as the supplied ref but can
1194 be a parent ref or component ref to indicate to the cache whether
1195 a composite file is being requested from the cache or a component
1196 file. Without this the cache will default to the supplied ref but
1197 it can get confused with read-only derived components for
1198 disassembled composites.
1200 Returns
1201 -------
1202 inMemoryDataset : `object`
1203 The artifact as a python object.
1204 """
1205 location = getInfo.location
1206 uri = location.uri
1207 log.debug("Accessing data from %s", uri)
1209 if cache_ref is None:
1210 cache_ref = ref
1211 if cache_ref.id != ref.id: 1211 ↛ 1212line 1211 didn't jump to line 1212, because the condition on line 1211 was never true
1212 raise ValueError(
1213 "The supplied cache dataset ref refers to a different dataset than expected:"
1214 f" {ref.id} != {cache_ref.id}"
1215 )
1217 # Cannot recalculate checksum but can compare size as a quick check
1218 # Do not do this if the size is negative since that indicates
1219 # we do not know.
1220 recorded_size = getInfo.info.file_size
1221 resource_size = uri.size()
1222 if recorded_size >= 0 and resource_size != recorded_size: 1222 ↛ 1223line 1222 didn't jump to line 1223, because the condition on line 1222 was never true
1223 raise RuntimeError(
1224 "Integrity failure in Datastore. "
1225 f"Size of file {uri} ({resource_size}) "
1226 f"does not match size recorded in registry of {recorded_size}"
1227 )
1229 # For the general case we have choices for how to proceed.
1230 # 1. Always use a local file (downloading the remote resource to a
1231 # temporary file if needed).
1232 # 2. Use a threshold size and read into memory and use bytes.
1233 # Use both for now with an arbitrary hand off size.
1234 # This allows small datasets to be downloaded from remote object
1235 # stores without requiring a temporary file.
1237 formatter = getInfo.formatter
1238 nbytes_max = 10_000_000 # Arbitrary number that we can tune
1239 if resource_size <= nbytes_max and formatter.can_read_bytes():
1240 with self.cacheManager.find_in_cache(cache_ref, uri.getExtension()) as cached_file:
1241 if cached_file is not None:
1242 desired_uri = cached_file
1243 msg = f" (cached version of {uri})"
1244 else:
1245 desired_uri = uri
1246 msg = ""
1247 with time_this(log, msg="Reading bytes from %s%s", args=(desired_uri, msg)):
1248 serializedDataset = desired_uri.read()
1249 log.debug(
1250 "Deserializing %s from %d bytes from location %s with formatter %s",
1251 f"component {getInfo.component}" if isComponent else "",
1252 len(serializedDataset),
1253 uri,
1254 formatter.name(),
1255 )
1256 try:
1257 result = formatter.fromBytes(
1258 serializedDataset, component=getInfo.component if isComponent else None
1259 )
1260 except Exception as e:
1261 raise ValueError(
1262 f"Failure from formatter '{formatter.name()}' for dataset {ref.id}"
1263 f" ({ref.datasetType.name} from {uri}): {e}"
1264 ) from e
1265 else:
1266 # Read from file.
1268 # Have to update the Location associated with the formatter
1269 # because formatter.read does not allow an override.
1270 # This could be improved.
1271 location_updated = False
1272 msg = ""
1274 # First check in cache for local version.
1275 # The cache will only be relevant for remote resources but
1276 # no harm in always asking. Context manager ensures that cache
1277 # file is not deleted during cache expiration.
1278 with self.cacheManager.find_in_cache(cache_ref, uri.getExtension()) as cached_file:
1279 if cached_file is not None:
1280 msg = f"(via cache read of remote file {uri})"
1281 uri = cached_file
1282 location_updated = True
1284 with uri.as_local() as local_uri:
1286 can_be_cached = False
1287 if uri != local_uri: 1287 ↛ 1289line 1287 didn't jump to line 1289, because the condition on line 1287 was never true
1288 # URI was remote and file was downloaded
1289 cache_msg = ""
1290 location_updated = True
1292 if self.cacheManager.should_be_cached(cache_ref):
1293 # In this scenario we want to ask if the downloaded
1294 # file should be cached but we should not cache
1295 # it until after we've used it (to ensure it can't
1296 # be expired whilst we are using it).
1297 can_be_cached = True
1299 # Say that it is "likely" to be cached because
1300 # if the formatter read fails we will not be
1301 # caching this file.
1302 cache_msg = " and likely cached"
1304 msg = f"(via download to local file{cache_msg})"
1306 # Calculate the (possibly) new location for the formatter
1307 # to use.
1308 newLocation = Location(*local_uri.split()) if location_updated else None
1310 log.debug(
1311 "Reading%s from location %s %s with formatter %s",
1312 f" component {getInfo.component}" if isComponent else "",
1313 uri,
1314 msg,
1315 formatter.name(),
1316 )
1317 try:
1318 with formatter._updateLocation(newLocation):
1319 with time_this(
1320 log,
1321 msg="Reading%s from location %s %s with formatter %s",
1322 args=(
1323 f" component {getInfo.component}" if isComponent else "",
1324 uri,
1325 msg,
1326 formatter.name(),
1327 ),
1328 ):
1329 result = formatter.read(component=getInfo.component if isComponent else None)
1330 except Exception as e:
1331 raise ValueError(
1332 f"Failure from formatter '{formatter.name()}' for dataset {ref.id}"
1333 f" ({ref.datasetType.name} from {uri}): {e}"
1334 ) from e
1336 # File was read successfully so can move to cache
1337 if can_be_cached: 1337 ↛ 1338line 1337 didn't jump to line 1338, because the condition on line 1337 was never true
1338 self.cacheManager.move_to_cache(local_uri, cache_ref)
1340 return self._post_process_get(
1341 result, getInfo.readStorageClass, getInfo.assemblerParams, isComponent=isComponent
1342 )
1344 def knows(self, ref: DatasetRef) -> bool:
1345 """Check if the dataset is known to the datastore.
1347 Does not check for existence of any artifact.
1349 Parameters
1350 ----------
1351 ref : `DatasetRef`
1352 Reference to the required dataset.
1354 Returns
1355 -------
1356 exists : `bool`
1357 `True` if the dataset is known to the datastore.
1358 """
1359 fileLocations = self._get_dataset_locations_info(ref)
1360 if fileLocations:
1361 return True
1362 return False
1364 def _process_mexists_records(
1365 self,
1366 id_to_ref: Dict[DatasetId, DatasetRef],
1367 records: Dict[DatasetId, List[StoredFileInfo]],
1368 all_required: bool,
1369 artifact_existence: Optional[Dict[ResourcePath, bool]] = None,
1370 ) -> Dict[DatasetRef, bool]:
1371 """Helper function for mexists that checks the given records.
1373 Parameters
1374 ----------
1375 id_to_ref : `dict` of [`DatasetId`, `DatasetRef`]
1376 Mapping of the dataset ID to the dataset ref itself.
1377 records : `dict` of [`DatasetId`, `list` of `StoredFileInfo`]
1378 Records as generally returned by
1379 ``_get_stored_records_associated_with_refs``.
1380 all_required : `bool`
1381 Flag to indicate whether existence requires all artifacts
1382 associated with a dataset ID to exist or not for existence.
1383 artifact_existence : `dict` [`lsst.resources.ResourcePath`, `bool`]
1384 Optional mapping of datastore artifact to existence. Updated by
1385 this method with details of all artifacts tested. Can be `None`
1386 if the caller is not interested.
1388 Returns
1389 -------
1390 existence : `dict` of [`DatasetRef`, `bool`]
1391 Mapping from dataset to boolean indicating existence.
1392 """
1393 # The URIs to be checked and a mapping of those URIs to
1394 # the dataset ID.
1395 uris_to_check: List[ResourcePath] = []
1396 location_map: Dict[ResourcePath, DatasetId] = {}
1398 location_factory = self.locationFactory
1400 uri_existence: Dict[ResourcePath, bool] = {}
1401 for ref_id, infos in records.items():
1402 # Key is the dataset Id, value is list of StoredItemInfo
1403 uris = [info.file_location(location_factory).uri for info in infos]
1404 location_map.update({uri: ref_id for uri in uris})
1406 # Check the local cache directly for a dataset corresponding
1407 # to the remote URI.
1408 if self.cacheManager.file_count > 0:
1409 ref = id_to_ref[ref_id]
1410 for uri, storedFileInfo in zip(uris, infos):
1411 check_ref = ref
1412 if not ref.datasetType.isComponent() and (component := storedFileInfo.component): 1412 ↛ 1413line 1412 didn't jump to line 1413, because the condition on line 1412 was never true
1413 check_ref = ref.makeComponentRef(component)
1414 if self.cacheManager.known_to_cache(check_ref, uri.getExtension()):
1415 # Proxy for URI existence.
1416 uri_existence[uri] = True
1417 else:
1418 uris_to_check.append(uri)
1419 else:
1420 # Check all of them.
1421 uris_to_check.extend(uris)
1423 if artifact_existence is not None:
1424 # If a URI has already been checked remove it from the list
1425 # and immediately add the status to the output dict.
1426 filtered_uris_to_check = []
1427 for uri in uris_to_check:
1428 if uri in artifact_existence:
1429 uri_existence[uri] = artifact_existence[uri]
1430 else:
1431 filtered_uris_to_check.append(uri)
1432 uris_to_check = filtered_uris_to_check
1434 # Results.
1435 dataset_existence: Dict[DatasetRef, bool] = {}
1437 uri_existence.update(ResourcePath.mexists(uris_to_check))
1438 for uri, exists in uri_existence.items():
1439 dataset_id = location_map[uri]
1440 ref = id_to_ref[dataset_id]
1442 # Disassembled composite needs to check all locations.
1443 # all_required indicates whether all need to exist or not.
1444 if ref in dataset_existence:
1445 if all_required:
1446 exists = dataset_existence[ref] and exists
1447 else:
1448 exists = dataset_existence[ref] or exists
1449 dataset_existence[ref] = exists
1451 if artifact_existence is not None:
1452 artifact_existence.update(uri_existence)
1454 return dataset_existence
1456 def mexists(
1457 self, refs: Iterable[DatasetRef], artifact_existence: Optional[Dict[ResourcePath, bool]] = None
1458 ) -> Dict[DatasetRef, bool]:
1459 """Check the existence of multiple datasets at once.
1461 Parameters
1462 ----------
1463 refs : iterable of `DatasetRef`
1464 The datasets to be checked.
1465 artifact_existence : `dict` [`lsst.resources.ResourcePath`, `bool`]
1466 Optional mapping of datastore artifact to existence. Updated by
1467 this method with details of all artifacts tested. Can be `None`
1468 if the caller is not interested.
1470 Returns
1471 -------
1472 existence : `dict` of [`DatasetRef`, `bool`]
1473 Mapping from dataset to boolean indicating existence.
1475 Notes
1476 -----
1477 To minimize potentially costly remote existence checks, the local
1478 cache is checked as a proxy for existence. If a file for this
1479 `DatasetRef` does exist no check is done for the actual URI. This
1480 could result in possibly unexpected behavior if the dataset itself
1481 has been removed from the datastore by another process whilst it is
1482 still in the cache.
1483 """
1484 chunk_size = 10_000
1485 dataset_existence: Dict[DatasetRef, bool] = {}
1486 log.debug("Checking for the existence of multiple artifacts in datastore in chunks of %d", chunk_size)
1487 n_found_total = 0
1488 n_checked = 0
1489 n_chunks = 0
1490 for chunk in chunk_iterable(refs, chunk_size=chunk_size):
1491 chunk_result = self._mexists(chunk, artifact_existence)
1492 if log.isEnabledFor(VERBOSE):
1493 n_results = len(chunk_result)
1494 n_checked += n_results
1495 # Can treat the booleans as 0, 1 integers and sum them.
1496 n_found = sum(chunk_result.values())
1497 n_found_total += n_found
1498 log.verbose(
1499 "Number of datasets found in datastore for chunk %d = %d/%d (running total: %d/%d)",
1500 n_chunks,
1501 n_found,
1502 n_results,
1503 n_found_total,
1504 n_checked,
1505 )
1506 dataset_existence.update(chunk_result)
1507 n_chunks += 1
1509 return dataset_existence
1511 def _mexists(
1512 self, refs: Iterable[DatasetRef], artifact_existence: Optional[Dict[ResourcePath, bool]] = None
1513 ) -> Dict[DatasetRef, bool]:
1514 """Check the existence of multiple datasets at once.
1516 Parameters
1517 ----------
1518 refs : iterable of `DatasetRef`
1519 The datasets to be checked.
1521 Returns
1522 -------
1523 existence : `dict` of [`DatasetRef`, `bool`]
1524 Mapping from dataset to boolean indicating existence.
1525 """
1526 # Need a mapping of dataset_id to dataset ref since the API
1527 # works with dataset_id
1528 id_to_ref = {ref.getCheckedId(): ref for ref in refs}
1530 # Set of all IDs we are checking for.
1531 requested_ids = set(id_to_ref.keys())
1533 # The records themselves. Could be missing some entries.
1534 records = self._get_stored_records_associated_with_refs(refs)
1536 dataset_existence = self._process_mexists_records(
1537 id_to_ref, records, True, artifact_existence=artifact_existence
1538 )
1540 # Set of IDs that have been handled.
1541 handled_ids = {ref.id for ref in dataset_existence.keys()}
1543 missing_ids = requested_ids - handled_ids
1544 if missing_ids:
1545 if not self.trustGetRequest:
1546 # Must assume these do not exist
1547 for missing in missing_ids:
1548 dataset_existence[id_to_ref[missing]] = False
1549 else:
1550 log.debug(
1551 "%d out of %d datasets were not known to datastore during initial existence check.",
1552 len(missing_ids),
1553 len(requested_ids),
1554 )
1556 # Construct data structure identical to that returned
1557 # by _get_stored_records_associated_with_refs() but using
1558 # guessed names.
1559 records = {}
1560 for missing in missing_ids:
1561 expected = self._get_expected_dataset_locations_info(id_to_ref[missing])
1562 records[missing] = [info for _, info in expected]
1564 dataset_existence.update(
1565 self._process_mexists_records(
1566 id_to_ref, records, False, artifact_existence=artifact_existence
1567 )
1568 )
1570 return dataset_existence
1572 def exists(self, ref: DatasetRef) -> bool:
1573 """Check if the dataset exists in the datastore.
1575 Parameters
1576 ----------
1577 ref : `DatasetRef`
1578 Reference to the required dataset.
1580 Returns
1581 -------
1582 exists : `bool`
1583 `True` if the entity exists in the `Datastore`.
1585 Notes
1586 -----
1587 The local cache is checked as a proxy for existence in the remote
1588 object store. It is possible that another process on a different
1589 compute node could remove the file from the object store even
1590 though it is present in the local cache.
1591 """
1592 fileLocations = self._get_dataset_locations_info(ref)
1594 # if we are being asked to trust that registry might not be correct
1595 # we ask for the expected locations and check them explicitly
1596 if not fileLocations:
1597 if not self.trustGetRequest:
1598 return False
1600 # First check the cache. If it is not found we must check
1601 # the datastore itself. Assume that any component in the cache
1602 # means that the dataset does exist somewhere.
1603 if self.cacheManager.known_to_cache(ref): 1603 ↛ 1604line 1603 didn't jump to line 1604, because the condition on line 1603 was never true
1604 return True
1606 # When we are guessing a dataset location we can not check
1607 # for the existence of every component since we can not
1608 # know if every component was written. Instead we check
1609 # for the existence of any of the expected locations.
1610 for location, _ in self._get_expected_dataset_locations_info(ref): 1610 ↛ 1613line 1610 didn't jump to line 1613, because the loop on line 1610 didn't complete
1611 if self._artifact_exists(location): 1611 ↛ 1610line 1611 didn't jump to line 1610, because the condition on line 1611 was never false
1612 return True
1613 return False
1615 # All listed artifacts must exist.
1616 for location, storedFileInfo in fileLocations:
1617 # Checking in cache needs the component ref.
1618 check_ref = ref
1619 if not ref.datasetType.isComponent() and (component := storedFileInfo.component):
1620 check_ref = ref.makeComponentRef(component)
1621 if self.cacheManager.known_to_cache(check_ref, location.getExtension()):
1622 continue
1624 if not self._artifact_exists(location):
1625 return False
1627 return True
1629 def getURIs(
1630 self, ref: DatasetRef, predict: bool = False
1631 ) -> Tuple[Optional[ResourcePath], Dict[str, ResourcePath]]:
1632 """Return URIs associated with dataset.
1634 Parameters
1635 ----------
1636 ref : `DatasetRef`
1637 Reference to the required dataset.
1638 predict : `bool`, optional
1639 If the datastore does not know about the dataset, should it
1640 return a predicted URI or not?
1642 Returns
1643 -------
1644 primary : `lsst.resources.ResourcePath`
1645 The URI to the primary artifact associated with this dataset.
1646 If the dataset was disassembled within the datastore this
1647 may be `None`.
1648 components : `dict`
1649 URIs to any components associated with the dataset artifact.
1650 Can be empty if there are no components.
1651 """
1653 primary: Optional[ResourcePath] = None
1654 components: Dict[str, ResourcePath] = {}
1656 # if this has never been written then we have to guess
1657 if not self.exists(ref):
1658 if not predict:
1659 raise FileNotFoundError("Dataset {} not in this datastore".format(ref))
1661 doDisassembly = self.composites.shouldBeDisassembled(ref)
1663 if doDisassembly:
1665 for component, componentStorage in ref.datasetType.storageClass.components.items():
1666 compRef = ref.makeComponentRef(component)
1667 compLocation, _ = self._determine_put_formatter_location(compRef)
1669 # Add a URI fragment to indicate this is a guess
1670 components[component] = ResourcePath(compLocation.uri.geturl() + "#predicted")
1672 else:
1674 location, _ = self._determine_put_formatter_location(ref)
1676 # Add a URI fragment to indicate this is a guess
1677 primary = ResourcePath(location.uri.geturl() + "#predicted")
1679 return primary, components
1681 # If this is a ref that we have written we can get the path.
1682 # Get file metadata and internal metadata
1683 fileLocations = self._get_dataset_locations_info(ref)
1685 guessing = False
1686 if not fileLocations:
1687 if not self.trustGetRequest: 1687 ↛ 1688line 1687 didn't jump to line 1688, because the condition on line 1687 was never true
1688 raise RuntimeError(f"Unexpectedly got no artifacts for dataset {ref}")
1689 fileLocations = self._get_expected_dataset_locations_info(ref)
1690 guessing = True
1692 if len(fileLocations) == 1:
1693 # No disassembly so this is the primary URI
1694 uri = fileLocations[0][0].uri
1695 if guessing and not uri.exists(): 1695 ↛ 1696line 1695 didn't jump to line 1696, because the condition on line 1695 was never true
1696 raise FileNotFoundError(f"Expected URI ({uri}) does not exist")
1697 primary = uri
1699 else:
1700 for location, storedFileInfo in fileLocations:
1701 if storedFileInfo.component is None: 1701 ↛ 1702line 1701 didn't jump to line 1702, because the condition on line 1701 was never true
1702 raise RuntimeError(f"Unexpectedly got no component name for a component at {location}")
1703 uri = location.uri
1704 if guessing and not uri.exists(): 1704 ↛ 1708line 1704 didn't jump to line 1708, because the condition on line 1704 was never true
1705 # If we are trusting then it is entirely possible for
1706 # some components to be missing. In that case we skip
1707 # to the next component.
1708 if self.trustGetRequest:
1709 continue
1710 raise FileNotFoundError(f"Expected URI ({uri}) does not exist")
1711 components[storedFileInfo.component] = uri
1713 return primary, components
1715 def getURI(self, ref: DatasetRef, predict: bool = False) -> ResourcePath:
1716 """URI to the Dataset.
1718 Parameters
1719 ----------
1720 ref : `DatasetRef`
1721 Reference to the required Dataset.
1722 predict : `bool`
1723 If `True`, allow URIs to be returned of datasets that have not
1724 been written.
1726 Returns
1727 -------
1728 uri : `str`
1729 URI pointing to the dataset within the datastore. If the
1730 dataset does not exist in the datastore, and if ``predict`` is
1731 `True`, the URI will be a prediction and will include a URI
1732 fragment "#predicted".
1733 If the datastore does not have entities that relate well
1734 to the concept of a URI the returned URI will be
1735 descriptive. The returned URI is not guaranteed to be obtainable.
1737 Raises
1738 ------
1739 FileNotFoundError
1740 Raised if a URI has been requested for a dataset that does not
1741 exist and guessing is not allowed.
1742 RuntimeError
1743 Raised if a request is made for a single URI but multiple URIs
1744 are associated with this dataset.
1746 Notes
1747 -----
1748 When a predicted URI is requested an attempt will be made to form
1749 a reasonable URI based on file templates and the expected formatter.
1750 """
1751 primary, components = self.getURIs(ref, predict)
1752 if primary is None or components: 1752 ↛ 1753line 1752 didn't jump to line 1753, because the condition on line 1752 was never true
1753 raise RuntimeError(
1754 f"Dataset ({ref}) includes distinct URIs for components. Use Datastore.getURIs() instead."
1755 )
1756 return primary
1758 def retrieveArtifacts(
1759 self,
1760 refs: Iterable[DatasetRef],
1761 destination: ResourcePath,
1762 transfer: str = "auto",
1763 preserve_path: bool = True,
1764 overwrite: bool = False,
1765 ) -> List[ResourcePath]:
1766 """Retrieve the file artifacts associated with the supplied refs.
1768 Parameters
1769 ----------
1770 refs : iterable of `DatasetRef`
1771 The datasets for which file artifacts are to be retrieved.
1772 A single ref can result in multiple files. The refs must
1773 be resolved.
1774 destination : `lsst.resources.ResourcePath`
1775 Location to write the file artifacts.
1776 transfer : `str`, optional
1777 Method to use to transfer the artifacts. Must be one of the options
1778 supported by `lsst.resources.ResourcePath.transfer_from()`.
1779 "move" is not allowed.
1780 preserve_path : `bool`, optional
1781 If `True` the full path of the file artifact within the datastore
1782 is preserved. If `False` the final file component of the path
1783 is used.
1784 overwrite : `bool`, optional
1785 If `True` allow transfers to overwrite existing files at the
1786 destination.
1788 Returns
1789 -------
1790 targets : `list` of `lsst.resources.ResourcePath`
1791 URIs of file artifacts in destination location. Order is not
1792 preserved.
1793 """
1794 if not destination.isdir(): 1794 ↛ 1795line 1794 didn't jump to line 1795, because the condition on line 1794 was never true
1795 raise ValueError(f"Destination location must refer to a directory. Given {destination}")
1797 if transfer == "move":
1798 raise ValueError("Can not move artifacts out of datastore. Use copy instead.")
1800 # Source -> Destination
1801 # This also helps filter out duplicate DatasetRef in the request
1802 # that will map to the same underlying file transfer.
1803 to_transfer: Dict[ResourcePath, ResourcePath] = {}
1805 for ref in refs:
1806 locations = self._get_dataset_locations_info(ref)
1807 for location, _ in locations:
1808 source_uri = location.uri
1809 target_path: ResourcePathExpression
1810 if preserve_path:
1811 target_path = location.pathInStore
1812 if target_path.isabs(): 1812 ↛ 1815line 1812 didn't jump to line 1815, because the condition on line 1812 was never true
1813 # This is an absolute path to an external file.
1814 # Use the full path.
1815 target_path = target_path.relativeToPathRoot
1816 else:
1817 target_path = source_uri.basename()
1818 target_uri = destination.join(target_path)
1819 to_transfer[source_uri] = target_uri
1821 # In theory can now parallelize the transfer
1822 log.debug("Number of artifacts to transfer to %s: %d", str(destination), len(to_transfer))
1823 for source_uri, target_uri in to_transfer.items():
1824 target_uri.transfer_from(source_uri, transfer=transfer, overwrite=overwrite)
1826 return list(to_transfer.values())
1828 def get(self, ref: DatasetRef, parameters: Optional[Mapping[str, Any]] = None) -> Any:
1829 """Load an InMemoryDataset from the store.
1831 Parameters
1832 ----------
1833 ref : `DatasetRef`
1834 Reference to the required Dataset.
1835 parameters : `dict`
1836 `StorageClass`-specific parameters that specify, for example,
1837 a slice of the dataset to be loaded.
1839 Returns
1840 -------
1841 inMemoryDataset : `object`
1842 Requested dataset or slice thereof as an InMemoryDataset.
1844 Raises
1845 ------
1846 FileNotFoundError
1847 Requested dataset can not be retrieved.
1848 TypeError
1849 Return value from formatter has unexpected type.
1850 ValueError
1851 Formatter failed to process the dataset.
1852 """
1853 allGetInfo = self._prepare_for_get(ref, parameters)
1854 refComponent = ref.datasetType.component()
1856 # Supplied storage class for the component being read
1857 refStorageClass = ref.datasetType.storageClass
1859 # Create mapping from component name to related info
1860 allComponents = {i.component: i for i in allGetInfo}
1862 # By definition the dataset is disassembled if we have more
1863 # than one record for it.
1864 isDisassembled = len(allGetInfo) > 1
1866 # Look for the special case where we are disassembled but the
1867 # component is a derived component that was not written during
1868 # disassembly. For this scenario we need to check that the
1869 # component requested is listed as a derived component for the
1870 # composite storage class
1871 isDisassembledReadOnlyComponent = False
1872 if isDisassembled and refComponent:
1873 # The composite storage class should be accessible through
1874 # the component dataset type
1875 compositeStorageClass = ref.datasetType.parentStorageClass
1877 # In the unlikely scenario where the composite storage
1878 # class is not known, we can only assume that this is a
1879 # normal component. If that assumption is wrong then the
1880 # branch below that reads a persisted component will fail
1881 # so there is no need to complain here.
1882 if compositeStorageClass is not None: 1882 ↛ 1885line 1882 didn't jump to line 1885, because the condition on line 1882 was never false
1883 isDisassembledReadOnlyComponent = refComponent in compositeStorageClass.derivedComponents
1885 if isDisassembled and not refComponent:
1886 # This was a disassembled dataset spread over multiple files
1887 # and we need to put them all back together again.
1888 # Read into memory and then assemble
1890 # Check that the supplied parameters are suitable for the type read
1891 refStorageClass.validateParameters(parameters)
1893 # We want to keep track of all the parameters that were not used
1894 # by formatters. We assume that if any of the component formatters
1895 # use a parameter that we do not need to apply it again in the
1896 # assembler.
1897 usedParams = set()
1899 components: Dict[str, Any] = {}
1900 for getInfo in allGetInfo:
1901 # assemblerParams are parameters not understood by the
1902 # associated formatter.
1903 usedParams.update(set(getInfo.formatterParams))
1905 component = getInfo.component
1907 if component is None: 1907 ↛ 1908line 1907 didn't jump to line 1908, because the condition on line 1907 was never true
1908 raise RuntimeError(f"Internal error in datastore assembly of {ref}")
1910 # We do not want the formatter to think it's reading
1911 # a component though because it is really reading a
1912 # standalone dataset -- always tell reader it is not a
1913 # component.
1914 components[component] = self._read_artifact_into_memory(
1915 getInfo, ref.makeComponentRef(component), isComponent=False
1916 )
1918 inMemoryDataset = ref.datasetType.storageClass.delegate().assemble(components)
1920 # Any unused parameters will have to be passed to the assembler
1921 if parameters:
1922 unusedParams = {k: v for k, v in parameters.items() if k not in usedParams}
1923 else:
1924 unusedParams = {}
1926 # Process parameters
1927 return ref.datasetType.storageClass.delegate().handleParameters(
1928 inMemoryDataset, parameters=unusedParams
1929 )
1931 elif isDisassembledReadOnlyComponent:
1933 compositeStorageClass = ref.datasetType.parentStorageClass
1934 if compositeStorageClass is None: 1934 ↛ 1935line 1934 didn't jump to line 1935, because the condition on line 1934 was never true
1935 raise RuntimeError(
1936 f"Unable to retrieve derived component '{refComponent}' since"
1937 "no composite storage class is available."
1938 )
1940 if refComponent is None: 1940 ↛ 1942line 1940 didn't jump to line 1942, because the condition on line 1940 was never true
1941 # Mainly for mypy
1942 raise RuntimeError(f"Internal error in datastore {self.name}: component can not be None here")
1944 # Assume that every derived component can be calculated by
1945 # forwarding the request to a single read/write component.
1946 # Rather than guessing which rw component is the right one by
1947 # scanning each for a derived component of the same name,
1948 # we ask the storage class delegate directly which one is best to
1949 # use.
1950 compositeDelegate = compositeStorageClass.delegate()
1951 forwardedComponent = compositeDelegate.selectResponsibleComponent(
1952 refComponent, set(allComponents)
1953 )
1955 # Select the relevant component
1956 rwInfo = allComponents[forwardedComponent]
1958 # For now assume that read parameters are validated against
1959 # the real component and not the requested component
1960 forwardedStorageClass = rwInfo.formatter.fileDescriptor.readStorageClass
1961 forwardedStorageClass.validateParameters(parameters)
1963 # The reference to use for the caching must refer to the forwarded
1964 # component and not the derived component.
1965 cache_ref = ref.makeCompositeRef().makeComponentRef(forwardedComponent)
1967 # Unfortunately the FileDescriptor inside the formatter will have
1968 # the wrong write storage class so we need to create a new one
1969 # given the immutability constraint.
1970 writeStorageClass = rwInfo.info.storageClass
1972 # We may need to put some thought into parameters for read
1973 # components but for now forward them on as is
1974 readFormatter = type(rwInfo.formatter)(
1975 FileDescriptor(
1976 rwInfo.location,
1977 readStorageClass=refStorageClass,
1978 storageClass=writeStorageClass,
1979 parameters=parameters,
1980 ),
1981 ref.dataId,
1982 )
1984 # The assembler can not receive any parameter requests for a
1985 # derived component at this time since the assembler will
1986 # see the storage class of the derived component and those
1987 # parameters will have to be handled by the formatter on the
1988 # forwarded storage class.
1989 assemblerParams: Dict[str, Any] = {}
1991 # Need to created a new info that specifies the derived
1992 # component and associated storage class
1993 readInfo = DatastoreFileGetInformation(
1994 rwInfo.location,
1995 readFormatter,
1996 rwInfo.info,
1997 assemblerParams,
1998 {},
1999 refComponent,
2000 refStorageClass,
2001 )
2003 return self._read_artifact_into_memory(readInfo, ref, isComponent=True, cache_ref=cache_ref)
2005 else:
2006 # Single file request or component from that composite file
2007 for lookup in (refComponent, None): 2007 ↛ 2012line 2007 didn't jump to line 2012, because the loop on line 2007 didn't complete
2008 if lookup in allComponents: 2008 ↛ 2007line 2008 didn't jump to line 2007, because the condition on line 2008 was never false
2009 getInfo = allComponents[lookup]
2010 break
2011 else:
2012 raise FileNotFoundError(
2013 f"Component {refComponent} not found for ref {ref} in datastore {self.name}"
2014 )
2016 # Do not need the component itself if already disassembled
2017 if isDisassembled:
2018 isComponent = False
2019 else:
2020 isComponent = getInfo.component is not None
2022 # For a component read of a composite we want the cache to
2023 # be looking at the composite ref itself.
2024 cache_ref = ref.makeCompositeRef() if isComponent else ref
2026 # For a disassembled component we can validate parametersagainst
2027 # the component storage class directly
2028 if isDisassembled:
2029 refStorageClass.validateParameters(parameters)
2030 else:
2031 # For an assembled composite this could be a derived
2032 # component derived from a real component. The validity
2033 # of the parameters is not clear. For now validate against
2034 # the composite storage class
2035 getInfo.formatter.fileDescriptor.storageClass.validateParameters(parameters)
2037 return self._read_artifact_into_memory(getInfo, ref, isComponent=isComponent, cache_ref=cache_ref)
2039 @transactional
2040 def put(self, inMemoryDataset: Any, ref: DatasetRef) -> None:
2041 """Write a InMemoryDataset with a given `DatasetRef` to the store.
2043 Parameters
2044 ----------
2045 inMemoryDataset : `object`
2046 The dataset to store.
2047 ref : `DatasetRef`
2048 Reference to the associated Dataset.
2050 Raises
2051 ------
2052 TypeError
2053 Supplied object and storage class are inconsistent.
2054 DatasetTypeNotSupportedError
2055 The associated `DatasetType` is not handled by this datastore.
2057 Notes
2058 -----
2059 If the datastore is configured to reject certain dataset types it
2060 is possible that the put will fail and raise a
2061 `DatasetTypeNotSupportedError`. The main use case for this is to
2062 allow `ChainedDatastore` to put to multiple datastores without
2063 requiring that every datastore accepts the dataset.
2064 """
2066 doDisassembly = self.composites.shouldBeDisassembled(ref)
2067 # doDisassembly = True
2069 artifacts = []
2070 if doDisassembly:
2071 components = ref.datasetType.storageClass.delegate().disassemble(inMemoryDataset)
2072 for component, componentInfo in components.items():
2073 # Don't recurse because we want to take advantage of
2074 # bulk insert -- need a new DatasetRef that refers to the
2075 # same dataset_id but has the component DatasetType
2076 # DatasetType does not refer to the types of components
2077 # So we construct one ourselves.
2078 compRef = ref.makeComponentRef(component)
2079 storedInfo = self._write_in_memory_to_artifact(componentInfo.component, compRef)
2080 artifacts.append((compRef, storedInfo))
2081 else:
2082 # Write the entire thing out
2083 storedInfo = self._write_in_memory_to_artifact(inMemoryDataset, ref)
2084 artifacts.append((ref, storedInfo))
2086 self._register_datasets(artifacts)
2088 @transactional
2089 def trash(self, ref: Union[DatasetRef, Iterable[DatasetRef]], ignore_errors: bool = True) -> None:
2090 # At this point can safely remove these datasets from the cache
2091 # to avoid confusion later on. If they are not trashed later
2092 # the cache will simply be refilled.
2093 self.cacheManager.remove_from_cache(ref)
2095 # If we are in trust mode there will be nothing to move to
2096 # the trash table and we will have to try to delete the file
2097 # immediately.
2098 if self.trustGetRequest:
2099 # Try to keep the logic below for a single file trash.
2100 if isinstance(ref, DatasetRef):
2101 refs = {ref}
2102 else:
2103 # Will recreate ref at the end of this branch.
2104 refs = set(ref)
2106 # Determine which datasets are known to datastore directly.
2107 id_to_ref = {ref.getCheckedId(): ref for ref in refs}
2108 existing_ids = self._get_stored_records_associated_with_refs(refs)
2109 existing_refs = {id_to_ref[ref_id] for ref_id in existing_ids}
2111 missing = refs - existing_refs
2112 if missing:
2113 # Do an explicit existence check on these refs.
2114 # We only care about the artifacts at this point and not
2115 # the dataset existence.
2116 artifact_existence: Dict[ResourcePath, bool] = {}
2117 _ = self.mexists(missing, artifact_existence)
2118 uris = [uri for uri, exists in artifact_existence.items() if exists]
2120 # FUTURE UPGRADE: Implement a parallelized bulk remove.
2121 log.debug("Removing %d artifacts from datastore that are unknown to datastore", len(uris))
2122 for uri in uris:
2123 try:
2124 uri.remove()
2125 except Exception as e:
2126 if ignore_errors:
2127 log.debug("Artifact %s could not be removed: %s", uri, e)
2128 continue
2129 raise
2131 # There is no point asking the code below to remove refs we
2132 # know are missing so update it with the list of existing
2133 # records. Try to retain one vs many logic.
2134 if not existing_refs:
2135 # Nothing more to do since none of the datasets were
2136 # known to the datastore record table.
2137 return
2138 ref = list(existing_refs)
2139 if len(ref) == 1:
2140 ref = ref[0]
2142 # Get file metadata and internal metadata
2143 if not isinstance(ref, DatasetRef):
2144 log.debug("Doing multi-dataset trash in datastore %s", self.name)
2145 # Assumed to be an iterable of refs so bulk mode enabled.
2146 try:
2147 self.bridge.moveToTrash(ref)
2148 except Exception as e:
2149 if ignore_errors:
2150 log.warning("Unexpected issue moving multiple datasets to trash: %s", e)
2151 else:
2152 raise
2153 return
2155 log.debug("Trashing dataset %s in datastore %s", ref, self.name)
2157 fileLocations = self._get_dataset_locations_info(ref)
2159 if not fileLocations:
2160 err_msg = f"Requested dataset to trash ({ref}) is not known to datastore {self.name}"
2161 if ignore_errors:
2162 log.warning(err_msg)
2163 return
2164 else:
2165 raise FileNotFoundError(err_msg)
2167 for location, storedFileInfo in fileLocations:
2168 if not self._artifact_exists(location): 2168 ↛ 2169line 2168 didn't jump to line 2169
2169 err_msg = (
2170 f"Dataset is known to datastore {self.name} but "
2171 f"associated artifact ({location.uri}) is missing"
2172 )
2173 if ignore_errors:
2174 log.warning(err_msg)
2175 return
2176 else:
2177 raise FileNotFoundError(err_msg)
2179 # Mark dataset as trashed
2180 try:
2181 self.bridge.moveToTrash([ref])
2182 except Exception as e:
2183 if ignore_errors:
2184 log.warning(
2185 "Attempted to mark dataset (%s) to be trashed in datastore %s "
2186 "but encountered an error: %s",
2187 ref,
2188 self.name,
2189 e,
2190 )
2191 pass
2192 else:
2193 raise
2195 @transactional
2196 def emptyTrash(self, ignore_errors: bool = True) -> None:
2197 """Remove all datasets from the trash.
2199 Parameters
2200 ----------
2201 ignore_errors : `bool`
2202 If `True` return without error even if something went wrong.
2203 Problems could occur if another process is simultaneously trying
2204 to delete.
2205 """
2206 log.debug("Emptying trash in datastore %s", self.name)
2208 # Context manager will empty trash iff we finish it without raising.
2209 # It will also automatically delete the relevant rows from the
2210 # trash table and the records table.
2211 with self.bridge.emptyTrash(
2212 self._table, record_class=StoredFileInfo, record_column="path"
2213 ) as trash_data:
2214 # Removing the artifacts themselves requires that the files are
2215 # not also associated with refs that are not to be trashed.
2216 # Therefore need to do a query with the file paths themselves
2217 # and return all the refs associated with them. Can only delete
2218 # a file if the refs to be trashed are the only refs associated
2219 # with the file.
2220 # This requires multiple copies of the trashed items
2221 trashed, artifacts_to_keep = trash_data
2223 if artifacts_to_keep is None:
2224 # The bridge is not helping us so have to work it out
2225 # ourselves. This is not going to be as efficient.
2226 trashed = list(trashed)
2228 # The instance check is for mypy since up to this point it
2229 # does not know the type of info.
2230 path_map = self._refs_associated_with_artifacts(
2231 [info.path for _, info in trashed if isinstance(info, StoredFileInfo)]
2232 )
2234 for ref, info in trashed:
2236 # Mypy needs to know this is not the base class
2237 assert isinstance(info, StoredFileInfo), f"Unexpectedly got info of class {type(info)}"
2239 # Check for mypy
2240 assert ref.id is not None, f"Internal logic error in emptyTrash with ref {ref}/{info}"
2242 path_map[info.path].remove(ref.id)
2243 if not path_map[info.path]: 2243 ↛ 2234line 2243 didn't jump to line 2234, because the condition on line 2243 was never false
2244 del path_map[info.path]
2246 artifacts_to_keep = set(path_map)
2248 for ref, info in trashed:
2250 # Should not happen for this implementation but need
2251 # to keep mypy happy.
2252 assert info is not None, f"Internal logic error in emptyTrash with ref {ref}."
2254 # Mypy needs to know this is not the base class
2255 assert isinstance(info, StoredFileInfo), f"Unexpectedly got info of class {type(info)}"
2257 # Check for mypy
2258 assert ref.id is not None, f"Internal logic error in emptyTrash with ref {ref}/{info}"
2260 if info.path in artifacts_to_keep:
2261 # This is a multi-dataset artifact and we are not
2262 # removing all associated refs.
2263 continue
2265 # Only trashed refs still known to datastore will be returned.
2266 location = info.file_location(self.locationFactory)
2268 # Point of no return for this artifact
2269 log.debug("Removing artifact %s from datastore %s", location.uri, self.name)
2270 try:
2271 self._delete_artifact(location)
2272 except FileNotFoundError:
2273 # If the file itself has been deleted there is nothing
2274 # we can do about it. It is possible that trash has
2275 # been run in parallel in another process or someone
2276 # decided to delete the file. It is unlikely to come
2277 # back and so we should still continue with the removal
2278 # of the entry from the trash table. It is also possible
2279 # we removed it in a previous iteration if it was
2280 # a multi-dataset artifact. The delete artifact method
2281 # will log a debug message in this scenario.
2282 # Distinguishing file missing before trash started and
2283 # file already removed previously as part of this trash
2284 # is not worth the distinction with regards to potential
2285 # memory cost.
2286 pass
2287 except Exception as e:
2288 if ignore_errors:
2289 # Use a debug message here even though it's not
2290 # a good situation. In some cases this can be
2291 # caused by a race between user A and user B
2292 # and neither of them has permissions for the
2293 # other's files. Butler does not know about users
2294 # and trash has no idea what collections these
2295 # files were in (without guessing from a path).
2296 log.debug(
2297 "Encountered error removing artifact %s from datastore %s: %s",
2298 location.uri,
2299 self.name,
2300 e,
2301 )
2302 else:
2303 raise
2305 @transactional
2306 def transfer_from(
2307 self,
2308 source_datastore: Datastore,
2309 refs: Iterable[DatasetRef],
2310 local_refs: Optional[Iterable[DatasetRef]] = None,
2311 transfer: str = "auto",
2312 artifact_existence: Optional[Dict[ResourcePath, bool]] = None,
2313 ) -> None:
2314 # Docstring inherited
2315 if type(self) is not type(source_datastore):
2316 raise TypeError(
2317 f"Datastore mismatch between this datastore ({type(self)}) and the "
2318 f"source datastore ({type(source_datastore)})."
2319 )
2321 # Be explicit for mypy
2322 if not isinstance(source_datastore, FileDatastore): 2322 ↛ 2323line 2322 didn't jump to line 2323, because the condition on line 2322 was never true
2323 raise TypeError(
2324 "Can only transfer to a FileDatastore from another FileDatastore, not"
2325 f" {type(source_datastore)}"
2326 )
2328 # Stop early if "direct" transfer mode is requested. That would
2329 # require that the URI inside the source datastore should be stored
2330 # directly in the target datastore, which seems unlikely to be useful
2331 # since at any moment the source datastore could delete the file.
2332 if transfer in ("direct", "split"):
2333 raise ValueError(
2334 f"Can not transfer from a source datastore using {transfer} mode since"
2335 " those files are controlled by the other datastore."
2336 )
2338 # Empty existence lookup if none given.
2339 if artifact_existence is None:
2340 artifact_existence = {}
2342 # We will go through the list multiple times so must convert
2343 # generators to lists.
2344 refs = list(refs)
2346 if local_refs is None:
2347 local_refs = refs
2348 else:
2349 local_refs = list(local_refs)
2351 # In order to handle disassembled composites the code works
2352 # at the records level since it can assume that internal APIs
2353 # can be used.
2354 # - If the record already exists in the destination this is assumed
2355 # to be okay.
2356 # - If there is no record but the source and destination URIs are
2357 # identical no transfer is done but the record is added.
2358 # - If the source record refers to an absolute URI currently assume
2359 # that that URI should remain absolute and will be visible to the
2360 # destination butler. May need to have a flag to indicate whether
2361 # the dataset should be transferred. This will only happen if
2362 # the detached Butler has had a local ingest.
2364 # What we really want is all the records in the source datastore
2365 # associated with these refs. Or derived ones if they don't exist
2366 # in the source.
2367 source_records = source_datastore._get_stored_records_associated_with_refs(refs)
2369 # The source dataset_ids are the keys in these records
2370 source_ids = set(source_records)
2371 log.debug("Number of datastore records found in source: %d", len(source_ids))
2373 # The not None check is to appease mypy
2374 requested_ids = set(ref.id for ref in refs if ref.id is not None)
2375 missing_ids = requested_ids - source_ids
2377 # Missing IDs can be okay if that datastore has allowed
2378 # gets based on file existence. Should we transfer what we can
2379 # or complain about it and warn?
2380 if missing_ids and not source_datastore.trustGetRequest: 2380 ↛ 2381line 2380 didn't jump to line 2381, because the condition on line 2380 was never true
2381 raise ValueError(
2382 f"Some datasets are missing from source datastore {source_datastore}: {missing_ids}"
2383 )
2385 # Need to map these missing IDs to a DatasetRef so we can guess
2386 # the details.
2387 if missing_ids:
2388 log.info(
2389 "Number of expected datasets missing from source datastore records: %d out of %d",
2390 len(missing_ids),
2391 len(requested_ids),
2392 )
2393 id_to_ref = {ref.id: ref for ref in refs if ref.id in missing_ids}
2395 # This should be chunked in case we end up having to check
2396 # the file store since we need some log output to show
2397 # progress.
2398 for missing_ids_chunk in chunk_iterable(missing_ids, chunk_size=10_000):
2399 records = {}
2400 for missing in missing_ids_chunk:
2401 # Ask the source datastore where the missing artifacts
2402 # should be. An execution butler might not know about the
2403 # artifacts even if they are there.
2404 expected = source_datastore._get_expected_dataset_locations_info(id_to_ref[missing])
2405 records[missing] = [info for _, info in expected]
2407 # Call the mexist helper method in case we have not already
2408 # checked these artifacts such that artifact_existence is
2409 # empty. This allows us to benefit from parallelism.
2410 # datastore.mexists() itself does not give us access to the
2411 # derived datastore record.
2412 log.verbose("Checking existence of %d datasets unknown to datastore", len(records))
2413 ref_exists = source_datastore._process_mexists_records(
2414 id_to_ref, records, False, artifact_existence=artifact_existence
2415 )
2417 # Now go through the records and propagate the ones that exist.
2418 location_factory = source_datastore.locationFactory
2419 for missing, record_list in records.items():
2420 # Skip completely if the ref does not exist.
2421 ref = id_to_ref[missing]
2422 if not ref_exists[ref]:
2423 log.warning("Asked to transfer dataset %s but no file artifacts exist for it.", ref)
2424 continue
2425 # Check for file artifact to decide which parts of a
2426 # disassembled composite do exist. If there is only a
2427 # single record we don't even need to look because it can't
2428 # be a composite and must exist.
2429 if len(record_list) == 1:
2430 dataset_records = record_list
2431 else:
2432 dataset_records = [
2433 record
2434 for record in record_list
2435 if artifact_existence[record.file_location(location_factory).uri]
2436 ]
2437 assert len(dataset_records) > 0, "Disassembled composite should have had some files."
2439 # Rely on source_records being a defaultdict.
2440 source_records[missing].extend(dataset_records)
2442 # See if we already have these records
2443 target_records = self._get_stored_records_associated_with_refs(local_refs)
2445 # The artifacts to register
2446 artifacts = []
2448 # Refs that already exist
2449 already_present = []
2451 # Now can transfer the artifacts
2452 for source_ref, target_ref in zip(refs, local_refs):
2453 if target_ref.id in target_records:
2454 # Already have an artifact for this.
2455 already_present.append(target_ref)
2456 continue
2458 # mypy needs to know these are always resolved refs
2459 for info in source_records[source_ref.getCheckedId()]:
2460 source_location = info.file_location(source_datastore.locationFactory)
2461 target_location = info.file_location(self.locationFactory)
2462 if source_location == target_location: 2462 ↛ 2466line 2462 didn't jump to line 2466, because the condition on line 2462 was never true
2463 # Either the dataset is already in the target datastore
2464 # (which is how execution butler currently runs) or
2465 # it is an absolute URI.
2466 if source_location.pathInStore.isabs():
2467 # Just because we can see the artifact when running
2468 # the transfer doesn't mean it will be generally
2469 # accessible to a user of this butler. For now warn
2470 # but assume it will be accessible.
2471 log.warning(
2472 "Transfer request for an outside-datastore artifact has been found at %s",
2473 source_location,
2474 )
2475 else:
2476 # Need to transfer it to the new location.
2477 # Assume we should always overwrite. If the artifact
2478 # is there this might indicate that a previous transfer
2479 # was interrupted but was not able to be rolled back
2480 # completely (eg pre-emption) so follow Datastore default
2481 # and overwrite.
2482 target_location.uri.transfer_from(
2483 source_location.uri, transfer=transfer, overwrite=True, transaction=self._transaction
2484 )
2486 artifacts.append((target_ref, info))
2488 self._register_datasets(artifacts)
2490 if already_present:
2491 n_skipped = len(already_present)
2492 log.info(
2493 "Skipped transfer of %d dataset%s already present in datastore",
2494 n_skipped,
2495 "" if n_skipped == 1 else "s",
2496 )
2498 @transactional
2499 def forget(self, refs: Iterable[DatasetRef]) -> None:
2500 # Docstring inherited.
2501 refs = list(refs)
2502 self.bridge.forget(refs)
2503 self._table.delete(["dataset_id"], *[{"dataset_id": ref.getCheckedId()} for ref in refs])
2505 def validateConfiguration(
2506 self, entities: Iterable[Union[DatasetRef, DatasetType, StorageClass]], logFailures: bool = False
2507 ) -> None:
2508 """Validate some of the configuration for this datastore.
2510 Parameters
2511 ----------
2512 entities : iterable of `DatasetRef`, `DatasetType`, or `StorageClass`
2513 Entities to test against this configuration. Can be differing
2514 types.
2515 logFailures : `bool`, optional
2516 If `True`, output a log message for every validation error
2517 detected.
2519 Raises
2520 ------
2521 DatastoreValidationError
2522 Raised if there is a validation problem with a configuration.
2523 All the problems are reported in a single exception.
2525 Notes
2526 -----
2527 This method checks that all the supplied entities have valid file
2528 templates and also have formatters defined.
2529 """
2531 templateFailed = None
2532 try:
2533 self.templates.validateTemplates(entities, logFailures=logFailures)
2534 except FileTemplateValidationError as e:
2535 templateFailed = str(e)
2537 formatterFailed = []
2538 for entity in entities:
2539 try:
2540 self.formatterFactory.getFormatterClass(entity)
2541 except KeyError as e:
2542 formatterFailed.append(str(e))
2543 if logFailures: 2543 ↛ 2538line 2543 didn't jump to line 2538, because the condition on line 2543 was never false
2544 log.critical("Formatter failure: %s", e)
2546 if templateFailed or formatterFailed:
2547 messages = []
2548 if templateFailed: 2548 ↛ 2549line 2548 didn't jump to line 2549, because the condition on line 2548 was never true
2549 messages.append(templateFailed)
2550 if formatterFailed: 2550 ↛ 2552line 2550 didn't jump to line 2552, because the condition on line 2550 was never false
2551 messages.append(",".join(formatterFailed))
2552 msg = ";\n".join(messages)
2553 raise DatastoreValidationError(msg)
2555 def getLookupKeys(self) -> Set[LookupKey]:
2556 # Docstring is inherited from base class
2557 return (
2558 self.templates.getLookupKeys()
2559 | self.formatterFactory.getLookupKeys()
2560 | self.constraints.getLookupKeys()
2561 )
2563 def validateKey(self, lookupKey: LookupKey, entity: Union[DatasetRef, DatasetType, StorageClass]) -> None:
2564 # Docstring is inherited from base class
2565 # The key can be valid in either formatters or templates so we can
2566 # only check the template if it exists
2567 if lookupKey in self.templates:
2568 try:
2569 self.templates[lookupKey].validateTemplate(entity)
2570 except FileTemplateValidationError as e:
2571 raise DatastoreValidationError(e) from e
2573 def export(
2574 self,
2575 refs: Iterable[DatasetRef],
2576 *,
2577 directory: Optional[ResourcePathExpression] = None,
2578 transfer: Optional[str] = "auto",
2579 ) -> Iterable[FileDataset]:
2580 # Docstring inherited from Datastore.export.
2581 if transfer is not None and directory is None: 2581 ↛ 2582line 2581 didn't jump to line 2582, because the condition on line 2581 was never true
2582 raise RuntimeError(f"Cannot export using transfer mode {transfer} with no export directory given")
2584 # Force the directory to be a URI object
2585 directoryUri: Optional[ResourcePath] = None
2586 if directory is not None: 2586 ↛ 2589line 2586 didn't jump to line 2589, because the condition on line 2586 was never false
2587 directoryUri = ResourcePath(directory, forceDirectory=True)
2589 if transfer is not None and directoryUri is not None: 2589 ↛ 2594line 2589 didn't jump to line 2594, because the condition on line 2589 was never false
2590 # mypy needs the second test
2591 if not directoryUri.exists(): 2591 ↛ 2592line 2591 didn't jump to line 2592, because the condition on line 2591 was never true
2592 raise FileNotFoundError(f"Export location {directory} does not exist")
2594 progress = Progress("lsst.daf.butler.datastores.FileDatastore.export", level=logging.DEBUG)
2595 for ref in progress.wrap(refs, "Exporting dataset files"):
2596 fileLocations = self._get_dataset_locations_info(ref)
2597 if not fileLocations: 2597 ↛ 2598line 2597 didn't jump to line 2598, because the condition on line 2597 was never true
2598 raise FileNotFoundError(f"Could not retrieve dataset {ref}.")
2599 # For now we can not export disassembled datasets
2600 if len(fileLocations) > 1:
2601 raise NotImplementedError(f"Can not export disassembled datasets such as {ref}")
2602 location, storedFileInfo = fileLocations[0]
2604 pathInStore = location.pathInStore.path
2605 if transfer is None: 2605 ↛ 2609line 2605 didn't jump to line 2609, because the condition on line 2605 was never true
2606 # TODO: do we also need to return the readStorageClass somehow?
2607 # We will use the path in store directly. If this is an
2608 # absolute URI, preserve it.
2609 if location.pathInStore.isabs():
2610 pathInStore = str(location.uri)
2611 elif transfer == "direct": 2611 ↛ 2613line 2611 didn't jump to line 2613, because the condition on line 2611 was never true
2612 # Use full URIs to the remote store in the export
2613 pathInStore = str(location.uri)
2614 else:
2615 # mypy needs help
2616 assert directoryUri is not None, "directoryUri must be defined to get here"
2617 storeUri = ResourcePath(location.uri)
2619 # if the datastore has an absolute URI to a resource, we
2620 # have two options:
2621 # 1. Keep the absolute URI in the exported YAML
2622 # 2. Allocate a new name in the local datastore and transfer
2623 # it.
2624 # For now go with option 2
2625 if location.pathInStore.isabs(): 2625 ↛ 2626line 2625 didn't jump to line 2626, because the condition on line 2625 was never true
2626 template = self.templates.getTemplate(ref)
2627 newURI = ResourcePath(template.format(ref), forceAbsolute=False)
2628 pathInStore = str(newURI.updatedExtension(location.pathInStore.getExtension()))
2630 exportUri = directoryUri.join(pathInStore)
2631 exportUri.transfer_from(storeUri, transfer=transfer)
2633 yield FileDataset(refs=[ref], path=pathInStore, formatter=storedFileInfo.formatter)
2635 @staticmethod
2636 def computeChecksum(
2637 uri: ResourcePath, algorithm: str = "blake2b", block_size: int = 8192
2638 ) -> Optional[str]:
2639 """Compute the checksum of the supplied file.
2641 Parameters
2642 ----------
2643 uri : `lsst.resources.ResourcePath`
2644 Name of resource to calculate checksum from.
2645 algorithm : `str`, optional
2646 Name of algorithm to use. Must be one of the algorithms supported
2647 by :py:class`hashlib`.
2648 block_size : `int`
2649 Number of bytes to read from file at one time.
2651 Returns
2652 -------
2653 hexdigest : `str`
2654 Hex digest of the file.
2656 Notes
2657 -----
2658 Currently returns None if the URI is for a remote resource.
2659 """
2660 if algorithm not in hashlib.algorithms_guaranteed: 2660 ↛ 2661line 2660 didn't jump to line 2661, because the condition on line 2660 was never true
2661 raise NameError("The specified algorithm '{}' is not supported by hashlib".format(algorithm))
2663 if not uri.isLocal: 2663 ↛ 2664line 2663 didn't jump to line 2664, because the condition on line 2663 was never true
2664 return None
2666 hasher = hashlib.new(algorithm)
2668 with uri.as_local() as local_uri:
2669 with open(local_uri.ospath, "rb") as f:
2670 for chunk in iter(lambda: f.read(block_size), b""):
2671 hasher.update(chunk)
2673 return hasher.hexdigest()
2675 def needs_expanded_data_ids(
2676 self,
2677 transfer: Optional[str],
2678 entity: Optional[Union[DatasetRef, DatasetType, StorageClass]] = None,
2679 ) -> bool:
2680 # Docstring inherited.
2681 # This _could_ also use entity to inspect whether the filename template
2682 # involves placeholders other than the required dimensions for its
2683 # dataset type, but that's not necessary for correctness; it just
2684 # enables more optimizations (perhaps only in theory).
2685 return transfer not in ("direct", None)