Coverage for python/lsst/daf/butler/datastores/file_datastore/get.py: 15%
156 statements
« prev ^ index » next coverage.py v7.4.4, created at 2024-03-30 09:59 +0000
« prev ^ index » next coverage.py v7.4.4, created at 2024-03-30 09:59 +0000
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This software is dual licensed under the GNU General Public License and also
10# under a 3-clause BSD license. Recipients may choose which of these licenses
11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt,
12# respectively. If you choose the GPL option then the following text applies
13# (but note that there is still no warranty even if you opt for BSD instead):
14#
15# This program is free software: you can redistribute it and/or modify
16# it under the terms of the GNU General Public License as published by
17# the Free Software Foundation, either version 3 of the License, or
18# (at your option) any later version.
19#
20# This program is distributed in the hope that it will be useful,
21# but WITHOUT ANY WARRANTY; without even the implied warranty of
22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23# GNU General Public License for more details.
24#
25# You should have received a copy of the GNU General Public License
26# along with this program. If not, see <http://www.gnu.org/licenses/>.
28__all__ = (
29 "DatastoreFileGetInformation",
30 "DatasetLocationInformation",
31 "generate_datastore_get_information",
32 "get_dataset_as_python_object_from_get_info",
33)
35from collections.abc import Mapping
36from dataclasses import dataclass
37from typing import Any, TypeAlias
39from lsst.daf.butler import DatasetRef, FileDescriptor, Formatter, Location, StorageClass
40from lsst.daf.butler.datastore.cache_manager import AbstractDatastoreCacheManager
41from lsst.daf.butler.datastore.generic_base import post_process_get
42from lsst.daf.butler.datastore.stored_file_info import StoredFileInfo
43from lsst.utils.introspection import get_instance_of
44from lsst.utils.logging import getLogger
45from lsst.utils.timer import time_this
47log = getLogger(__name__)
49DatasetLocationInformation: TypeAlias = tuple[Location, StoredFileInfo]
52@dataclass(frozen=True)
53class DatastoreFileGetInformation:
54 """Collection of useful parameters needed to retrieve a file from
55 a Datastore.
56 """
58 location: Location
59 """The location from which to read the dataset."""
61 formatter: Formatter
62 """The `Formatter` to use to deserialize the dataset."""
64 info: StoredFileInfo
65 """Stored information about this file and its formatter."""
67 assemblerParams: Mapping[str, Any]
68 """Parameters to use for post-processing the retrieved dataset."""
70 formatterParams: Mapping[str, Any]
71 """Parameters that were understood by the associated formatter."""
73 component: str | None
74 """The component to be retrieved (can be `None`)."""
76 readStorageClass: StorageClass
77 """The `StorageClass` of the dataset being read."""
80def generate_datastore_get_information(
81 fileLocations: list[DatasetLocationInformation],
82 *,
83 ref: DatasetRef,
84 parameters: Mapping[str, Any] | None,
85 readStorageClass: StorageClass | None = None,
86) -> list[DatastoreFileGetInformation]:
87 """Process parameters and instantiate formatters for in preparation for
88 retrieving an artifact and converting it to a Python object.
90 Parameters
91 ----------
92 fileLocations : `list`[`DatasetLocationInformation`]
93 List of file locations for this artifact and their associated datastore
94 records.
95 ref : `DatasetRef`
96 The registry information associated with this artifact.
97 parameters : `Mapping`[`str`, `Any`]
98 `StorageClass` and `Formatter` parameters.
99 readStorageClass : `StorageClass` | `None`, optional
100 The StorageClass to use when ultimately returning the resulting object
101 from the get. Defaults to the `StorageClass` specified by ``ref``.
103 Returns
104 -------
105 getInfo : `list` [`DatastoreFileGetInformation`]
106 The parameters needed to retrieve each file.
107 """
108 if readStorageClass is None:
109 readStorageClass = ref.datasetType.storageClass
111 # Is this a component request?
112 refComponent = ref.datasetType.component()
114 disassembled = len(fileLocations) > 1
115 fileGetInfo = []
116 for location, storedFileInfo in fileLocations:
117 # The storage class used to write the file
118 writeStorageClass = storedFileInfo.storageClass
120 # If this has been disassembled we need read to match the write
121 if disassembled:
122 readStorageClass = writeStorageClass
124 formatter = get_instance_of(
125 storedFileInfo.formatter,
126 FileDescriptor(
127 location,
128 readStorageClass=readStorageClass,
129 storageClass=writeStorageClass,
130 parameters=parameters,
131 ),
132 ref.dataId,
133 )
135 formatterParams, notFormatterParams = formatter.segregateParameters()
137 # Of the remaining parameters, extract the ones supported by
138 # this StorageClass (for components not all will be handled)
139 assemblerParams = readStorageClass.filterParameters(notFormatterParams)
141 # The ref itself could be a component if the dataset was
142 # disassembled by butler, or we disassembled in datastore and
143 # components came from the datastore records
144 component = storedFileInfo.component if storedFileInfo.component else refComponent
146 fileGetInfo.append(
147 DatastoreFileGetInformation(
148 location,
149 formatter,
150 storedFileInfo,
151 assemblerParams,
152 formatterParams,
153 component,
154 readStorageClass,
155 )
156 )
158 return fileGetInfo
161def _read_artifact_into_memory(
162 getInfo: DatastoreFileGetInformation,
163 ref: DatasetRef,
164 cache_manager: AbstractDatastoreCacheManager,
165 isComponent: bool = False,
166 cache_ref: DatasetRef | None = None,
167) -> Any:
168 """Read the artifact from datastore into in memory object.
170 Parameters
171 ----------
172 getInfo : `DatastoreFileGetInformation`
173 Information about the artifact within the datastore.
174 ref : `DatasetRef`
175 The registry information associated with this artifact.
176 isComponent : `bool`
177 Flag to indicate if a component is being read from this artifact.
178 cache_manager : `AbstractDatastoreCacheManager`
179 The cache manager to use for caching retrieved files
180 cache_ref : `DatasetRef`, optional
181 The DatasetRef to use when looking up the file in the cache.
182 This ref must have the same ID as the supplied ref but can
183 be a parent ref or component ref to indicate to the cache whether
184 a composite file is being requested from the cache or a component
185 file. Without this the cache will default to the supplied ref but
186 it can get confused with read-only derived components for
187 disassembled composites.
189 Returns
190 -------
191 inMemoryDataset : `object`
192 The artifact as a python object.
193 """
194 location = getInfo.location
195 uri = location.uri
196 log.debug("Accessing data from %s", uri)
198 if cache_ref is None:
199 cache_ref = ref
200 if cache_ref.id != ref.id:
201 raise ValueError(
202 "The supplied cache dataset ref refers to a different dataset than expected:"
203 f" {ref.id} != {cache_ref.id}"
204 )
206 # Cannot recalculate checksum but can compare size as a quick check
207 # Do not do this if the size is negative since that indicates
208 # we do not know.
209 recorded_size = getInfo.info.file_size
211 def check_resource_size(resource_size: int) -> None:
212 if recorded_size >= 0 and resource_size != recorded_size:
213 raise RuntimeError(
214 "Integrity failure in Datastore. "
215 f"Size of file {uri} ({resource_size}) "
216 f"does not match size recorded in registry of {recorded_size}"
217 )
219 # For the general case we have choices for how to proceed.
220 # 1. Always use a local file (downloading the remote resource to a
221 # temporary file if needed).
222 # 2. Use a threshold size and read into memory and use bytes.
223 # Use both for now with an arbitrary hand off size.
224 # This allows small datasets to be downloaded from remote object
225 # stores without requiring a temporary file.
227 formatter = getInfo.formatter
228 nbytes_max = 10_000_000 # Arbitrary number that we can tune
229 if recorded_size >= 0 and recorded_size <= nbytes_max and formatter.can_read_bytes():
230 with cache_manager.find_in_cache(cache_ref, uri.getExtension()) as cached_file:
231 if cached_file is not None:
232 desired_uri = cached_file
233 msg = f" (cached version of {uri})"
234 else:
235 desired_uri = uri
236 msg = ""
237 with time_this(log, msg="Reading bytes from %s%s", args=(desired_uri, msg)):
238 serializedDataset = desired_uri.read()
239 check_resource_size(len(serializedDataset))
240 log.debug(
241 "Deserializing %s from %d bytes from location %s with formatter %s",
242 f"component {getInfo.component}" if isComponent else "",
243 len(serializedDataset),
244 uri,
245 formatter.name(),
246 )
247 try:
248 result = formatter.fromBytes(
249 serializedDataset, component=getInfo.component if isComponent else None
250 )
251 except Exception as e:
252 raise ValueError(
253 f"Failure from formatter '{formatter.name()}' for dataset {ref.id}"
254 f" ({ref.datasetType.name} from {uri}): {e}"
255 ) from e
256 else:
257 # Read from file.
259 # Have to update the Location associated with the formatter
260 # because formatter.read does not allow an override.
261 # This could be improved.
262 location_updated = False
263 msg = ""
265 # First check in cache for local version.
266 # The cache will only be relevant for remote resources but
267 # no harm in always asking. Context manager ensures that cache
268 # file is not deleted during cache expiration.
269 with cache_manager.find_in_cache(cache_ref, uri.getExtension()) as cached_file:
270 if cached_file is not None:
271 msg = f"(via cache read of remote file {uri})"
272 uri = cached_file
273 location_updated = True
275 with uri.as_local() as local_uri:
276 check_resource_size(local_uri.size())
277 can_be_cached = False
278 if uri != local_uri:
279 # URI was remote and file was downloaded
280 cache_msg = ""
281 location_updated = True
283 if cache_manager.should_be_cached(cache_ref):
284 # In this scenario we want to ask if the downloaded
285 # file should be cached but we should not cache
286 # it until after we've used it (to ensure it can't
287 # be expired whilst we are using it).
288 can_be_cached = True
290 # Say that it is "likely" to be cached because
291 # if the formatter read fails we will not be
292 # caching this file.
293 cache_msg = " and likely cached"
295 msg = f"(via download to local file{cache_msg})"
297 # Calculate the (possibly) new location for the formatter
298 # to use.
299 newLocation = Location(*local_uri.split()) if location_updated else None
301 log.debug(
302 "Reading%s from location %s %s with formatter %s",
303 f" component {getInfo.component}" if isComponent else "",
304 uri,
305 msg,
306 formatter.name(),
307 )
308 try:
309 with (
310 formatter._updateLocation(newLocation),
311 time_this(
312 log,
313 msg="Reading%s from location %s %s with formatter %s",
314 args=(
315 f" component {getInfo.component}" if isComponent else "",
316 uri,
317 msg,
318 formatter.name(),
319 ),
320 ),
321 ):
322 result = formatter.read(component=getInfo.component if isComponent else None)
323 except Exception as e:
324 raise ValueError(
325 f"Failure from formatter '{formatter.name()}' for dataset {ref.id}"
326 f" ({ref.datasetType.name} from {uri}): {e}"
327 ) from e
329 # File was read successfully so can move to cache
330 if can_be_cached:
331 cache_manager.move_to_cache(local_uri, cache_ref)
333 return post_process_get(
334 result, ref.datasetType.storageClass, getInfo.assemblerParams, isComponent=isComponent
335 )
338def get_dataset_as_python_object_from_get_info(
339 allGetInfo: list[DatastoreFileGetInformation],
340 *,
341 ref: DatasetRef,
342 parameters: Mapping[str, Any] | None,
343 cache_manager: AbstractDatastoreCacheManager,
344) -> Any:
345 """Retrieve an artifact from storage and return it as a Python object.
347 Parameters
348 ----------
349 allGetInfo : `list`[`DatastoreFileGetInformation`]
350 Pre-processed information about each file associated with this
351 artifact.
352 ref : `DatasetRef`
353 The registry information associated with this artifact.
354 parameters : `Mapping`[`str`, `Any`]
355 `StorageClass` and `Formatter` parameters.
356 cache_manager : `AbstractDatastoreCacheManager`
357 The cache manager to use for caching retrieved files.
359 Returns
360 -------
361 python_object : `typing.Any`
362 The retrieved artifact, converted to a Python object according to the
363 `StorageClass` specified in ``ref``.
364 """
365 refStorageClass = ref.datasetType.storageClass
366 refComponent = ref.datasetType.component()
367 # Create mapping from component name to related info
368 allComponents = {i.component: i for i in allGetInfo}
370 # By definition the dataset is disassembled if we have more
371 # than one record for it.
372 isDisassembled = len(allGetInfo) > 1
374 # Look for the special case where we are disassembled but the
375 # component is a derived component that was not written during
376 # disassembly. For this scenario we need to check that the
377 # component requested is listed as a derived component for the
378 # composite storage class
379 isDisassembledReadOnlyComponent = False
380 if isDisassembled and refComponent:
381 # The composite storage class should be accessible through
382 # the component dataset type
383 compositeStorageClass = ref.datasetType.parentStorageClass
385 # In the unlikely scenario where the composite storage
386 # class is not known, we can only assume that this is a
387 # normal component. If that assumption is wrong then the
388 # branch below that reads a persisted component will fail
389 # so there is no need to complain here.
390 if compositeStorageClass is not None:
391 isDisassembledReadOnlyComponent = refComponent in compositeStorageClass.derivedComponents
393 if isDisassembled and not refComponent:
394 # This was a disassembled dataset spread over multiple files
395 # and we need to put them all back together again.
396 # Read into memory and then assemble
398 # Check that the supplied parameters are suitable for the type read
399 refStorageClass.validateParameters(parameters)
401 # We want to keep track of all the parameters that were not used
402 # by formatters. We assume that if any of the component formatters
403 # use a parameter that we do not need to apply it again in the
404 # assembler.
405 usedParams = set()
407 components: dict[str, Any] = {}
408 for getInfo in allGetInfo:
409 # assemblerParams are parameters not understood by the
410 # associated formatter.
411 usedParams.update(set(getInfo.formatterParams))
413 component = getInfo.component
415 if component is None:
416 raise RuntimeError(f"Internal error in datastore assembly of {ref}")
418 # We do not want the formatter to think it's reading
419 # a component though because it is really reading a
420 # standalone dataset -- always tell reader it is not a
421 # component.
422 components[component] = _read_artifact_into_memory(
423 getInfo, ref.makeComponentRef(component), cache_manager, isComponent=False
424 )
426 inMemoryDataset = ref.datasetType.storageClass.delegate().assemble(components)
428 # Any unused parameters will have to be passed to the assembler
429 if parameters:
430 unusedParams = {k: v for k, v in parameters.items() if k not in usedParams}
431 else:
432 unusedParams = {}
434 # Process parameters
435 return ref.datasetType.storageClass.delegate().handleParameters(
436 inMemoryDataset, parameters=unusedParams
437 )
439 elif isDisassembledReadOnlyComponent:
440 compositeStorageClass = ref.datasetType.parentStorageClass
441 if compositeStorageClass is None:
442 raise RuntimeError(
443 f"Unable to retrieve derived component '{refComponent}' since"
444 "no composite storage class is available."
445 )
447 if refComponent is None:
448 # Mainly for mypy
449 raise RuntimeError("Internal error in datastore: component can not be None here")
451 # Assume that every derived component can be calculated by
452 # forwarding the request to a single read/write component.
453 # Rather than guessing which rw component is the right one by
454 # scanning each for a derived component of the same name,
455 # we ask the storage class delegate directly which one is best to
456 # use.
457 compositeDelegate = compositeStorageClass.delegate()
458 forwardedComponent = compositeDelegate.selectResponsibleComponent(refComponent, set(allComponents))
460 # Select the relevant component
461 rwInfo = allComponents[forwardedComponent]
463 # For now assume that read parameters are validated against
464 # the real component and not the requested component
465 forwardedStorageClass = rwInfo.formatter.fileDescriptor.readStorageClass
466 forwardedStorageClass.validateParameters(parameters)
468 # The reference to use for the caching must refer to the forwarded
469 # component and not the derived component.
470 cache_ref = ref.makeCompositeRef().makeComponentRef(forwardedComponent)
472 # Unfortunately the FileDescriptor inside the formatter will have
473 # the wrong write storage class so we need to create a new one
474 # given the immutability constraint.
475 writeStorageClass = rwInfo.info.storageClass
477 # We may need to put some thought into parameters for read
478 # components but for now forward them on as is
479 readFormatter = type(rwInfo.formatter)(
480 FileDescriptor(
481 rwInfo.location,
482 readStorageClass=refStorageClass,
483 storageClass=writeStorageClass,
484 parameters=parameters,
485 ),
486 ref.dataId,
487 )
489 # The assembler can not receive any parameter requests for a
490 # derived component at this time since the assembler will
491 # see the storage class of the derived component and those
492 # parameters will have to be handled by the formatter on the
493 # forwarded storage class.
494 assemblerParams: dict[str, Any] = {}
496 # Need to created a new info that specifies the derived
497 # component and associated storage class
498 readInfo = DatastoreFileGetInformation(
499 rwInfo.location,
500 readFormatter,
501 rwInfo.info,
502 assemblerParams,
503 {},
504 refComponent,
505 refStorageClass,
506 )
508 return _read_artifact_into_memory(readInfo, ref, cache_manager, isComponent=True, cache_ref=cache_ref)
510 else:
511 # Single file request or component from that composite file
512 for lookup in (refComponent, None):
513 if lookup in allComponents:
514 getInfo = allComponents[lookup]
515 break
516 else:
517 raise FileNotFoundError(f"Component {refComponent} not found for ref {ref} in datastore")
519 # Do not need the component itself if already disassembled
520 if isDisassembled:
521 isComponent = False
522 else:
523 isComponent = getInfo.component is not None
525 # For a component read of a composite we want the cache to
526 # be looking at the composite ref itself.
527 cache_ref = ref.makeCompositeRef() if isComponent else ref
529 # For a disassembled component we can validate parameters against
530 # the component storage class directly
531 if isDisassembled:
532 refStorageClass.validateParameters(parameters)
533 else:
534 # For an assembled composite this could be a derived
535 # component derived from a real component. The validity
536 # of the parameters is not clear. For now validate against
537 # the composite storage class
538 getInfo.formatter.fileDescriptor.storageClass.validateParameters(parameters)
540 return _read_artifact_into_memory(
541 getInfo, ref, cache_manager, isComponent=isComponent, cache_ref=cache_ref
542 )