Coverage for python/lsst/daf/butler/datastores/file_datastore/get.py: 15%

156 statements  

« prev     ^ index     » next       coverage.py v7.5.0, created at 2024-05-03 02:48 -0700

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This software is dual licensed under the GNU General Public License and also 

10# under a 3-clause BSD license. Recipients may choose which of these licenses 

11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt, 

12# respectively. If you choose the GPL option then the following text applies 

13# (but note that there is still no warranty even if you opt for BSD instead): 

14# 

15# This program is free software: you can redistribute it and/or modify 

16# it under the terms of the GNU General Public License as published by 

17# the Free Software Foundation, either version 3 of the License, or 

18# (at your option) any later version. 

19# 

20# This program is distributed in the hope that it will be useful, 

21# but WITHOUT ANY WARRANTY; without even the implied warranty of 

22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

23# GNU General Public License for more details. 

24# 

25# You should have received a copy of the GNU General Public License 

26# along with this program. If not, see <http://www.gnu.org/licenses/>. 

27 

28__all__ = ( 

29 "DatastoreFileGetInformation", 

30 "DatasetLocationInformation", 

31 "generate_datastore_get_information", 

32 "get_dataset_as_python_object_from_get_info", 

33) 

34 

35from collections.abc import Mapping 

36from dataclasses import dataclass 

37from typing import Any, TypeAlias 

38 

39from lsst.daf.butler import DatasetRef, FileDescriptor, Formatter, Location, StorageClass 

40from lsst.daf.butler.datastore.cache_manager import AbstractDatastoreCacheManager 

41from lsst.daf.butler.datastore.generic_base import post_process_get 

42from lsst.daf.butler.datastore.stored_file_info import StoredFileInfo 

43from lsst.utils.introspection import get_instance_of 

44from lsst.utils.logging import getLogger 

45from lsst.utils.timer import time_this 

46 

47log = getLogger(__name__) 

48 

49DatasetLocationInformation: TypeAlias = tuple[Location, StoredFileInfo] 

50 

51 

52@dataclass(frozen=True) 

53class DatastoreFileGetInformation: 

54 """Collection of useful parameters needed to retrieve a file from 

55 a Datastore. 

56 """ 

57 

58 location: Location 

59 """The location from which to read the dataset.""" 

60 

61 formatter: Formatter 

62 """The `Formatter` to use to deserialize the dataset.""" 

63 

64 info: StoredFileInfo 

65 """Stored information about this file and its formatter.""" 

66 

67 assemblerParams: Mapping[str, Any] 

68 """Parameters to use for post-processing the retrieved dataset.""" 

69 

70 formatterParams: Mapping[str, Any] 

71 """Parameters that were understood by the associated formatter.""" 

72 

73 component: str | None 

74 """The component to be retrieved (can be `None`).""" 

75 

76 readStorageClass: StorageClass 

77 """The `StorageClass` of the dataset being read.""" 

78 

79 

80def generate_datastore_get_information( 

81 fileLocations: list[DatasetLocationInformation], 

82 *, 

83 ref: DatasetRef, 

84 parameters: Mapping[str, Any] | None, 

85 readStorageClass: StorageClass | None = None, 

86) -> list[DatastoreFileGetInformation]: 

87 """Process parameters and instantiate formatters for in preparation for 

88 retrieving an artifact and converting it to a Python object. 

89 

90 Parameters 

91 ---------- 

92 fileLocations : `list`[`DatasetLocationInformation`] 

93 List of file locations for this artifact and their associated datastore 

94 records. 

95 ref : `DatasetRef` 

96 The registry information associated with this artifact. 

97 parameters : `Mapping`[`str`, `Any`] 

98 `StorageClass` and `Formatter` parameters. 

99 readStorageClass : `StorageClass` | `None`, optional 

100 The StorageClass to use when ultimately returning the resulting object 

101 from the get. Defaults to the `StorageClass` specified by ``ref``. 

102 

103 Returns 

104 ------- 

105 getInfo : `list` [`DatastoreFileGetInformation`] 

106 The parameters needed to retrieve each file. 

107 """ 

108 if readStorageClass is None: 

109 readStorageClass = ref.datasetType.storageClass 

110 

111 # Is this a component request? 

112 refComponent = ref.datasetType.component() 

113 

114 disassembled = len(fileLocations) > 1 

115 fileGetInfo = [] 

116 for location, storedFileInfo in fileLocations: 

117 # The storage class used to write the file 

118 writeStorageClass = storedFileInfo.storageClass 

119 

120 # If this has been disassembled we need read to match the write 

121 if disassembled: 

122 readStorageClass = writeStorageClass 

123 

124 formatter = get_instance_of( 

125 storedFileInfo.formatter, 

126 FileDescriptor( 

127 location, 

128 readStorageClass=readStorageClass, 

129 storageClass=writeStorageClass, 

130 parameters=parameters, 

131 ), 

132 ref.dataId, 

133 ) 

134 

135 formatterParams, notFormatterParams = formatter.segregateParameters() 

136 

137 # Of the remaining parameters, extract the ones supported by 

138 # this StorageClass (for components not all will be handled) 

139 assemblerParams = readStorageClass.filterParameters(notFormatterParams) 

140 

141 # The ref itself could be a component if the dataset was 

142 # disassembled by butler, or we disassembled in datastore and 

143 # components came from the datastore records 

144 component = storedFileInfo.component if storedFileInfo.component else refComponent 

145 

146 fileGetInfo.append( 

147 DatastoreFileGetInformation( 

148 location, 

149 formatter, 

150 storedFileInfo, 

151 assemblerParams, 

152 formatterParams, 

153 component, 

154 readStorageClass, 

155 ) 

156 ) 

157 

158 return fileGetInfo 

159 

160 

161def _read_artifact_into_memory( 

162 getInfo: DatastoreFileGetInformation, 

163 ref: DatasetRef, 

164 cache_manager: AbstractDatastoreCacheManager, 

165 isComponent: bool = False, 

166 cache_ref: DatasetRef | None = None, 

167) -> Any: 

168 """Read the artifact from datastore into in memory object. 

169 

170 Parameters 

171 ---------- 

172 getInfo : `DatastoreFileGetInformation` 

173 Information about the artifact within the datastore. 

174 ref : `DatasetRef` 

175 The registry information associated with this artifact. 

176 isComponent : `bool` 

177 Flag to indicate if a component is being read from this artifact. 

178 cache_manager : `AbstractDatastoreCacheManager` 

179 The cache manager to use for caching retrieved files 

180 cache_ref : `DatasetRef`, optional 

181 The DatasetRef to use when looking up the file in the cache. 

182 This ref must have the same ID as the supplied ref but can 

183 be a parent ref or component ref to indicate to the cache whether 

184 a composite file is being requested from the cache or a component 

185 file. Without this the cache will default to the supplied ref but 

186 it can get confused with read-only derived components for 

187 disassembled composites. 

188 

189 Returns 

190 ------- 

191 inMemoryDataset : `object` 

192 The artifact as a python object. 

193 """ 

194 location = getInfo.location 

195 uri = location.uri 

196 log.debug("Accessing data from %s", uri) 

197 

198 if cache_ref is None: 

199 cache_ref = ref 

200 if cache_ref.id != ref.id: 

201 raise ValueError( 

202 "The supplied cache dataset ref refers to a different dataset than expected:" 

203 f" {ref.id} != {cache_ref.id}" 

204 ) 

205 

206 # Cannot recalculate checksum but can compare size as a quick check 

207 # Do not do this if the size is negative since that indicates 

208 # we do not know. 

209 recorded_size = getInfo.info.file_size 

210 

211 def check_resource_size(resource_size: int) -> None: 

212 if recorded_size >= 0 and resource_size != recorded_size: 

213 raise RuntimeError( 

214 "Integrity failure in Datastore. " 

215 f"Size of file {uri} ({resource_size}) " 

216 f"does not match size recorded in registry of {recorded_size}" 

217 ) 

218 

219 # For the general case we have choices for how to proceed. 

220 # 1. Always use a local file (downloading the remote resource to a 

221 # temporary file if needed). 

222 # 2. Use a threshold size and read into memory and use bytes. 

223 # Use both for now with an arbitrary hand off size. 

224 # This allows small datasets to be downloaded from remote object 

225 # stores without requiring a temporary file. 

226 

227 formatter = getInfo.formatter 

228 nbytes_max = 10_000_000 # Arbitrary number that we can tune 

229 if recorded_size >= 0 and recorded_size <= nbytes_max and formatter.can_read_bytes(): 

230 with cache_manager.find_in_cache(cache_ref, uri.getExtension()) as cached_file: 

231 if cached_file is not None: 

232 desired_uri = cached_file 

233 msg = f" (cached version of {uri})" 

234 else: 

235 desired_uri = uri 

236 msg = "" 

237 with time_this(log, msg="Reading bytes from %s%s", args=(desired_uri, msg)): 

238 serializedDataset = desired_uri.read() 

239 check_resource_size(len(serializedDataset)) 

240 log.debug( 

241 "Deserializing %s from %d bytes from location %s with formatter %s", 

242 f"component {getInfo.component}" if isComponent else "", 

243 len(serializedDataset), 

244 uri, 

245 formatter.name(), 

246 ) 

247 try: 

248 result = formatter.fromBytes( 

249 serializedDataset, component=getInfo.component if isComponent else None 

250 ) 

251 except Exception as e: 

252 raise ValueError( 

253 f"Failure from formatter '{formatter.name()}' for dataset {ref.id}" 

254 f" ({ref.datasetType.name} from {uri}): {e}" 

255 ) from e 

256 else: 

257 # Read from file. 

258 

259 # Have to update the Location associated with the formatter 

260 # because formatter.read does not allow an override. 

261 # This could be improved. 

262 location_updated = False 

263 msg = "" 

264 

265 # First check in cache for local version. 

266 # The cache will only be relevant for remote resources but 

267 # no harm in always asking. Context manager ensures that cache 

268 # file is not deleted during cache expiration. 

269 with cache_manager.find_in_cache(cache_ref, uri.getExtension()) as cached_file: 

270 if cached_file is not None: 

271 msg = f"(via cache read of remote file {uri})" 

272 uri = cached_file 

273 location_updated = True 

274 

275 with uri.as_local() as local_uri: 

276 check_resource_size(local_uri.size()) 

277 can_be_cached = False 

278 if uri != local_uri: 

279 # URI was remote and file was downloaded 

280 cache_msg = "" 

281 location_updated = True 

282 

283 if cache_manager.should_be_cached(cache_ref): 

284 # In this scenario we want to ask if the downloaded 

285 # file should be cached but we should not cache 

286 # it until after we've used it (to ensure it can't 

287 # be expired whilst we are using it). 

288 can_be_cached = True 

289 

290 # Say that it is "likely" to be cached because 

291 # if the formatter read fails we will not be 

292 # caching this file. 

293 cache_msg = " and likely cached" 

294 

295 msg = f"(via download to local file{cache_msg})" 

296 

297 # Calculate the (possibly) new location for the formatter 

298 # to use. 

299 newLocation = Location(*local_uri.split()) if location_updated else None 

300 

301 log.debug( 

302 "Reading%s from location %s %s with formatter %s", 

303 f" component {getInfo.component}" if isComponent else "", 

304 uri, 

305 msg, 

306 formatter.name(), 

307 ) 

308 try: 

309 with ( 

310 formatter._updateLocation(newLocation), 

311 time_this( 

312 log, 

313 msg="Reading%s from location %s %s with formatter %s", 

314 args=( 

315 f" component {getInfo.component}" if isComponent else "", 

316 uri, 

317 msg, 

318 formatter.name(), 

319 ), 

320 ), 

321 ): 

322 result = formatter.read(component=getInfo.component if isComponent else None) 

323 except Exception as e: 

324 raise ValueError( 

325 f"Failure from formatter '{formatter.name()}' for dataset {ref.id}" 

326 f" ({ref.datasetType.name} from {uri}): {e}" 

327 ) from e 

328 

329 # File was read successfully so can move to cache 

330 if can_be_cached: 

331 cache_manager.move_to_cache(local_uri, cache_ref) 

332 

333 return post_process_get( 

334 result, ref.datasetType.storageClass, getInfo.assemblerParams, isComponent=isComponent 

335 ) 

336 

337 

338def get_dataset_as_python_object_from_get_info( 

339 allGetInfo: list[DatastoreFileGetInformation], 

340 *, 

341 ref: DatasetRef, 

342 parameters: Mapping[str, Any] | None, 

343 cache_manager: AbstractDatastoreCacheManager, 

344) -> Any: 

345 """Retrieve an artifact from storage and return it as a Python object. 

346 

347 Parameters 

348 ---------- 

349 allGetInfo : `list`[`DatastoreFileGetInformation`] 

350 Pre-processed information about each file associated with this 

351 artifact. 

352 ref : `DatasetRef` 

353 The registry information associated with this artifact. 

354 parameters : `Mapping`[`str`, `Any`] 

355 `StorageClass` and `Formatter` parameters. 

356 cache_manager : `AbstractDatastoreCacheManager` 

357 The cache manager to use for caching retrieved files. 

358 

359 Returns 

360 ------- 

361 python_object : `typing.Any` 

362 The retrieved artifact, converted to a Python object according to the 

363 `StorageClass` specified in ``ref``. 

364 """ 

365 refStorageClass = ref.datasetType.storageClass 

366 refComponent = ref.datasetType.component() 

367 # Create mapping from component name to related info 

368 allComponents = {i.component: i for i in allGetInfo} 

369 

370 # By definition the dataset is disassembled if we have more 

371 # than one record for it. 

372 isDisassembled = len(allGetInfo) > 1 

373 

374 # Look for the special case where we are disassembled but the 

375 # component is a derived component that was not written during 

376 # disassembly. For this scenario we need to check that the 

377 # component requested is listed as a derived component for the 

378 # composite storage class 

379 isDisassembledReadOnlyComponent = False 

380 if isDisassembled and refComponent: 

381 # The composite storage class should be accessible through 

382 # the component dataset type 

383 compositeStorageClass = ref.datasetType.parentStorageClass 

384 

385 # In the unlikely scenario where the composite storage 

386 # class is not known, we can only assume that this is a 

387 # normal component. If that assumption is wrong then the 

388 # branch below that reads a persisted component will fail 

389 # so there is no need to complain here. 

390 if compositeStorageClass is not None: 

391 isDisassembledReadOnlyComponent = refComponent in compositeStorageClass.derivedComponents 

392 

393 if isDisassembled and not refComponent: 

394 # This was a disassembled dataset spread over multiple files 

395 # and we need to put them all back together again. 

396 # Read into memory and then assemble 

397 

398 # Check that the supplied parameters are suitable for the type read 

399 refStorageClass.validateParameters(parameters) 

400 

401 # We want to keep track of all the parameters that were not used 

402 # by formatters. We assume that if any of the component formatters 

403 # use a parameter that we do not need to apply it again in the 

404 # assembler. 

405 usedParams = set() 

406 

407 components: dict[str, Any] = {} 

408 for getInfo in allGetInfo: 

409 # assemblerParams are parameters not understood by the 

410 # associated formatter. 

411 usedParams.update(set(getInfo.formatterParams)) 

412 

413 component = getInfo.component 

414 

415 if component is None: 

416 raise RuntimeError(f"Internal error in datastore assembly of {ref}") 

417 

418 # We do not want the formatter to think it's reading 

419 # a component though because it is really reading a 

420 # standalone dataset -- always tell reader it is not a 

421 # component. 

422 components[component] = _read_artifact_into_memory( 

423 getInfo, ref.makeComponentRef(component), cache_manager, isComponent=False 

424 ) 

425 

426 inMemoryDataset = ref.datasetType.storageClass.delegate().assemble(components) 

427 

428 # Any unused parameters will have to be passed to the assembler 

429 if parameters: 

430 unusedParams = {k: v for k, v in parameters.items() if k not in usedParams} 

431 else: 

432 unusedParams = {} 

433 

434 # Process parameters 

435 return ref.datasetType.storageClass.delegate().handleParameters( 

436 inMemoryDataset, parameters=unusedParams 

437 ) 

438 

439 elif isDisassembledReadOnlyComponent: 

440 compositeStorageClass = ref.datasetType.parentStorageClass 

441 if compositeStorageClass is None: 

442 raise RuntimeError( 

443 f"Unable to retrieve derived component '{refComponent}' since" 

444 "no composite storage class is available." 

445 ) 

446 

447 if refComponent is None: 

448 # Mainly for mypy 

449 raise RuntimeError("Internal error in datastore: component can not be None here") 

450 

451 # Assume that every derived component can be calculated by 

452 # forwarding the request to a single read/write component. 

453 # Rather than guessing which rw component is the right one by 

454 # scanning each for a derived component of the same name, 

455 # we ask the storage class delegate directly which one is best to 

456 # use. 

457 compositeDelegate = compositeStorageClass.delegate() 

458 forwardedComponent = compositeDelegate.selectResponsibleComponent(refComponent, set(allComponents)) 

459 

460 # Select the relevant component 

461 rwInfo = allComponents[forwardedComponent] 

462 

463 # For now assume that read parameters are validated against 

464 # the real component and not the requested component 

465 forwardedStorageClass = rwInfo.formatter.fileDescriptor.readStorageClass 

466 forwardedStorageClass.validateParameters(parameters) 

467 

468 # The reference to use for the caching must refer to the forwarded 

469 # component and not the derived component. 

470 cache_ref = ref.makeCompositeRef().makeComponentRef(forwardedComponent) 

471 

472 # Unfortunately the FileDescriptor inside the formatter will have 

473 # the wrong write storage class so we need to create a new one 

474 # given the immutability constraint. 

475 writeStorageClass = rwInfo.info.storageClass 

476 

477 # We may need to put some thought into parameters for read 

478 # components but for now forward them on as is 

479 readFormatter = type(rwInfo.formatter)( 

480 FileDescriptor( 

481 rwInfo.location, 

482 readStorageClass=refStorageClass, 

483 storageClass=writeStorageClass, 

484 parameters=parameters, 

485 ), 

486 ref.dataId, 

487 ) 

488 

489 # The assembler can not receive any parameter requests for a 

490 # derived component at this time since the assembler will 

491 # see the storage class of the derived component and those 

492 # parameters will have to be handled by the formatter on the 

493 # forwarded storage class. 

494 assemblerParams: dict[str, Any] = {} 

495 

496 # Need to created a new info that specifies the derived 

497 # component and associated storage class 

498 readInfo = DatastoreFileGetInformation( 

499 rwInfo.location, 

500 readFormatter, 

501 rwInfo.info, 

502 assemblerParams, 

503 {}, 

504 refComponent, 

505 refStorageClass, 

506 ) 

507 

508 return _read_artifact_into_memory(readInfo, ref, cache_manager, isComponent=True, cache_ref=cache_ref) 

509 

510 else: 

511 # Single file request or component from that composite file 

512 for lookup in (refComponent, None): 

513 if lookup in allComponents: 

514 getInfo = allComponents[lookup] 

515 break 

516 else: 

517 raise FileNotFoundError(f"Component {refComponent} not found for ref {ref} in datastore") 

518 

519 # Do not need the component itself if already disassembled 

520 if isDisassembled: 

521 isComponent = False 

522 else: 

523 isComponent = getInfo.component is not None 

524 

525 # For a component read of a composite we want the cache to 

526 # be looking at the composite ref itself. 

527 cache_ref = ref.makeCompositeRef() if isComponent else ref 

528 

529 # For a disassembled component we can validate parameters against 

530 # the component storage class directly 

531 if isDisassembled: 

532 refStorageClass.validateParameters(parameters) 

533 else: 

534 # For an assembled composite this could be a derived 

535 # component derived from a real component. The validity 

536 # of the parameters is not clear. For now validate against 

537 # the composite storage class 

538 getInfo.formatter.fileDescriptor.storageClass.validateParameters(parameters) 

539 

540 return _read_artifact_into_memory( 

541 getInfo, ref, cache_manager, isComponent=isComponent, cache_ref=cache_ref 

542 )