Coverage for python/lsst/daf/butler/datastores/inMemoryDatastore.py: 86%

Shortcuts on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

177 statements  

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22from __future__ import annotations 

23 

24"""In-memory datastore.""" 

25 

26__all__ = ("StoredMemoryItemInfo", "InMemoryDatastore") 

27 

28import logging 

29import time 

30from dataclasses import dataclass 

31from typing import TYPE_CHECKING, Any, Dict, Iterable, List, Mapping, Optional, Set, Tuple, Union 

32from urllib.parse import urlencode 

33 

34from lsst.daf.butler import DatasetId, DatasetRef, StorageClass, StoredDatastoreItemInfo 

35from lsst.daf.butler.registry.interfaces import DatastoreRegistryBridge 

36from lsst.resources import ResourcePath 

37 

38from .genericDatastore import GenericBaseDatastore 

39 

40if TYPE_CHECKING: 40 ↛ 41line 40 didn't jump to line 41, because the condition on line 40 was never true

41 from lsst.daf.butler import Config, DatasetType, LookupKey 

42 from lsst.daf.butler.registry.interfaces import DatasetIdRef, DatastoreRegistryBridgeManager 

43 

44log = logging.getLogger(__name__) 

45 

46 

47@dataclass(frozen=True) 

48class StoredMemoryItemInfo(StoredDatastoreItemInfo): 

49 """Internal InMemoryDatastore Metadata associated with a stored 

50 DatasetRef. 

51 """ 

52 

53 __slots__ = {"timestamp", "storageClass", "parentID"} 

54 

55 timestamp: float 

56 """Unix timestamp indicating the time the dataset was stored.""" 

57 

58 storageClass: StorageClass 

59 """StorageClass associated with the dataset.""" 

60 

61 parentID: DatasetId 

62 """ID of the parent `DatasetRef` if this entry is a concrete 

63 composite. Not used if the dataset being stored is not a 

64 virtual component of a composite 

65 """ 

66 

67 

68class InMemoryDatastore(GenericBaseDatastore): 

69 """Basic Datastore for writing to an in memory cache. 

70 

71 This datastore is ephemeral in that the contents of the datastore 

72 disappear when the Python process completes. This also means that 

73 other processes can not access this datastore. 

74 

75 Parameters 

76 ---------- 

77 config : `DatastoreConfig` or `str` 

78 Configuration. 

79 bridgeManager : `DatastoreRegistryBridgeManager` 

80 Object that manages the interface between `Registry` and datastores. 

81 butlerRoot : `str`, optional 

82 Unused parameter. 

83 

84 Notes 

85 ----- 

86 InMemoryDatastore does not support any file-based ingest. 

87 """ 

88 

89 defaultConfigFile = "datastores/inMemoryDatastore.yaml" 

90 """Path to configuration defaults. Accessed within the ``configs`` resource 

91 or relative to a search path. Can be None if no defaults specified. 

92 """ 

93 

94 isEphemeral = True 

95 """A new datastore is created every time and datasets disappear when 

96 the process shuts down.""" 

97 

98 datasets: Dict[DatasetId, Any] 

99 """Internal storage of datasets indexed by dataset ID.""" 

100 

101 records: Dict[DatasetId, StoredMemoryItemInfo] 

102 """Internal records about stored datasets.""" 

103 

104 def __init__( 

105 self, 

106 config: Union[Config, str], 

107 bridgeManager: DatastoreRegistryBridgeManager, 

108 butlerRoot: Optional[str] = None, 

109 ): 

110 super().__init__(config, bridgeManager) 

111 

112 # Name ourselves with the timestamp the datastore 

113 # was created. 

114 self.name = "{}@{}".format(type(self).__name__, time.time()) 

115 log.debug("Creating datastore %s", self.name) 

116 

117 # Storage of datasets, keyed by dataset_id 

118 self.datasets: Dict[DatasetId, Any] = {} 

119 

120 # Records is distinct in order to track concrete composite components 

121 # where we register multiple components for a single dataset. 

122 self.records: Dict[DatasetId, StoredMemoryItemInfo] = {} 

123 

124 # Related records that share the same parent 

125 self.related: Dict[DatasetId, Set[DatasetId]] = {} 

126 

127 self._bridge = bridgeManager.register(self.name, ephemeral=True) 

128 

129 @classmethod 

130 def setConfigRoot(cls, root: str, config: Config, full: Config, overwrite: bool = True) -> None: 

131 """Set any filesystem-dependent config options for this Datastore to 

132 be appropriate for a new empty repository with the given root. 

133 

134 Does nothing in this implementation. 

135 

136 Parameters 

137 ---------- 

138 root : `str` 

139 Filesystem path to the root of the data repository. 

140 config : `Config` 

141 A `Config` to update. Only the subset understood by 

142 this component will be updated. Will not expand 

143 defaults. 

144 full : `Config` 

145 A complete config with all defaults expanded that can be 

146 converted to a `DatastoreConfig`. Read-only and will not be 

147 modified by this method. 

148 Repository-specific options that should not be obtained 

149 from defaults when Butler instances are constructed 

150 should be copied from ``full`` to ``config``. 

151 overwrite : `bool`, optional 

152 If `False`, do not modify a value in ``config`` if the value 

153 already exists. Default is always to overwrite with the provided 

154 ``root``. 

155 

156 Notes 

157 ----- 

158 If a keyword is explicitly defined in the supplied ``config`` it 

159 will not be overridden by this method if ``overwrite`` is `False`. 

160 This allows explicit values set in external configs to be retained. 

161 """ 

162 return 

163 

164 @property 

165 def bridge(self) -> DatastoreRegistryBridge: 

166 # Docstring inherited from GenericBaseDatastore. 

167 return self._bridge 

168 

169 def addStoredItemInfo(self, refs: Iterable[DatasetRef], infos: Iterable[StoredMemoryItemInfo]) -> None: 

170 # Docstring inherited from GenericBaseDatastore. 

171 for ref, info in zip(refs, infos): 

172 if ref.id is None: 172 ↛ 173line 172 didn't jump to line 173, because the condition on line 172 was never true

173 raise RuntimeError(f"Can not store unresolved DatasetRef {ref}") 

174 self.records[ref.id] = info 

175 self.related.setdefault(info.parentID, set()).add(ref.id) 

176 

177 def getStoredItemInfo(self, ref: DatasetIdRef) -> StoredMemoryItemInfo: 

178 # Docstring inherited from GenericBaseDatastore. 

179 if ref.id is None: 179 ↛ 180line 179 didn't jump to line 180, because the condition on line 179 was never true

180 raise RuntimeError(f"Can not retrieve unresolved DatasetRef {ref}") 

181 return self.records[ref.id] 

182 

183 def getStoredItemsInfo(self, ref: DatasetIdRef) -> List[StoredMemoryItemInfo]: 

184 # Docstring inherited from GenericBaseDatastore. 

185 return [self.getStoredItemInfo(ref)] 

186 

187 def removeStoredItemInfo(self, ref: DatasetIdRef) -> None: 

188 # Docstring inherited from GenericBaseDatastore. 

189 # If a component has been removed previously then we can sometimes 

190 # be asked to remove it again. Other datastores ignore this 

191 # so also ignore here 

192 if ref.id is None: 192 ↛ 193line 192 didn't jump to line 193, because the condition on line 192 was never true

193 raise RuntimeError(f"Can not remove unresolved DatasetRef {ref}") 

194 if ref.id not in self.records: 

195 return 

196 record = self.records[ref.id] 

197 del self.records[ref.id] 

198 self.related[record.parentID].remove(ref.id) 

199 

200 def _get_dataset_info(self, ref: DatasetIdRef) -> Tuple[DatasetId, StoredMemoryItemInfo]: 

201 """Check that the dataset is present and return the real ID and 

202 associated information. 

203 

204 Parameters 

205 ---------- 

206 ref : `DatasetRef` 

207 Target `DatasetRef` 

208 

209 Returns 

210 ------- 

211 realID : `int` 

212 The dataset ID associated with this ref that should be used. This 

213 could either be the ID of the supplied `DatasetRef` or the parent. 

214 storageInfo : `StoredMemoryItemInfo` 

215 Associated storage information. 

216 

217 Raises 

218 ------ 

219 FileNotFoundError 

220 Raised if the dataset is not present in this datastore. 

221 """ 

222 try: 

223 storedItemInfo = self.getStoredItemInfo(ref) 

224 except KeyError: 

225 raise FileNotFoundError(f"No such file dataset in memory: {ref}") from None 

226 realID = ref.id 

227 if storedItemInfo.parentID is not None: 227 ↛ 230line 227 didn't jump to line 230, because the condition on line 227 was never false

228 realID = storedItemInfo.parentID 

229 

230 if realID not in self.datasets: 230 ↛ 231line 230 didn't jump to line 231, because the condition on line 230 was never true

231 raise FileNotFoundError(f"No such file dataset in memory: {ref}") 

232 

233 return realID, storedItemInfo 

234 

235 def knows(self, ref: DatasetRef) -> bool: 

236 """Check if the dataset is known to the datastore. 

237 

238 This datastore does not distinguish dataset existence from knowledge 

239 of a dataset. 

240 

241 Parameters 

242 ---------- 

243 ref : `DatasetRef` 

244 Reference to the required dataset. 

245 

246 Returns 

247 ------- 

248 exists : `bool` 

249 `True` if the dataset is known to the datastore. 

250 """ 

251 return self.exists(ref) 

252 

253 def exists(self, ref: DatasetRef) -> bool: 

254 """Check if the dataset exists in the datastore. 

255 

256 Parameters 

257 ---------- 

258 ref : `DatasetRef` 

259 Reference to the required dataset. 

260 

261 Returns 

262 ------- 

263 exists : `bool` 

264 `True` if the entity exists in the `Datastore`. 

265 """ 

266 try: 

267 self._get_dataset_info(ref) 

268 except FileNotFoundError: 

269 return False 

270 return True 

271 

272 def get(self, ref: DatasetRef, parameters: Optional[Mapping[str, Any]] = None) -> Any: 

273 """Load an InMemoryDataset from the store. 

274 

275 Parameters 

276 ---------- 

277 ref : `DatasetRef` 

278 Reference to the required Dataset. 

279 parameters : `dict` 

280 `StorageClass`-specific parameters that specify, for example, 

281 a slice of the dataset to be loaded. 

282 

283 Returns 

284 ------- 

285 inMemoryDataset : `object` 

286 Requested dataset or slice thereof as an InMemoryDataset. 

287 

288 Raises 

289 ------ 

290 FileNotFoundError 

291 Requested dataset can not be retrieved. 

292 TypeError 

293 Return value from formatter has unexpected type. 

294 ValueError 

295 Formatter failed to process the dataset. 

296 """ 

297 

298 log.debug("Retrieve %s from %s with parameters %s", ref, self.name, parameters) 

299 

300 realID, storedItemInfo = self._get_dataset_info(ref) 

301 

302 # We have a write storage class and a read storage class and they 

303 # can be different for concrete composites. 

304 readStorageClass = ref.datasetType.storageClass 

305 writeStorageClass = storedItemInfo.storageClass 

306 

307 component = ref.datasetType.component() 

308 

309 # Check that the supplied parameters are suitable for the type read 

310 # If this is a derived component we validate against the composite 

311 isDerivedComponent = False 

312 if component in writeStorageClass.derivedComponents: 

313 writeStorageClass.validateParameters(parameters) 

314 isDerivedComponent = True 

315 else: 

316 readStorageClass.validateParameters(parameters) 

317 

318 inMemoryDataset = self.datasets[realID] 

319 

320 # if this is a read only component we need to apply parameters 

321 # before we retrieve the component. We assume that the parameters 

322 # will affect the data globally, before the derived component 

323 # is selected. 

324 if isDerivedComponent: 

325 inMemoryDataset = writeStorageClass.delegate().handleParameters(inMemoryDataset, parameters) 

326 # Then disable parameters for later 

327 parameters = {} 

328 

329 # Different storage classes implies a component request 

330 if readStorageClass != writeStorageClass: 

331 

332 if component is None: 332 ↛ 333line 332 didn't jump to line 333, because the condition on line 332 was never true

333 raise ValueError( 

334 "Storage class inconsistency ({} vs {}) but no" 

335 " component requested".format(readStorageClass.name, writeStorageClass.name) 

336 ) 

337 

338 # Concrete composite written as a single object (we hope) 

339 inMemoryDataset = writeStorageClass.delegate().getComponent(inMemoryDataset, component) 

340 

341 # Since there is no formatter to process parameters, they all must be 

342 # passed to the assembler. 

343 return self._post_process_get( 

344 inMemoryDataset, readStorageClass, parameters, isComponent=component is not None 

345 ) 

346 

347 def put(self, inMemoryDataset: Any, ref: DatasetRef) -> None: 

348 """Write a InMemoryDataset with a given `DatasetRef` to the store. 

349 

350 Parameters 

351 ---------- 

352 inMemoryDataset : `object` 

353 The dataset to store. 

354 ref : `DatasetRef` 

355 Reference to the associated Dataset. 

356 

357 Raises 

358 ------ 

359 TypeError 

360 Supplied object and storage class are inconsistent. 

361 DatasetTypeNotSupportedError 

362 The associated `DatasetType` is not handled by this datastore. 

363 

364 Notes 

365 ----- 

366 If the datastore is configured to reject certain dataset types it 

367 is possible that the put will fail and raise a 

368 `DatasetTypeNotSupportedError`. The main use case for this is to 

369 allow `ChainedDatastore` to put to multiple datastores without 

370 requiring that every datastore accepts the dataset. 

371 """ 

372 

373 if ref.id is None: 373 ↛ 374line 373 didn't jump to line 374, because the condition on line 373 was never true

374 raise RuntimeError(f"Can not store unresolved DatasetRef {ref}") 

375 

376 # May need to coerce the in memory dataset to the correct 

377 # python type, otherwise parameters may not work. 

378 inMemoryDataset = ref.datasetType.storageClass.coerce_type(inMemoryDataset) 

379 

380 self._validate_put_parameters(inMemoryDataset, ref) 

381 

382 self.datasets[ref.id] = inMemoryDataset 

383 log.debug("Store %s in %s", ref, self.name) 

384 

385 # Store time we received this content, to allow us to optionally 

386 # expire it. Instead of storing a filename here, we include the 

387 # ID of this datasetRef so we can find it from components. 

388 itemInfo = StoredMemoryItemInfo(time.time(), ref.datasetType.storageClass, parentID=ref.id) 

389 

390 # We have to register this content with registry. 

391 # Currently this assumes we have a file so we need to use stub entries 

392 # TODO: Add to ephemeral part of registry 

393 self._register_datasets([(ref, itemInfo)]) 

394 

395 if self._transaction is not None: 

396 self._transaction.registerUndo("put", self.remove, ref) 

397 

398 def getURIs( 

399 self, ref: DatasetRef, predict: bool = False 

400 ) -> Tuple[Optional[ResourcePath], Dict[str, ResourcePath]]: 

401 """Return URIs associated with dataset. 

402 

403 Parameters 

404 ---------- 

405 ref : `DatasetRef` 

406 Reference to the required dataset. 

407 predict : `bool`, optional 

408 If the datastore does not know about the dataset, should it 

409 return a predicted URI or not? 

410 

411 Returns 

412 ------- 

413 primary : `lsst.resources.ResourcePath` 

414 The URI to the primary artifact associated with this dataset. 

415 If the dataset was disassembled within the datastore this 

416 may be `None`. 

417 components : `dict` 

418 URIs to any components associated with the dataset artifact. 

419 Can be empty if there are no components. 

420 

421 Notes 

422 ----- 

423 The URIs returned for in-memory datastores are not usable but 

424 provide an indication of the associated dataset. 

425 """ 

426 

427 # Include the dataID as a URI query 

428 query = urlencode(ref.dataId) 

429 

430 # if this has never been written then we have to guess 

431 if not self.exists(ref): 

432 if not predict: 

433 raise FileNotFoundError("Dataset {} not in this datastore".format(ref)) 

434 name = f"{ref.datasetType.name}" 

435 fragment = "#predicted" 

436 else: 

437 realID, _ = self._get_dataset_info(ref) 

438 name = f"{id(self.datasets[realID])}?{query}" 

439 fragment = "" 

440 

441 return ResourcePath(f"mem://{name}?{query}{fragment}"), {} 

442 

443 def getURI(self, ref: DatasetRef, predict: bool = False) -> ResourcePath: 

444 """URI to the Dataset. 

445 

446 Always uses "mem://" URI prefix. 

447 

448 Parameters 

449 ---------- 

450 ref : `DatasetRef` 

451 Reference to the required Dataset. 

452 predict : `bool` 

453 If `True`, allow URIs to be returned of datasets that have not 

454 been written. 

455 

456 Returns 

457 ------- 

458 uri : `str` 

459 URI pointing to the dataset within the datastore. If the 

460 dataset does not exist in the datastore, and if ``predict`` is 

461 `True`, the URI will be a prediction and will include a URI 

462 fragment "#predicted". 

463 If the datastore does not have entities that relate well 

464 to the concept of a URI the returned URI string will be 

465 descriptive. The returned URI is not guaranteed to be obtainable. 

466 

467 Raises 

468 ------ 

469 FileNotFoundError 

470 A URI has been requested for a dataset that does not exist and 

471 guessing is not allowed. 

472 AssertionError 

473 Raised if an internal error occurs. 

474 """ 

475 primary, _ = self.getURIs(ref, predict) 

476 if primary is None: 476 ↛ 479line 476 didn't jump to line 479, because the condition on line 476 was never true

477 # This should be impossible since this datastore does 

478 # not disassemble. This check also helps mypy. 

479 raise AssertionError(f"Unexpectedly got no URI for in-memory datastore for {ref}") 

480 return primary 

481 

482 def retrieveArtifacts( 

483 self, 

484 refs: Iterable[DatasetRef], 

485 destination: ResourcePath, 

486 transfer: str = "auto", 

487 preserve_path: bool = True, 

488 overwrite: Optional[bool] = False, 

489 ) -> List[ResourcePath]: 

490 """Retrieve the file artifacts associated with the supplied refs. 

491 

492 Notes 

493 ----- 

494 Not implemented by this datastore. 

495 """ 

496 # Could conceivably launch a FileDatastore to use formatters to write 

497 # the data but this is fraught with problems. 

498 raise NotImplementedError("Can not write artifacts to disk from in-memory datastore.") 

499 

500 def forget(self, refs: Iterable[DatasetRef]) -> None: 

501 # Docstring inherited. 

502 refs = list(refs) 

503 self._bridge.forget(refs) 

504 for ref in refs: 

505 self.removeStoredItemInfo(ref) 

506 

507 def trash(self, ref: Union[DatasetRef, Iterable[DatasetRef]], ignore_errors: bool = False) -> None: 

508 """Indicate to the Datastore that a dataset can be removed. 

509 

510 Parameters 

511 ---------- 

512 ref : `DatasetRef` or iterable thereof 

513 Reference to the required Dataset(s). 

514 ignore_errors: `bool`, optional 

515 Indicate that errors should be ignored. 

516 

517 Raises 

518 ------ 

519 FileNotFoundError 

520 Attempt to remove a dataset that does not exist. Only relevant 

521 if a single dataset ref is given. 

522 

523 Notes 

524 ----- 

525 Concurrency should not normally be an issue for the in memory datastore 

526 since all internal changes are isolated to solely this process and 

527 the registry only changes rows associated with this process. 

528 """ 

529 if not isinstance(ref, DatasetRef): 

530 log.debug("Bulk trashing of datasets in datastore %s", self.name) 

531 self.bridge.moveToTrash(ref) 

532 return 

533 

534 log.debug("Trash %s in datastore %s", ref, self.name) 

535 

536 # Check that this dataset is known to datastore 

537 try: 

538 self._get_dataset_info(ref) 

539 

540 # Move datasets to trash table 

541 self.bridge.moveToTrash([ref]) 

542 except Exception as e: 

543 if ignore_errors: 543 ↛ 544line 543 didn't jump to line 544, because the condition on line 543 was never true

544 log.warning( 

545 "Error encountered moving dataset %s to trash in datastore %s: %s", ref, self.name, e 

546 ) 

547 else: 

548 raise 

549 

550 def emptyTrash(self, ignore_errors: bool = False) -> None: 

551 """Remove all datasets from the trash. 

552 

553 Parameters 

554 ---------- 

555 ignore_errors : `bool`, optional 

556 Ignore errors. 

557 

558 Notes 

559 ----- 

560 The internal tracking of datasets is affected by this method and 

561 transaction handling is not supported if there is a problem before 

562 the datasets themselves are deleted. 

563 

564 Concurrency should not normally be an issue for the in memory datastore 

565 since all internal changes are isolated to solely this process and 

566 the registry only changes rows associated with this process. 

567 """ 

568 log.debug("Emptying trash in datastore %s", self.name) 

569 with self._bridge.emptyTrash() as trash_data: 

570 trashed, _ = trash_data 

571 for ref, _ in trashed: 

572 try: 

573 realID, _ = self._get_dataset_info(ref) 

574 except FileNotFoundError: 574 ↛ 577line 574 didn't jump to line 577

575 # Dataset already removed so ignore it 

576 continue 

577 except Exception as e: 

578 if ignore_errors: 

579 log.warning( 

580 "Emptying trash in datastore %s but encountered an error with dataset %s: %s", 

581 self.name, 

582 ref.id, 

583 e, 

584 ) 

585 continue 

586 else: 

587 raise 

588 

589 # Determine whether all references to this dataset have been 

590 # removed and we can delete the dataset itself 

591 allRefs = self.related[realID] 

592 remainingRefs = allRefs - {ref.id} 

593 if not remainingRefs: 593 ↛ 598line 593 didn't jump to line 598, because the condition on line 593 was never false

594 log.debug("Removing artifact %s from datastore %s", realID, self.name) 

595 del self.datasets[realID] 

596 

597 # Remove this entry 

598 self.removeStoredItemInfo(ref) 

599 

600 def validateConfiguration( 

601 self, entities: Iterable[Union[DatasetRef, DatasetType, StorageClass]], logFailures: bool = False 

602 ) -> None: 

603 """Validate some of the configuration for this datastore. 

604 

605 Parameters 

606 ---------- 

607 entities : iterable of `DatasetRef`, `DatasetType`, or `StorageClass` 

608 Entities to test against this configuration. Can be differing 

609 types. 

610 logFailures : `bool`, optional 

611 If `True`, output a log message for every validation error 

612 detected. 

613 

614 Raises 

615 ------ 

616 DatastoreValidationError 

617 Raised if there is a validation problem with a configuration. 

618 All the problems are reported in a single exception. 

619 

620 Notes 

621 ----- 

622 This method is a no-op. 

623 """ 

624 return 

625 

626 def _overrideTransferMode(self, *datasets: Any, transfer: Optional[str] = None) -> Optional[str]: 

627 # Docstring is inherited from base class 

628 return transfer 

629 

630 def validateKey(self, lookupKey: LookupKey, entity: Union[DatasetRef, DatasetType, StorageClass]) -> None: 

631 # Docstring is inherited from base class 

632 return 

633 

634 def getLookupKeys(self) -> Set[LookupKey]: 

635 # Docstring is inherited from base class 

636 return self.constraints.getLookupKeys() 

637 

638 def needs_expanded_data_ids( 

639 self, 

640 transfer: Optional[str], 

641 entity: Optional[Union[DatasetRef, DatasetType, StorageClass]] = None, 

642 ) -> bool: 

643 # Docstring inherited. 

644 return False