Coverage for python/lsst/daf/butler/datastores/inMemoryDatastore.py: 86%

Shortcuts on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

175 statements  

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22from __future__ import annotations 

23 

24"""In-memory datastore.""" 

25 

26__all__ = ("StoredMemoryItemInfo", "InMemoryDatastore") 

27 

28import time 

29import logging 

30from dataclasses import dataclass 

31from urllib.parse import urlencode 

32from typing import ( 

33 TYPE_CHECKING, 

34 Any, 

35 Dict, 

36 Iterable, 

37 List, 

38 Mapping, 

39 Optional, 

40 Set, 

41 Tuple, 

42 Union, 

43) 

44 

45from lsst.daf.butler import DatasetId, DatasetRef, StoredDatastoreItemInfo, StorageClass, ButlerURI 

46from lsst.daf.butler.registry.interfaces import DatastoreRegistryBridge 

47from .genericDatastore import GenericBaseDatastore 

48 

49if TYPE_CHECKING: 49 ↛ 50line 49 didn't jump to line 50, because the condition on line 49 was never true

50 from lsst.daf.butler import (Config, DatasetType, 

51 LookupKey) 

52 from lsst.daf.butler.registry.interfaces import DatasetIdRef, DatastoreRegistryBridgeManager 

53 

54log = logging.getLogger(__name__) 

55 

56 

57@dataclass(frozen=True) 

58class StoredMemoryItemInfo(StoredDatastoreItemInfo): 

59 """Internal InMemoryDatastore Metadata associated with a stored 

60 DatasetRef. 

61 """ 

62 __slots__ = {"timestamp", "storageClass", "parentID"} 

63 

64 timestamp: float 

65 """Unix timestamp indicating the time the dataset was stored.""" 

66 

67 storageClass: StorageClass 

68 """StorageClass associated with the dataset.""" 

69 

70 parentID: DatasetId 

71 """ID of the parent `DatasetRef` if this entry is a concrete 

72 composite. Not used if the dataset being stored is not a 

73 virtual component of a composite 

74 """ 

75 

76 

77class InMemoryDatastore(GenericBaseDatastore): 

78 """Basic Datastore for writing to an in memory cache. 

79 

80 This datastore is ephemeral in that the contents of the datastore 

81 disappear when the Python process completes. This also means that 

82 other processes can not access this datastore. 

83 

84 Parameters 

85 ---------- 

86 config : `DatastoreConfig` or `str` 

87 Configuration. 

88 bridgeManager : `DatastoreRegistryBridgeManager` 

89 Object that manages the interface between `Registry` and datastores. 

90 butlerRoot : `str`, optional 

91 Unused parameter. 

92 

93 Notes 

94 ----- 

95 InMemoryDatastore does not support any file-based ingest. 

96 """ 

97 

98 defaultConfigFile = "datastores/inMemoryDatastore.yaml" 

99 """Path to configuration defaults. Accessed within the ``configs`` resource 

100 or relative to a search path. Can be None if no defaults specified. 

101 """ 

102 

103 isEphemeral = True 

104 """A new datastore is created every time and datasets disappear when 

105 the process shuts down.""" 

106 

107 datasets: Dict[DatasetId, Any] 

108 """Internal storage of datasets indexed by dataset ID.""" 

109 

110 records: Dict[DatasetId, StoredMemoryItemInfo] 

111 """Internal records about stored datasets.""" 

112 

113 def __init__(self, config: Union[Config, str], 

114 bridgeManager: DatastoreRegistryBridgeManager, 

115 butlerRoot: Optional[str] = None): 

116 super().__init__(config, bridgeManager) 

117 

118 # Name ourselves with the timestamp the datastore 

119 # was created. 

120 self.name = "{}@{}".format(type(self).__name__, time.time()) 

121 log.debug("Creating datastore %s", self.name) 

122 

123 # Storage of datasets, keyed by dataset_id 

124 self.datasets: Dict[DatasetId, Any] = {} 

125 

126 # Records is distinct in order to track concrete composite components 

127 # where we register multiple components for a single dataset. 

128 self.records: Dict[DatasetId, StoredMemoryItemInfo] = {} 

129 

130 # Related records that share the same parent 

131 self.related: Dict[DatasetId, Set[DatasetId]] = {} 

132 

133 self._bridge = bridgeManager.register(self.name, ephemeral=True) 

134 

135 @classmethod 

136 def setConfigRoot(cls, root: str, config: Config, full: Config, overwrite: bool = True) -> None: 

137 """Set any filesystem-dependent config options for this Datastore to 

138 be appropriate for a new empty repository with the given root. 

139 

140 Does nothing in this implementation. 

141 

142 Parameters 

143 ---------- 

144 root : `str` 

145 Filesystem path to the root of the data repository. 

146 config : `Config` 

147 A `Config` to update. Only the subset understood by 

148 this component will be updated. Will not expand 

149 defaults. 

150 full : `Config` 

151 A complete config with all defaults expanded that can be 

152 converted to a `DatastoreConfig`. Read-only and will not be 

153 modified by this method. 

154 Repository-specific options that should not be obtained 

155 from defaults when Butler instances are constructed 

156 should be copied from ``full`` to ``config``. 

157 overwrite : `bool`, optional 

158 If `False`, do not modify a value in ``config`` if the value 

159 already exists. Default is always to overwrite with the provided 

160 ``root``. 

161 

162 Notes 

163 ----- 

164 If a keyword is explicitly defined in the supplied ``config`` it 

165 will not be overridden by this method if ``overwrite`` is `False`. 

166 This allows explicit values set in external configs to be retained. 

167 """ 

168 return 

169 

170 @property 

171 def bridge(self) -> DatastoreRegistryBridge: 

172 # Docstring inherited from GenericBaseDatastore. 

173 return self._bridge 

174 

175 def addStoredItemInfo(self, refs: Iterable[DatasetRef], 

176 infos: Iterable[StoredMemoryItemInfo]) -> None: 

177 # Docstring inherited from GenericBaseDatastore. 

178 for ref, info in zip(refs, infos): 

179 if ref.id is None: 179 ↛ 180line 179 didn't jump to line 180, because the condition on line 179 was never true

180 raise RuntimeError(f"Can not store unresolved DatasetRef {ref}") 

181 self.records[ref.id] = info 

182 self.related.setdefault(info.parentID, set()).add(ref.id) 

183 

184 def getStoredItemInfo(self, ref: DatasetIdRef) -> StoredMemoryItemInfo: 

185 # Docstring inherited from GenericBaseDatastore. 

186 if ref.id is None: 186 ↛ 187line 186 didn't jump to line 187, because the condition on line 186 was never true

187 raise RuntimeError(f"Can not retrieve unresolved DatasetRef {ref}") 

188 return self.records[ref.id] 

189 

190 def getStoredItemsInfo(self, ref: DatasetIdRef) -> List[StoredMemoryItemInfo]: 

191 # Docstring inherited from GenericBaseDatastore. 

192 return [self.getStoredItemInfo(ref)] 

193 

194 def removeStoredItemInfo(self, ref: DatasetIdRef) -> None: 

195 # Docstring inherited from GenericBaseDatastore. 

196 # If a component has been removed previously then we can sometimes 

197 # be asked to remove it again. Other datastores ignore this 

198 # so also ignore here 

199 if ref.id is None: 199 ↛ 200line 199 didn't jump to line 200, because the condition on line 199 was never true

200 raise RuntimeError(f"Can not remove unresolved DatasetRef {ref}") 

201 if ref.id not in self.records: 

202 return 

203 record = self.records[ref.id] 

204 del self.records[ref.id] 

205 self.related[record.parentID].remove(ref.id) 

206 

207 def _get_dataset_info(self, ref: DatasetIdRef) -> Tuple[DatasetId, StoredMemoryItemInfo]: 

208 """Check that the dataset is present and return the real ID and 

209 associated information. 

210 

211 Parameters 

212 ---------- 

213 ref : `DatasetRef` 

214 Target `DatasetRef` 

215 

216 Returns 

217 ------- 

218 realID : `int` 

219 The dataset ID associated with this ref that should be used. This 

220 could either be the ID of the supplied `DatasetRef` or the parent. 

221 storageInfo : `StoredMemoryItemInfo` 

222 Associated storage information. 

223 

224 Raises 

225 ------ 

226 FileNotFoundError 

227 Raised if the dataset is not present in this datastore. 

228 """ 

229 try: 

230 storedItemInfo = self.getStoredItemInfo(ref) 

231 except KeyError: 

232 raise FileNotFoundError(f"No such file dataset in memory: {ref}") from None 

233 realID = ref.id 

234 if storedItemInfo.parentID is not None: 234 ↛ 237line 234 didn't jump to line 237, because the condition on line 234 was never false

235 realID = storedItemInfo.parentID 

236 

237 if realID not in self.datasets: 237 ↛ 238line 237 didn't jump to line 238, because the condition on line 237 was never true

238 raise FileNotFoundError(f"No such file dataset in memory: {ref}") 

239 

240 return realID, storedItemInfo 

241 

242 def knows(self, ref: DatasetRef) -> bool: 

243 """Check if the dataset is known to the datastore. 

244 

245 This datastore does not distinguish dataset existence from knowledge 

246 of a dataset. 

247 

248 Parameters 

249 ---------- 

250 ref : `DatasetRef` 

251 Reference to the required dataset. 

252 

253 Returns 

254 ------- 

255 exists : `bool` 

256 `True` if the dataset is known to the datastore. 

257 """ 

258 return self.exists(ref) 

259 

260 def exists(self, ref: DatasetRef) -> bool: 

261 """Check if the dataset exists in the datastore. 

262 

263 Parameters 

264 ---------- 

265 ref : `DatasetRef` 

266 Reference to the required dataset. 

267 

268 Returns 

269 ------- 

270 exists : `bool` 

271 `True` if the entity exists in the `Datastore`. 

272 """ 

273 try: 

274 self._get_dataset_info(ref) 

275 except FileNotFoundError: 

276 return False 

277 return True 

278 

279 def get(self, ref: DatasetRef, parameters: Optional[Mapping[str, Any]] = None) -> Any: 

280 """Load an InMemoryDataset from the store. 

281 

282 Parameters 

283 ---------- 

284 ref : `DatasetRef` 

285 Reference to the required Dataset. 

286 parameters : `dict` 

287 `StorageClass`-specific parameters that specify, for example, 

288 a slice of the dataset to be loaded. 

289 

290 Returns 

291 ------- 

292 inMemoryDataset : `object` 

293 Requested dataset or slice thereof as an InMemoryDataset. 

294 

295 Raises 

296 ------ 

297 FileNotFoundError 

298 Requested dataset can not be retrieved. 

299 TypeError 

300 Return value from formatter has unexpected type. 

301 ValueError 

302 Formatter failed to process the dataset. 

303 """ 

304 

305 log.debug("Retrieve %s from %s with parameters %s", ref, self.name, parameters) 

306 

307 realID, storedItemInfo = self._get_dataset_info(ref) 

308 

309 # We have a write storage class and a read storage class and they 

310 # can be different for concrete composites. 

311 readStorageClass = ref.datasetType.storageClass 

312 writeStorageClass = storedItemInfo.storageClass 

313 

314 component = ref.datasetType.component() 

315 

316 # Check that the supplied parameters are suitable for the type read 

317 # If this is a derived component we validate against the composite 

318 isDerivedComponent = False 

319 if component in writeStorageClass.derivedComponents: 

320 writeStorageClass.validateParameters(parameters) 

321 isDerivedComponent = True 

322 else: 

323 readStorageClass.validateParameters(parameters) 

324 

325 inMemoryDataset = self.datasets[realID] 

326 

327 # if this is a read only component we need to apply parameters 

328 # before we retrieve the component. We assume that the parameters 

329 # will affect the data globally, before the derived component 

330 # is selected. 

331 if isDerivedComponent: 

332 inMemoryDataset = writeStorageClass.delegate().handleParameters(inMemoryDataset, parameters) 

333 # Then disable parameters for later 

334 parameters = {} 

335 

336 # Different storage classes implies a component request 

337 if readStorageClass != writeStorageClass: 

338 

339 if component is None: 339 ↛ 340line 339 didn't jump to line 340, because the condition on line 339 was never true

340 raise ValueError("Storage class inconsistency ({} vs {}) but no" 

341 " component requested".format(readStorageClass.name, 

342 writeStorageClass.name)) 

343 

344 # Concrete composite written as a single object (we hope) 

345 inMemoryDataset = writeStorageClass.delegate().getComponent(inMemoryDataset, component) 

346 

347 # Since there is no formatter to process parameters, they all must be 

348 # passed to the assembler. 

349 return self._post_process_get(inMemoryDataset, readStorageClass, parameters, 

350 isComponent=component is not None) 

351 

352 def put(self, inMemoryDataset: Any, ref: DatasetRef) -> None: 

353 """Write a InMemoryDataset with a given `DatasetRef` to the store. 

354 

355 Parameters 

356 ---------- 

357 inMemoryDataset : `object` 

358 The dataset to store. 

359 ref : `DatasetRef` 

360 Reference to the associated Dataset. 

361 

362 Raises 

363 ------ 

364 TypeError 

365 Supplied object and storage class are inconsistent. 

366 DatasetTypeNotSupportedError 

367 The associated `DatasetType` is not handled by this datastore. 

368 

369 Notes 

370 ----- 

371 If the datastore is configured to reject certain dataset types it 

372 is possible that the put will fail and raise a 

373 `DatasetTypeNotSupportedError`. The main use case for this is to 

374 allow `ChainedDatastore` to put to multiple datastores without 

375 requiring that every datastore accepts the dataset. 

376 """ 

377 

378 if ref.id is None: 378 ↛ 379line 378 didn't jump to line 379, because the condition on line 378 was never true

379 raise RuntimeError(f"Can not store unresolved DatasetRef {ref}") 

380 

381 self._validate_put_parameters(inMemoryDataset, ref) 

382 

383 self.datasets[ref.id] = inMemoryDataset 

384 log.debug("Store %s in %s", ref, self.name) 

385 

386 # Store time we received this content, to allow us to optionally 

387 # expire it. Instead of storing a filename here, we include the 

388 # ID of this datasetRef so we can find it from components. 

389 itemInfo = StoredMemoryItemInfo(time.time(), ref.datasetType.storageClass, 

390 parentID=ref.id) 

391 

392 # We have to register this content with registry. 

393 # Currently this assumes we have a file so we need to use stub entries 

394 # TODO: Add to ephemeral part of registry 

395 self._register_datasets([(ref, itemInfo)]) 

396 

397 if self._transaction is not None: 

398 self._transaction.registerUndo("put", self.remove, ref) 

399 

400 def getURIs(self, ref: DatasetRef, 

401 predict: bool = False) -> Tuple[Optional[ButlerURI], Dict[str, ButlerURI]]: 

402 """Return URIs associated with dataset. 

403 

404 Parameters 

405 ---------- 

406 ref : `DatasetRef` 

407 Reference to the required dataset. 

408 predict : `bool`, optional 

409 If the datastore does not know about the dataset, should it 

410 return a predicted URI or not? 

411 

412 Returns 

413 ------- 

414 primary : `ButlerURI` 

415 The URI to the primary artifact associated with this dataset. 

416 If the dataset was disassembled within the datastore this 

417 may be `None`. 

418 components : `dict` 

419 URIs to any components associated with the dataset artifact. 

420 Can be empty if there are no components. 

421 

422 Notes 

423 ----- 

424 The URIs returned for in-memory datastores are not usable but 

425 provide an indication of the associated dataset. 

426 """ 

427 

428 # Include the dataID as a URI query 

429 query = urlencode(ref.dataId) 

430 

431 # if this has never been written then we have to guess 

432 if not self.exists(ref): 

433 if not predict: 

434 raise FileNotFoundError("Dataset {} not in this datastore".format(ref)) 

435 name = f"{ref.datasetType.name}" 

436 fragment = "#predicted" 

437 else: 

438 realID, _ = self._get_dataset_info(ref) 

439 name = f"{id(self.datasets[realID])}?{query}" 

440 fragment = "" 

441 

442 return ButlerURI(f"mem://{name}?{query}{fragment}"), {} 

443 

444 def getURI(self, ref: DatasetRef, predict: bool = False) -> ButlerURI: 

445 """URI to the Dataset. 

446 

447 Always uses "mem://" URI prefix. 

448 

449 Parameters 

450 ---------- 

451 ref : `DatasetRef` 

452 Reference to the required Dataset. 

453 predict : `bool` 

454 If `True`, allow URIs to be returned of datasets that have not 

455 been written. 

456 

457 Returns 

458 ------- 

459 uri : `str` 

460 URI pointing to the dataset within the datastore. If the 

461 dataset does not exist in the datastore, and if ``predict`` is 

462 `True`, the URI will be a prediction and will include a URI 

463 fragment "#predicted". 

464 If the datastore does not have entities that relate well 

465 to the concept of a URI the returned URI string will be 

466 descriptive. The returned URI is not guaranteed to be obtainable. 

467 

468 Raises 

469 ------ 

470 FileNotFoundError 

471 A URI has been requested for a dataset that does not exist and 

472 guessing is not allowed. 

473 AssertionError 

474 Raised if an internal error occurs. 

475 """ 

476 primary, _ = self.getURIs(ref, predict) 

477 if primary is None: 477 ↛ 480line 477 didn't jump to line 480, because the condition on line 477 was never true

478 # This should be impossible since this datastore does 

479 # not disassemble. This check also helps mypy. 

480 raise AssertionError(f"Unexpectedly got no URI for in-memory datastore for {ref}") 

481 return primary 

482 

483 def retrieveArtifacts(self, refs: Iterable[DatasetRef], 

484 destination: ButlerURI, transfer: str = "auto", 

485 preserve_path: bool = True, 

486 overwrite: Optional[bool] = False) -> List[ButlerURI]: 

487 """Retrieve the file artifacts associated with the supplied refs. 

488 

489 Notes 

490 ----- 

491 Not implemented by this datastore. 

492 """ 

493 # Could conceivably launch a FileDatastore to use formatters to write 

494 # the data but this is fraught with problems. 

495 raise NotImplementedError("Can not write artifacts to disk from in-memory datastore.") 

496 

497 def forget(self, refs: Iterable[DatasetRef]) -> None: 

498 # Docstring inherited. 

499 refs = list(refs) 

500 self._bridge.forget(refs) 

501 for ref in refs: 

502 self.removeStoredItemInfo(ref) 

503 

504 def trash(self, ref: Union[DatasetRef, Iterable[DatasetRef]], ignore_errors: bool = False) -> None: 

505 """Indicate to the Datastore that a dataset can be removed. 

506 

507 Parameters 

508 ---------- 

509 ref : `DatasetRef` or iterable thereof 

510 Reference to the required Dataset(s). 

511 ignore_errors: `bool`, optional 

512 Indicate that errors should be ignored. 

513 

514 Raises 

515 ------ 

516 FileNotFoundError 

517 Attempt to remove a dataset that does not exist. Only relevant 

518 if a single dataset ref is given. 

519 

520 Notes 

521 ----- 

522 Concurrency should not normally be an issue for the in memory datastore 

523 since all internal changes are isolated to solely this process and 

524 the registry only changes rows associated with this process. 

525 """ 

526 if not isinstance(ref, DatasetRef): 

527 log.debug("Bulk trashing of datasets in datastore %s", self.name) 

528 self.bridge.moveToTrash(ref) 

529 return 

530 

531 log.debug("Trash %s in datastore %s", ref, self.name) 

532 

533 # Check that this dataset is known to datastore 

534 try: 

535 self._get_dataset_info(ref) 

536 

537 # Move datasets to trash table 

538 self.bridge.moveToTrash([ref]) 

539 except Exception as e: 

540 if ignore_errors: 540 ↛ 541line 540 didn't jump to line 541, because the condition on line 540 was never true

541 log.warning("Error encountered moving dataset %s to trash in datastore %s: %s", 

542 ref, self.name, e) 

543 else: 

544 raise 

545 

546 def emptyTrash(self, ignore_errors: bool = False) -> None: 

547 """Remove all datasets from the trash. 

548 

549 Parameters 

550 ---------- 

551 ignore_errors : `bool`, optional 

552 Ignore errors. 

553 

554 Notes 

555 ----- 

556 The internal tracking of datasets is affected by this method and 

557 transaction handling is not supported if there is a problem before 

558 the datasets themselves are deleted. 

559 

560 Concurrency should not normally be an issue for the in memory datastore 

561 since all internal changes are isolated to solely this process and 

562 the registry only changes rows associated with this process. 

563 """ 

564 log.debug("Emptying trash in datastore %s", self.name) 

565 with self._bridge.emptyTrash() as trash_data: 

566 trashed, _ = trash_data 

567 for ref, _ in trashed: 

568 try: 

569 realID, _ = self._get_dataset_info(ref) 

570 except FileNotFoundError: 570 ↛ 573line 570 didn't jump to line 573

571 # Dataset already removed so ignore it 

572 continue 

573 except Exception as e: 

574 if ignore_errors: 

575 log.warning("Emptying trash in datastore %s but encountered an " 

576 "error with dataset %s: %s", 

577 self.name, ref.id, e) 

578 continue 

579 else: 

580 raise 

581 

582 # Determine whether all references to this dataset have been 

583 # removed and we can delete the dataset itself 

584 allRefs = self.related[realID] 

585 remainingRefs = allRefs - {ref.id} 

586 if not remainingRefs: 586 ↛ 591line 586 didn't jump to line 591, because the condition on line 586 was never false

587 log.debug("Removing artifact %s from datastore %s", realID, self.name) 

588 del self.datasets[realID] 

589 

590 # Remove this entry 

591 self.removeStoredItemInfo(ref) 

592 

593 def validateConfiguration(self, entities: Iterable[Union[DatasetRef, DatasetType, StorageClass]], 

594 logFailures: bool = False) -> None: 

595 """Validate some of the configuration for this datastore. 

596 

597 Parameters 

598 ---------- 

599 entities : iterable of `DatasetRef`, `DatasetType`, or `StorageClass` 

600 Entities to test against this configuration. Can be differing 

601 types. 

602 logFailures : `bool`, optional 

603 If `True`, output a log message for every validation error 

604 detected. 

605 

606 Raises 

607 ------ 

608 DatastoreValidationError 

609 Raised if there is a validation problem with a configuration. 

610 All the problems are reported in a single exception. 

611 

612 Notes 

613 ----- 

614 This method is a no-op. 

615 """ 

616 return 

617 

618 def _overrideTransferMode(self, *datasets: Any, transfer: Optional[str] = None) -> Optional[str]: 

619 # Docstring is inherited from base class 

620 return transfer 

621 

622 def validateKey(self, lookupKey: LookupKey, entity: Union[DatasetRef, DatasetType, StorageClass]) -> None: 

623 # Docstring is inherited from base class 

624 return 

625 

626 def getLookupKeys(self) -> Set[LookupKey]: 

627 # Docstring is inherited from base class 

628 return self.constraints.getLookupKeys() 

629 

630 def needs_expanded_data_ids( 

631 self, 

632 transfer: Optional[str], 

633 entity: Optional[Union[DatasetRef, DatasetType, StorageClass]] = None, 

634 ) -> bool: 

635 # Docstring inherited. 

636 return False