Coverage for python/lsst/daf/butler/datastores/inMemoryDatastore.py: 86%

Shortcuts on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

175 statements  

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22from __future__ import annotations 

23 

24"""In-memory datastore.""" 

25 

26__all__ = ("StoredMemoryItemInfo", "InMemoryDatastore") 

27 

28import logging 

29import time 

30from dataclasses import dataclass 

31from typing import TYPE_CHECKING, Any, Dict, Iterable, List, Mapping, Optional, Set, Tuple, Union 

32from urllib.parse import urlencode 

33 

34from lsst.daf.butler import ButlerURI, DatasetId, DatasetRef, StorageClass, StoredDatastoreItemInfo 

35from lsst.daf.butler.registry.interfaces import DatastoreRegistryBridge 

36 

37from .genericDatastore import GenericBaseDatastore 

38 

39if TYPE_CHECKING: 39 ↛ 40line 39 didn't jump to line 40, because the condition on line 39 was never true

40 from lsst.daf.butler import Config, DatasetType, LookupKey 

41 from lsst.daf.butler.registry.interfaces import DatasetIdRef, DatastoreRegistryBridgeManager 

42 

43log = logging.getLogger(__name__) 

44 

45 

46@dataclass(frozen=True) 

47class StoredMemoryItemInfo(StoredDatastoreItemInfo): 

48 """Internal InMemoryDatastore Metadata associated with a stored 

49 DatasetRef. 

50 """ 

51 

52 __slots__ = {"timestamp", "storageClass", "parentID"} 

53 

54 timestamp: float 

55 """Unix timestamp indicating the time the dataset was stored.""" 

56 

57 storageClass: StorageClass 

58 """StorageClass associated with the dataset.""" 

59 

60 parentID: DatasetId 

61 """ID of the parent `DatasetRef` if this entry is a concrete 

62 composite. Not used if the dataset being stored is not a 

63 virtual component of a composite 

64 """ 

65 

66 

67class InMemoryDatastore(GenericBaseDatastore): 

68 """Basic Datastore for writing to an in memory cache. 

69 

70 This datastore is ephemeral in that the contents of the datastore 

71 disappear when the Python process completes. This also means that 

72 other processes can not access this datastore. 

73 

74 Parameters 

75 ---------- 

76 config : `DatastoreConfig` or `str` 

77 Configuration. 

78 bridgeManager : `DatastoreRegistryBridgeManager` 

79 Object that manages the interface between `Registry` and datastores. 

80 butlerRoot : `str`, optional 

81 Unused parameter. 

82 

83 Notes 

84 ----- 

85 InMemoryDatastore does not support any file-based ingest. 

86 """ 

87 

88 defaultConfigFile = "datastores/inMemoryDatastore.yaml" 

89 """Path to configuration defaults. Accessed within the ``configs`` resource 

90 or relative to a search path. Can be None if no defaults specified. 

91 """ 

92 

93 isEphemeral = True 

94 """A new datastore is created every time and datasets disappear when 

95 the process shuts down.""" 

96 

97 datasets: Dict[DatasetId, Any] 

98 """Internal storage of datasets indexed by dataset ID.""" 

99 

100 records: Dict[DatasetId, StoredMemoryItemInfo] 

101 """Internal records about stored datasets.""" 

102 

103 def __init__( 

104 self, 

105 config: Union[Config, str], 

106 bridgeManager: DatastoreRegistryBridgeManager, 

107 butlerRoot: Optional[str] = None, 

108 ): 

109 super().__init__(config, bridgeManager) 

110 

111 # Name ourselves with the timestamp the datastore 

112 # was created. 

113 self.name = "{}@{}".format(type(self).__name__, time.time()) 

114 log.debug("Creating datastore %s", self.name) 

115 

116 # Storage of datasets, keyed by dataset_id 

117 self.datasets: Dict[DatasetId, Any] = {} 

118 

119 # Records is distinct in order to track concrete composite components 

120 # where we register multiple components for a single dataset. 

121 self.records: Dict[DatasetId, StoredMemoryItemInfo] = {} 

122 

123 # Related records that share the same parent 

124 self.related: Dict[DatasetId, Set[DatasetId]] = {} 

125 

126 self._bridge = bridgeManager.register(self.name, ephemeral=True) 

127 

128 @classmethod 

129 def setConfigRoot(cls, root: str, config: Config, full: Config, overwrite: bool = True) -> None: 

130 """Set any filesystem-dependent config options for this Datastore to 

131 be appropriate for a new empty repository with the given root. 

132 

133 Does nothing in this implementation. 

134 

135 Parameters 

136 ---------- 

137 root : `str` 

138 Filesystem path to the root of the data repository. 

139 config : `Config` 

140 A `Config` to update. Only the subset understood by 

141 this component will be updated. Will not expand 

142 defaults. 

143 full : `Config` 

144 A complete config with all defaults expanded that can be 

145 converted to a `DatastoreConfig`. Read-only and will not be 

146 modified by this method. 

147 Repository-specific options that should not be obtained 

148 from defaults when Butler instances are constructed 

149 should be copied from ``full`` to ``config``. 

150 overwrite : `bool`, optional 

151 If `False`, do not modify a value in ``config`` if the value 

152 already exists. Default is always to overwrite with the provided 

153 ``root``. 

154 

155 Notes 

156 ----- 

157 If a keyword is explicitly defined in the supplied ``config`` it 

158 will not be overridden by this method if ``overwrite`` is `False`. 

159 This allows explicit values set in external configs to be retained. 

160 """ 

161 return 

162 

163 @property 

164 def bridge(self) -> DatastoreRegistryBridge: 

165 # Docstring inherited from GenericBaseDatastore. 

166 return self._bridge 

167 

168 def addStoredItemInfo(self, refs: Iterable[DatasetRef], infos: Iterable[StoredMemoryItemInfo]) -> None: 

169 # Docstring inherited from GenericBaseDatastore. 

170 for ref, info in zip(refs, infos): 

171 if ref.id is None: 171 ↛ 172line 171 didn't jump to line 172, because the condition on line 171 was never true

172 raise RuntimeError(f"Can not store unresolved DatasetRef {ref}") 

173 self.records[ref.id] = info 

174 self.related.setdefault(info.parentID, set()).add(ref.id) 

175 

176 def getStoredItemInfo(self, ref: DatasetIdRef) -> StoredMemoryItemInfo: 

177 # Docstring inherited from GenericBaseDatastore. 

178 if ref.id is None: 178 ↛ 179line 178 didn't jump to line 179, because the condition on line 178 was never true

179 raise RuntimeError(f"Can not retrieve unresolved DatasetRef {ref}") 

180 return self.records[ref.id] 

181 

182 def getStoredItemsInfo(self, ref: DatasetIdRef) -> List[StoredMemoryItemInfo]: 

183 # Docstring inherited from GenericBaseDatastore. 

184 return [self.getStoredItemInfo(ref)] 

185 

186 def removeStoredItemInfo(self, ref: DatasetIdRef) -> None: 

187 # Docstring inherited from GenericBaseDatastore. 

188 # If a component has been removed previously then we can sometimes 

189 # be asked to remove it again. Other datastores ignore this 

190 # so also ignore here 

191 if ref.id is None: 191 ↛ 192line 191 didn't jump to line 192, because the condition on line 191 was never true

192 raise RuntimeError(f"Can not remove unresolved DatasetRef {ref}") 

193 if ref.id not in self.records: 

194 return 

195 record = self.records[ref.id] 

196 del self.records[ref.id] 

197 self.related[record.parentID].remove(ref.id) 

198 

199 def _get_dataset_info(self, ref: DatasetIdRef) -> Tuple[DatasetId, StoredMemoryItemInfo]: 

200 """Check that the dataset is present and return the real ID and 

201 associated information. 

202 

203 Parameters 

204 ---------- 

205 ref : `DatasetRef` 

206 Target `DatasetRef` 

207 

208 Returns 

209 ------- 

210 realID : `int` 

211 The dataset ID associated with this ref that should be used. This 

212 could either be the ID of the supplied `DatasetRef` or the parent. 

213 storageInfo : `StoredMemoryItemInfo` 

214 Associated storage information. 

215 

216 Raises 

217 ------ 

218 FileNotFoundError 

219 Raised if the dataset is not present in this datastore. 

220 """ 

221 try: 

222 storedItemInfo = self.getStoredItemInfo(ref) 

223 except KeyError: 

224 raise FileNotFoundError(f"No such file dataset in memory: {ref}") from None 

225 realID = ref.id 

226 if storedItemInfo.parentID is not None: 226 ↛ 229line 226 didn't jump to line 229, because the condition on line 226 was never false

227 realID = storedItemInfo.parentID 

228 

229 if realID not in self.datasets: 229 ↛ 230line 229 didn't jump to line 230, because the condition on line 229 was never true

230 raise FileNotFoundError(f"No such file dataset in memory: {ref}") 

231 

232 return realID, storedItemInfo 

233 

234 def knows(self, ref: DatasetRef) -> bool: 

235 """Check if the dataset is known to the datastore. 

236 

237 This datastore does not distinguish dataset existence from knowledge 

238 of a dataset. 

239 

240 Parameters 

241 ---------- 

242 ref : `DatasetRef` 

243 Reference to the required dataset. 

244 

245 Returns 

246 ------- 

247 exists : `bool` 

248 `True` if the dataset is known to the datastore. 

249 """ 

250 return self.exists(ref) 

251 

252 def exists(self, ref: DatasetRef) -> bool: 

253 """Check if the dataset exists in the datastore. 

254 

255 Parameters 

256 ---------- 

257 ref : `DatasetRef` 

258 Reference to the required dataset. 

259 

260 Returns 

261 ------- 

262 exists : `bool` 

263 `True` if the entity exists in the `Datastore`. 

264 """ 

265 try: 

266 self._get_dataset_info(ref) 

267 except FileNotFoundError: 

268 return False 

269 return True 

270 

271 def get(self, ref: DatasetRef, parameters: Optional[Mapping[str, Any]] = None) -> Any: 

272 """Load an InMemoryDataset from the store. 

273 

274 Parameters 

275 ---------- 

276 ref : `DatasetRef` 

277 Reference to the required Dataset. 

278 parameters : `dict` 

279 `StorageClass`-specific parameters that specify, for example, 

280 a slice of the dataset to be loaded. 

281 

282 Returns 

283 ------- 

284 inMemoryDataset : `object` 

285 Requested dataset or slice thereof as an InMemoryDataset. 

286 

287 Raises 

288 ------ 

289 FileNotFoundError 

290 Requested dataset can not be retrieved. 

291 TypeError 

292 Return value from formatter has unexpected type. 

293 ValueError 

294 Formatter failed to process the dataset. 

295 """ 

296 

297 log.debug("Retrieve %s from %s with parameters %s", ref, self.name, parameters) 

298 

299 realID, storedItemInfo = self._get_dataset_info(ref) 

300 

301 # We have a write storage class and a read storage class and they 

302 # can be different for concrete composites. 

303 readStorageClass = ref.datasetType.storageClass 

304 writeStorageClass = storedItemInfo.storageClass 

305 

306 component = ref.datasetType.component() 

307 

308 # Check that the supplied parameters are suitable for the type read 

309 # If this is a derived component we validate against the composite 

310 isDerivedComponent = False 

311 if component in writeStorageClass.derivedComponents: 

312 writeStorageClass.validateParameters(parameters) 

313 isDerivedComponent = True 

314 else: 

315 readStorageClass.validateParameters(parameters) 

316 

317 inMemoryDataset = self.datasets[realID] 

318 

319 # if this is a read only component we need to apply parameters 

320 # before we retrieve the component. We assume that the parameters 

321 # will affect the data globally, before the derived component 

322 # is selected. 

323 if isDerivedComponent: 

324 inMemoryDataset = writeStorageClass.delegate().handleParameters(inMemoryDataset, parameters) 

325 # Then disable parameters for later 

326 parameters = {} 

327 

328 # Different storage classes implies a component request 

329 if readStorageClass != writeStorageClass: 

330 

331 if component is None: 331 ↛ 332line 331 didn't jump to line 332, because the condition on line 331 was never true

332 raise ValueError( 

333 "Storage class inconsistency ({} vs {}) but no" 

334 " component requested".format(readStorageClass.name, writeStorageClass.name) 

335 ) 

336 

337 # Concrete composite written as a single object (we hope) 

338 inMemoryDataset = writeStorageClass.delegate().getComponent(inMemoryDataset, component) 

339 

340 # Since there is no formatter to process parameters, they all must be 

341 # passed to the assembler. 

342 return self._post_process_get( 

343 inMemoryDataset, readStorageClass, parameters, isComponent=component is not None 

344 ) 

345 

346 def put(self, inMemoryDataset: Any, ref: DatasetRef) -> None: 

347 """Write a InMemoryDataset with a given `DatasetRef` to the store. 

348 

349 Parameters 

350 ---------- 

351 inMemoryDataset : `object` 

352 The dataset to store. 

353 ref : `DatasetRef` 

354 Reference to the associated Dataset. 

355 

356 Raises 

357 ------ 

358 TypeError 

359 Supplied object and storage class are inconsistent. 

360 DatasetTypeNotSupportedError 

361 The associated `DatasetType` is not handled by this datastore. 

362 

363 Notes 

364 ----- 

365 If the datastore is configured to reject certain dataset types it 

366 is possible that the put will fail and raise a 

367 `DatasetTypeNotSupportedError`. The main use case for this is to 

368 allow `ChainedDatastore` to put to multiple datastores without 

369 requiring that every datastore accepts the dataset. 

370 """ 

371 

372 if ref.id is None: 372 ↛ 373line 372 didn't jump to line 373, because the condition on line 372 was never true

373 raise RuntimeError(f"Can not store unresolved DatasetRef {ref}") 

374 

375 self._validate_put_parameters(inMemoryDataset, ref) 

376 

377 self.datasets[ref.id] = inMemoryDataset 

378 log.debug("Store %s in %s", ref, self.name) 

379 

380 # Store time we received this content, to allow us to optionally 

381 # expire it. Instead of storing a filename here, we include the 

382 # ID of this datasetRef so we can find it from components. 

383 itemInfo = StoredMemoryItemInfo(time.time(), ref.datasetType.storageClass, parentID=ref.id) 

384 

385 # We have to register this content with registry. 

386 # Currently this assumes we have a file so we need to use stub entries 

387 # TODO: Add to ephemeral part of registry 

388 self._register_datasets([(ref, itemInfo)]) 

389 

390 if self._transaction is not None: 

391 self._transaction.registerUndo("put", self.remove, ref) 

392 

393 def getURIs( 

394 self, ref: DatasetRef, predict: bool = False 

395 ) -> Tuple[Optional[ButlerURI], Dict[str, ButlerURI]]: 

396 """Return URIs associated with dataset. 

397 

398 Parameters 

399 ---------- 

400 ref : `DatasetRef` 

401 Reference to the required dataset. 

402 predict : `bool`, optional 

403 If the datastore does not know about the dataset, should it 

404 return a predicted URI or not? 

405 

406 Returns 

407 ------- 

408 primary : `ButlerURI` 

409 The URI to the primary artifact associated with this dataset. 

410 If the dataset was disassembled within the datastore this 

411 may be `None`. 

412 components : `dict` 

413 URIs to any components associated with the dataset artifact. 

414 Can be empty if there are no components. 

415 

416 Notes 

417 ----- 

418 The URIs returned for in-memory datastores are not usable but 

419 provide an indication of the associated dataset. 

420 """ 

421 

422 # Include the dataID as a URI query 

423 query = urlencode(ref.dataId) 

424 

425 # if this has never been written then we have to guess 

426 if not self.exists(ref): 

427 if not predict: 

428 raise FileNotFoundError("Dataset {} not in this datastore".format(ref)) 

429 name = f"{ref.datasetType.name}" 

430 fragment = "#predicted" 

431 else: 

432 realID, _ = self._get_dataset_info(ref) 

433 name = f"{id(self.datasets[realID])}?{query}" 

434 fragment = "" 

435 

436 return ButlerURI(f"mem://{name}?{query}{fragment}"), {} 

437 

438 def getURI(self, ref: DatasetRef, predict: bool = False) -> ButlerURI: 

439 """URI to the Dataset. 

440 

441 Always uses "mem://" URI prefix. 

442 

443 Parameters 

444 ---------- 

445 ref : `DatasetRef` 

446 Reference to the required Dataset. 

447 predict : `bool` 

448 If `True`, allow URIs to be returned of datasets that have not 

449 been written. 

450 

451 Returns 

452 ------- 

453 uri : `str` 

454 URI pointing to the dataset within the datastore. If the 

455 dataset does not exist in the datastore, and if ``predict`` is 

456 `True`, the URI will be a prediction and will include a URI 

457 fragment "#predicted". 

458 If the datastore does not have entities that relate well 

459 to the concept of a URI the returned URI string will be 

460 descriptive. The returned URI is not guaranteed to be obtainable. 

461 

462 Raises 

463 ------ 

464 FileNotFoundError 

465 A URI has been requested for a dataset that does not exist and 

466 guessing is not allowed. 

467 AssertionError 

468 Raised if an internal error occurs. 

469 """ 

470 primary, _ = self.getURIs(ref, predict) 

471 if primary is None: 471 ↛ 474line 471 didn't jump to line 474, because the condition on line 471 was never true

472 # This should be impossible since this datastore does 

473 # not disassemble. This check also helps mypy. 

474 raise AssertionError(f"Unexpectedly got no URI for in-memory datastore for {ref}") 

475 return primary 

476 

477 def retrieveArtifacts( 

478 self, 

479 refs: Iterable[DatasetRef], 

480 destination: ButlerURI, 

481 transfer: str = "auto", 

482 preserve_path: bool = True, 

483 overwrite: Optional[bool] = False, 

484 ) -> List[ButlerURI]: 

485 """Retrieve the file artifacts associated with the supplied refs. 

486 

487 Notes 

488 ----- 

489 Not implemented by this datastore. 

490 """ 

491 # Could conceivably launch a FileDatastore to use formatters to write 

492 # the data but this is fraught with problems. 

493 raise NotImplementedError("Can not write artifacts to disk from in-memory datastore.") 

494 

495 def forget(self, refs: Iterable[DatasetRef]) -> None: 

496 # Docstring inherited. 

497 refs = list(refs) 

498 self._bridge.forget(refs) 

499 for ref in refs: 

500 self.removeStoredItemInfo(ref) 

501 

502 def trash(self, ref: Union[DatasetRef, Iterable[DatasetRef]], ignore_errors: bool = False) -> None: 

503 """Indicate to the Datastore that a dataset can be removed. 

504 

505 Parameters 

506 ---------- 

507 ref : `DatasetRef` or iterable thereof 

508 Reference to the required Dataset(s). 

509 ignore_errors: `bool`, optional 

510 Indicate that errors should be ignored. 

511 

512 Raises 

513 ------ 

514 FileNotFoundError 

515 Attempt to remove a dataset that does not exist. Only relevant 

516 if a single dataset ref is given. 

517 

518 Notes 

519 ----- 

520 Concurrency should not normally be an issue for the in memory datastore 

521 since all internal changes are isolated to solely this process and 

522 the registry only changes rows associated with this process. 

523 """ 

524 if not isinstance(ref, DatasetRef): 

525 log.debug("Bulk trashing of datasets in datastore %s", self.name) 

526 self.bridge.moveToTrash(ref) 

527 return 

528 

529 log.debug("Trash %s in datastore %s", ref, self.name) 

530 

531 # Check that this dataset is known to datastore 

532 try: 

533 self._get_dataset_info(ref) 

534 

535 # Move datasets to trash table 

536 self.bridge.moveToTrash([ref]) 

537 except Exception as e: 

538 if ignore_errors: 538 ↛ 539line 538 didn't jump to line 539, because the condition on line 538 was never true

539 log.warning( 

540 "Error encountered moving dataset %s to trash in datastore %s: %s", ref, self.name, e 

541 ) 

542 else: 

543 raise 

544 

545 def emptyTrash(self, ignore_errors: bool = False) -> None: 

546 """Remove all datasets from the trash. 

547 

548 Parameters 

549 ---------- 

550 ignore_errors : `bool`, optional 

551 Ignore errors. 

552 

553 Notes 

554 ----- 

555 The internal tracking of datasets is affected by this method and 

556 transaction handling is not supported if there is a problem before 

557 the datasets themselves are deleted. 

558 

559 Concurrency should not normally be an issue for the in memory datastore 

560 since all internal changes are isolated to solely this process and 

561 the registry only changes rows associated with this process. 

562 """ 

563 log.debug("Emptying trash in datastore %s", self.name) 

564 with self._bridge.emptyTrash() as trash_data: 

565 trashed, _ = trash_data 

566 for ref, _ in trashed: 

567 try: 

568 realID, _ = self._get_dataset_info(ref) 

569 except FileNotFoundError: 569 ↛ 572line 569 didn't jump to line 572

570 # Dataset already removed so ignore it 

571 continue 

572 except Exception as e: 

573 if ignore_errors: 

574 log.warning( 

575 "Emptying trash in datastore %s but encountered an error with dataset %s: %s", 

576 self.name, 

577 ref.id, 

578 e, 

579 ) 

580 continue 

581 else: 

582 raise 

583 

584 # Determine whether all references to this dataset have been 

585 # removed and we can delete the dataset itself 

586 allRefs = self.related[realID] 

587 remainingRefs = allRefs - {ref.id} 

588 if not remainingRefs: 588 ↛ 593line 588 didn't jump to line 593, because the condition on line 588 was never false

589 log.debug("Removing artifact %s from datastore %s", realID, self.name) 

590 del self.datasets[realID] 

591 

592 # Remove this entry 

593 self.removeStoredItemInfo(ref) 

594 

595 def validateConfiguration( 

596 self, entities: Iterable[Union[DatasetRef, DatasetType, StorageClass]], logFailures: bool = False 

597 ) -> None: 

598 """Validate some of the configuration for this datastore. 

599 

600 Parameters 

601 ---------- 

602 entities : iterable of `DatasetRef`, `DatasetType`, or `StorageClass` 

603 Entities to test against this configuration. Can be differing 

604 types. 

605 logFailures : `bool`, optional 

606 If `True`, output a log message for every validation error 

607 detected. 

608 

609 Raises 

610 ------ 

611 DatastoreValidationError 

612 Raised if there is a validation problem with a configuration. 

613 All the problems are reported in a single exception. 

614 

615 Notes 

616 ----- 

617 This method is a no-op. 

618 """ 

619 return 

620 

621 def _overrideTransferMode(self, *datasets: Any, transfer: Optional[str] = None) -> Optional[str]: 

622 # Docstring is inherited from base class 

623 return transfer 

624 

625 def validateKey(self, lookupKey: LookupKey, entity: Union[DatasetRef, DatasetType, StorageClass]) -> None: 

626 # Docstring is inherited from base class 

627 return 

628 

629 def getLookupKeys(self) -> Set[LookupKey]: 

630 # Docstring is inherited from base class 

631 return self.constraints.getLookupKeys() 

632 

633 def needs_expanded_data_ids( 

634 self, 

635 transfer: Optional[str], 

636 entity: Optional[Union[DatasetRef, DatasetType, StorageClass]] = None, 

637 ) -> bool: 

638 # Docstring inherited. 

639 return False