Coverage for python/lsst/daf/butler/datastores/inMemoryDatastore.py: 92%

180 statements  

« prev     ^ index     » next       coverage.py v7.2.7, created at 2023-06-02 02:15 -0700

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22from __future__ import annotations 

23 

24"""In-memory datastore.""" 

25 

26__all__ = ("StoredMemoryItemInfo", "InMemoryDatastore") 

27 

28import logging 

29import time 

30from dataclasses import dataclass 

31from typing import TYPE_CHECKING, Any, Dict, Iterable, List, Mapping, Optional, Set, Tuple, Union 

32from urllib.parse import urlencode 

33 

34from lsst.daf.butler import ( 

35 DatasetId, 

36 DatasetRef, 

37 DatasetRefURIs, 

38 DatastoreRecordData, 

39 StorageClass, 

40 StoredDatastoreItemInfo, 

41) 

42from lsst.daf.butler.core.utils import transactional 

43from lsst.daf.butler.registry.interfaces import DatastoreRegistryBridge 

44from lsst.resources import ResourcePath 

45 

46from .genericDatastore import GenericBaseDatastore 

47 

48if TYPE_CHECKING: 

49 from lsst.daf.butler import Config, DatasetType, LookupKey 

50 from lsst.daf.butler.registry.interfaces import DatasetIdRef, DatastoreRegistryBridgeManager 

51 

52log = logging.getLogger(__name__) 

53 

54 

55@dataclass(frozen=True) 

56class StoredMemoryItemInfo(StoredDatastoreItemInfo): 

57 """Internal InMemoryDatastore Metadata associated with a stored 

58 DatasetRef. 

59 """ 

60 

61 __slots__ = {"timestamp", "storageClass", "parentID", "dataset_id"} 

62 

63 timestamp: float 

64 """Unix timestamp indicating the time the dataset was stored.""" 

65 

66 storageClass: StorageClass 

67 """StorageClass associated with the dataset.""" 

68 

69 parentID: DatasetId 

70 """ID of the parent `DatasetRef` if this entry is a concrete 

71 composite. Not used if the dataset being stored is not a 

72 virtual component of a composite 

73 """ 

74 

75 dataset_id: DatasetId 

76 """DatasetId associated with this record.""" 

77 

78 

79class InMemoryDatastore(GenericBaseDatastore): 

80 """Basic Datastore for writing to an in memory cache. 

81 

82 This datastore is ephemeral in that the contents of the datastore 

83 disappear when the Python process completes. This also means that 

84 other processes can not access this datastore. 

85 

86 Parameters 

87 ---------- 

88 config : `DatastoreConfig` or `str` 

89 Configuration. 

90 bridgeManager : `DatastoreRegistryBridgeManager` 

91 Object that manages the interface between `Registry` and datastores. 

92 butlerRoot : `str`, optional 

93 Unused parameter. 

94 

95 Notes 

96 ----- 

97 InMemoryDatastore does not support any file-based ingest. 

98 """ 

99 

100 defaultConfigFile = "datastores/inMemoryDatastore.yaml" 

101 """Path to configuration defaults. Accessed within the ``configs`` resource 

102 or relative to a search path. Can be None if no defaults specified. 

103 """ 

104 

105 isEphemeral = True 

106 """A new datastore is created every time and datasets disappear when 

107 the process shuts down.""" 

108 

109 datasets: Dict[DatasetId, Any] 

110 """Internal storage of datasets indexed by dataset ID.""" 

111 

112 records: Dict[DatasetId, StoredMemoryItemInfo] 

113 """Internal records about stored datasets.""" 

114 

115 def __init__( 

116 self, 

117 config: Union[Config, str], 

118 bridgeManager: DatastoreRegistryBridgeManager, 

119 butlerRoot: Optional[str] = None, 

120 ): 

121 super().__init__(config, bridgeManager) 

122 

123 # Name ourselves with the timestamp the datastore 

124 # was created. 

125 self.name = "{}@{}".format(type(self).__name__, time.time()) 

126 log.debug("Creating datastore %s", self.name) 

127 

128 # Storage of datasets, keyed by dataset_id 

129 self.datasets: Dict[DatasetId, Any] = {} 

130 

131 # Records is distinct in order to track concrete composite components 

132 # where we register multiple components for a single dataset. 

133 self.records: Dict[DatasetId, StoredMemoryItemInfo] = {} 

134 

135 # Related records that share the same parent 

136 self.related: Dict[DatasetId, Set[DatasetId]] = {} 

137 

138 self._bridge = bridgeManager.register(self.name, ephemeral=True) 

139 

140 @classmethod 

141 def setConfigRoot(cls, root: str, config: Config, full: Config, overwrite: bool = True) -> None: 

142 """Set any filesystem-dependent config options for this Datastore to 

143 be appropriate for a new empty repository with the given root. 

144 

145 Does nothing in this implementation. 

146 

147 Parameters 

148 ---------- 

149 root : `str` 

150 Filesystem path to the root of the data repository. 

151 config : `Config` 

152 A `Config` to update. Only the subset understood by 

153 this component will be updated. Will not expand 

154 defaults. 

155 full : `Config` 

156 A complete config with all defaults expanded that can be 

157 converted to a `DatastoreConfig`. Read-only and will not be 

158 modified by this method. 

159 Repository-specific options that should not be obtained 

160 from defaults when Butler instances are constructed 

161 should be copied from ``full`` to ``config``. 

162 overwrite : `bool`, optional 

163 If `False`, do not modify a value in ``config`` if the value 

164 already exists. Default is always to overwrite with the provided 

165 ``root``. 

166 

167 Notes 

168 ----- 

169 If a keyword is explicitly defined in the supplied ``config`` it 

170 will not be overridden by this method if ``overwrite`` is `False`. 

171 This allows explicit values set in external configs to be retained. 

172 """ 

173 return 

174 

175 @property 

176 def bridge(self) -> DatastoreRegistryBridge: 

177 # Docstring inherited from GenericBaseDatastore. 

178 return self._bridge 

179 

180 def addStoredItemInfo(self, refs: Iterable[DatasetRef], infos: Iterable[StoredMemoryItemInfo]) -> None: 

181 # Docstring inherited from GenericBaseDatastore. 

182 for ref, info in zip(refs, infos): 

183 self.records[ref.id] = info 

184 self.related.setdefault(info.parentID, set()).add(ref.id) 

185 

186 def getStoredItemInfo(self, ref: DatasetIdRef) -> StoredMemoryItemInfo: 

187 # Docstring inherited from GenericBaseDatastore. 

188 return self.records[ref.id] 

189 

190 def getStoredItemsInfo(self, ref: DatasetIdRef) -> List[StoredMemoryItemInfo]: 

191 # Docstring inherited from GenericBaseDatastore. 

192 return [self.getStoredItemInfo(ref)] 

193 

194 def removeStoredItemInfo(self, ref: DatasetIdRef) -> None: 

195 # Docstring inherited from GenericBaseDatastore. 

196 # If a component has been removed previously then we can sometimes 

197 # be asked to remove it again. Other datastores ignore this 

198 # so also ignore here 

199 if ref.id not in self.records: 

200 return 

201 record = self.records[ref.id] 

202 del self.records[ref.id] 

203 self.related[record.parentID].remove(ref.id) 

204 

205 def _get_dataset_info(self, ref: DatasetIdRef) -> Tuple[DatasetId, StoredMemoryItemInfo]: 

206 """Check that the dataset is present and return the real ID and 

207 associated information. 

208 

209 Parameters 

210 ---------- 

211 ref : `DatasetRef` 

212 Target `DatasetRef` 

213 

214 Returns 

215 ------- 

216 realID : `int` 

217 The dataset ID associated with this ref that should be used. This 

218 could either be the ID of the supplied `DatasetRef` or the parent. 

219 storageInfo : `StoredMemoryItemInfo` 

220 Associated storage information. 

221 

222 Raises 

223 ------ 

224 FileNotFoundError 

225 Raised if the dataset is not present in this datastore. 

226 """ 

227 try: 

228 storedItemInfo = self.getStoredItemInfo(ref) 

229 except KeyError: 

230 raise FileNotFoundError(f"No such file dataset in memory: {ref}") from None 

231 realID = ref.id 

232 if storedItemInfo.parentID is not None: 232 ↛ 235line 232 didn't jump to line 235, because the condition on line 232 was never false

233 realID = storedItemInfo.parentID 

234 

235 if realID not in self.datasets: 235 ↛ 236line 235 didn't jump to line 236, because the condition on line 235 was never true

236 raise FileNotFoundError(f"No such file dataset in memory: {ref}") 

237 

238 return realID, storedItemInfo 

239 

240 def knows(self, ref: DatasetRef) -> bool: 

241 """Check if the dataset is known to the datastore. 

242 

243 This datastore does not distinguish dataset existence from knowledge 

244 of a dataset. 

245 

246 Parameters 

247 ---------- 

248 ref : `DatasetRef` 

249 Reference to the required dataset. 

250 

251 Returns 

252 ------- 

253 exists : `bool` 

254 `True` if the dataset is known to the datastore. 

255 """ 

256 return self.exists(ref) 

257 

258 def exists(self, ref: DatasetRef) -> bool: 

259 """Check if the dataset exists in the datastore. 

260 

261 Parameters 

262 ---------- 

263 ref : `DatasetRef` 

264 Reference to the required dataset. 

265 

266 Returns 

267 ------- 

268 exists : `bool` 

269 `True` if the entity exists in the `Datastore`. 

270 """ 

271 try: 

272 self._get_dataset_info(ref) 

273 except FileNotFoundError: 

274 return False 

275 return True 

276 

277 def get( 

278 self, 

279 ref: DatasetRef, 

280 parameters: Optional[Mapping[str, Any]] = None, 

281 storageClass: Optional[Union[StorageClass, str]] = None, 

282 ) -> Any: 

283 """Load an InMemoryDataset from the store. 

284 

285 Parameters 

286 ---------- 

287 ref : `DatasetRef` 

288 Reference to the required Dataset. 

289 parameters : `dict` 

290 `StorageClass`-specific parameters that specify, for example, 

291 a slice of the dataset to be loaded. 

292 storageClass : `StorageClass` or `str`, optional 

293 The storage class to be used to override the Python type 

294 returned by this method. By default the returned type matches 

295 the dataset type definition for this dataset. Specifying a 

296 read `StorageClass` can force a different type to be returned. 

297 This type must be compatible with the original type. 

298 

299 Returns 

300 ------- 

301 inMemoryDataset : `object` 

302 Requested dataset or slice thereof as an InMemoryDataset. 

303 

304 Raises 

305 ------ 

306 FileNotFoundError 

307 Requested dataset can not be retrieved. 

308 TypeError 

309 Return value from formatter has unexpected type. 

310 ValueError 

311 Formatter failed to process the dataset. 

312 """ 

313 

314 log.debug("Retrieve %s from %s with parameters %s", ref, self.name, parameters) 

315 

316 realID, storedItemInfo = self._get_dataset_info(ref) 

317 

318 # We have a write storage class and a read storage class and they 

319 # can be different for concrete composites or if overridden. 

320 if storageClass is not None: 

321 ref = ref.overrideStorageClass(storageClass) 

322 refStorageClass = ref.datasetType.storageClass 

323 writeStorageClass = storedItemInfo.storageClass 

324 

325 component = ref.datasetType.component() 

326 

327 # Check that the supplied parameters are suitable for the type read 

328 # If this is a derived component we validate against the composite 

329 isDerivedComponent = False 

330 if component in writeStorageClass.derivedComponents: 

331 writeStorageClass.validateParameters(parameters) 

332 isDerivedComponent = True 

333 else: 

334 refStorageClass.validateParameters(parameters) 

335 

336 inMemoryDataset = self.datasets[realID] 

337 

338 # if this is a read only component we need to apply parameters 

339 # before we retrieve the component. We assume that the parameters 

340 # will affect the data globally, before the derived component 

341 # is selected. 

342 if isDerivedComponent: 

343 inMemoryDataset = writeStorageClass.delegate().handleParameters(inMemoryDataset, parameters) 

344 # Then disable parameters for later 

345 parameters = {} 

346 

347 # Check if we have a component. 

348 if component: 

349 # In-memory datastore must have stored the dataset as a single 

350 # object in the write storage class. We therefore use that 

351 # storage class delegate to obtain the component. 

352 inMemoryDataset = writeStorageClass.delegate().getComponent(inMemoryDataset, component) 

353 

354 # Since there is no formatter to process parameters, they all must be 

355 # passed to the assembler. 

356 inMemoryDataset = self._post_process_get( 

357 inMemoryDataset, refStorageClass, parameters, isComponent=component is not None 

358 ) 

359 

360 # Last minute type conversion. 

361 return refStorageClass.coerce_type(inMemoryDataset) 

362 

363 def put(self, inMemoryDataset: Any, ref: DatasetRef) -> None: 

364 """Write a InMemoryDataset with a given `DatasetRef` to the store. 

365 

366 Parameters 

367 ---------- 

368 inMemoryDataset : `object` 

369 The dataset to store. 

370 ref : `DatasetRef` 

371 Reference to the associated Dataset. 

372 

373 Raises 

374 ------ 

375 TypeError 

376 Supplied object and storage class are inconsistent. 

377 DatasetTypeNotSupportedError 

378 The associated `DatasetType` is not handled by this datastore. 

379 

380 Notes 

381 ----- 

382 If the datastore is configured to reject certain dataset types it 

383 is possible that the put will fail and raise a 

384 `DatasetTypeNotSupportedError`. The main use case for this is to 

385 allow `ChainedDatastore` to put to multiple datastores without 

386 requiring that every datastore accepts the dataset. 

387 """ 

388 # May need to coerce the in memory dataset to the correct 

389 # python type, otherwise parameters may not work. 

390 inMemoryDataset = ref.datasetType.storageClass.coerce_type(inMemoryDataset) 

391 

392 self._validate_put_parameters(inMemoryDataset, ref) 

393 

394 self.datasets[ref.id] = inMemoryDataset 

395 log.debug("Store %s in %s", ref, self.name) 

396 

397 # Store time we received this content, to allow us to optionally 

398 # expire it. Instead of storing a filename here, we include the 

399 # ID of this datasetRef so we can find it from components. 

400 itemInfo = StoredMemoryItemInfo( 

401 time.time(), ref.datasetType.storageClass, parentID=ref.id, dataset_id=ref.id 

402 ) 

403 

404 # We have to register this content with registry. 

405 # Currently this assumes we have a file so we need to use stub entries 

406 # TODO: Add to ephemeral part of registry 

407 self._register_datasets([(ref, itemInfo)]) 

408 

409 if self._transaction is not None: 

410 self._transaction.registerUndo("put", self.remove, ref) 

411 

412 def getURIs(self, ref: DatasetRef, predict: bool = False) -> DatasetRefURIs: 

413 """Return URIs associated with dataset. 

414 

415 Parameters 

416 ---------- 

417 ref : `DatasetRef` 

418 Reference to the required dataset. 

419 predict : `bool`, optional 

420 If the datastore does not know about the dataset, should it 

421 return a predicted URI or not? 

422 

423 Returns 

424 ------- 

425 uris : `DatasetRefURIs` 

426 The URI to the primary artifact associated with this dataset (if 

427 the dataset was disassembled within the datastore this may be 

428 `None`), and the URIs to any components associated with the dataset 

429 artifact. (can be empty if there are no components). 

430 

431 Notes 

432 ----- 

433 The URIs returned for in-memory datastores are not usable but 

434 provide an indication of the associated dataset. 

435 """ 

436 

437 # Include the dataID as a URI query 

438 query = urlencode(ref.dataId) 

439 

440 # if this has never been written then we have to guess 

441 if not self.exists(ref): 

442 if not predict: 

443 raise FileNotFoundError("Dataset {} not in this datastore".format(ref)) 

444 name = f"{ref.datasetType.name}" 

445 fragment = "#predicted" 

446 else: 

447 realID, _ = self._get_dataset_info(ref) 

448 name = f"{id(self.datasets[realID])}?{query}" 

449 fragment = "" 

450 

451 return DatasetRefURIs(ResourcePath(f"mem://{name}?{query}{fragment}"), {}) 

452 

453 def getURI(self, ref: DatasetRef, predict: bool = False) -> ResourcePath: 

454 """URI to the Dataset. 

455 

456 Always uses "mem://" URI prefix. 

457 

458 Parameters 

459 ---------- 

460 ref : `DatasetRef` 

461 Reference to the required Dataset. 

462 predict : `bool` 

463 If `True`, allow URIs to be returned of datasets that have not 

464 been written. 

465 

466 Returns 

467 ------- 

468 uri : `str` 

469 URI pointing to the dataset within the datastore. If the 

470 dataset does not exist in the datastore, and if ``predict`` is 

471 `True`, the URI will be a prediction and will include a URI 

472 fragment "#predicted". 

473 If the datastore does not have entities that relate well 

474 to the concept of a URI the returned URI string will be 

475 descriptive. The returned URI is not guaranteed to be obtainable. 

476 

477 Raises 

478 ------ 

479 FileNotFoundError 

480 A URI has been requested for a dataset that does not exist and 

481 guessing is not allowed. 

482 AssertionError 

483 Raised if an internal error occurs. 

484 """ 

485 primary, _ = self.getURIs(ref, predict) 

486 if primary is None: 

487 # This should be impossible since this datastore does 

488 # not disassemble. This check also helps mypy. 

489 raise AssertionError(f"Unexpectedly got no URI for in-memory datastore for {ref}") 

490 return primary 

491 

492 def retrieveArtifacts( 

493 self, 

494 refs: Iterable[DatasetRef], 

495 destination: ResourcePath, 

496 transfer: str = "auto", 

497 preserve_path: bool = True, 

498 overwrite: Optional[bool] = False, 

499 ) -> List[ResourcePath]: 

500 """Retrieve the file artifacts associated with the supplied refs. 

501 

502 Notes 

503 ----- 

504 Not implemented by this datastore. 

505 """ 

506 # Could conceivably launch a FileDatastore to use formatters to write 

507 # the data but this is fraught with problems. 

508 raise NotImplementedError("Can not write artifacts to disk from in-memory datastore.") 

509 

510 def forget(self, refs: Iterable[DatasetRef]) -> None: 

511 # Docstring inherited. 

512 refs = list(refs) 

513 self._bridge.forget(refs) 

514 for ref in refs: 

515 self.removeStoredItemInfo(ref) 

516 

517 @transactional 

518 def trash(self, ref: Union[DatasetRef, Iterable[DatasetRef]], ignore_errors: bool = False) -> None: 

519 """Indicate to the Datastore that a dataset can be removed. 

520 

521 Parameters 

522 ---------- 

523 ref : `DatasetRef` or iterable thereof 

524 Reference to the required Dataset(s). 

525 ignore_errors: `bool`, optional 

526 Indicate that errors should be ignored. 

527 

528 Raises 

529 ------ 

530 FileNotFoundError 

531 Attempt to remove a dataset that does not exist. Only relevant 

532 if a single dataset ref is given. 

533 

534 Notes 

535 ----- 

536 Concurrency should not normally be an issue for the in memory datastore 

537 since all internal changes are isolated to solely this process and 

538 the registry only changes rows associated with this process. 

539 """ 

540 if not isinstance(ref, DatasetRef): 

541 log.debug("Bulk trashing of datasets in datastore %s", self.name) 

542 self.bridge.moveToTrash(ref, transaction=self._transaction) 

543 return 

544 

545 log.debug("Trash %s in datastore %s", ref, self.name) 

546 

547 # Check that this dataset is known to datastore 

548 try: 

549 self._get_dataset_info(ref) 

550 

551 # Move datasets to trash table 

552 self.bridge.moveToTrash([ref], transaction=self._transaction) 

553 except Exception as e: 

554 if ignore_errors: 554 ↛ 555line 554 didn't jump to line 555, because the condition on line 554 was never true

555 log.warning( 

556 "Error encountered moving dataset %s to trash in datastore %s: %s", ref, self.name, e 

557 ) 

558 else: 

559 raise 

560 

561 def emptyTrash(self, ignore_errors: bool = False) -> None: 

562 """Remove all datasets from the trash. 

563 

564 Parameters 

565 ---------- 

566 ignore_errors : `bool`, optional 

567 Ignore errors. 

568 

569 Notes 

570 ----- 

571 The internal tracking of datasets is affected by this method and 

572 transaction handling is not supported if there is a problem before 

573 the datasets themselves are deleted. 

574 

575 Concurrency should not normally be an issue for the in memory datastore 

576 since all internal changes are isolated to solely this process and 

577 the registry only changes rows associated with this process. 

578 """ 

579 log.debug("Emptying trash in datastore %s", self.name) 

580 with self._bridge.emptyTrash() as trash_data: 

581 trashed, _ = trash_data 

582 for ref, _ in trashed: 

583 try: 

584 realID, _ = self._get_dataset_info(ref) 

585 except FileNotFoundError: 585 ↛ 588line 585 didn't jump to line 588

586 # Dataset already removed so ignore it 

587 continue 

588 except Exception as e: 

589 if ignore_errors: 

590 log.warning( 

591 "Emptying trash in datastore %s but encountered an error with dataset %s: %s", 

592 self.name, 

593 ref.id, 

594 e, 

595 ) 

596 continue 

597 else: 

598 raise 

599 

600 # Determine whether all references to this dataset have been 

601 # removed and we can delete the dataset itself 

602 allRefs = self.related[realID] 

603 remainingRefs = allRefs - {ref.id} 

604 if not remainingRefs: 604 ↛ 609line 604 didn't jump to line 609, because the condition on line 604 was never false

605 log.debug("Removing artifact %s from datastore %s", realID, self.name) 

606 del self.datasets[realID] 

607 

608 # Remove this entry 

609 self.removeStoredItemInfo(ref) 

610 

611 def validateConfiguration( 

612 self, entities: Iterable[Union[DatasetRef, DatasetType, StorageClass]], logFailures: bool = False 

613 ) -> None: 

614 """Validate some of the configuration for this datastore. 

615 

616 Parameters 

617 ---------- 

618 entities : iterable of `DatasetRef`, `DatasetType`, or `StorageClass` 

619 Entities to test against this configuration. Can be differing 

620 types. 

621 logFailures : `bool`, optional 

622 If `True`, output a log message for every validation error 

623 detected. 

624 

625 Raises 

626 ------ 

627 DatastoreValidationError 

628 Raised if there is a validation problem with a configuration. 

629 All the problems are reported in a single exception. 

630 

631 Notes 

632 ----- 

633 This method is a no-op. 

634 """ 

635 return 

636 

637 def _overrideTransferMode(self, *datasets: Any, transfer: Optional[str] = None) -> Optional[str]: 

638 # Docstring is inherited from base class 

639 return transfer 

640 

641 def validateKey(self, lookupKey: LookupKey, entity: Union[DatasetRef, DatasetType, StorageClass]) -> None: 

642 # Docstring is inherited from base class 

643 return 

644 

645 def getLookupKeys(self) -> Set[LookupKey]: 

646 # Docstring is inherited from base class 

647 return self.constraints.getLookupKeys() 

648 

649 def needs_expanded_data_ids( 

650 self, 

651 transfer: Optional[str], 

652 entity: Optional[Union[DatasetRef, DatasetType, StorageClass]] = None, 

653 ) -> bool: 

654 # Docstring inherited. 

655 return False 

656 

657 def import_records(self, data: Mapping[str, DatastoreRecordData]) -> None: 

658 # Docstring inherited from the base class. 

659 return 

660 

661 def export_records(self, refs: Iterable[DatasetIdRef]) -> Mapping[str, DatastoreRecordData]: 

662 # Docstring inherited from the base class. 

663 

664 # In-memory Datastore records cannot be exported or imported 

665 return {}