Coverage for python/lsst/daf/butler/datastores/inMemoryDatastore.py: 92%

181 statements  

« prev     ^ index     » next       coverage.py v7.2.7, created at 2023-06-14 09:10 +0000

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22from __future__ import annotations 

23 

24"""In-memory datastore.""" 

25 

26__all__ = ("StoredMemoryItemInfo", "InMemoryDatastore") 

27 

28import logging 

29import time 

30from collections.abc import Iterable, Mapping 

31from dataclasses import dataclass 

32from typing import TYPE_CHECKING, Any 

33from urllib.parse import urlencode 

34 

35from lsst.daf.butler import ( 

36 DatasetId, 

37 DatasetRef, 

38 DatasetRefURIs, 

39 DatastoreRecordData, 

40 StorageClass, 

41 StoredDatastoreItemInfo, 

42) 

43from lsst.daf.butler.core.utils import transactional 

44from lsst.daf.butler.registry.interfaces import DatastoreRegistryBridge 

45from lsst.resources import ResourcePath 

46 

47from .genericDatastore import GenericBaseDatastore 

48 

49if TYPE_CHECKING: 

50 from lsst.daf.butler import Config, DatasetType, LookupKey 

51 from lsst.daf.butler.registry.interfaces import DatasetIdRef, DatastoreRegistryBridgeManager 

52 

53log = logging.getLogger(__name__) 

54 

55 

56@dataclass(frozen=True) 

57class StoredMemoryItemInfo(StoredDatastoreItemInfo): 

58 """Internal InMemoryDatastore Metadata associated with a stored 

59 DatasetRef. 

60 """ 

61 

62 __slots__ = {"timestamp", "storageClass", "parentID", "dataset_id"} 

63 

64 timestamp: float 

65 """Unix timestamp indicating the time the dataset was stored.""" 

66 

67 storageClass: StorageClass 

68 """StorageClass associated with the dataset.""" 

69 

70 parentID: DatasetId 

71 """ID of the parent `DatasetRef` if this entry is a concrete 

72 composite. Not used if the dataset being stored is not a 

73 virtual component of a composite 

74 """ 

75 

76 dataset_id: DatasetId 

77 """DatasetId associated with this record.""" 

78 

79 

80class InMemoryDatastore(GenericBaseDatastore): 

81 """Basic Datastore for writing to an in memory cache. 

82 

83 This datastore is ephemeral in that the contents of the datastore 

84 disappear when the Python process completes. This also means that 

85 other processes can not access this datastore. 

86 

87 Parameters 

88 ---------- 

89 config : `DatastoreConfig` or `str` 

90 Configuration. 

91 bridgeManager : `DatastoreRegistryBridgeManager` 

92 Object that manages the interface between `Registry` and datastores. 

93 butlerRoot : `str`, optional 

94 Unused parameter. 

95 

96 Notes 

97 ----- 

98 InMemoryDatastore does not support any file-based ingest. 

99 """ 

100 

101 defaultConfigFile = "datastores/inMemoryDatastore.yaml" 

102 """Path to configuration defaults. Accessed within the ``configs`` resource 

103 or relative to a search path. Can be None if no defaults specified. 

104 """ 

105 

106 isEphemeral = True 

107 """A new datastore is created every time and datasets disappear when 

108 the process shuts down.""" 

109 

110 datasets: dict[DatasetId, Any] 

111 """Internal storage of datasets indexed by dataset ID.""" 

112 

113 records: dict[DatasetId, StoredMemoryItemInfo] 

114 """Internal records about stored datasets.""" 

115 

116 def __init__( 

117 self, 

118 config: Config | str, 

119 bridgeManager: DatastoreRegistryBridgeManager, 

120 butlerRoot: str | None = None, 

121 ): 

122 super().__init__(config, bridgeManager) 

123 

124 # Name ourselves with the timestamp the datastore 

125 # was created. 

126 self.name = f"{type(self).__name__}@{time.time()}" 

127 log.debug("Creating datastore %s", self.name) 

128 

129 # Storage of datasets, keyed by dataset_id 

130 self.datasets: dict[DatasetId, Any] = {} 

131 

132 # Records is distinct in order to track concrete composite components 

133 # where we register multiple components for a single dataset. 

134 self.records: dict[DatasetId, StoredMemoryItemInfo] = {} 

135 

136 # Related records that share the same parent 

137 self.related: dict[DatasetId, set[DatasetId]] = {} 

138 

139 self._bridge = bridgeManager.register(self.name, ephemeral=True) 

140 

141 @classmethod 

142 def setConfigRoot(cls, root: str, config: Config, full: Config, overwrite: bool = True) -> None: 

143 """Set any filesystem-dependent config options for this Datastore to 

144 be appropriate for a new empty repository with the given root. 

145 

146 Does nothing in this implementation. 

147 

148 Parameters 

149 ---------- 

150 root : `str` 

151 Filesystem path to the root of the data repository. 

152 config : `Config` 

153 A `Config` to update. Only the subset understood by 

154 this component will be updated. Will not expand 

155 defaults. 

156 full : `Config` 

157 A complete config with all defaults expanded that can be 

158 converted to a `DatastoreConfig`. Read-only and will not be 

159 modified by this method. 

160 Repository-specific options that should not be obtained 

161 from defaults when Butler instances are constructed 

162 should be copied from ``full`` to ``config``. 

163 overwrite : `bool`, optional 

164 If `False`, do not modify a value in ``config`` if the value 

165 already exists. Default is always to overwrite with the provided 

166 ``root``. 

167 

168 Notes 

169 ----- 

170 If a keyword is explicitly defined in the supplied ``config`` it 

171 will not be overridden by this method if ``overwrite`` is `False`. 

172 This allows explicit values set in external configs to be retained. 

173 """ 

174 return 

175 

176 @property 

177 def bridge(self) -> DatastoreRegistryBridge: 

178 # Docstring inherited from GenericBaseDatastore. 

179 return self._bridge 

180 

181 def addStoredItemInfo(self, refs: Iterable[DatasetRef], infos: Iterable[StoredMemoryItemInfo]) -> None: 

182 # Docstring inherited from GenericBaseDatastore. 

183 for ref, info in zip(refs, infos): 

184 self.records[ref.id] = info 

185 self.related.setdefault(info.parentID, set()).add(ref.id) 

186 

187 def getStoredItemInfo(self, ref: DatasetIdRef) -> StoredMemoryItemInfo: 

188 # Docstring inherited from GenericBaseDatastore. 

189 return self.records[ref.id] 

190 

191 def getStoredItemsInfo(self, ref: DatasetIdRef) -> list[StoredMemoryItemInfo]: 

192 # Docstring inherited from GenericBaseDatastore. 

193 return [self.getStoredItemInfo(ref)] 

194 

195 def removeStoredItemInfo(self, ref: DatasetIdRef) -> None: 

196 # Docstring inherited from GenericBaseDatastore. 

197 # If a component has been removed previously then we can sometimes 

198 # be asked to remove it again. Other datastores ignore this 

199 # so also ignore here 

200 if ref.id not in self.records: 

201 return 

202 record = self.records[ref.id] 

203 del self.records[ref.id] 

204 self.related[record.parentID].remove(ref.id) 

205 

206 def _get_dataset_info(self, ref: DatasetIdRef) -> tuple[DatasetId, StoredMemoryItemInfo]: 

207 """Check that the dataset is present and return the real ID and 

208 associated information. 

209 

210 Parameters 

211 ---------- 

212 ref : `DatasetRef` 

213 Target `DatasetRef` 

214 

215 Returns 

216 ------- 

217 realID : `int` 

218 The dataset ID associated with this ref that should be used. This 

219 could either be the ID of the supplied `DatasetRef` or the parent. 

220 storageInfo : `StoredMemoryItemInfo` 

221 Associated storage information. 

222 

223 Raises 

224 ------ 

225 FileNotFoundError 

226 Raised if the dataset is not present in this datastore. 

227 """ 

228 try: 

229 storedItemInfo = self.getStoredItemInfo(ref) 

230 except KeyError: 

231 raise FileNotFoundError(f"No such file dataset in memory: {ref}") from None 

232 realID = ref.id 

233 if storedItemInfo.parentID is not None: 233 ↛ 236line 233 didn't jump to line 236, because the condition on line 233 was never false

234 realID = storedItemInfo.parentID 

235 

236 if realID not in self.datasets: 236 ↛ 237line 236 didn't jump to line 237, because the condition on line 236 was never true

237 raise FileNotFoundError(f"No such file dataset in memory: {ref}") 

238 

239 return realID, storedItemInfo 

240 

241 def knows(self, ref: DatasetRef) -> bool: 

242 """Check if the dataset is known to the datastore. 

243 

244 This datastore does not distinguish dataset existence from knowledge 

245 of a dataset. 

246 

247 Parameters 

248 ---------- 

249 ref : `DatasetRef` 

250 Reference to the required dataset. 

251 

252 Returns 

253 ------- 

254 exists : `bool` 

255 `True` if the dataset is known to the datastore. 

256 """ 

257 return self.exists(ref) 

258 

259 def exists(self, ref: DatasetRef) -> bool: 

260 """Check if the dataset exists in the datastore. 

261 

262 Parameters 

263 ---------- 

264 ref : `DatasetRef` 

265 Reference to the required dataset. 

266 

267 Returns 

268 ------- 

269 exists : `bool` 

270 `True` if the entity exists in the `Datastore`. 

271 """ 

272 try: 

273 self._get_dataset_info(ref) 

274 except FileNotFoundError: 

275 return False 

276 return True 

277 

278 def get( 

279 self, 

280 ref: DatasetRef, 

281 parameters: Mapping[str, Any] | None = None, 

282 storageClass: StorageClass | str | None = None, 

283 ) -> Any: 

284 """Load an InMemoryDataset from the store. 

285 

286 Parameters 

287 ---------- 

288 ref : `DatasetRef` 

289 Reference to the required Dataset. 

290 parameters : `dict` 

291 `StorageClass`-specific parameters that specify, for example, 

292 a slice of the dataset to be loaded. 

293 storageClass : `StorageClass` or `str`, optional 

294 The storage class to be used to override the Python type 

295 returned by this method. By default the returned type matches 

296 the dataset type definition for this dataset. Specifying a 

297 read `StorageClass` can force a different type to be returned. 

298 This type must be compatible with the original type. 

299 

300 Returns 

301 ------- 

302 inMemoryDataset : `object` 

303 Requested dataset or slice thereof as an InMemoryDataset. 

304 

305 Raises 

306 ------ 

307 FileNotFoundError 

308 Requested dataset can not be retrieved. 

309 TypeError 

310 Return value from formatter has unexpected type. 

311 ValueError 

312 Formatter failed to process the dataset. 

313 """ 

314 

315 log.debug("Retrieve %s from %s with parameters %s", ref, self.name, parameters) 

316 

317 realID, storedItemInfo = self._get_dataset_info(ref) 

318 

319 # We have a write storage class and a read storage class and they 

320 # can be different for concrete composites or if overridden. 

321 if storageClass is not None: 

322 ref = ref.overrideStorageClass(storageClass) 

323 refStorageClass = ref.datasetType.storageClass 

324 writeStorageClass = storedItemInfo.storageClass 

325 

326 component = ref.datasetType.component() 

327 

328 # Check that the supplied parameters are suitable for the type read 

329 # If this is a derived component we validate against the composite 

330 isDerivedComponent = False 

331 if component in writeStorageClass.derivedComponents: 

332 writeStorageClass.validateParameters(parameters) 

333 isDerivedComponent = True 

334 else: 

335 refStorageClass.validateParameters(parameters) 

336 

337 inMemoryDataset = self.datasets[realID] 

338 

339 # if this is a read only component we need to apply parameters 

340 # before we retrieve the component. We assume that the parameters 

341 # will affect the data globally, before the derived component 

342 # is selected. 

343 if isDerivedComponent: 

344 inMemoryDataset = writeStorageClass.delegate().handleParameters(inMemoryDataset, parameters) 

345 # Then disable parameters for later 

346 parameters = {} 

347 

348 # Check if we have a component. 

349 if component: 

350 # In-memory datastore must have stored the dataset as a single 

351 # object in the write storage class. We therefore use that 

352 # storage class delegate to obtain the component. 

353 inMemoryDataset = writeStorageClass.delegate().getComponent(inMemoryDataset, component) 

354 

355 # Since there is no formatter to process parameters, they all must be 

356 # passed to the assembler. 

357 inMemoryDataset = self._post_process_get( 

358 inMemoryDataset, refStorageClass, parameters, isComponent=component is not None 

359 ) 

360 

361 # Last minute type conversion. 

362 return refStorageClass.coerce_type(inMemoryDataset) 

363 

364 def put(self, inMemoryDataset: Any, ref: DatasetRef) -> None: 

365 """Write a InMemoryDataset with a given `DatasetRef` to the store. 

366 

367 Parameters 

368 ---------- 

369 inMemoryDataset : `object` 

370 The dataset to store. 

371 ref : `DatasetRef` 

372 Reference to the associated Dataset. 

373 

374 Raises 

375 ------ 

376 TypeError 

377 Supplied object and storage class are inconsistent. 

378 DatasetTypeNotSupportedError 

379 The associated `DatasetType` is not handled by this datastore. 

380 

381 Notes 

382 ----- 

383 If the datastore is configured to reject certain dataset types it 

384 is possible that the put will fail and raise a 

385 `DatasetTypeNotSupportedError`. The main use case for this is to 

386 allow `ChainedDatastore` to put to multiple datastores without 

387 requiring that every datastore accepts the dataset. 

388 """ 

389 # May need to coerce the in memory dataset to the correct 

390 # python type, otherwise parameters may not work. 

391 inMemoryDataset = ref.datasetType.storageClass.coerce_type(inMemoryDataset) 

392 

393 self._validate_put_parameters(inMemoryDataset, ref) 

394 

395 self.datasets[ref.id] = inMemoryDataset 

396 log.debug("Store %s in %s", ref, self.name) 

397 

398 # Store time we received this content, to allow us to optionally 

399 # expire it. Instead of storing a filename here, we include the 

400 # ID of this datasetRef so we can find it from components. 

401 itemInfo = StoredMemoryItemInfo( 

402 time.time(), ref.datasetType.storageClass, parentID=ref.id, dataset_id=ref.id 

403 ) 

404 

405 # We have to register this content with registry. 

406 # Currently this assumes we have a file so we need to use stub entries 

407 # TODO: Add to ephemeral part of registry 

408 self._register_datasets([(ref, itemInfo)]) 

409 

410 if self._transaction is not None: 

411 self._transaction.registerUndo("put", self.remove, ref) 

412 

413 def getURIs(self, ref: DatasetRef, predict: bool = False) -> DatasetRefURIs: 

414 """Return URIs associated with dataset. 

415 

416 Parameters 

417 ---------- 

418 ref : `DatasetRef` 

419 Reference to the required dataset. 

420 predict : `bool`, optional 

421 If the datastore does not know about the dataset, should it 

422 return a predicted URI or not? 

423 

424 Returns 

425 ------- 

426 uris : `DatasetRefURIs` 

427 The URI to the primary artifact associated with this dataset (if 

428 the dataset was disassembled within the datastore this may be 

429 `None`), and the URIs to any components associated with the dataset 

430 artifact. (can be empty if there are no components). 

431 

432 Notes 

433 ----- 

434 The URIs returned for in-memory datastores are not usable but 

435 provide an indication of the associated dataset. 

436 """ 

437 

438 # Include the dataID as a URI query 

439 query = urlencode(ref.dataId) 

440 

441 # if this has never been written then we have to guess 

442 if not self.exists(ref): 

443 if not predict: 

444 raise FileNotFoundError(f"Dataset {ref} not in this datastore") 

445 name = f"{ref.datasetType.name}" 

446 fragment = "#predicted" 

447 else: 

448 realID, _ = self._get_dataset_info(ref) 

449 name = f"{id(self.datasets[realID])}?{query}" 

450 fragment = "" 

451 

452 return DatasetRefURIs(ResourcePath(f"mem://{name}?{query}{fragment}"), {}) 

453 

454 def getURI(self, ref: DatasetRef, predict: bool = False) -> ResourcePath: 

455 """URI to the Dataset. 

456 

457 Always uses "mem://" URI prefix. 

458 

459 Parameters 

460 ---------- 

461 ref : `DatasetRef` 

462 Reference to the required Dataset. 

463 predict : `bool` 

464 If `True`, allow URIs to be returned of datasets that have not 

465 been written. 

466 

467 Returns 

468 ------- 

469 uri : `str` 

470 URI pointing to the dataset within the datastore. If the 

471 dataset does not exist in the datastore, and if ``predict`` is 

472 `True`, the URI will be a prediction and will include a URI 

473 fragment "#predicted". 

474 If the datastore does not have entities that relate well 

475 to the concept of a URI the returned URI string will be 

476 descriptive. The returned URI is not guaranteed to be obtainable. 

477 

478 Raises 

479 ------ 

480 FileNotFoundError 

481 A URI has been requested for a dataset that does not exist and 

482 guessing is not allowed. 

483 AssertionError 

484 Raised if an internal error occurs. 

485 """ 

486 primary, _ = self.getURIs(ref, predict) 

487 if primary is None: 

488 # This should be impossible since this datastore does 

489 # not disassemble. This check also helps mypy. 

490 raise AssertionError(f"Unexpectedly got no URI for in-memory datastore for {ref}") 

491 return primary 

492 

493 def retrieveArtifacts( 

494 self, 

495 refs: Iterable[DatasetRef], 

496 destination: ResourcePath, 

497 transfer: str = "auto", 

498 preserve_path: bool = True, 

499 overwrite: bool | None = False, 

500 ) -> list[ResourcePath]: 

501 """Retrieve the file artifacts associated with the supplied refs. 

502 

503 Notes 

504 ----- 

505 Not implemented by this datastore. 

506 """ 

507 # Could conceivably launch a FileDatastore to use formatters to write 

508 # the data but this is fraught with problems. 

509 raise NotImplementedError("Can not write artifacts to disk from in-memory datastore.") 

510 

511 def forget(self, refs: Iterable[DatasetRef]) -> None: 

512 # Docstring inherited. 

513 refs = list(refs) 

514 self._bridge.forget(refs) 

515 for ref in refs: 

516 self.removeStoredItemInfo(ref) 

517 

518 @transactional 

519 def trash(self, ref: DatasetRef | Iterable[DatasetRef], ignore_errors: bool = False) -> None: 

520 """Indicate to the Datastore that a dataset can be removed. 

521 

522 Parameters 

523 ---------- 

524 ref : `DatasetRef` or iterable thereof 

525 Reference to the required Dataset(s). 

526 ignore_errors: `bool`, optional 

527 Indicate that errors should be ignored. 

528 

529 Raises 

530 ------ 

531 FileNotFoundError 

532 Attempt to remove a dataset that does not exist. Only relevant 

533 if a single dataset ref is given. 

534 

535 Notes 

536 ----- 

537 Concurrency should not normally be an issue for the in memory datastore 

538 since all internal changes are isolated to solely this process and 

539 the registry only changes rows associated with this process. 

540 """ 

541 if not isinstance(ref, DatasetRef): 

542 log.debug("Bulk trashing of datasets in datastore %s", self.name) 

543 self.bridge.moveToTrash(ref, transaction=self._transaction) 

544 return 

545 

546 log.debug("Trash %s in datastore %s", ref, self.name) 

547 

548 # Check that this dataset is known to datastore 

549 try: 

550 self._get_dataset_info(ref) 

551 

552 # Move datasets to trash table 

553 self.bridge.moveToTrash([ref], transaction=self._transaction) 

554 except Exception as e: 

555 if ignore_errors: 555 ↛ 556line 555 didn't jump to line 556, because the condition on line 555 was never true

556 log.warning( 

557 "Error encountered moving dataset %s to trash in datastore %s: %s", ref, self.name, e 

558 ) 

559 else: 

560 raise 

561 

562 def emptyTrash(self, ignore_errors: bool = False) -> None: 

563 """Remove all datasets from the trash. 

564 

565 Parameters 

566 ---------- 

567 ignore_errors : `bool`, optional 

568 Ignore errors. 

569 

570 Notes 

571 ----- 

572 The internal tracking of datasets is affected by this method and 

573 transaction handling is not supported if there is a problem before 

574 the datasets themselves are deleted. 

575 

576 Concurrency should not normally be an issue for the in memory datastore 

577 since all internal changes are isolated to solely this process and 

578 the registry only changes rows associated with this process. 

579 """ 

580 log.debug("Emptying trash in datastore %s", self.name) 

581 with self._bridge.emptyTrash() as trash_data: 

582 trashed, _ = trash_data 

583 for ref, _ in trashed: 

584 try: 

585 realID, _ = self._get_dataset_info(ref) 

586 except FileNotFoundError: 586 ↛ 589line 586 didn't jump to line 589

587 # Dataset already removed so ignore it 

588 continue 

589 except Exception as e: 

590 if ignore_errors: 

591 log.warning( 

592 "Emptying trash in datastore %s but encountered an error with dataset %s: %s", 

593 self.name, 

594 ref.id, 

595 e, 

596 ) 

597 continue 

598 else: 

599 raise 

600 

601 # Determine whether all references to this dataset have been 

602 # removed and we can delete the dataset itself 

603 allRefs = self.related[realID] 

604 remainingRefs = allRefs - {ref.id} 

605 if not remainingRefs: 605 ↛ 610line 605 didn't jump to line 610, because the condition on line 605 was never false

606 log.debug("Removing artifact %s from datastore %s", realID, self.name) 

607 del self.datasets[realID] 

608 

609 # Remove this entry 

610 self.removeStoredItemInfo(ref) 

611 

612 def validateConfiguration( 

613 self, entities: Iterable[DatasetRef | DatasetType | StorageClass], logFailures: bool = False 

614 ) -> None: 

615 """Validate some of the configuration for this datastore. 

616 

617 Parameters 

618 ---------- 

619 entities : iterable of `DatasetRef`, `DatasetType`, or `StorageClass` 

620 Entities to test against this configuration. Can be differing 

621 types. 

622 logFailures : `bool`, optional 

623 If `True`, output a log message for every validation error 

624 detected. 

625 

626 Raises 

627 ------ 

628 DatastoreValidationError 

629 Raised if there is a validation problem with a configuration. 

630 All the problems are reported in a single exception. 

631 

632 Notes 

633 ----- 

634 This method is a no-op. 

635 """ 

636 return 

637 

638 def _overrideTransferMode(self, *datasets: Any, transfer: str | None = None) -> str | None: 

639 # Docstring is inherited from base class 

640 return transfer 

641 

642 def validateKey(self, lookupKey: LookupKey, entity: DatasetRef | DatasetType | StorageClass) -> None: 

643 # Docstring is inherited from base class 

644 return 

645 

646 def getLookupKeys(self) -> set[LookupKey]: 

647 # Docstring is inherited from base class 

648 return self.constraints.getLookupKeys() 

649 

650 def needs_expanded_data_ids( 

651 self, 

652 transfer: str | None, 

653 entity: DatasetRef | DatasetType | StorageClass | None = None, 

654 ) -> bool: 

655 # Docstring inherited. 

656 return False 

657 

658 def import_records(self, data: Mapping[str, DatastoreRecordData]) -> None: 

659 # Docstring inherited from the base class. 

660 return 

661 

662 def export_records(self, refs: Iterable[DatasetIdRef]) -> Mapping[str, DatastoreRecordData]: 

663 # Docstring inherited from the base class. 

664 

665 # In-memory Datastore records cannot be exported or imported 

666 return {}