Coverage for python/lsst/daf/butler/datastores/inMemoryDatastore.py: 93%

182 statements  

« prev     ^ index     » next       coverage.py v7.3.2, created at 2023-10-25 15:13 +0000

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22"""In-memory datastore.""" 

23 

24from __future__ import annotations 

25 

26__all__ = ("StoredMemoryItemInfo", "InMemoryDatastore") 

27 

28import logging 

29import time 

30from collections.abc import Iterable, Mapping 

31from dataclasses import dataclass 

32from typing import TYPE_CHECKING, Any 

33from urllib.parse import urlencode 

34 

35from lsst.daf.butler import ( 

36 DatasetId, 

37 DatasetRef, 

38 DatasetRefURIs, 

39 DatastoreRecordData, 

40 StorageClass, 

41 StoredDatastoreItemInfo, 

42) 

43from lsst.daf.butler.core.utils import transactional 

44from lsst.daf.butler.registry.interfaces import DatastoreRegistryBridge 

45from lsst.resources import ResourcePath 

46 

47from ..registry.interfaces import DatabaseInsertMode 

48from .genericDatastore import GenericBaseDatastore 

49 

50if TYPE_CHECKING: 

51 from lsst.daf.butler import Config, DatasetType, LookupKey 

52 from lsst.daf.butler.registry.interfaces import DatasetIdRef, DatastoreRegistryBridgeManager 

53 

54log = logging.getLogger(__name__) 

55 

56 

57@dataclass(frozen=True) 

58class StoredMemoryItemInfo(StoredDatastoreItemInfo): 

59 """Internal InMemoryDatastore Metadata associated with a stored 

60 DatasetRef. 

61 """ 

62 

63 __slots__ = {"timestamp", "storageClass", "parentID", "dataset_id"} 

64 

65 timestamp: float 

66 """Unix timestamp indicating the time the dataset was stored.""" 

67 

68 storageClass: StorageClass 

69 """StorageClass associated with the dataset.""" 

70 

71 parentID: DatasetId 

72 """ID of the parent `DatasetRef` if this entry is a concrete 

73 composite. Not used if the dataset being stored is not a 

74 virtual component of a composite 

75 """ 

76 

77 dataset_id: DatasetId 

78 """DatasetId associated with this record.""" 

79 

80 

81class InMemoryDatastore(GenericBaseDatastore): 

82 """Basic Datastore for writing to an in memory cache. 

83 

84 This datastore is ephemeral in that the contents of the datastore 

85 disappear when the Python process completes. This also means that 

86 other processes can not access this datastore. 

87 

88 Parameters 

89 ---------- 

90 config : `DatastoreConfig` or `str` 

91 Configuration. 

92 bridgeManager : `DatastoreRegistryBridgeManager` 

93 Object that manages the interface between `Registry` and datastores. 

94 butlerRoot : `str`, optional 

95 Unused parameter. 

96 

97 Notes 

98 ----- 

99 InMemoryDatastore does not support any file-based ingest. 

100 """ 

101 

102 defaultConfigFile = "datastores/inMemoryDatastore.yaml" 

103 """Path to configuration defaults. Accessed within the ``configs`` resource 

104 or relative to a search path. Can be None if no defaults specified. 

105 """ 

106 

107 isEphemeral = True 

108 """A new datastore is created every time and datasets disappear when 

109 the process shuts down.""" 

110 

111 datasets: dict[DatasetId, Any] 

112 """Internal storage of datasets indexed by dataset ID.""" 

113 

114 records: dict[DatasetId, StoredMemoryItemInfo] 

115 """Internal records about stored datasets.""" 

116 

117 def __init__( 

118 self, 

119 config: Config | str, 

120 bridgeManager: DatastoreRegistryBridgeManager, 

121 butlerRoot: str | None = None, 

122 ): 

123 super().__init__(config, bridgeManager) 

124 

125 # Name ourselves with the timestamp the datastore 

126 # was created. 

127 self.name = f"{type(self).__name__}@{time.time()}" 

128 log.debug("Creating datastore %s", self.name) 

129 

130 # Storage of datasets, keyed by dataset_id 

131 self.datasets: dict[DatasetId, Any] = {} 

132 

133 # Records is distinct in order to track concrete composite components 

134 # where we register multiple components for a single dataset. 

135 self.records: dict[DatasetId, StoredMemoryItemInfo] = {} 

136 

137 # Related records that share the same parent 

138 self.related: dict[DatasetId, set[DatasetId]] = {} 

139 

140 self._bridge = bridgeManager.register(self.name, ephemeral=True) 

141 

142 @classmethod 

143 def setConfigRoot(cls, root: str, config: Config, full: Config, overwrite: bool = True) -> None: 

144 """Set any filesystem-dependent config options for this Datastore to 

145 be appropriate for a new empty repository with the given root. 

146 

147 Does nothing in this implementation. 

148 

149 Parameters 

150 ---------- 

151 root : `str` 

152 Filesystem path to the root of the data repository. 

153 config : `Config` 

154 A `Config` to update. Only the subset understood by 

155 this component will be updated. Will not expand 

156 defaults. 

157 full : `Config` 

158 A complete config with all defaults expanded that can be 

159 converted to a `DatastoreConfig`. Read-only and will not be 

160 modified by this method. 

161 Repository-specific options that should not be obtained 

162 from defaults when Butler instances are constructed 

163 should be copied from ``full`` to ``config``. 

164 overwrite : `bool`, optional 

165 If `False`, do not modify a value in ``config`` if the value 

166 already exists. Default is always to overwrite with the provided 

167 ``root``. 

168 

169 Notes 

170 ----- 

171 If a keyword is explicitly defined in the supplied ``config`` it 

172 will not be overridden by this method if ``overwrite`` is `False`. 

173 This allows explicit values set in external configs to be retained. 

174 """ 

175 return 

176 

177 @property 

178 def bridge(self) -> DatastoreRegistryBridge: 

179 # Docstring inherited from GenericBaseDatastore. 

180 return self._bridge 

181 

182 def addStoredItemInfo( 

183 self, 

184 refs: Iterable[DatasetRef], 

185 infos: Iterable[StoredMemoryItemInfo], 

186 insert_mode: DatabaseInsertMode = DatabaseInsertMode.INSERT, 

187 ) -> None: 

188 # Docstring inherited from GenericBaseDatastore. 

189 for ref, info in zip(refs, infos, strict=True): 

190 self.records[ref.id] = info 

191 self.related.setdefault(info.parentID, set()).add(ref.id) 

192 

193 def getStoredItemInfo(self, ref: DatasetIdRef) -> StoredMemoryItemInfo: 

194 # Docstring inherited from GenericBaseDatastore. 

195 return self.records[ref.id] 

196 

197 def getStoredItemsInfo(self, ref: DatasetIdRef) -> list[StoredMemoryItemInfo]: 

198 # Docstring inherited from GenericBaseDatastore. 

199 return [self.getStoredItemInfo(ref)] 

200 

201 def removeStoredItemInfo(self, ref: DatasetIdRef) -> None: 

202 # Docstring inherited from GenericBaseDatastore. 

203 # If a component has been removed previously then we can sometimes 

204 # be asked to remove it again. Other datastores ignore this 

205 # so also ignore here 

206 if ref.id not in self.records: 

207 return 

208 record = self.records[ref.id] 

209 del self.records[ref.id] 

210 self.related[record.parentID].remove(ref.id) 

211 

212 def _get_dataset_info(self, ref: DatasetIdRef) -> tuple[DatasetId, StoredMemoryItemInfo]: 

213 """Check that the dataset is present and return the real ID and 

214 associated information. 

215 

216 Parameters 

217 ---------- 

218 ref : `DatasetRef` 

219 Target `DatasetRef` 

220 

221 Returns 

222 ------- 

223 realID : `int` 

224 The dataset ID associated with this ref that should be used. This 

225 could either be the ID of the supplied `DatasetRef` or the parent. 

226 storageInfo : `StoredMemoryItemInfo` 

227 Associated storage information. 

228 

229 Raises 

230 ------ 

231 FileNotFoundError 

232 Raised if the dataset is not present in this datastore. 

233 """ 

234 try: 

235 storedItemInfo = self.getStoredItemInfo(ref) 

236 except KeyError: 

237 raise FileNotFoundError(f"No such file dataset in memory: {ref}") from None 

238 realID = ref.id 

239 if storedItemInfo.parentID is not None: 239 ↛ 242line 239 didn't jump to line 242, because the condition on line 239 was never false

240 realID = storedItemInfo.parentID 

241 

242 if realID not in self.datasets: 242 ↛ 243line 242 didn't jump to line 243, because the condition on line 242 was never true

243 raise FileNotFoundError(f"No such file dataset in memory: {ref}") 

244 

245 return realID, storedItemInfo 

246 

247 def knows(self, ref: DatasetRef) -> bool: 

248 """Check if the dataset is known to the datastore. 

249 

250 This datastore does not distinguish dataset existence from knowledge 

251 of a dataset. 

252 

253 Parameters 

254 ---------- 

255 ref : `DatasetRef` 

256 Reference to the required dataset. 

257 

258 Returns 

259 ------- 

260 exists : `bool` 

261 `True` if the dataset is known to the datastore. 

262 """ 

263 return self.exists(ref) 

264 

265 def exists(self, ref: DatasetRef) -> bool: 

266 """Check if the dataset exists in the datastore. 

267 

268 Parameters 

269 ---------- 

270 ref : `DatasetRef` 

271 Reference to the required dataset. 

272 

273 Returns 

274 ------- 

275 exists : `bool` 

276 `True` if the entity exists in the `Datastore`. 

277 """ 

278 try: 

279 self._get_dataset_info(ref) 

280 except FileNotFoundError: 

281 return False 

282 return True 

283 

284 def get( 

285 self, 

286 ref: DatasetRef, 

287 parameters: Mapping[str, Any] | None = None, 

288 storageClass: StorageClass | str | None = None, 

289 ) -> Any: 

290 """Load an InMemoryDataset from the store. 

291 

292 Parameters 

293 ---------- 

294 ref : `DatasetRef` 

295 Reference to the required Dataset. 

296 parameters : `dict` 

297 `StorageClass`-specific parameters that specify, for example, 

298 a slice of the dataset to be loaded. 

299 storageClass : `StorageClass` or `str`, optional 

300 The storage class to be used to override the Python type 

301 returned by this method. By default the returned type matches 

302 the dataset type definition for this dataset. Specifying a 

303 read `StorageClass` can force a different type to be returned. 

304 This type must be compatible with the original type. 

305 

306 Returns 

307 ------- 

308 inMemoryDataset : `object` 

309 Requested dataset or slice thereof as an InMemoryDataset. 

310 

311 Raises 

312 ------ 

313 FileNotFoundError 

314 Requested dataset can not be retrieved. 

315 TypeError 

316 Return value from formatter has unexpected type. 

317 ValueError 

318 Formatter failed to process the dataset. 

319 """ 

320 log.debug("Retrieve %s from %s with parameters %s", ref, self.name, parameters) 

321 

322 realID, storedItemInfo = self._get_dataset_info(ref) 

323 

324 # We have a write storage class and a read storage class and they 

325 # can be different for concrete composites or if overridden. 

326 if storageClass is not None: 

327 ref = ref.overrideStorageClass(storageClass) 

328 refStorageClass = ref.datasetType.storageClass 

329 writeStorageClass = storedItemInfo.storageClass 

330 

331 component = ref.datasetType.component() 

332 

333 # Check that the supplied parameters are suitable for the type read 

334 # If this is a derived component we validate against the composite 

335 isDerivedComponent = False 

336 if component in writeStorageClass.derivedComponents: 

337 writeStorageClass.validateParameters(parameters) 

338 isDerivedComponent = True 

339 else: 

340 refStorageClass.validateParameters(parameters) 

341 

342 inMemoryDataset = self.datasets[realID] 

343 

344 # if this is a read only component we need to apply parameters 

345 # before we retrieve the component. We assume that the parameters 

346 # will affect the data globally, before the derived component 

347 # is selected. 

348 if isDerivedComponent: 

349 inMemoryDataset = writeStorageClass.delegate().handleParameters(inMemoryDataset, parameters) 

350 # Then disable parameters for later 

351 parameters = {} 

352 

353 # Check if we have a component. 

354 if component: 

355 # In-memory datastore must have stored the dataset as a single 

356 # object in the write storage class. We therefore use that 

357 # storage class delegate to obtain the component. 

358 inMemoryDataset = writeStorageClass.delegate().getComponent(inMemoryDataset, component) 

359 

360 # Since there is no formatter to process parameters, they all must be 

361 # passed to the assembler. 

362 inMemoryDataset = self._post_process_get( 

363 inMemoryDataset, refStorageClass, parameters, isComponent=component is not None 

364 ) 

365 

366 # Last minute type conversion. 

367 return refStorageClass.coerce_type(inMemoryDataset) 

368 

369 def put(self, inMemoryDataset: Any, ref: DatasetRef) -> None: 

370 """Write a InMemoryDataset with a given `DatasetRef` to the store. 

371 

372 Parameters 

373 ---------- 

374 inMemoryDataset : `object` 

375 The dataset to store. 

376 ref : `DatasetRef` 

377 Reference to the associated Dataset. 

378 

379 Raises 

380 ------ 

381 TypeError 

382 Supplied object and storage class are inconsistent. 

383 DatasetTypeNotSupportedError 

384 The associated `DatasetType` is not handled by this datastore. 

385 

386 Notes 

387 ----- 

388 If the datastore is configured to reject certain dataset types it 

389 is possible that the put will fail and raise a 

390 `DatasetTypeNotSupportedError`. The main use case for this is to 

391 allow `ChainedDatastore` to put to multiple datastores without 

392 requiring that every datastore accepts the dataset. 

393 """ 

394 # May need to coerce the in memory dataset to the correct 

395 # python type, otherwise parameters may not work. 

396 inMemoryDataset = ref.datasetType.storageClass.coerce_type(inMemoryDataset) 

397 

398 self._validate_put_parameters(inMemoryDataset, ref) 

399 

400 self.datasets[ref.id] = inMemoryDataset 

401 log.debug("Store %s in %s", ref, self.name) 

402 

403 # Store time we received this content, to allow us to optionally 

404 # expire it. Instead of storing a filename here, we include the 

405 # ID of this datasetRef so we can find it from components. 

406 itemInfo = StoredMemoryItemInfo( 

407 time.time(), ref.datasetType.storageClass, parentID=ref.id, dataset_id=ref.id 

408 ) 

409 

410 # We have to register this content with registry. 

411 # Currently this assumes we have a file so we need to use stub entries 

412 # TODO: Add to ephemeral part of registry 

413 self._register_datasets([(ref, itemInfo)]) 

414 

415 if self._transaction is not None: 

416 self._transaction.registerUndo("put", self.remove, ref) 

417 

418 def getURIs(self, ref: DatasetRef, predict: bool = False) -> DatasetRefURIs: 

419 """Return URIs associated with dataset. 

420 

421 Parameters 

422 ---------- 

423 ref : `DatasetRef` 

424 Reference to the required dataset. 

425 predict : `bool`, optional 

426 If the datastore does not know about the dataset, should it 

427 return a predicted URI or not? 

428 

429 Returns 

430 ------- 

431 uris : `DatasetRefURIs` 

432 The URI to the primary artifact associated with this dataset (if 

433 the dataset was disassembled within the datastore this may be 

434 `None`), and the URIs to any components associated with the dataset 

435 artifact. (can be empty if there are no components). 

436 

437 Notes 

438 ----- 

439 The URIs returned for in-memory datastores are not usable but 

440 provide an indication of the associated dataset. 

441 """ 

442 # Include the dataID as a URI query 

443 query = urlencode(ref.dataId) 

444 

445 # if this has never been written then we have to guess 

446 if not self.exists(ref): 

447 if not predict: 

448 raise FileNotFoundError(f"Dataset {ref} not in this datastore") 

449 name = f"{ref.datasetType.name}" 

450 fragment = "#predicted" 

451 else: 

452 realID, _ = self._get_dataset_info(ref) 

453 name = f"{id(self.datasets[realID])}?{query}" 

454 fragment = "" 

455 

456 return DatasetRefURIs(ResourcePath(f"mem://{name}?{query}{fragment}"), {}) 

457 

458 def getURI(self, ref: DatasetRef, predict: bool = False) -> ResourcePath: 

459 """URI to the Dataset. 

460 

461 Always uses "mem://" URI prefix. 

462 

463 Parameters 

464 ---------- 

465 ref : `DatasetRef` 

466 Reference to the required Dataset. 

467 predict : `bool` 

468 If `True`, allow URIs to be returned of datasets that have not 

469 been written. 

470 

471 Returns 

472 ------- 

473 uri : `str` 

474 URI pointing to the dataset within the datastore. If the 

475 dataset does not exist in the datastore, and if ``predict`` is 

476 `True`, the URI will be a prediction and will include a URI 

477 fragment "#predicted". 

478 If the datastore does not have entities that relate well 

479 to the concept of a URI the returned URI string will be 

480 descriptive. The returned URI is not guaranteed to be obtainable. 

481 

482 Raises 

483 ------ 

484 FileNotFoundError 

485 A URI has been requested for a dataset that does not exist and 

486 guessing is not allowed. 

487 AssertionError 

488 Raised if an internal error occurs. 

489 """ 

490 primary, _ = self.getURIs(ref, predict) 

491 if primary is None: 

492 # This should be impossible since this datastore does 

493 # not disassemble. This check also helps mypy. 

494 raise AssertionError(f"Unexpectedly got no URI for in-memory datastore for {ref}") 

495 return primary 

496 

497 def retrieveArtifacts( 

498 self, 

499 refs: Iterable[DatasetRef], 

500 destination: ResourcePath, 

501 transfer: str = "auto", 

502 preserve_path: bool = True, 

503 overwrite: bool | None = False, 

504 ) -> list[ResourcePath]: 

505 """Retrieve the file artifacts associated with the supplied refs. 

506 

507 Notes 

508 ----- 

509 Not implemented by this datastore. 

510 """ 

511 # Could conceivably launch a FileDatastore to use formatters to write 

512 # the data but this is fraught with problems. 

513 raise NotImplementedError("Can not write artifacts to disk from in-memory datastore.") 

514 

515 def forget(self, refs: Iterable[DatasetRef]) -> None: 

516 # Docstring inherited. 

517 refs = list(refs) 

518 self._bridge.forget(refs) 

519 for ref in refs: 

520 self.removeStoredItemInfo(ref) 

521 

522 @transactional 

523 def trash(self, ref: DatasetRef | Iterable[DatasetRef], ignore_errors: bool = False) -> None: 

524 """Indicate to the Datastore that a dataset can be removed. 

525 

526 Parameters 

527 ---------- 

528 ref : `DatasetRef` or iterable thereof 

529 Reference to the required Dataset(s). 

530 ignore_errors: `bool`, optional 

531 Indicate that errors should be ignored. 

532 

533 Raises 

534 ------ 

535 FileNotFoundError 

536 Attempt to remove a dataset that does not exist. Only relevant 

537 if a single dataset ref is given. 

538 

539 Notes 

540 ----- 

541 Concurrency should not normally be an issue for the in memory datastore 

542 since all internal changes are isolated to solely this process and 

543 the registry only changes rows associated with this process. 

544 """ 

545 if not isinstance(ref, DatasetRef): 

546 log.debug("Bulk trashing of datasets in datastore %s", self.name) 

547 self.bridge.moveToTrash(ref, transaction=self._transaction) 

548 return 

549 

550 log.debug("Trash %s in datastore %s", ref, self.name) 

551 

552 # Check that this dataset is known to datastore 

553 try: 

554 self._get_dataset_info(ref) 

555 

556 # Move datasets to trash table 

557 self.bridge.moveToTrash([ref], transaction=self._transaction) 

558 except Exception as e: 

559 if ignore_errors: 559 ↛ 560line 559 didn't jump to line 560, because the condition on line 559 was never true

560 log.warning( 

561 "Error encountered moving dataset %s to trash in datastore %s: %s", ref, self.name, e 

562 ) 

563 else: 

564 raise 

565 

566 def emptyTrash(self, ignore_errors: bool = False) -> None: 

567 """Remove all datasets from the trash. 

568 

569 Parameters 

570 ---------- 

571 ignore_errors : `bool`, optional 

572 Ignore errors. 

573 

574 Notes 

575 ----- 

576 The internal tracking of datasets is affected by this method and 

577 transaction handling is not supported if there is a problem before 

578 the datasets themselves are deleted. 

579 

580 Concurrency should not normally be an issue for the in memory datastore 

581 since all internal changes are isolated to solely this process and 

582 the registry only changes rows associated with this process. 

583 """ 

584 log.debug("Emptying trash in datastore %s", self.name) 

585 with self._bridge.emptyTrash() as trash_data: 

586 trashed, _ = trash_data 

587 for ref, _ in trashed: 

588 try: 

589 realID, _ = self._get_dataset_info(ref) 

590 except FileNotFoundError: 590 ↛ 593line 590 didn't jump to line 593

591 # Dataset already removed so ignore it 

592 continue 

593 except Exception as e: 

594 if ignore_errors: 

595 log.warning( 

596 "Emptying trash in datastore %s but encountered an error with dataset %s: %s", 

597 self.name, 

598 ref.id, 

599 e, 

600 ) 

601 continue 

602 else: 

603 raise 

604 

605 # Determine whether all references to this dataset have been 

606 # removed and we can delete the dataset itself 

607 allRefs = self.related[realID] 

608 remainingRefs = allRefs - {ref.id} 

609 if not remainingRefs: 609 ↛ 614line 609 didn't jump to line 614, because the condition on line 609 was never false

610 log.debug("Removing artifact %s from datastore %s", realID, self.name) 

611 del self.datasets[realID] 

612 

613 # Remove this entry 

614 self.removeStoredItemInfo(ref) 

615 

616 def validateConfiguration( 

617 self, entities: Iterable[DatasetRef | DatasetType | StorageClass], logFailures: bool = False 

618 ) -> None: 

619 """Validate some of the configuration for this datastore. 

620 

621 Parameters 

622 ---------- 

623 entities : iterable of `DatasetRef`, `DatasetType`, or `StorageClass` 

624 Entities to test against this configuration. Can be differing 

625 types. 

626 logFailures : `bool`, optional 

627 If `True`, output a log message for every validation error 

628 detected. 

629 

630 Raises 

631 ------ 

632 DatastoreValidationError 

633 Raised if there is a validation problem with a configuration. 

634 All the problems are reported in a single exception. 

635 

636 Notes 

637 ----- 

638 This method is a no-op. 

639 """ 

640 return 

641 

642 def _overrideTransferMode(self, *datasets: Any, transfer: str | None = None) -> str | None: 

643 # Docstring is inherited from base class 

644 return transfer 

645 

646 def validateKey(self, lookupKey: LookupKey, entity: DatasetRef | DatasetType | StorageClass) -> None: 

647 # Docstring is inherited from base class 

648 return 

649 

650 def getLookupKeys(self) -> set[LookupKey]: 

651 # Docstring is inherited from base class 

652 return self.constraints.getLookupKeys() 

653 

654 def needs_expanded_data_ids( 

655 self, 

656 transfer: str | None, 

657 entity: DatasetRef | DatasetType | StorageClass | None = None, 

658 ) -> bool: 

659 # Docstring inherited. 

660 return False 

661 

662 def import_records(self, data: Mapping[str, DatastoreRecordData]) -> None: 

663 # Docstring inherited from the base class. 

664 return 

665 

666 def export_records(self, refs: Iterable[DatasetIdRef]) -> Mapping[str, DatastoreRecordData]: 

667 # Docstring inherited from the base class. 

668 

669 # In-memory Datastore records cannot be exported or imported 

670 return {}