Coverage for python/lsst/daf/butler/datastores/inMemoryDatastore.py: 93%

182 statements  

« prev     ^ index     » next       coverage.py v7.3.1, created at 2023-10-02 07:59 +0000

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This software is dual licensed under the GNU General Public License and also 

10# under a 3-clause BSD license. Recipients may choose which of these licenses 

11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt, 

12# respectively. If you choose the GPL option then the following text applies 

13# (but note that there is still no warranty even if you opt for BSD instead): 

14# 

15# This program is free software: you can redistribute it and/or modify 

16# it under the terms of the GNU General Public License as published by 

17# the Free Software Foundation, either version 3 of the License, or 

18# (at your option) any later version. 

19# 

20# This program is distributed in the hope that it will be useful, 

21# but WITHOUT ANY WARRANTY; without even the implied warranty of 

22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

23# GNU General Public License for more details. 

24# 

25# You should have received a copy of the GNU General Public License 

26# along with this program. If not, see <http://www.gnu.org/licenses/>. 

27 

28"""In-memory datastore.""" 

29 

30from __future__ import annotations 

31 

32__all__ = ("StoredMemoryItemInfo", "InMemoryDatastore") 

33 

34import logging 

35import time 

36from collections.abc import Iterable, Mapping 

37from dataclasses import dataclass 

38from typing import TYPE_CHECKING, Any 

39from urllib.parse import urlencode 

40 

41from lsst.daf.butler import ( 

42 DatasetId, 

43 DatasetRef, 

44 DatasetRefURIs, 

45 DatastoreRecordData, 

46 StorageClass, 

47 StoredDatastoreItemInfo, 

48) 

49from lsst.daf.butler.core.utils import transactional 

50from lsst.daf.butler.registry.interfaces import DatastoreRegistryBridge 

51from lsst.resources import ResourcePath 

52 

53from ..registry.interfaces import DatabaseInsertMode 

54from .genericDatastore import GenericBaseDatastore 

55 

56if TYPE_CHECKING: 

57 from lsst.daf.butler import Config, DatasetType, LookupKey 

58 from lsst.daf.butler.registry.interfaces import DatasetIdRef, DatastoreRegistryBridgeManager 

59 

60log = logging.getLogger(__name__) 

61 

62 

63@dataclass(frozen=True) 

64class StoredMemoryItemInfo(StoredDatastoreItemInfo): 

65 """Internal InMemoryDatastore Metadata associated with a stored 

66 DatasetRef. 

67 """ 

68 

69 __slots__ = {"timestamp", "storageClass", "parentID", "dataset_id"} 

70 

71 timestamp: float 

72 """Unix timestamp indicating the time the dataset was stored.""" 

73 

74 storageClass: StorageClass 

75 """StorageClass associated with the dataset.""" 

76 

77 parentID: DatasetId 

78 """ID of the parent `DatasetRef` if this entry is a concrete 

79 composite. Not used if the dataset being stored is not a 

80 virtual component of a composite 

81 """ 

82 

83 dataset_id: DatasetId 

84 """DatasetId associated with this record.""" 

85 

86 

87class InMemoryDatastore(GenericBaseDatastore): 

88 """Basic Datastore for writing to an in memory cache. 

89 

90 This datastore is ephemeral in that the contents of the datastore 

91 disappear when the Python process completes. This also means that 

92 other processes can not access this datastore. 

93 

94 Parameters 

95 ---------- 

96 config : `DatastoreConfig` or `str` 

97 Configuration. 

98 bridgeManager : `DatastoreRegistryBridgeManager` 

99 Object that manages the interface between `Registry` and datastores. 

100 butlerRoot : `str`, optional 

101 Unused parameter. 

102 

103 Notes 

104 ----- 

105 InMemoryDatastore does not support any file-based ingest. 

106 """ 

107 

108 defaultConfigFile = "datastores/inMemoryDatastore.yaml" 

109 """Path to configuration defaults. Accessed within the ``configs`` resource 

110 or relative to a search path. Can be None if no defaults specified. 

111 """ 

112 

113 isEphemeral = True 

114 """A new datastore is created every time and datasets disappear when 

115 the process shuts down.""" 

116 

117 datasets: dict[DatasetId, Any] 

118 """Internal storage of datasets indexed by dataset ID.""" 

119 

120 records: dict[DatasetId, StoredMemoryItemInfo] 

121 """Internal records about stored datasets.""" 

122 

123 def __init__( 

124 self, 

125 config: Config | str, 

126 bridgeManager: DatastoreRegistryBridgeManager, 

127 butlerRoot: str | None = None, 

128 ): 

129 super().__init__(config, bridgeManager) 

130 

131 # Name ourselves with the timestamp the datastore 

132 # was created. 

133 self.name = f"{type(self).__name__}@{time.time()}" 

134 log.debug("Creating datastore %s", self.name) 

135 

136 # Storage of datasets, keyed by dataset_id 

137 self.datasets: dict[DatasetId, Any] = {} 

138 

139 # Records is distinct in order to track concrete composite components 

140 # where we register multiple components for a single dataset. 

141 self.records: dict[DatasetId, StoredMemoryItemInfo] = {} 

142 

143 # Related records that share the same parent 

144 self.related: dict[DatasetId, set[DatasetId]] = {} 

145 

146 self._bridge = bridgeManager.register(self.name, ephemeral=True) 

147 

148 @classmethod 

149 def setConfigRoot(cls, root: str, config: Config, full: Config, overwrite: bool = True) -> None: 

150 """Set any filesystem-dependent config options for this Datastore to 

151 be appropriate for a new empty repository with the given root. 

152 

153 Does nothing in this implementation. 

154 

155 Parameters 

156 ---------- 

157 root : `str` 

158 Filesystem path to the root of the data repository. 

159 config : `Config` 

160 A `Config` to update. Only the subset understood by 

161 this component will be updated. Will not expand 

162 defaults. 

163 full : `Config` 

164 A complete config with all defaults expanded that can be 

165 converted to a `DatastoreConfig`. Read-only and will not be 

166 modified by this method. 

167 Repository-specific options that should not be obtained 

168 from defaults when Butler instances are constructed 

169 should be copied from ``full`` to ``config``. 

170 overwrite : `bool`, optional 

171 If `False`, do not modify a value in ``config`` if the value 

172 already exists. Default is always to overwrite with the provided 

173 ``root``. 

174 

175 Notes 

176 ----- 

177 If a keyword is explicitly defined in the supplied ``config`` it 

178 will not be overridden by this method if ``overwrite`` is `False`. 

179 This allows explicit values set in external configs to be retained. 

180 """ 

181 return 

182 

183 @property 

184 def bridge(self) -> DatastoreRegistryBridge: 

185 # Docstring inherited from GenericBaseDatastore. 

186 return self._bridge 

187 

188 def addStoredItemInfo( 

189 self, 

190 refs: Iterable[DatasetRef], 

191 infos: Iterable[StoredMemoryItemInfo], 

192 insert_mode: DatabaseInsertMode = DatabaseInsertMode.INSERT, 

193 ) -> None: 

194 # Docstring inherited from GenericBaseDatastore. 

195 for ref, info in zip(refs, infos, strict=True): 

196 self.records[ref.id] = info 

197 self.related.setdefault(info.parentID, set()).add(ref.id) 

198 

199 def getStoredItemInfo(self, ref: DatasetIdRef) -> StoredMemoryItemInfo: 

200 # Docstring inherited from GenericBaseDatastore. 

201 return self.records[ref.id] 

202 

203 def getStoredItemsInfo(self, ref: DatasetIdRef) -> list[StoredMemoryItemInfo]: 

204 # Docstring inherited from GenericBaseDatastore. 

205 return [self.getStoredItemInfo(ref)] 

206 

207 def removeStoredItemInfo(self, ref: DatasetIdRef) -> None: 

208 # Docstring inherited from GenericBaseDatastore. 

209 # If a component has been removed previously then we can sometimes 

210 # be asked to remove it again. Other datastores ignore this 

211 # so also ignore here 

212 if ref.id not in self.records: 

213 return 

214 record = self.records[ref.id] 

215 del self.records[ref.id] 

216 self.related[record.parentID].remove(ref.id) 

217 

218 def _get_dataset_info(self, ref: DatasetIdRef) -> tuple[DatasetId, StoredMemoryItemInfo]: 

219 """Check that the dataset is present and return the real ID and 

220 associated information. 

221 

222 Parameters 

223 ---------- 

224 ref : `DatasetRef` 

225 Target `DatasetRef` 

226 

227 Returns 

228 ------- 

229 realID : `int` 

230 The dataset ID associated with this ref that should be used. This 

231 could either be the ID of the supplied `DatasetRef` or the parent. 

232 storageInfo : `StoredMemoryItemInfo` 

233 Associated storage information. 

234 

235 Raises 

236 ------ 

237 FileNotFoundError 

238 Raised if the dataset is not present in this datastore. 

239 """ 

240 try: 

241 storedItemInfo = self.getStoredItemInfo(ref) 

242 except KeyError: 

243 raise FileNotFoundError(f"No such file dataset in memory: {ref}") from None 

244 realID = ref.id 

245 if storedItemInfo.parentID is not None: 245 ↛ 248line 245 didn't jump to line 248, because the condition on line 245 was never false

246 realID = storedItemInfo.parentID 

247 

248 if realID not in self.datasets: 248 ↛ 249line 248 didn't jump to line 249, because the condition on line 248 was never true

249 raise FileNotFoundError(f"No such file dataset in memory: {ref}") 

250 

251 return realID, storedItemInfo 

252 

253 def knows(self, ref: DatasetRef) -> bool: 

254 """Check if the dataset is known to the datastore. 

255 

256 This datastore does not distinguish dataset existence from knowledge 

257 of a dataset. 

258 

259 Parameters 

260 ---------- 

261 ref : `DatasetRef` 

262 Reference to the required dataset. 

263 

264 Returns 

265 ------- 

266 exists : `bool` 

267 `True` if the dataset is known to the datastore. 

268 """ 

269 return self.exists(ref) 

270 

271 def exists(self, ref: DatasetRef) -> bool: 

272 """Check if the dataset exists in the datastore. 

273 

274 Parameters 

275 ---------- 

276 ref : `DatasetRef` 

277 Reference to the required dataset. 

278 

279 Returns 

280 ------- 

281 exists : `bool` 

282 `True` if the entity exists in the `Datastore`. 

283 """ 

284 try: 

285 self._get_dataset_info(ref) 

286 except FileNotFoundError: 

287 return False 

288 return True 

289 

290 def get( 

291 self, 

292 ref: DatasetRef, 

293 parameters: Mapping[str, Any] | None = None, 

294 storageClass: StorageClass | str | None = None, 

295 ) -> Any: 

296 """Load an InMemoryDataset from the store. 

297 

298 Parameters 

299 ---------- 

300 ref : `DatasetRef` 

301 Reference to the required Dataset. 

302 parameters : `dict` 

303 `StorageClass`-specific parameters that specify, for example, 

304 a slice of the dataset to be loaded. 

305 storageClass : `StorageClass` or `str`, optional 

306 The storage class to be used to override the Python type 

307 returned by this method. By default the returned type matches 

308 the dataset type definition for this dataset. Specifying a 

309 read `StorageClass` can force a different type to be returned. 

310 This type must be compatible with the original type. 

311 

312 Returns 

313 ------- 

314 inMemoryDataset : `object` 

315 Requested dataset or slice thereof as an InMemoryDataset. 

316 

317 Raises 

318 ------ 

319 FileNotFoundError 

320 Requested dataset can not be retrieved. 

321 TypeError 

322 Return value from formatter has unexpected type. 

323 ValueError 

324 Formatter failed to process the dataset. 

325 """ 

326 log.debug("Retrieve %s from %s with parameters %s", ref, self.name, parameters) 

327 

328 realID, storedItemInfo = self._get_dataset_info(ref) 

329 

330 # We have a write storage class and a read storage class and they 

331 # can be different for concrete composites or if overridden. 

332 if storageClass is not None: 

333 ref = ref.overrideStorageClass(storageClass) 

334 refStorageClass = ref.datasetType.storageClass 

335 writeStorageClass = storedItemInfo.storageClass 

336 

337 component = ref.datasetType.component() 

338 

339 # Check that the supplied parameters are suitable for the type read 

340 # If this is a derived component we validate against the composite 

341 isDerivedComponent = False 

342 if component in writeStorageClass.derivedComponents: 

343 writeStorageClass.validateParameters(parameters) 

344 isDerivedComponent = True 

345 else: 

346 refStorageClass.validateParameters(parameters) 

347 

348 inMemoryDataset = self.datasets[realID] 

349 

350 # if this is a read only component we need to apply parameters 

351 # before we retrieve the component. We assume that the parameters 

352 # will affect the data globally, before the derived component 

353 # is selected. 

354 if isDerivedComponent: 

355 inMemoryDataset = writeStorageClass.delegate().handleParameters(inMemoryDataset, parameters) 

356 # Then disable parameters for later 

357 parameters = {} 

358 

359 # Check if we have a component. 

360 if component: 

361 # In-memory datastore must have stored the dataset as a single 

362 # object in the write storage class. We therefore use that 

363 # storage class delegate to obtain the component. 

364 inMemoryDataset = writeStorageClass.delegate().getComponent(inMemoryDataset, component) 

365 

366 # Since there is no formatter to process parameters, they all must be 

367 # passed to the assembler. 

368 inMemoryDataset = self._post_process_get( 

369 inMemoryDataset, refStorageClass, parameters, isComponent=component is not None 

370 ) 

371 

372 # Last minute type conversion. 

373 return refStorageClass.coerce_type(inMemoryDataset) 

374 

375 def put(self, inMemoryDataset: Any, ref: DatasetRef) -> None: 

376 """Write a InMemoryDataset with a given `DatasetRef` to the store. 

377 

378 Parameters 

379 ---------- 

380 inMemoryDataset : `object` 

381 The dataset to store. 

382 ref : `DatasetRef` 

383 Reference to the associated Dataset. 

384 

385 Raises 

386 ------ 

387 TypeError 

388 Supplied object and storage class are inconsistent. 

389 DatasetTypeNotSupportedError 

390 The associated `DatasetType` is not handled by this datastore. 

391 

392 Notes 

393 ----- 

394 If the datastore is configured to reject certain dataset types it 

395 is possible that the put will fail and raise a 

396 `DatasetTypeNotSupportedError`. The main use case for this is to 

397 allow `ChainedDatastore` to put to multiple datastores without 

398 requiring that every datastore accepts the dataset. 

399 """ 

400 # May need to coerce the in memory dataset to the correct 

401 # python type, otherwise parameters may not work. 

402 inMemoryDataset = ref.datasetType.storageClass.coerce_type(inMemoryDataset) 

403 

404 self._validate_put_parameters(inMemoryDataset, ref) 

405 

406 self.datasets[ref.id] = inMemoryDataset 

407 log.debug("Store %s in %s", ref, self.name) 

408 

409 # Store time we received this content, to allow us to optionally 

410 # expire it. Instead of storing a filename here, we include the 

411 # ID of this datasetRef so we can find it from components. 

412 itemInfo = StoredMemoryItemInfo( 

413 time.time(), ref.datasetType.storageClass, parentID=ref.id, dataset_id=ref.id 

414 ) 

415 

416 # We have to register this content with registry. 

417 # Currently this assumes we have a file so we need to use stub entries 

418 # TODO: Add to ephemeral part of registry 

419 self._register_datasets([(ref, itemInfo)]) 

420 

421 if self._transaction is not None: 

422 self._transaction.registerUndo("put", self.remove, ref) 

423 

424 def getURIs(self, ref: DatasetRef, predict: bool = False) -> DatasetRefURIs: 

425 """Return URIs associated with dataset. 

426 

427 Parameters 

428 ---------- 

429 ref : `DatasetRef` 

430 Reference to the required dataset. 

431 predict : `bool`, optional 

432 If the datastore does not know about the dataset, should it 

433 return a predicted URI or not? 

434 

435 Returns 

436 ------- 

437 uris : `DatasetRefURIs` 

438 The URI to the primary artifact associated with this dataset (if 

439 the dataset was disassembled within the datastore this may be 

440 `None`), and the URIs to any components associated with the dataset 

441 artifact. (can be empty if there are no components). 

442 

443 Notes 

444 ----- 

445 The URIs returned for in-memory datastores are not usable but 

446 provide an indication of the associated dataset. 

447 """ 

448 # Include the dataID as a URI query 

449 query = urlencode(ref.dataId) 

450 

451 # if this has never been written then we have to guess 

452 if not self.exists(ref): 

453 if not predict: 

454 raise FileNotFoundError(f"Dataset {ref} not in this datastore") 

455 name = f"{ref.datasetType.name}" 

456 fragment = "#predicted" 

457 else: 

458 realID, _ = self._get_dataset_info(ref) 

459 name = f"{id(self.datasets[realID])}?{query}" 

460 fragment = "" 

461 

462 return DatasetRefURIs(ResourcePath(f"mem://{name}?{query}{fragment}"), {}) 

463 

464 def getURI(self, ref: DatasetRef, predict: bool = False) -> ResourcePath: 

465 """URI to the Dataset. 

466 

467 Always uses "mem://" URI prefix. 

468 

469 Parameters 

470 ---------- 

471 ref : `DatasetRef` 

472 Reference to the required Dataset. 

473 predict : `bool` 

474 If `True`, allow URIs to be returned of datasets that have not 

475 been written. 

476 

477 Returns 

478 ------- 

479 uri : `str` 

480 URI pointing to the dataset within the datastore. If the 

481 dataset does not exist in the datastore, and if ``predict`` is 

482 `True`, the URI will be a prediction and will include a URI 

483 fragment "#predicted". 

484 If the datastore does not have entities that relate well 

485 to the concept of a URI the returned URI string will be 

486 descriptive. The returned URI is not guaranteed to be obtainable. 

487 

488 Raises 

489 ------ 

490 FileNotFoundError 

491 A URI has been requested for a dataset that does not exist and 

492 guessing is not allowed. 

493 AssertionError 

494 Raised if an internal error occurs. 

495 """ 

496 primary, _ = self.getURIs(ref, predict) 

497 if primary is None: 

498 # This should be impossible since this datastore does 

499 # not disassemble. This check also helps mypy. 

500 raise AssertionError(f"Unexpectedly got no URI for in-memory datastore for {ref}") 

501 return primary 

502 

503 def retrieveArtifacts( 

504 self, 

505 refs: Iterable[DatasetRef], 

506 destination: ResourcePath, 

507 transfer: str = "auto", 

508 preserve_path: bool = True, 

509 overwrite: bool | None = False, 

510 ) -> list[ResourcePath]: 

511 """Retrieve the file artifacts associated with the supplied refs. 

512 

513 Notes 

514 ----- 

515 Not implemented by this datastore. 

516 """ 

517 # Could conceivably launch a FileDatastore to use formatters to write 

518 # the data but this is fraught with problems. 

519 raise NotImplementedError("Can not write artifacts to disk from in-memory datastore.") 

520 

521 def forget(self, refs: Iterable[DatasetRef]) -> None: 

522 # Docstring inherited. 

523 refs = list(refs) 

524 self._bridge.forget(refs) 

525 for ref in refs: 

526 self.removeStoredItemInfo(ref) 

527 

528 @transactional 

529 def trash(self, ref: DatasetRef | Iterable[DatasetRef], ignore_errors: bool = False) -> None: 

530 """Indicate to the Datastore that a dataset can be removed. 

531 

532 Parameters 

533 ---------- 

534 ref : `DatasetRef` or iterable thereof 

535 Reference to the required Dataset(s). 

536 ignore_errors: `bool`, optional 

537 Indicate that errors should be ignored. 

538 

539 Raises 

540 ------ 

541 FileNotFoundError 

542 Attempt to remove a dataset that does not exist. Only relevant 

543 if a single dataset ref is given. 

544 

545 Notes 

546 ----- 

547 Concurrency should not normally be an issue for the in memory datastore 

548 since all internal changes are isolated to solely this process and 

549 the registry only changes rows associated with this process. 

550 """ 

551 if not isinstance(ref, DatasetRef): 

552 log.debug("Bulk trashing of datasets in datastore %s", self.name) 

553 self.bridge.moveToTrash(ref, transaction=self._transaction) 

554 return 

555 

556 log.debug("Trash %s in datastore %s", ref, self.name) 

557 

558 # Check that this dataset is known to datastore 

559 try: 

560 self._get_dataset_info(ref) 

561 

562 # Move datasets to trash table 

563 self.bridge.moveToTrash([ref], transaction=self._transaction) 

564 except Exception as e: 

565 if ignore_errors: 565 ↛ 566line 565 didn't jump to line 566, because the condition on line 565 was never true

566 log.warning( 

567 "Error encountered moving dataset %s to trash in datastore %s: %s", ref, self.name, e 

568 ) 

569 else: 

570 raise 

571 

572 def emptyTrash(self, ignore_errors: bool = False) -> None: 

573 """Remove all datasets from the trash. 

574 

575 Parameters 

576 ---------- 

577 ignore_errors : `bool`, optional 

578 Ignore errors. 

579 

580 Notes 

581 ----- 

582 The internal tracking of datasets is affected by this method and 

583 transaction handling is not supported if there is a problem before 

584 the datasets themselves are deleted. 

585 

586 Concurrency should not normally be an issue for the in memory datastore 

587 since all internal changes are isolated to solely this process and 

588 the registry only changes rows associated with this process. 

589 """ 

590 log.debug("Emptying trash in datastore %s", self.name) 

591 with self._bridge.emptyTrash() as trash_data: 

592 trashed, _ = trash_data 

593 for ref, _ in trashed: 

594 try: 

595 realID, _ = self._get_dataset_info(ref) 

596 except FileNotFoundError: 596 ↛ 599line 596 didn't jump to line 599

597 # Dataset already removed so ignore it 

598 continue 

599 except Exception as e: 

600 if ignore_errors: 

601 log.warning( 

602 "Emptying trash in datastore %s but encountered an error with dataset %s: %s", 

603 self.name, 

604 ref.id, 

605 e, 

606 ) 

607 continue 

608 else: 

609 raise 

610 

611 # Determine whether all references to this dataset have been 

612 # removed and we can delete the dataset itself 

613 allRefs = self.related[realID] 

614 remainingRefs = allRefs - {ref.id} 

615 if not remainingRefs: 615 ↛ 620line 615 didn't jump to line 620, because the condition on line 615 was never false

616 log.debug("Removing artifact %s from datastore %s", realID, self.name) 

617 del self.datasets[realID] 

618 

619 # Remove this entry 

620 self.removeStoredItemInfo(ref) 

621 

622 def validateConfiguration( 

623 self, entities: Iterable[DatasetRef | DatasetType | StorageClass], logFailures: bool = False 

624 ) -> None: 

625 """Validate some of the configuration for this datastore. 

626 

627 Parameters 

628 ---------- 

629 entities : iterable of `DatasetRef`, `DatasetType`, or `StorageClass` 

630 Entities to test against this configuration. Can be differing 

631 types. 

632 logFailures : `bool`, optional 

633 If `True`, output a log message for every validation error 

634 detected. 

635 

636 Raises 

637 ------ 

638 DatastoreValidationError 

639 Raised if there is a validation problem with a configuration. 

640 All the problems are reported in a single exception. 

641 

642 Notes 

643 ----- 

644 This method is a no-op. 

645 """ 

646 return 

647 

648 def _overrideTransferMode(self, *datasets: Any, transfer: str | None = None) -> str | None: 

649 # Docstring is inherited from base class 

650 return transfer 

651 

652 def validateKey(self, lookupKey: LookupKey, entity: DatasetRef | DatasetType | StorageClass) -> None: 

653 # Docstring is inherited from base class 

654 return 

655 

656 def getLookupKeys(self) -> set[LookupKey]: 

657 # Docstring is inherited from base class 

658 return self.constraints.getLookupKeys() 

659 

660 def needs_expanded_data_ids( 

661 self, 

662 transfer: str | None, 

663 entity: DatasetRef | DatasetType | StorageClass | None = None, 

664 ) -> bool: 

665 # Docstring inherited. 

666 return False 

667 

668 def import_records(self, data: Mapping[str, DatastoreRecordData]) -> None: 

669 # Docstring inherited from the base class. 

670 return 

671 

672 def export_records(self, refs: Iterable[DatasetIdRef]) -> Mapping[str, DatastoreRecordData]: 

673 # Docstring inherited from the base class. 

674 

675 # In-memory Datastore records cannot be exported or imported 

676 return {}