Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22from __future__ import annotations 

23 

24"""In-memory datastore.""" 

25 

26__all__ = ("StoredMemoryItemInfo", "InMemoryDatastore") 

27 

28import time 

29import logging 

30from dataclasses import dataclass 

31from urllib.parse import urlencode 

32from typing import ( 

33 TYPE_CHECKING, 

34 Any, 

35 Dict, 

36 Iterable, 

37 List, 

38 Mapping, 

39 Optional, 

40 Set, 

41 Tuple, 

42 Union, 

43) 

44 

45from lsst.daf.butler import StoredDatastoreItemInfo, StorageClass, ButlerURI 

46from lsst.daf.butler.registry.interfaces import DatastoreRegistryBridge 

47from .genericDatastore import GenericBaseDatastore 

48 

49if TYPE_CHECKING: 49 ↛ 50line 49 didn't jump to line 50, because the condition on line 49 was never true

50 from lsst.daf.butler import (Config, DatasetRef, DatasetType, 

51 LookupKey) 

52 from lsst.daf.butler.registry.interfaces import DatasetIdRef, DatastoreRegistryBridgeManager 

53 

54log = logging.getLogger(__name__) 

55 

56 

57@dataclass(frozen=True) 

58class StoredMemoryItemInfo(StoredDatastoreItemInfo): 

59 """Internal InMemoryDatastore Metadata associated with a stored 

60 DatasetRef. 

61 """ 

62 __slots__ = {"timestamp", "storageClass", "parentID"} 

63 

64 timestamp: float 

65 """Unix timestamp indicating the time the dataset was stored.""" 

66 

67 storageClass: StorageClass 

68 """StorageClass associated with the dataset.""" 

69 

70 parentID: int 

71 """ID of the parent `DatasetRef` if this entry is a concrete 

72 composite. Not used if the dataset being stored is not a 

73 virtual component of a composite 

74 """ 

75 

76 

77class InMemoryDatastore(GenericBaseDatastore): 

78 """Basic Datastore for writing to an in memory cache. 

79 

80 This datastore is ephemeral in that the contents of the datastore 

81 disappear when the Python process completes. This also means that 

82 other processes can not access this datastore. 

83 

84 Parameters 

85 ---------- 

86 config : `DatastoreConfig` or `str` 

87 Configuration. 

88 bridgeManager : `DatastoreRegistryBridgeManager` 

89 Object that manages the interface between `Registry` and datastores. 

90 butlerRoot : `str`, optional 

91 Unused parameter. 

92 

93 Notes 

94 ----- 

95 InMemoryDatastore does not support any file-based ingest. 

96 """ 

97 

98 defaultConfigFile = "datastores/inMemoryDatastore.yaml" 

99 """Path to configuration defaults. Accessed within the ``configs`` resource 

100 or relative to a search path. Can be None if no defaults specified. 

101 """ 

102 

103 isEphemeral = True 

104 """A new datastore is created every time and datasets disappear when 

105 the process shuts down.""" 

106 

107 datasets: Dict[int, Any] 

108 """Internal storage of datasets indexed by dataset ID.""" 

109 

110 records: Dict[int, StoredMemoryItemInfo] 

111 """Internal records about stored datasets.""" 

112 

113 def __init__(self, config: Union[Config, str], 

114 bridgeManager: DatastoreRegistryBridgeManager, 

115 butlerRoot: Optional[str] = None): 

116 super().__init__(config, bridgeManager) 

117 

118 # Name ourselves with the timestamp the datastore 

119 # was created. 

120 self.name = "{}@{}".format(type(self).__name__, time.time()) 

121 log.debug("Creating datastore %s", self.name) 

122 

123 # Storage of datasets, keyed by dataset_id 

124 self.datasets: Dict[int, Any] = {} 

125 

126 # Records is distinct in order to track concrete composite components 

127 # where we register multiple components for a single dataset. 

128 self.records: Dict[int, StoredMemoryItemInfo] = {} 

129 

130 # Related records that share the same parent 

131 self.related: Dict[int, Set[int]] = {} 

132 

133 self._bridge = bridgeManager.register(self.name, ephemeral=True) 

134 

135 @classmethod 

136 def setConfigRoot(cls, root: str, config: Config, full: Config, overwrite: bool = True) -> None: 

137 """Set any filesystem-dependent config options for this Datastore to 

138 be appropriate for a new empty repository with the given root. 

139 

140 Does nothing in this implementation. 

141 

142 Parameters 

143 ---------- 

144 root : `str` 

145 Filesystem path to the root of the data repository. 

146 config : `Config` 

147 A `Config` to update. Only the subset understood by 

148 this component will be updated. Will not expand 

149 defaults. 

150 full : `Config` 

151 A complete config with all defaults expanded that can be 

152 converted to a `DatastoreConfig`. Read-only and will not be 

153 modified by this method. 

154 Repository-specific options that should not be obtained 

155 from defaults when Butler instances are constructed 

156 should be copied from ``full`` to ``config``. 

157 overwrite : `bool`, optional 

158 If `False`, do not modify a value in ``config`` if the value 

159 already exists. Default is always to overwrite with the provided 

160 ``root``. 

161 

162 Notes 

163 ----- 

164 If a keyword is explicitly defined in the supplied ``config`` it 

165 will not be overridden by this method if ``overwrite`` is `False`. 

166 This allows explicit values set in external configs to be retained. 

167 """ 

168 return 

169 

170 @property 

171 def bridge(self) -> DatastoreRegistryBridge: 

172 # Docstring inherited from GenericBaseDatastore. 

173 return self._bridge 

174 

175 def addStoredItemInfo(self, refs: Iterable[DatasetRef], 

176 infos: Iterable[StoredMemoryItemInfo]) -> None: 

177 # Docstring inherited from GenericBaseDatastore. 

178 for ref, info in zip(refs, infos): 

179 if ref.id is None: 179 ↛ 180line 179 didn't jump to line 180, because the condition on line 179 was never true

180 raise RuntimeError(f"Can not store unresolved DatasetRef {ref}") 

181 self.records[ref.id] = info 

182 self.related.setdefault(info.parentID, set()).add(ref.id) 

183 

184 def getStoredItemInfo(self, ref: DatasetIdRef) -> StoredMemoryItemInfo: 

185 # Docstring inherited from GenericBaseDatastore. 

186 if ref.id is None: 186 ↛ 187line 186 didn't jump to line 187, because the condition on line 186 was never true

187 raise RuntimeError(f"Can not retrieve unresolved DatasetRef {ref}") 

188 return self.records[ref.id] 

189 

190 def getStoredItemsInfo(self, ref: DatasetIdRef) -> List[StoredMemoryItemInfo]: 

191 # Docstring inherited from GenericBaseDatastore. 

192 return [self.getStoredItemInfo(ref)] 

193 

194 def removeStoredItemInfo(self, ref: DatasetIdRef) -> None: 

195 # Docstring inherited from GenericBaseDatastore. 

196 # If a component has been removed previously then we can sometimes 

197 # be asked to remove it again. Other datastores ignore this 

198 # so also ignore here 

199 if ref.id is None: 199 ↛ 200line 199 didn't jump to line 200, because the condition on line 199 was never true

200 raise RuntimeError(f"Can not remove unresolved DatasetRef {ref}") 

201 if ref.id not in self.records: 201 ↛ 202line 201 didn't jump to line 202, because the condition on line 201 was never true

202 return 

203 record = self.records[ref.id] 

204 del self.records[ref.id] 

205 self.related[record.parentID].remove(ref.id) 

206 

207 def _get_dataset_info(self, ref: DatasetIdRef) -> Tuple[int, StoredMemoryItemInfo]: 

208 """Check that the dataset is present and return the real ID and 

209 associated information. 

210 

211 Parameters 

212 ---------- 

213 ref : `DatasetRef` 

214 Target `DatasetRef` 

215 

216 Returns 

217 ------- 

218 realID : `int` 

219 The dataset ID associated with this ref that shoul be used. This 

220 could either be the ID of the supplied `DatasetRef` or the parent. 

221 storageInfo : `StoredMemoryItemInfo` 

222 Associated storage information. 

223 

224 Raises 

225 ------ 

226 FileNotFoundError 

227 Raised if the dataset is not present in this datastore. 

228 """ 

229 try: 

230 storedItemInfo = self.getStoredItemInfo(ref) 

231 except KeyError: 

232 raise FileNotFoundError(f"No such file dataset in memory: {ref}") from None 

233 realID = ref.id 

234 if storedItemInfo.parentID is not None: 234 ↛ 237line 234 didn't jump to line 237, because the condition on line 234 was never false

235 realID = storedItemInfo.parentID 

236 

237 if realID not in self.datasets: 237 ↛ 238line 237 didn't jump to line 238, because the condition on line 237 was never true

238 raise FileNotFoundError(f"No such file dataset in memory: {ref}") 

239 

240 return realID, storedItemInfo 

241 

242 def exists(self, ref: DatasetRef) -> bool: 

243 """Check if the dataset exists in the datastore. 

244 

245 Parameters 

246 ---------- 

247 ref : `DatasetRef` 

248 Reference to the required dataset. 

249 

250 Returns 

251 ------- 

252 exists : `bool` 

253 `True` if the entity exists in the `Datastore`. 

254 """ 

255 try: 

256 self._get_dataset_info(ref) 

257 except FileNotFoundError: 

258 return False 

259 return True 

260 

261 def get(self, ref: DatasetRef, parameters: Optional[Mapping[str, Any]] = None) -> Any: 

262 """Load an InMemoryDataset from the store. 

263 

264 Parameters 

265 ---------- 

266 ref : `DatasetRef` 

267 Reference to the required Dataset. 

268 parameters : `dict` 

269 `StorageClass`-specific parameters that specify, for example, 

270 a slice of the dataset to be loaded. 

271 

272 Returns 

273 ------- 

274 inMemoryDataset : `object` 

275 Requested dataset or slice thereof as an InMemoryDataset. 

276 

277 Raises 

278 ------ 

279 FileNotFoundError 

280 Requested dataset can not be retrieved. 

281 TypeError 

282 Return value from formatter has unexpected type. 

283 ValueError 

284 Formatter failed to process the dataset. 

285 """ 

286 

287 log.debug("Retrieve %s from %s with parameters %s", ref, self.name, parameters) 

288 

289 realID, storedItemInfo = self._get_dataset_info(ref) 

290 

291 # We have a write storage class and a read storage class and they 

292 # can be different for concrete composites. 

293 readStorageClass = ref.datasetType.storageClass 

294 writeStorageClass = storedItemInfo.storageClass 

295 

296 component = ref.datasetType.component() 

297 

298 # Check that the supplied parameters are suitable for the type read 

299 # If this is a derived component we validate against the composite 

300 isDerivedComponent = False 

301 if component in writeStorageClass.derivedComponents: 

302 writeStorageClass.validateParameters(parameters) 

303 isDerivedComponent = True 

304 else: 

305 readStorageClass.validateParameters(parameters) 

306 

307 inMemoryDataset = self.datasets[realID] 

308 

309 # if this is a read only component we need to apply parameters 

310 # before we retrieve the component. We assume that the parameters 

311 # will affect the data globally, before the derived component 

312 # is selected. 

313 if isDerivedComponent: 

314 inMemoryDataset = writeStorageClass.delegate().handleParameters(inMemoryDataset, parameters) 

315 # Then disable parameters for later 

316 parameters = {} 

317 

318 # Different storage classes implies a component request 

319 if readStorageClass != writeStorageClass: 

320 

321 if component is None: 321 ↛ 322line 321 didn't jump to line 322, because the condition on line 321 was never true

322 raise ValueError("Storage class inconsistency ({} vs {}) but no" 

323 " component requested".format(readStorageClass.name, 

324 writeStorageClass.name)) 

325 

326 # Concrete composite written as a single object (we hope) 

327 inMemoryDataset = writeStorageClass.delegate().getComponent(inMemoryDataset, component) 

328 

329 # Since there is no formatter to process parameters, they all must be 

330 # passed to the assembler. 

331 return self._post_process_get(inMemoryDataset, readStorageClass, parameters, 

332 isComponent=component is not None) 

333 

334 def put(self, inMemoryDataset: Any, ref: DatasetRef) -> None: 

335 """Write a InMemoryDataset with a given `DatasetRef` to the store. 

336 

337 Parameters 

338 ---------- 

339 inMemoryDataset : `object` 

340 The dataset to store. 

341 ref : `DatasetRef` 

342 Reference to the associated Dataset. 

343 

344 Raises 

345 ------ 

346 TypeError 

347 Supplied object and storage class are inconsistent. 

348 DatasetTypeNotSupportedError 

349 The associated `DatasetType` is not handled by this datastore. 

350 

351 Notes 

352 ----- 

353 If the datastore is configured to reject certain dataset types it 

354 is possible that the put will fail and raise a 

355 `DatasetTypeNotSupportedError`. The main use case for this is to 

356 allow `ChainedDatastore` to put to multiple datastores without 

357 requiring that every datastore accepts the dataset. 

358 """ 

359 

360 if ref.id is None: 360 ↛ 361line 360 didn't jump to line 361, because the condition on line 360 was never true

361 raise RuntimeError(f"Can not store unresolved DatasetRef {ref}") 

362 

363 self._validate_put_parameters(inMemoryDataset, ref) 

364 

365 self.datasets[ref.id] = inMemoryDataset 

366 log.debug("Store %s in %s", ref, self.name) 

367 

368 # Store time we received this content, to allow us to optionally 

369 # expire it. Instead of storing a filename here, we include the 

370 # ID of this datasetRef so we can find it from components. 

371 itemInfo = StoredMemoryItemInfo(time.time(), ref.datasetType.storageClass, 

372 parentID=ref.id) 

373 

374 # We have to register this content with registry. 

375 # Currently this assumes we have a file so we need to use stub entries 

376 # TODO: Add to ephemeral part of registry 

377 self._register_datasets([(ref, itemInfo)]) 

378 

379 if self._transaction is not None: 

380 self._transaction.registerUndo("put", self.remove, ref) 

381 

382 def getURIs(self, ref: DatasetRef, 

383 predict: bool = False) -> Tuple[Optional[ButlerURI], Dict[str, ButlerURI]]: 

384 """Return URIs associated with dataset. 

385 

386 Parameters 

387 ---------- 

388 ref : `DatasetRef` 

389 Reference to the required dataset. 

390 predict : `bool`, optional 

391 If the datastore does not know about the dataset, should it 

392 return a predicted URI or not? 

393 

394 Returns 

395 ------- 

396 primary : `ButlerURI` 

397 The URI to the primary artifact associated with this dataset. 

398 If the dataset was disassembled within the datastore this 

399 may be `None`. 

400 components : `dict` 

401 URIs to any components associated with the dataset artifact. 

402 Can be empty if there are no components. 

403 

404 Notes 

405 ----- 

406 The URIs returned for in-memory datastores are not usable but 

407 provide an indication of the associated dataset. 

408 """ 

409 

410 # Include the dataID as a URI query 

411 query = urlencode(ref.dataId) 

412 

413 # if this has never been written then we have to guess 

414 if not self.exists(ref): 

415 if not predict: 

416 raise FileNotFoundError("Dataset {} not in this datastore".format(ref)) 

417 name = f"{ref.datasetType.name}" 

418 fragment = "#predicted" 

419 else: 

420 realID, _ = self._get_dataset_info(ref) 

421 name = f"{id(self.datasets[realID])}?{query}" 

422 fragment = "" 

423 

424 return ButlerURI(f"mem://{name}?{query}{fragment}"), {} 

425 

426 def getURI(self, ref: DatasetRef, predict: bool = False) -> ButlerURI: 

427 """URI to the Dataset. 

428 

429 Always uses "mem://" URI prefix. 

430 

431 Parameters 

432 ---------- 

433 ref : `DatasetRef` 

434 Reference to the required Dataset. 

435 predict : `bool` 

436 If `True`, allow URIs to be returned of datasets that have not 

437 been written. 

438 

439 Returns 

440 ------- 

441 uri : `str` 

442 URI pointing to the dataset within the datastore. If the 

443 dataset does not exist in the datastore, and if ``predict`` is 

444 `True`, the URI will be a prediction and will include a URI 

445 fragment "#predicted". 

446 If the datastore does not have entities that relate well 

447 to the concept of a URI the returned URI string will be 

448 descriptive. The returned URI is not guaranteed to be obtainable. 

449 

450 Raises 

451 ------ 

452 FileNotFoundError 

453 A URI has been requested for a dataset that does not exist and 

454 guessing is not allowed. 

455 AssertionError 

456 Raised if an internal error occurs. 

457 """ 

458 primary, _ = self.getURIs(ref, predict) 

459 if primary is None: 459 ↛ 462line 459 didn't jump to line 462, because the condition on line 459 was never true

460 # This should be impossible since this datastore does 

461 # not disassemble. This check also helps mypy. 

462 raise AssertionError(f"Unexpectedly got no URI for in-memory datastore for {ref}") 

463 return primary 

464 

465 def trash(self, ref: DatasetRef, ignore_errors: bool = False) -> None: 

466 """Indicate to the Datastore that a dataset can be removed. 

467 

468 Parameters 

469 ---------- 

470 ref : `DatasetRef` 

471 Reference to the required Dataset. 

472 ignore_errors: `bool`, optional 

473 Indicate that errors should be ignored. 

474 

475 Raises 

476 ------ 

477 FileNotFoundError 

478 Attempt to remove a dataset that does not exist. 

479 

480 Notes 

481 ----- 

482 Concurrency should not normally be an issue for the in memory datastore 

483 since all internal changes are isolated to solely this process and 

484 the registry only changes rows associated with this process. 

485 """ 

486 

487 log.debug("Trash %s in datastore %s", ref, self.name) 

488 

489 # Check that this dataset is known to datastore 

490 try: 

491 self._get_dataset_info(ref) 

492 

493 # Move datasets to trash table 

494 self._move_to_trash_in_registry(ref) 

495 except Exception as e: 

496 if ignore_errors: 

497 log.warning("Error encountered moving dataset %s to trash in datastore %s: %s", 

498 ref, self.name, e) 

499 else: 

500 raise 

501 

502 def emptyTrash(self, ignore_errors: bool = False) -> None: 

503 """Remove all datasets from the trash. 

504 

505 Parameters 

506 ---------- 

507 ignore_errors : `bool`, optional 

508 Ignore errors. 

509 

510 Notes 

511 ----- 

512 The internal tracking of datasets is affected by this method and 

513 transaction handling is not supported if there is a problem before 

514 the datasets themselves are deleted. 

515 

516 Concurrency should not normally be an issue for the in memory datastore 

517 since all internal changes are isolated to solely this process and 

518 the registry only changes rows associated with this process. 

519 """ 

520 log.debug("Emptying trash in datastore %s", self.name) 

521 with self._bridge.emptyTrash() as trashed: 

522 for ref in trashed: 

523 try: 

524 realID, _ = self._get_dataset_info(ref) 

525 except Exception as e: 

526 if ignore_errors: 

527 log.warning("Emptying trash in datastore %s but encountered an " 

528 "error with dataset %s: %s", 

529 self.name, ref.id, e) 

530 continue 

531 else: 

532 raise 

533 

534 # Determine whether all references to this dataset have been 

535 # removed and we can delete the dataset itself 

536 allRefs = self.related[realID] 

537 remainingRefs = allRefs - {ref.id} 

538 if not remainingRefs: 538 ↛ 543line 538 didn't jump to line 543, because the condition on line 538 was never false

539 log.debug("Removing artifact %s from datastore %s", realID, self.name) 

540 del self.datasets[realID] 

541 

542 # Remove this entry 

543 self.removeStoredItemInfo(ref) 

544 

545 def validateConfiguration(self, entities: Iterable[Union[DatasetRef, DatasetType, StorageClass]], 

546 logFailures: bool = False) -> None: 

547 """Validate some of the configuration for this datastore. 

548 

549 Parameters 

550 ---------- 

551 entities : iterable of `DatasetRef`, `DatasetType`, or `StorageClass` 

552 Entities to test against this configuration. Can be differing 

553 types. 

554 logFailures : `bool`, optional 

555 If `True`, output a log message for every validation error 

556 detected. 

557 

558 Raises 

559 ------ 

560 DatastoreValidationError 

561 Raised if there is a validation problem with a configuration. 

562 All the problems are reported in a single exception. 

563 

564 Notes 

565 ----- 

566 This method is a no-op. 

567 """ 

568 return 

569 

570 def _overrideTransferMode(self, *datasets: Any, transfer: Optional[str] = None) -> Optional[str]: 

571 # Docstring is inherited from base class 

572 return transfer 

573 

574 def validateKey(self, lookupKey: LookupKey, entity: Union[DatasetRef, DatasetType, StorageClass]) -> None: 

575 # Docstring is inherited from base class 

576 return 

577 

578 def getLookupKeys(self) -> Set[LookupKey]: 

579 # Docstring is inherited from base class 

580 return self.constraints.getLookupKeys()