Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22from __future__ import annotations 

23 

24"""In-memory datastore.""" 

25 

26__all__ = ("StoredMemoryItemInfo", "InMemoryDatastore") 

27 

28import time 

29import logging 

30from dataclasses import dataclass 

31from typing import ( 

32 TYPE_CHECKING, 

33 Any, 

34 Dict, 

35 Iterable, 

36 List, 

37 Mapping, 

38 Optional, 

39 Set, 

40 Tuple, 

41 Union, 

42) 

43 

44from lsst.daf.butler import StoredDatastoreItemInfo, StorageClass 

45from lsst.daf.butler.registry.interfaces import DatastoreRegistryBridge 

46from .genericDatastore import GenericBaseDatastore 

47 

48if TYPE_CHECKING: 48 ↛ 49line 48 didn't jump to line 49, because the condition on line 48 was never true

49 from lsst.daf.butler import (Config, DatasetRef, DatasetType, 

50 LookupKey) 

51 from lsst.daf.butler.registry.interfaces import DatasetIdRef, DatastoreRegistryBridgeManager 

52 

53log = logging.getLogger(__name__) 

54 

55 

56@dataclass(frozen=True) 

57class StoredMemoryItemInfo(StoredDatastoreItemInfo): 

58 """Internal InMemoryDatastore Metadata associated with a stored 

59 DatasetRef. 

60 """ 

61 __slots__ = {"timestamp", "storageClass", "parentID"} 

62 

63 timestamp: float 

64 """Unix timestamp indicating the time the dataset was stored.""" 

65 

66 storageClass: StorageClass 

67 """StorageClass associated with the dataset.""" 

68 

69 parentID: int 

70 """ID of the parent `DatasetRef` if this entry is a concrete 

71 composite. Not used if the dataset being stored is not a 

72 virtual component of a composite 

73 """ 

74 

75 

76class InMemoryDatastore(GenericBaseDatastore): 

77 """Basic Datastore for writing to an in memory cache. 

78 

79 This datastore is ephemeral in that the contents of the datastore 

80 disappear when the Python process completes. This also means that 

81 other processes can not access this datastore. 

82 

83 Parameters 

84 ---------- 

85 config : `DatastoreConfig` or `str` 

86 Configuration. 

87 bridgeManager : `DatastoreRegistryBridgeManager` 

88 Object that manages the interface between `Registry` and datastores. 

89 butlerRoot : `str`, optional 

90 Unused parameter. 

91 

92 Notes 

93 ----- 

94 InMemoryDatastore does not support any file-based ingest. 

95 """ 

96 

97 defaultConfigFile = "datastores/inMemoryDatastore.yaml" 

98 """Path to configuration defaults. Relative to $DAF_BUTLER_DIR/config or 

99 absolute path. Can be None if no defaults specified. 

100 """ 

101 

102 isEphemeral = True 

103 """A new datastore is created every time and datasets disappear when 

104 the process shuts down.""" 

105 

106 datasets: Dict[int, Any] 

107 """Internal storage of datasets indexed by dataset ID.""" 

108 

109 records: Dict[int, StoredMemoryItemInfo] 

110 """Internal records about stored datasets.""" 

111 

112 def __init__(self, config: Union[Config, str], 

113 bridgeManager: DatastoreRegistryBridgeManager, 

114 butlerRoot: Optional[str] = None): 

115 super().__init__(config, bridgeManager) 

116 

117 # Name ourselves with the timestamp the datastore 

118 # was created. 

119 self.name = "{}@{}".format(type(self).__name__, time.time()) 

120 log.debug("Creating datastore %s", self.name) 

121 

122 # Storage of datasets, keyed by dataset_id 

123 self.datasets: Dict[int, Any] = {} 

124 

125 # Records is distinct in order to track concrete composite components 

126 # where we register multiple components for a single dataset. 

127 self.records: Dict[int, StoredMemoryItemInfo] = {} 

128 

129 # Related records that share the same parent 

130 self.related: Dict[int, Set[int]] = {} 

131 

132 self._bridge = bridgeManager.register(self.name, ephemeral=True) 

133 

134 @classmethod 

135 def setConfigRoot(cls, root: str, config: Config, full: Config, overwrite: bool = True) -> None: 

136 """Set any filesystem-dependent config options for this Datastore to 

137 be appropriate for a new empty repository with the given root. 

138 

139 Does nothing in this implementation. 

140 

141 Parameters 

142 ---------- 

143 root : `str` 

144 Filesystem path to the root of the data repository. 

145 config : `Config` 

146 A `Config` to update. Only the subset understood by 

147 this component will be updated. Will not expand 

148 defaults. 

149 full : `Config` 

150 A complete config with all defaults expanded that can be 

151 converted to a `DatastoreConfig`. Read-only and will not be 

152 modified by this method. 

153 Repository-specific options that should not be obtained 

154 from defaults when Butler instances are constructed 

155 should be copied from ``full`` to ``config``. 

156 overwrite : `bool`, optional 

157 If `False`, do not modify a value in ``config`` if the value 

158 already exists. Default is always to overwrite with the provided 

159 ``root``. 

160 

161 Notes 

162 ----- 

163 If a keyword is explicitly defined in the supplied ``config`` it 

164 will not be overridden by this method if ``overwrite`` is `False`. 

165 This allows explicit values set in external configs to be retained. 

166 """ 

167 return 

168 

169 @property 

170 def bridge(self) -> DatastoreRegistryBridge: 

171 # Docstring inherited from GenericBaseDatastore. 

172 return self._bridge 

173 

174 def addStoredItemInfo(self, refs: Iterable[DatasetRef], 

175 infos: Iterable[StoredMemoryItemInfo]) -> None: 

176 # Docstring inherited from GenericBaseDatastore. 

177 for ref, info in zip(refs, infos): 

178 if ref.id is None: 178 ↛ 179line 178 didn't jump to line 179, because the condition on line 178 was never true

179 raise RuntimeError(f"Can not store unresolved DatasetRef {ref}") 

180 self.records[ref.id] = info 

181 self.related.setdefault(info.parentID, set()).add(ref.id) 

182 

183 def getStoredItemInfo(self, ref: DatasetIdRef) -> StoredMemoryItemInfo: 

184 # Docstring inherited from GenericBaseDatastore. 

185 if ref.id is None: 185 ↛ 186line 185 didn't jump to line 186, because the condition on line 185 was never true

186 raise RuntimeError(f"Can not retrieve unresolved DatasetRef {ref}") 

187 return self.records[ref.id] 

188 

189 def getStoredItemsInfo(self, ref: DatasetIdRef) -> List[StoredMemoryItemInfo]: 

190 # Docstring inherited from GenericBaseDatastore. 

191 return [self.getStoredItemInfo(ref)] 

192 

193 def removeStoredItemInfo(self, ref: DatasetIdRef) -> None: 

194 # Docstring inherited from GenericBaseDatastore. 

195 # If a component has been removed previously then we can sometimes 

196 # be asked to remove it again. Other datastores ignore this 

197 # so also ignore here 

198 if ref.id is None: 198 ↛ 199line 198 didn't jump to line 199, because the condition on line 198 was never true

199 raise RuntimeError(f"Can not remove unresolved DatasetRef {ref}") 

200 if ref.id not in self.records: 200 ↛ 201line 200 didn't jump to line 201, because the condition on line 200 was never true

201 return 

202 record = self.records[ref.id] 

203 del self.records[ref.id] 

204 self.related[record.parentID].remove(ref.id) 

205 

206 def _get_dataset_info(self, ref: DatasetIdRef) -> Tuple[int, StoredMemoryItemInfo]: 

207 """Check that the dataset is present and return the real ID and 

208 associated information. 

209 

210 Parameters 

211 ---------- 

212 ref : `DatasetRef` 

213 Target `DatasetRef` 

214 

215 Returns 

216 ------- 

217 realID : `int` 

218 The dataset ID associated with this ref that shoul be used. This 

219 could either be the ID of the supplied `DatasetRef` or the parent. 

220 storageInfo : `StoredMemoryItemInfo` 

221 Associated storage information. 

222 

223 Raises 

224 ------ 

225 FileNotFoundError 

226 Raised if the dataset is not present in this datastore. 

227 """ 

228 try: 

229 storedItemInfo = self.getStoredItemInfo(ref) 

230 except KeyError: 

231 raise FileNotFoundError(f"No such file dataset in memory: {ref}") from None 

232 realID = ref.id 

233 if storedItemInfo.parentID is not None: 233 ↛ 236line 233 didn't jump to line 236, because the condition on line 233 was never false

234 realID = storedItemInfo.parentID 

235 

236 if realID not in self.datasets: 236 ↛ 237line 236 didn't jump to line 237, because the condition on line 236 was never true

237 raise FileNotFoundError(f"No such file dataset in memory: {ref}") 

238 

239 return realID, storedItemInfo 

240 

241 def exists(self, ref: DatasetRef) -> bool: 

242 """Check if the dataset exists in the datastore. 

243 

244 Parameters 

245 ---------- 

246 ref : `DatasetRef` 

247 Reference to the required dataset. 

248 

249 Returns 

250 ------- 

251 exists : `bool` 

252 `True` if the entity exists in the `Datastore`. 

253 """ 

254 try: 

255 self._get_dataset_info(ref) 

256 except FileNotFoundError: 

257 return False 

258 return True 

259 

260 def get(self, ref: DatasetRef, parameters: Optional[Mapping[str, Any]] = None) -> Any: 

261 """Load an InMemoryDataset from the store. 

262 

263 Parameters 

264 ---------- 

265 ref : `DatasetRef` 

266 Reference to the required Dataset. 

267 parameters : `dict` 

268 `StorageClass`-specific parameters that specify, for example, 

269 a slice of the dataset to be loaded. 

270 

271 Returns 

272 ------- 

273 inMemoryDataset : `object` 

274 Requested dataset or slice thereof as an InMemoryDataset. 

275 

276 Raises 

277 ------ 

278 FileNotFoundError 

279 Requested dataset can not be retrieved. 

280 TypeError 

281 Return value from formatter has unexpected type. 

282 ValueError 

283 Formatter failed to process the dataset. 

284 """ 

285 

286 log.debug("Retrieve %s from %s with parameters %s", ref, self.name, parameters) 

287 

288 realID, storedItemInfo = self._get_dataset_info(ref) 

289 

290 # We have a write storage class and a read storage class and they 

291 # can be different for concrete composites. 

292 readStorageClass = ref.datasetType.storageClass 

293 writeStorageClass = storedItemInfo.storageClass 

294 

295 # Check that the supplied parameters are suitable for the type read 

296 readStorageClass.validateParameters(parameters) 

297 

298 inMemoryDataset = self.datasets[realID] 

299 

300 component = ref.datasetType.component() 

301 

302 # Different storage classes implies a component request 

303 if readStorageClass != writeStorageClass: 

304 

305 if component is None: 305 ↛ 306line 305 didn't jump to line 306, because the condition on line 305 was never true

306 raise ValueError("Storage class inconsistency ({} vs {}) but no" 

307 " component requested".format(readStorageClass.name, 

308 writeStorageClass.name)) 

309 

310 # Concrete composite written as a single object (we hope) 

311 inMemoryDataset = writeStorageClass.assembler().getComponent(inMemoryDataset, component) 

312 

313 # Since there is no formatter to process parameters, they all must be 

314 # passed to the assembler. 

315 return self._post_process_get(inMemoryDataset, readStorageClass, parameters, 

316 isComponent=component is not None) 

317 

318 def put(self, inMemoryDataset: Any, ref: DatasetRef) -> None: 

319 """Write a InMemoryDataset with a given `DatasetRef` to the store. 

320 

321 Parameters 

322 ---------- 

323 inMemoryDataset : `object` 

324 The dataset to store. 

325 ref : `DatasetRef` 

326 Reference to the associated Dataset. 

327 

328 Raises 

329 ------ 

330 TypeError 

331 Supplied object and storage class are inconsistent. 

332 DatasetTypeNotSupportedError 

333 The associated `DatasetType` is not handled by this datastore. 

334 

335 Notes 

336 ----- 

337 If the datastore is configured to reject certain dataset types it 

338 is possible that the put will fail and raise a 

339 `DatasetTypeNotSupportedError`. The main use case for this is to 

340 allow `ChainedDatastore` to put to multiple datastores without 

341 requiring that every datastore accepts the dataset. 

342 """ 

343 

344 if ref.id is None: 344 ↛ 345line 344 didn't jump to line 345, because the condition on line 344 was never true

345 raise RuntimeError(f"Can not store unresolved DatasetRef {ref}") 

346 

347 self._validate_put_parameters(inMemoryDataset, ref) 

348 

349 self.datasets[ref.id] = inMemoryDataset 

350 log.debug("Store %s in %s", ref, self.name) 

351 

352 # Store time we received this content, to allow us to optionally 

353 # expire it. Instead of storing a filename here, we include the 

354 # ID of this datasetRef so we can find it from components. 

355 itemInfo = StoredMemoryItemInfo(time.time(), ref.datasetType.storageClass, 

356 parentID=ref.id) 

357 

358 # We have to register this content with registry. 

359 # Currently this assumes we have a file so we need to use stub entries 

360 # TODO: Add to ephemeral part of registry 

361 self._register_datasets([(ref, itemInfo)]) 

362 

363 if self._transaction is not None: 

364 self._transaction.registerUndo("put", self.remove, ref) 

365 

366 def getUri(self, ref: DatasetRef, predict: bool = False) -> str: 

367 """URI to the Dataset. 

368 

369 Always uses "mem://" URI prefix. 

370 

371 Parameters 

372 ---------- 

373 ref : `DatasetRef` 

374 Reference to the required Dataset. 

375 predict : `bool` 

376 If `True`, allow URIs to be returned of datasets that have not 

377 been written. 

378 

379 Returns 

380 ------- 

381 uri : `str` 

382 URI string pointing to the dataset within the datastore. If the 

383 dataset does not exist in the datastore, and if ``predict`` is 

384 `True`, the URI will be a prediction and will include a URI 

385 fragment "#predicted". 

386 If the datastore does not have entities that relate well 

387 to the concept of a URI the returned URI string will be 

388 descriptive. The returned URI is not guaranteed to be obtainable. 

389 

390 Raises 

391 ------ 

392 FileNotFoundError 

393 A URI has been requested for a dataset that does not exist and 

394 guessing is not allowed. 

395 

396 """ 

397 

398 # if this has never been written then we have to guess 

399 if not self.exists(ref): 

400 if not predict: 

401 raise FileNotFoundError("Dataset {} not in this datastore".format(ref)) 

402 name = "{}#predicted".format(ref.datasetType.name) 

403 else: 

404 realID, _ = self._get_dataset_info(ref) 

405 name = '{}'.format(id(self.datasets[realID])) 

406 

407 return "mem://{}".format(name) 

408 

409 def trash(self, ref: DatasetRef, ignore_errors: bool = False) -> None: 

410 """Indicate to the Datastore that a dataset can be removed. 

411 

412 Parameters 

413 ---------- 

414 ref : `DatasetRef` 

415 Reference to the required Dataset. 

416 ignore_errors: `bool`, optional 

417 Indicate that errors should be ignored. 

418 

419 Raises 

420 ------ 

421 FileNotFoundError 

422 Attempt to remove a dataset that does not exist. 

423 

424 Notes 

425 ----- 

426 Concurrency should not normally be an issue for the in memory datastore 

427 since all internal changes are isolated to solely this process and 

428 the registry only changes rows associated with this process. 

429 """ 

430 

431 log.debug("Trash %s in datastore %s", ref, self.name) 

432 

433 # Check that this dataset is known to datastore 

434 try: 

435 self._get_dataset_info(ref) 

436 

437 # Move datasets to trash table 

438 self._move_to_trash_in_registry(ref) 

439 except Exception as e: 

440 if ignore_errors: 

441 log.warning("Error encountered moving dataset %s to trash in datastore %s: %s", 

442 ref, self.name, e) 

443 else: 

444 raise 

445 

446 def emptyTrash(self, ignore_errors: bool = False) -> None: 

447 """Remove all datasets from the trash. 

448 

449 Parameters 

450 ---------- 

451 ignore_errors : `bool`, optional 

452 Ignore errors. 

453 

454 Notes 

455 ----- 

456 The internal tracking of datasets is affected by this method and 

457 transaction handling is not supported if there is a problem before 

458 the datasets themselves are deleted. 

459 

460 Concurrency should not normally be an issue for the in memory datastore 

461 since all internal changes are isolated to solely this process and 

462 the registry only changes rows associated with this process. 

463 """ 

464 log.debug("Emptying trash in datastore %s", self.name) 

465 with self._bridge.emptyTrash() as trashed: 

466 for ref in trashed: 

467 try: 

468 realID, _ = self._get_dataset_info(ref) 

469 except Exception as e: 

470 if ignore_errors: 

471 log.warning("Emptying trash in datastore %s but encountered an " 

472 "error with dataset %s: %s", 

473 self.name, ref.id, e) 

474 continue 

475 else: 

476 raise 

477 

478 # Determine whether all references to this dataset have been 

479 # removed and we can delete the dataset itself 

480 allRefs = self.related[realID] 

481 theseRefs = {r.id for r in ref.allRefs()} 

482 remainingRefs = allRefs - theseRefs 

483 if not remainingRefs: 

484 log.debug("Removing artifact %s from datastore %s", realID, self.name) 

485 del self.datasets[realID] 

486 

487 # Remove this entry 

488 self.removeStoredItemInfo(ref) 

489 

490 def validateConfiguration(self, entities: Iterable[Union[DatasetRef, DatasetType, StorageClass]], 

491 logFailures: bool = False) -> None: 

492 """Validate some of the configuration for this datastore. 

493 

494 Parameters 

495 ---------- 

496 entities : iterable of `DatasetRef`, `DatasetType`, or `StorageClass` 

497 Entities to test against this configuration. Can be differing 

498 types. 

499 logFailures : `bool`, optional 

500 If `True`, output a log message for every validation error 

501 detected. 

502 

503 Raises 

504 ------ 

505 DatastoreValidationError 

506 Raised if there is a validation problem with a configuration. 

507 All the problems are reported in a single exception. 

508 

509 Notes 

510 ----- 

511 This method is a no-op. 

512 """ 

513 return 

514 

515 def _overrideTransferMode(self, *datasets: Any, transfer: Optional[str] = None) -> Optional[str]: 

516 # Docstring is inherited from base class 

517 return transfer 

518 

519 def validateKey(self, lookupKey: LookupKey, entity: Union[DatasetRef, DatasetType, StorageClass]) -> None: 

520 # Docstring is inherited from base class 

521 return 

522 

523 def getLookupKeys(self) -> Set[LookupKey]: 

524 # Docstring is inherited from base class 

525 return self.constraints.getLookupKeys()