Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22"""In-memory datastore.""" 

23 

24__all__ = ("StoredMemoryItemInfo", "InMemoryDatastore") 

25 

26import time 

27import logging 

28from dataclasses import dataclass 

29from typing import Dict, Optional, Any 

30 

31from lsst.daf.butler import StoredDatastoreItemInfo, StorageClass 

32from lsst.daf.butler.registry.interfaces import DatastoreRegistryBridge 

33from .genericDatastore import GenericBaseDatastore 

34 

35log = logging.getLogger(__name__) 

36 

37 

38@dataclass(frozen=True) 

39class StoredMemoryItemInfo(StoredDatastoreItemInfo): 

40 """Internal InMemoryDatastore Metadata associated with a stored 

41 DatasetRef. 

42 """ 

43 __slots__ = {"timestamp", "storageClass", "parentID"} 

44 

45 timestamp: float 

46 """Unix timestamp indicating the time the dataset was stored.""" 

47 

48 storageClass: StorageClass 

49 """StorageClass associated with the dataset.""" 

50 

51 parentID: Optional[int] 

52 """ID of the parent `DatasetRef` if this entry is a concrete 

53 composite. Not used if the dataset being stored is not a 

54 virtual component of a composite 

55 """ 

56 

57 

58class InMemoryDatastore(GenericBaseDatastore): 

59 """Basic Datastore for writing to an in memory cache. 

60 

61 This datastore is ephemeral in that the contents of the datastore 

62 disappear when the Python process completes. This also means that 

63 other processes can not access this datastore. 

64 

65 Parameters 

66 ---------- 

67 config : `DatastoreConfig` or `str` 

68 Configuration. 

69 bridgeManager : `DatastoreRegistryBridgeManager` 

70 Object that manages the interface between `Registry` and datastores. 

71 butlerRoot : `str`, optional 

72 Unused parameter. 

73 

74 Notes 

75 ----- 

76 InMemoryDatastore does not support any file-based ingest. 

77 """ 

78 

79 defaultConfigFile = "datastores/inMemoryDatastore.yaml" 

80 """Path to configuration defaults. Relative to $DAF_BUTLER_DIR/config or 

81 absolute path. Can be None if no defaults specified. 

82 """ 

83 

84 isEphemeral = True 

85 """A new datastore is created every time and datasets disappear when 

86 the process shuts down.""" 

87 

88 datasets: Dict[int, Any] 

89 """Internal storage of datasets indexed by dataset ID.""" 

90 

91 records: Dict[int, StoredMemoryItemInfo] 

92 """Internal records about stored datasets.""" 

93 

94 def __init__(self, config, bridgeManager, butlerRoot=None): 

95 super().__init__(config, bridgeManager) 

96 

97 # Name ourselves with the timestamp the datastore 

98 # was created. 

99 self.name = "{}@{}".format(type(self).__name__, time.time()) 

100 log.debug("Creating datastore %s", self.name) 

101 

102 # Storage of datasets, keyed by dataset_id 

103 self.datasets = {} 

104 

105 # Records is distinct in order to track concrete composite components 

106 # where we register multiple components for a single dataset. 

107 self.records = {} 

108 

109 # Related records that share the same parent 

110 self.related = {} 

111 

112 self._bridge = bridgeManager.register(self.name, ephemeral=True) 

113 

114 @classmethod 

115 def setConfigRoot(cls, root, config, full, overwrite=True): 

116 """Set any filesystem-dependent config options for this Datastore to 

117 be appropriate for a new empty repository with the given root. 

118 

119 Does nothing in this implementation. 

120 

121 Parameters 

122 ---------- 

123 root : `str` 

124 Filesystem path to the root of the data repository. 

125 config : `Config` 

126 A `Config` to update. Only the subset understood by 

127 this component will be updated. Will not expand 

128 defaults. 

129 full : `Config` 

130 A complete config with all defaults expanded that can be 

131 converted to a `DatastoreConfig`. Read-only and will not be 

132 modified by this method. 

133 Repository-specific options that should not be obtained 

134 from defaults when Butler instances are constructed 

135 should be copied from ``full`` to ``config``. 

136 overwrite : `bool`, optional 

137 If `False`, do not modify a value in ``config`` if the value 

138 already exists. Default is always to overwrite with the provided 

139 ``root``. 

140 

141 Notes 

142 ----- 

143 If a keyword is explicitly defined in the supplied ``config`` it 

144 will not be overridden by this method if ``overwrite`` is `False`. 

145 This allows explicit values set in external configs to be retained. 

146 """ 

147 return 

148 

149 @property 

150 def bridge(self) -> DatastoreRegistryBridge: 

151 # Docstring inherited from GenericBaseDatastore. 

152 return self._bridge 

153 

154 def addStoredItemInfo(self, refs, infos): 

155 # Docstring inherited from GenericBaseDatastore. 

156 for ref, info in zip(refs, infos): 

157 self.records[ref.id] = info 

158 self.related.setdefault(info.parentID, set()).add(ref.id) 

159 

160 def getStoredItemInfo(self, ref): 

161 # Docstring inherited from GenericBaseDatastore. 

162 return self.records[ref.id] 

163 

164 def getStoredItemsInfo(self, ref): 

165 # Docstring inherited from GenericBaseDatastore. 

166 return [self.getStoredItemInfo(ref)] 

167 

168 def removeStoredItemInfo(self, ref): 

169 # Docstring inherited from GenericBaseDatastore. 

170 # If a component has been removed previously then we can sometimes 

171 # be asked to remove it again. Other datastores ignore this 

172 # so also ignore here 

173 if ref.id not in self.records: 173 ↛ 174line 173 didn't jump to line 174, because the condition on line 173 was never true

174 return 

175 record = self.records[ref.id] 

176 del self.records[ref.id] 

177 self.related[record.parentID].remove(ref.id) 

178 

179 def _get_dataset_info(self, ref): 

180 """Check that the dataset is present and return the real ID and 

181 associated information. 

182 

183 Parameters 

184 ---------- 

185 ref : `DatasetRef` 

186 Target `DatasetRef` 

187 

188 Returns 

189 ------- 

190 realID : `int` 

191 The dataset ID associated with this ref that shoul be used. This 

192 could either be the ID of the supplied `DatasetRef` or the parent. 

193 storageInfo : `StoredMemoryItemInfo` 

194 Associated storage information. 

195 

196 Raises 

197 ------ 

198 FileNotFoundError 

199 Raised if the dataset is not present in this datastore. 

200 """ 

201 try: 

202 storedItemInfo = self.getStoredItemInfo(ref) 

203 except KeyError: 

204 raise FileNotFoundError(f"No such file dataset in memory: {ref}") from None 

205 realID = ref.id 

206 if storedItemInfo.parentID is not None: 206 ↛ 209line 206 didn't jump to line 209, because the condition on line 206 was never false

207 realID = storedItemInfo.parentID 

208 

209 if realID not in self.datasets: 209 ↛ 210line 209 didn't jump to line 210, because the condition on line 209 was never true

210 raise FileNotFoundError(f"No such file dataset in memory: {ref}") 

211 

212 return realID, storedItemInfo 

213 

214 def exists(self, ref): 

215 """Check if the dataset exists in the datastore. 

216 

217 Parameters 

218 ---------- 

219 ref : `DatasetRef` 

220 Reference to the required dataset. 

221 

222 Returns 

223 ------- 

224 exists : `bool` 

225 `True` if the entity exists in the `Datastore`. 

226 """ 

227 try: 

228 self._get_dataset_info(ref) 

229 except FileNotFoundError: 

230 return False 

231 return True 

232 

233 def get(self, ref, parameters=None): 

234 """Load an InMemoryDataset from the store. 

235 

236 Parameters 

237 ---------- 

238 ref : `DatasetRef` 

239 Reference to the required Dataset. 

240 parameters : `dict` 

241 `StorageClass`-specific parameters that specify, for example, 

242 a slice of the dataset to be loaded. 

243 

244 Returns 

245 ------- 

246 inMemoryDataset : `object` 

247 Requested dataset or slice thereof as an InMemoryDataset. 

248 

249 Raises 

250 ------ 

251 FileNotFoundError 

252 Requested dataset can not be retrieved. 

253 TypeError 

254 Return value from formatter has unexpected type. 

255 ValueError 

256 Formatter failed to process the dataset. 

257 """ 

258 

259 log.debug("Retrieve %s from %s with parameters %s", ref, self.name, parameters) 

260 

261 realID, storedItemInfo = self._get_dataset_info(ref) 

262 

263 # We have a write storage class and a read storage class and they 

264 # can be different for concrete composites. 

265 readStorageClass = ref.datasetType.storageClass 

266 writeStorageClass = storedItemInfo.storageClass 

267 

268 # Check that the supplied parameters are suitable for the type read 

269 readStorageClass.validateParameters(parameters) 

270 

271 inMemoryDataset = self.datasets[realID] 

272 

273 component = ref.datasetType.component() 

274 

275 # Different storage classes implies a component request 

276 if readStorageClass != writeStorageClass: 

277 

278 if component is None: 278 ↛ 279line 278 didn't jump to line 279, because the condition on line 278 was never true

279 raise ValueError("Storage class inconsistency ({} vs {}) but no" 

280 " component requested".format(readStorageClass.name, 

281 writeStorageClass.name)) 

282 

283 # Concrete composite written as a single object (we hope) 

284 inMemoryDataset = writeStorageClass.assembler().getComponent(inMemoryDataset, component) 

285 

286 # Since there is no formatter to process parameters, they all must be 

287 # passed to the assembler. 

288 return self._post_process_get(inMemoryDataset, readStorageClass, parameters, 

289 isComponent=component is not None) 

290 

291 def put(self, inMemoryDataset, ref): 

292 """Write a InMemoryDataset with a given `DatasetRef` to the store. 

293 

294 Parameters 

295 ---------- 

296 inMemoryDataset : `object` 

297 The dataset to store. 

298 ref : `DatasetRef` 

299 Reference to the associated Dataset. 

300 

301 Raises 

302 ------ 

303 TypeError 

304 Supplied object and storage class are inconsistent. 

305 DatasetTypeNotSupportedError 

306 The associated `DatasetType` is not handled by this datastore. 

307 

308 Notes 

309 ----- 

310 If the datastore is configured to reject certain dataset types it 

311 is possible that the put will fail and raise a 

312 `DatasetTypeNotSupportedError`. The main use case for this is to 

313 allow `ChainedDatastore` to put to multiple datastores without 

314 requiring that every datastore accepts the dataset. 

315 """ 

316 

317 self._validate_put_parameters(inMemoryDataset, ref) 

318 

319 self.datasets[ref.id] = inMemoryDataset 

320 log.debug("Store %s in %s", ref, self.name) 

321 

322 # Store time we received this content, to allow us to optionally 

323 # expire it. Instead of storing a filename here, we include the 

324 # ID of this datasetRef so we can find it from components. 

325 itemInfo = StoredMemoryItemInfo(time.time(), ref.datasetType.storageClass, 

326 parentID=ref.id) 

327 

328 # We have to register this content with registry. 

329 # Currently this assumes we have a file so we need to use stub entries 

330 # TODO: Add to ephemeral part of registry 

331 self._register_datasets([(ref, itemInfo)]) 

332 

333 if self._transaction is not None: 

334 self._transaction.registerUndo("put", self.remove, ref) 

335 

336 def getUri(self, ref, predict=False): 

337 """URI to the Dataset. 

338 

339 Always uses "mem://" URI prefix. 

340 

341 Parameters 

342 ---------- 

343 ref : `DatasetRef` 

344 Reference to the required Dataset. 

345 predict : `bool` 

346 If `True`, allow URIs to be returned of datasets that have not 

347 been written. 

348 

349 Returns 

350 ------- 

351 uri : `str` 

352 URI string pointing to the dataset within the datastore. If the 

353 dataset does not exist in the datastore, and if ``predict`` is 

354 `True`, the URI will be a prediction and will include a URI 

355 fragment "#predicted". 

356 If the datastore does not have entities that relate well 

357 to the concept of a URI the returned URI string will be 

358 descriptive. The returned URI is not guaranteed to be obtainable. 

359 

360 Raises 

361 ------ 

362 FileNotFoundError 

363 A URI has been requested for a dataset that does not exist and 

364 guessing is not allowed. 

365 

366 """ 

367 

368 # if this has never been written then we have to guess 

369 if not self.exists(ref): 

370 if not predict: 

371 raise FileNotFoundError("Dataset {} not in this datastore".format(ref)) 

372 name = "{}#predicted".format(ref.datasetType.name) 

373 else: 

374 realID, _ = self._get_dataset_info(ref) 

375 name = '{}'.format(id(self.datasets[realID])) 

376 

377 return "mem://{}".format(name) 

378 

379 def trash(self, ref, ignore_errors=False): 

380 """Indicate to the Datastore that a dataset can be removed. 

381 

382 Parameters 

383 ---------- 

384 ref : `DatasetRef` 

385 Reference to the required Dataset. 

386 ignore_errors: `bool`, optional 

387 Indicate that errors should be ignored. 

388 

389 Raises 

390 ------ 

391 FileNotFoundError 

392 Attempt to remove a dataset that does not exist. 

393 

394 Notes 

395 ----- 

396 Concurrency should not normally be an issue for the in memory datastore 

397 since all internal changes are isolated to solely this process and 

398 the registry only changes rows associated with this process. 

399 """ 

400 

401 log.debug("Trash %s in datastore %s", ref, self.name) 

402 

403 # Check that this dataset is known to datastore 

404 try: 

405 self._get_dataset_info(ref) 

406 

407 # Move datasets to trash table 

408 self._move_to_trash_in_registry(ref) 

409 except Exception as e: 

410 if ignore_errors: 

411 log.warning("Error encountered moving dataset %s to trash in datastore %s: %s", 

412 ref, self.name, e) 

413 else: 

414 raise 

415 

416 def emptyTrash(self, ignore_errors=False): 

417 """Remove all datasets from the trash. 

418 

419 Parameters 

420 ---------- 

421 ignore_errors : `bool`, optional 

422 Ignore errors. 

423 

424 Notes 

425 ----- 

426 The internal tracking of datasets is affected by this method and 

427 transaction handling is not supported if there is a problem before 

428 the datasets themselves are deleted. 

429 

430 Concurrency should not normally be an issue for the in memory datastore 

431 since all internal changes are isolated to solely this process and 

432 the registry only changes rows associated with this process. 

433 """ 

434 log.debug("Emptying trash in datastore %s", self.name) 

435 with self._bridge.emptyTrash() as trashed: 

436 for ref in trashed: 

437 try: 

438 realID, _ = self._get_dataset_info(ref) 

439 except Exception as e: 

440 if ignore_errors: 

441 log.warning("Emptying trash in datastore %s but encountered an " 

442 "error with dataset %s: %s", 

443 self.name, ref.id, e) 

444 continue 

445 else: 

446 raise 

447 

448 # Determine whether all references to this dataset have been 

449 # removed and we can delete the dataset itself 

450 allRefs = self.related[realID] 

451 theseRefs = {r.id for r in ref.flatten([ref])} 

452 remainingRefs = allRefs - theseRefs 

453 if not remainingRefs: 

454 log.debug("Removing artifact %s from datastore %s", realID, self.name) 

455 del self.datasets[realID] 

456 

457 # Remove this entry 

458 self.removeStoredItemInfo(ref) 

459 

460 def validateConfiguration(self, entities, logFailures=False): 

461 """Validate some of the configuration for this datastore. 

462 

463 Parameters 

464 ---------- 

465 entities : iterable of `DatasetRef`, `DatasetType`, or `StorageClass` 

466 Entities to test against this configuration. Can be differing 

467 types. 

468 logFailures : `bool`, optional 

469 If `True`, output a log message for every validation error 

470 detected. 

471 

472 Raises 

473 ------ 

474 DatastoreValidationError 

475 Raised if there is a validation problem with a configuration. 

476 All the problems are reported in a single exception. 

477 

478 Notes 

479 ----- 

480 This method is a no-op. 

481 """ 

482 return 

483 

484 def _overrideTransferMode(self, *datasets: Any, transfer: Optional[str] = None) -> str: 

485 # Docstring is inherited from base class 

486 return transfer 

487 

488 def validateKey(self, lookupKey, entity): 

489 # Docstring is inherited from base class 

490 return 

491 

492 def getLookupKeys(self): 

493 # Docstring is inherited from base class 

494 return self.constraints.getLookupKeys()