Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22"""In-memory datastore.""" 

23 

24__all__ = ("StoredMemoryItemInfo", "InMemoryDatastore") 

25 

26import time 

27import logging 

28from dataclasses import dataclass 

29from typing import Dict, Optional, Any 

30 

31from lsst.daf.butler import StoredDatastoreItemInfo, StorageClass 

32from .genericDatastore import GenericBaseDatastore 

33 

34log = logging.getLogger(__name__) 

35 

36 

37@dataclass(frozen=True) 

38class StoredMemoryItemInfo(StoredDatastoreItemInfo): 

39 """Internal InMemoryDatastore Metadata associated with a stored 

40 DatasetRef. 

41 """ 

42 __slots__ = {"timestamp", "storageClass", "parentID"} 

43 

44 timestamp: float 

45 """Unix timestamp indicating the time the dataset was stored.""" 

46 

47 storageClass: StorageClass 

48 """StorageClass associated with the dataset.""" 

49 

50 parentID: Optional[int] 

51 """ID of the parent `DatasetRef` if this entry is a concrete 

52 composite. Not used if the dataset being stored is not a 

53 virtual component of a composite 

54 """ 

55 

56 

57class InMemoryDatastore(GenericBaseDatastore): 

58 """Basic Datastore for writing to an in memory cache. 

59 

60 This datastore is ephemeral in that the contents of the datastore 

61 disappear when the Python process completes. This also means that 

62 other processes can not access this datastore. 

63 

64 Parameters 

65 ---------- 

66 config : `DatastoreConfig` or `str` 

67 Configuration. 

68 registry : `Registry`, optional 

69 Unused parameter. 

70 butlerRoot : `str`, optional 

71 Unused parameter. 

72 

73 Notes 

74 ----- 

75 InMemoryDatastore does not support any file-based ingest. 

76 """ 

77 

78 defaultConfigFile = "datastores/inMemoryDatastore.yaml" 

79 """Path to configuration defaults. Relative to $DAF_BUTLER_DIR/config or 

80 absolute path. Can be None if no defaults specified. 

81 """ 

82 

83 isEphemeral = True 

84 """A new datastore is created every time and datasets disappear when 

85 the process shuts down.""" 

86 

87 datasets: Dict[int, Any] 

88 """Internal storage of datasets indexed by dataset ID.""" 

89 

90 records: Dict[int, StoredMemoryItemInfo] 

91 """Internal records about stored datasets.""" 

92 

93 def __init__(self, config, registry=None, butlerRoot=None): 

94 super().__init__(config, registry) 

95 

96 # Name ourselves with the timestamp the datastore 

97 # was created. 

98 self.name = "{}@{}".format(type(self).__name__, time.time()) 

99 log.debug("Creating datastore %s", self.name) 

100 

101 # Storage of datasets, keyed by dataset_id 

102 self.datasets = {} 

103 

104 # Records is distinct in order to track concrete composite components 

105 # where we register multiple components for a single dataset. 

106 self.records = {} 

107 

108 # Related records that share the same parent 

109 self.related = {} 

110 

111 @classmethod 

112 def setConfigRoot(cls, root, config, full, overwrite=True): 

113 """Set any filesystem-dependent config options for this Datastore to 

114 be appropriate for a new empty repository with the given root. 

115 

116 Does nothing in this implementation. 

117 

118 Parameters 

119 ---------- 

120 root : `str` 

121 Filesystem path to the root of the data repository. 

122 config : `Config` 

123 A `Config` to update. Only the subset understood by 

124 this component will be updated. Will not expand 

125 defaults. 

126 full : `Config` 

127 A complete config with all defaults expanded that can be 

128 converted to a `DatastoreConfig`. Read-only and will not be 

129 modified by this method. 

130 Repository-specific options that should not be obtained 

131 from defaults when Butler instances are constructed 

132 should be copied from ``full`` to ``config``. 

133 overwrite : `bool`, optional 

134 If `False`, do not modify a value in ``config`` if the value 

135 already exists. Default is always to overwrite with the provided 

136 ``root``. 

137 

138 Notes 

139 ----- 

140 If a keyword is explicitly defined in the supplied ``config`` it 

141 will not be overridden by this method if ``overwrite`` is `False`. 

142 This allows explicit values set in external configs to be retained. 

143 """ 

144 return 

145 

146 def addStoredItemInfo(self, refs, infos): 

147 # Docstring inherited from GenericBaseDatastore. 

148 for ref, info in zip(refs, infos): 

149 self.records[ref.id] = info 

150 self.related.setdefault(info.parentID, set()).add(ref.id) 

151 

152 def getStoredItemInfo(self, ref): 

153 # Docstring inherited from GenericBaseDatastore. 

154 return self.records[ref.id] 

155 

156 def removeStoredItemInfo(self, ref): 

157 # Docstring inherited from GenericBaseDatastore. 

158 # If a component has been removed previously then we can sometimes 

159 # be asked to remove it again. Other datastores ignore this 

160 # so also ignore here 

161 if ref.id not in self.records: 161 ↛ 162line 161 didn't jump to line 162, because the condition on line 161 was never true

162 return 

163 record = self.records[ref.id] 

164 del self.records[ref.id] 

165 self.related[record.parentID].remove(ref.id) 

166 

167 def _get_dataset_info(self, ref): 

168 """Check that the dataset is present and return the real ID and 

169 associated information. 

170 

171 Parameters 

172 ---------- 

173 ref : `DatasetRef` 

174 Target `DatasetRef` 

175 

176 Returns 

177 ------- 

178 realID : `int` 

179 The dataset ID associated with this ref that shoul be used. This 

180 could either be the ID of the supplied `DatasetRef` or the parent. 

181 storageInfo : `StoredMemoryItemInfo` 

182 Associated storage information. 

183 

184 Raises 

185 ------ 

186 FileNotFoundError 

187 Raised if the dataset is not present in this datastore. 

188 """ 

189 try: 

190 storedItemInfo = self.getStoredItemInfo(ref) 

191 except KeyError: 

192 raise FileNotFoundError(f"No such file dataset in memory: {ref}") from None 

193 realID = ref.id 

194 if storedItemInfo.parentID is not None: 194 ↛ 197line 194 didn't jump to line 197, because the condition on line 194 was never false

195 realID = storedItemInfo.parentID 

196 

197 if realID not in self.datasets: 197 ↛ 198line 197 didn't jump to line 198, because the condition on line 197 was never true

198 raise FileNotFoundError(f"No such file dataset in memory: {ref}") 

199 

200 return realID, storedItemInfo 

201 

202 def exists(self, ref): 

203 """Check if the dataset exists in the datastore. 

204 

205 Parameters 

206 ---------- 

207 ref : `DatasetRef` 

208 Reference to the required dataset. 

209 

210 Returns 

211 ------- 

212 exists : `bool` 

213 `True` if the entity exists in the `Datastore`. 

214 """ 

215 try: 

216 self._get_dataset_info(ref) 

217 except FileNotFoundError: 

218 return False 

219 return True 

220 

221 def get(self, ref, parameters=None): 

222 """Load an InMemoryDataset from the store. 

223 

224 Parameters 

225 ---------- 

226 ref : `DatasetRef` 

227 Reference to the required Dataset. 

228 parameters : `dict` 

229 `StorageClass`-specific parameters that specify, for example, 

230 a slice of the dataset to be loaded. 

231 

232 Returns 

233 ------- 

234 inMemoryDataset : `object` 

235 Requested dataset or slice thereof as an InMemoryDataset. 

236 

237 Raises 

238 ------ 

239 FileNotFoundError 

240 Requested dataset can not be retrieved. 

241 TypeError 

242 Return value from formatter has unexpected type. 

243 ValueError 

244 Formatter failed to process the dataset. 

245 """ 

246 

247 log.debug("Retrieve %s from %s with parameters %s", ref, self.name, parameters) 

248 

249 realID, storedItemInfo = self._get_dataset_info(ref) 

250 

251 # We have a write storage class and a read storage class and they 

252 # can be different for concrete composites. 

253 readStorageClass = ref.datasetType.storageClass 

254 writeStorageClass = storedItemInfo.storageClass 

255 

256 # Check that the supplied parameters are suitable for the type read 

257 readStorageClass.validateParameters(parameters) 

258 

259 inMemoryDataset = self.datasets[realID] 

260 

261 component = ref.datasetType.component() 

262 

263 # Different storage classes implies a component request 

264 if readStorageClass != writeStorageClass: 

265 

266 if component is None: 266 ↛ 267line 266 didn't jump to line 267, because the condition on line 266 was never true

267 raise ValueError("Storage class inconsistency ({} vs {}) but no" 

268 " component requested".format(readStorageClass.name, 

269 writeStorageClass.name)) 

270 

271 # Concrete composite written as a single object (we hope) 

272 inMemoryDataset = writeStorageClass.assembler().getComponent(inMemoryDataset, component) 

273 

274 # Since there is no formatter to process parameters, they all must be 

275 # passed to the assembler. 

276 return self._post_process_get(inMemoryDataset, readStorageClass, parameters, 

277 isComponent=component is not None) 

278 

279 def put(self, inMemoryDataset, ref): 

280 """Write a InMemoryDataset with a given `DatasetRef` to the store. 

281 

282 Parameters 

283 ---------- 

284 inMemoryDataset : `object` 

285 The dataset to store. 

286 ref : `DatasetRef` 

287 Reference to the associated Dataset. 

288 

289 Raises 

290 ------ 

291 TypeError 

292 Supplied object and storage class are inconsistent. 

293 DatasetTypeNotSupportedError 

294 The associated `DatasetType` is not handled by this datastore. 

295 

296 Notes 

297 ----- 

298 If the datastore is configured to reject certain dataset types it 

299 is possible that the put will fail and raise a 

300 `DatasetTypeNotSupportedError`. The main use case for this is to 

301 allow `ChainedDatastore` to put to multiple datastores without 

302 requiring that every datastore accepts the dataset. 

303 """ 

304 

305 self._validate_put_parameters(inMemoryDataset, ref) 

306 

307 self.datasets[ref.id] = inMemoryDataset 

308 log.debug("Store %s in %s", ref, self.name) 

309 

310 # Store time we received this content, to allow us to optionally 

311 # expire it. Instead of storing a filename here, we include the 

312 # ID of this datasetRef so we can find it from components. 

313 itemInfo = StoredMemoryItemInfo(time.time(), ref.datasetType.storageClass, 

314 parentID=ref.id) 

315 

316 # We have to register this content with registry. 

317 # Currently this assumes we have a file so we need to use stub entries 

318 # TODO: Add to ephemeral part of registry 

319 self._register_datasets([(ref, itemInfo)]) 

320 

321 if self._transaction is not None: 

322 self._transaction.registerUndo("put", self.remove, ref) 

323 

324 def getUri(self, ref, predict=False): 

325 """URI to the Dataset. 

326 

327 Always uses "mem://" URI prefix. 

328 

329 Parameters 

330 ---------- 

331 ref : `DatasetRef` 

332 Reference to the required Dataset. 

333 predict : `bool` 

334 If `True`, allow URIs to be returned of datasets that have not 

335 been written. 

336 

337 Returns 

338 ------- 

339 uri : `str` 

340 URI string pointing to the dataset within the datastore. If the 

341 dataset does not exist in the datastore, and if ``predict`` is 

342 `True`, the URI will be a prediction and will include a URI 

343 fragment "#predicted". 

344 If the datastore does not have entities that relate well 

345 to the concept of a URI the returned URI string will be 

346 descriptive. The returned URI is not guaranteed to be obtainable. 

347 

348 Raises 

349 ------ 

350 FileNotFoundError 

351 A URI has been requested for a dataset that does not exist and 

352 guessing is not allowed. 

353 

354 """ 

355 

356 # if this has never been written then we have to guess 

357 if not self.exists(ref): 

358 if not predict: 

359 raise FileNotFoundError("Dataset {} not in this datastore".format(ref)) 

360 name = "{}#predicted".format(ref.datasetType.name) 

361 else: 

362 name = '{}'.format(id(self.datasets[ref.id])) 

363 

364 return "mem://{}".format(name) 

365 

366 def trash(self, ref, ignore_errors=False): 

367 """Indicate to the Datastore that a dataset can be removed. 

368 

369 Parameters 

370 ---------- 

371 ref : `DatasetRef` 

372 Reference to the required Dataset. 

373 ignore_errors: `bool`, optional 

374 Indicate that errors should be ignored. 

375 

376 Raises 

377 ------ 

378 FileNotFoundError 

379 Attempt to remove a dataset that does not exist. 

380 

381 Notes 

382 ----- 

383 Concurrency should not normally be an issue for the in memory datastore 

384 since all internal changes are isolated to solely this process and 

385 the registry only changes rows associated with this process. 

386 """ 

387 

388 log.debug("Trash %s in datastore %s", ref, self.name) 

389 

390 # Check that this dataset is known to datastore 

391 try: 

392 self._get_dataset_info(ref) 

393 

394 # Move datasets to trash table 

395 self._move_to_trash_in_registry(ref) 

396 except Exception as e: 

397 if ignore_errors: 

398 log.warning("Error encountered moving dataset %s to trash in datastore %s: %s", 

399 ref, self.name, e) 

400 else: 

401 raise 

402 

403 def emptyTrash(self, ignore_errors=False): 

404 """Remove all datasets from the trash. 

405 

406 Parameters 

407 ---------- 

408 ignore_errors : `bool`, optional 

409 Ignore errors. 

410 

411 Notes 

412 ----- 

413 The internal tracking of datasets is affected by this method and 

414 transaction handling is not supported if there is a problem before 

415 the datasets themselves are deleted. 

416 

417 Concurrency should not normally be an issue for the in memory datastore 

418 since all internal changes are isolated to solely this process and 

419 the registry only changes rows associated with this process. 

420 """ 

421 log.debug("Emptying trash in datastore %s", self.name) 

422 trashed = self.registry.getTrashedDatasets(self.name) 

423 

424 for ref in trashed: 

425 try: 

426 realID, _ = self._get_dataset_info(ref) 

427 except Exception as e: 

428 if ignore_errors: 

429 log.warning("Emptying trash in datastore %s but encountered an error with dataset %s: %s", 

430 self.name, ref.id, e) 

431 continue 

432 else: 

433 raise 

434 

435 # Determine whether all references to this dataset have been 

436 # removed and we can delete the dataset itself 

437 allRefs = self.related[realID] 

438 theseRefs = {r.id for r in ref.flatten([ref])} 

439 remainingRefs = allRefs - theseRefs 

440 if not remainingRefs: 

441 log.debug("Removing artifact %s from datastore %s", realID, self.name) 

442 del self.datasets[realID] 

443 

444 # Remove this entry 

445 self.removeStoredItemInfo(ref) 

446 

447 # Inform registry that we have handled these items 

448 # This should work even if another process is clearing out those rows 

449 self.registry.emptyDatasetLocationsTrash(self.name, trashed) 

450 

451 def validateConfiguration(self, entities, logFailures=False): 

452 """Validate some of the configuration for this datastore. 

453 

454 Parameters 

455 ---------- 

456 entities : iterable of `DatasetRef`, `DatasetType`, or `StorageClass` 

457 Entities to test against this configuration. Can be differing 

458 types. 

459 logFailures : `bool`, optional 

460 If `True`, output a log message for every validation error 

461 detected. 

462 

463 Raises 

464 ------ 

465 DatastoreValidationError 

466 Raised if there is a validation problem with a configuration. 

467 All the problems are reported in a single exception. 

468 

469 Notes 

470 ----- 

471 This method is a no-op. 

472 """ 

473 return 

474 

475 def _overrideTransferMode(self, *datasets: Any, transfer: Optional[str] = None) -> str: 

476 # Docstring is inherited from base class 

477 return transfer 

478 

479 def validateKey(self, lookupKey, entity): 

480 # Docstring is inherited from base class 

481 return 

482 

483 def getLookupKeys(self): 

484 # Docstring is inherited from base class 

485 return self.constraints.getLookupKeys()