Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22"""In-memory datastore.""" 

23 

24__all__ = ("StoredMemoryItemInfo", "InMemoryDatastore") 

25 

26import time 

27import logging 

28from dataclasses import dataclass 

29from typing import Dict, Optional, Any 

30 

31from lsst.daf.butler import StoredDatastoreItemInfo, StorageClass 

32from .genericDatastore import GenericBaseDatastore 

33 

34log = logging.getLogger(__name__) 

35 

36 

37@dataclass(frozen=True) 

38class StoredMemoryItemInfo(StoredDatastoreItemInfo): 

39 """Internal InMemoryDatastore Metadata associated with a stored 

40 DatasetRef. 

41 """ 

42 __slots__ = {"timestamp", "storageClass", "parentID"} 

43 

44 timestamp: float 

45 """Unix timestamp indicating the time the dataset was stored.""" 

46 

47 storageClass: StorageClass 

48 """StorageClass associated with the dataset.""" 

49 

50 parentID: Optional[int] 

51 """ID of the parent `DatasetRef` if this entry is a concrete 

52 composite. Not used if the dataset being stored is not a 

53 virtual component of a composite 

54 """ 

55 

56 

57class InMemoryDatastore(GenericBaseDatastore): 

58 """Basic Datastore for writing to an in memory cache. 

59 

60 This datastore is ephemeral in that the contents of the datastore 

61 disappear when the Python process completes. This also means that 

62 other processes can not access this datastore. 

63 

64 Parameters 

65 ---------- 

66 config : `DatastoreConfig` or `str` 

67 Configuration. 

68 registry : `Registry`, optional 

69 Unused parameter. 

70 butlerRoot : `str`, optional 

71 Unused parameter. 

72 

73 Notes 

74 ----- 

75 InMemoryDatastore does not support any file-based ingest. 

76 """ 

77 

78 defaultConfigFile = "datastores/inMemoryDatastore.yaml" 

79 """Path to configuration defaults. Relative to $DAF_BUTLER_DIR/config or 

80 absolute path. Can be None if no defaults specified. 

81 """ 

82 

83 isEphemeral = True 

84 """A new datastore is created every time and datasets disappear when 

85 the process shuts down.""" 

86 

87 datasets: Dict[int, Any] 

88 """Internal storage of datasets indexed by dataset ID.""" 

89 

90 records: Dict[int, StoredMemoryItemInfo] 

91 """Internal records about stored datasets.""" 

92 

93 def __init__(self, config, registry=None, butlerRoot=None): 

94 super().__init__(config, registry) 

95 

96 # Name ourselves with the timestamp the datastore 

97 # was created. 

98 self.name = "{}@{}".format(type(self).__name__, time.time()) 

99 log.debug("Creating datastore %s", self.name) 

100 

101 # Storage of datasets, keyed by dataset_id 

102 self.datasets = {} 

103 

104 # Records is distinct in order to track concrete composite components 

105 # where we register multiple components for a single dataset. 

106 self.records = {} 

107 

108 # Related records that share the same parent 

109 self.related = {} 

110 

111 @classmethod 

112 def setConfigRoot(cls, root, config, full, overwrite=True): 

113 """Set any filesystem-dependent config options for this Datastore to 

114 be appropriate for a new empty repository with the given root. 

115 

116 Does nothing in this implementation. 

117 

118 Parameters 

119 ---------- 

120 root : `str` 

121 Filesystem path to the root of the data repository. 

122 config : `Config` 

123 A `Config` to update. Only the subset understood by 

124 this component will be updated. Will not expand 

125 defaults. 

126 full : `Config` 

127 A complete config with all defaults expanded that can be 

128 converted to a `DatastoreConfig`. Read-only and will not be 

129 modified by this method. 

130 Repository-specific options that should not be obtained 

131 from defaults when Butler instances are constructed 

132 should be copied from ``full`` to ``config``. 

133 overwrite : `bool`, optional 

134 If `False`, do not modify a value in ``config`` if the value 

135 already exists. Default is always to overwrite with the provided 

136 ``root``. 

137 

138 Notes 

139 ----- 

140 If a keyword is explicitly defined in the supplied ``config`` it 

141 will not be overridden by this method if ``overwrite`` is `False`. 

142 This allows explicit values set in external configs to be retained. 

143 """ 

144 return 

145 

146 def addStoredItemInfo(self, refs, infos): 

147 # Docstring inherited from GenericBaseDatastore. 

148 for ref, info in zip(refs, infos): 

149 self.records[ref.id] = info 

150 self.related.setdefault(info.parentID, set()).add(ref.id) 

151 

152 def getStoredItemInfo(self, ref): 

153 # Docstring inherited from GenericBaseDatastore. 

154 return self.records[ref.id] 

155 

156 def removeStoredItemInfo(self, ref): 

157 # Docstring inherited from GenericBaseDatastore. 

158 # If a component has been removed previously then we can sometimes 

159 # be asked to remove it again. Other datastores ignore this 

160 # so also ignore here 

161 if ref.id not in self.records: 161 ↛ 162line 161 didn't jump to line 162, because the condition on line 161 was never true

162 return 

163 record = self.records[ref.id] 

164 del self.records[ref.id] 

165 self.related[record.parentID].remove(ref.id) 

166 

167 def _get_dataset_info(self, ref): 

168 """Check that the dataset is present and return the real ID and 

169 associated information. 

170 

171 Parameters 

172 ---------- 

173 ref : `DatasetRef` 

174 Target `DatasetRef` 

175 

176 Returns 

177 ------- 

178 realID : `int` 

179 The dataset ID associated with this ref that shoul be used. This 

180 could either be the ID of the supplied `DatasetRef` or the parent. 

181 storageInfo : `StoredMemoryItemInfo` 

182 Associated storage information. 

183 

184 Raises 

185 ------ 

186 FileNotFoundError 

187 Raised if the dataset is not present in this datastore. 

188 """ 

189 try: 

190 storedItemInfo = self.getStoredItemInfo(ref) 

191 except KeyError: 

192 raise FileNotFoundError(f"No such file dataset in memory: {ref}") from None 

193 realID = ref.id 

194 if storedItemInfo.parentID is not None: 194 ↛ 197line 194 didn't jump to line 197, because the condition on line 194 was never false

195 realID = storedItemInfo.parentID 

196 

197 if realID not in self.datasets: 197 ↛ 198line 197 didn't jump to line 198, because the condition on line 197 was never true

198 raise FileNotFoundError(f"No such file dataset in memory: {ref}") 

199 

200 return realID, storedItemInfo 

201 

202 def exists(self, ref): 

203 """Check if the dataset exists in the datastore. 

204 

205 Parameters 

206 ---------- 

207 ref : `DatasetRef` 

208 Reference to the required dataset. 

209 

210 Returns 

211 ------- 

212 exists : `bool` 

213 `True` if the entity exists in the `Datastore`. 

214 """ 

215 try: 

216 self._get_dataset_info(ref) 

217 except FileNotFoundError: 

218 return False 

219 return True 

220 

221 def get(self, ref, parameters=None): 

222 """Load an InMemoryDataset from the store. 

223 

224 Parameters 

225 ---------- 

226 ref : `DatasetRef` 

227 Reference to the required Dataset. 

228 parameters : `dict` 

229 `StorageClass`-specific parameters that specify, for example, 

230 a slice of the dataset to be loaded. 

231 

232 Returns 

233 ------- 

234 inMemoryDataset : `object` 

235 Requested dataset or slice thereof as an InMemoryDataset. 

236 

237 Raises 

238 ------ 

239 FileNotFoundError 

240 Requested dataset can not be retrieved. 

241 TypeError 

242 Return value from formatter has unexpected type. 

243 ValueError 

244 Formatter failed to process the dataset. 

245 """ 

246 

247 log.debug("Retrieve %s from %s with parameters %s", ref, self.name, parameters) 

248 

249 realID, storedItemInfo = self._get_dataset_info(ref) 

250 

251 # We have a write storage class and a read storage class and they 

252 # can be different for concrete composites. 

253 readStorageClass = ref.datasetType.storageClass 

254 writeStorageClass = storedItemInfo.storageClass 

255 

256 # Check that the supplied parameters are suitable for the type read 

257 readStorageClass.validateParameters(parameters) 

258 

259 inMemoryDataset = self.datasets[realID] 

260 

261 # Different storage classes implies a component request 

262 if readStorageClass != writeStorageClass: 

263 

264 component = ref.datasetType.component() 

265 

266 if component is None: 266 ↛ 267line 266 didn't jump to line 267, because the condition on line 266 was never true

267 raise ValueError("Storage class inconsistency ({} vs {}) but no" 

268 " component requested".format(readStorageClass.name, 

269 writeStorageClass.name)) 

270 

271 # Concrete composite written as a single object (we hope) 

272 inMemoryDataset = writeStorageClass.assembler().getComponent(inMemoryDataset, component) 

273 

274 # Since there is no formatter to process parameters, they all must be 

275 # passed to the assembler. 

276 return self._post_process_get(inMemoryDataset, readStorageClass, parameters) 

277 

278 def put(self, inMemoryDataset, ref): 

279 """Write a InMemoryDataset with a given `DatasetRef` to the store. 

280 

281 Parameters 

282 ---------- 

283 inMemoryDataset : `object` 

284 The dataset to store. 

285 ref : `DatasetRef` 

286 Reference to the associated Dataset. 

287 

288 Raises 

289 ------ 

290 TypeError 

291 Supplied object and storage class are inconsistent. 

292 DatasetTypeNotSupportedError 

293 The associated `DatasetType` is not handled by this datastore. 

294 

295 Notes 

296 ----- 

297 If the datastore is configured to reject certain dataset types it 

298 is possible that the put will fail and raise a 

299 `DatasetTypeNotSupportedError`. The main use case for this is to 

300 allow `ChainedDatastore` to put to multiple datastores without 

301 requiring that every datastore accepts the dataset. 

302 """ 

303 

304 self._validate_put_parameters(inMemoryDataset, ref) 

305 

306 self.datasets[ref.id] = inMemoryDataset 

307 log.debug("Store %s in %s", ref, self.name) 

308 

309 # Store time we received this content, to allow us to optionally 

310 # expire it. Instead of storing a filename here, we include the 

311 # ID of this datasetRef so we can find it from components. 

312 itemInfo = StoredMemoryItemInfo(time.time(), ref.datasetType.storageClass, 

313 parentID=ref.id) 

314 

315 # We have to register this content with registry. 

316 # Currently this assumes we have a file so we need to use stub entries 

317 # TODO: Add to ephemeral part of registry 

318 self._register_datasets([(ref, itemInfo)]) 

319 

320 if self._transaction is not None: 

321 self._transaction.registerUndo("put", self.remove, ref) 

322 

323 def getUri(self, ref, predict=False): 

324 """URI to the Dataset. 

325 

326 Always uses "mem://" URI prefix. 

327 

328 Parameters 

329 ---------- 

330 ref : `DatasetRef` 

331 Reference to the required Dataset. 

332 predict : `bool` 

333 If `True`, allow URIs to be returned of datasets that have not 

334 been written. 

335 

336 Returns 

337 ------- 

338 uri : `str` 

339 URI string pointing to the dataset within the datastore. If the 

340 dataset does not exist in the datastore, and if ``predict`` is 

341 `True`, the URI will be a prediction and will include a URI 

342 fragment "#predicted". 

343 If the datastore does not have entities that relate well 

344 to the concept of a URI the returned URI string will be 

345 descriptive. The returned URI is not guaranteed to be obtainable. 

346 

347 Raises 

348 ------ 

349 FileNotFoundError 

350 A URI has been requested for a dataset that does not exist and 

351 guessing is not allowed. 

352 

353 """ 

354 

355 # if this has never been written then we have to guess 

356 if not self.exists(ref): 

357 if not predict: 

358 raise FileNotFoundError("Dataset {} not in this datastore".format(ref)) 

359 name = "{}#predicted".format(ref.datasetType.name) 

360 else: 

361 name = '{}'.format(id(self.datasets[ref.id])) 

362 

363 return "mem://{}".format(name) 

364 

365 def trash(self, ref, ignore_errors=False): 

366 """Indicate to the Datastore that a dataset can be removed. 

367 

368 Parameters 

369 ---------- 

370 ref : `DatasetRef` 

371 Reference to the required Dataset. 

372 ignore_errors: `bool`, optional 

373 Indicate that errors should be ignored. 

374 

375 Raises 

376 ------ 

377 FileNotFoundError 

378 Attempt to remove a dataset that does not exist. 

379 

380 Notes 

381 ----- 

382 Concurrency should not normally be an issue for the in memory datastore 

383 since all internal changes are isolated to solely this process and 

384 the registry only changes rows associated with this process. 

385 """ 

386 

387 log.debug("Trash %s in datastore %s", ref, self.name) 

388 

389 # Check that this dataset is known to datastore 

390 try: 

391 self._get_dataset_info(ref) 

392 

393 # Move datasets to trash table 

394 self._move_to_trash_in_registry(ref) 

395 except Exception as e: 

396 if ignore_errors: 

397 log.warning("Error encountered moving dataset %s to trash in datastore %s: %s", 

398 ref, self.name, e) 

399 else: 

400 raise 

401 

402 def emptyTrash(self, ignore_errors=False): 

403 """Remove all datasets from the trash. 

404 

405 Parameters 

406 ---------- 

407 ignore_errors : `bool`, optional 

408 Ignore errors. 

409 

410 Notes 

411 ----- 

412 The internal tracking of datasets is affected by this method and 

413 transaction handling is not supported if there is a problem before 

414 the datasets themselves are deleted. 

415 

416 Concurrency should not normally be an issue for the in memory datastore 

417 since all internal changes are isolated to solely this process and 

418 the registry only changes rows associated with this process. 

419 """ 

420 log.debug("Emptying trash in datastore %s", self.name) 

421 trashed = self.registry.getTrashedDatasets(self.name) 

422 

423 for ref in trashed: 

424 try: 

425 realID, _ = self._get_dataset_info(ref) 

426 except Exception as e: 

427 if ignore_errors: 

428 log.warning("Emptying trash in datastore %s but encountered an error with dataset %s: %s", 

429 self.name, ref.id, e) 

430 continue 

431 else: 

432 raise 

433 

434 # Determine whether all references to this dataset have been 

435 # removed and we can delete the dataset itself 

436 allRefs = self.related[realID] 

437 theseRefs = {r.id for r in ref.flatten([ref])} 

438 remainingRefs = allRefs - theseRefs 

439 if not remainingRefs: 

440 log.debug("Removing artifact %s from datastore %s", realID, self.name) 

441 del self.datasets[realID] 

442 

443 # Remove this entry 

444 self.removeStoredItemInfo(ref) 

445 

446 # Inform registry that we have handled these items 

447 # This should work even if another process is clearing out those rows 

448 self.registry.emptyDatasetLocationsTrash(self.name, trashed) 

449 

450 def validateConfiguration(self, entities, logFailures=False): 

451 """Validate some of the configuration for this datastore. 

452 

453 Parameters 

454 ---------- 

455 entities : iterable of `DatasetRef`, `DatasetType`, or `StorageClass` 

456 Entities to test against this configuration. Can be differing 

457 types. 

458 logFailures : `bool`, optional 

459 If `True`, output a log message for every validation error 

460 detected. 

461 

462 Raises 

463 ------ 

464 DatastoreValidationError 

465 Raised if there is a validation problem with a configuration. 

466 All the problems are reported in a single exception. 

467 

468 Notes 

469 ----- 

470 This method is a no-op. 

471 """ 

472 return 

473 

474 def _overrideTransferMode(self, *datasets: Any, transfer: Optional[str] = None) -> str: 

475 # Docstring is inherited from base class 

476 return transfer 

477 

478 def validateKey(self, lookupKey, entity): 

479 # Docstring is inherited from base class 

480 return 

481 

482 def getLookupKeys(self): 

483 # Docstring is inherited from base class 

484 return self.constraints.getLookupKeys()