Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22"""In-memory datastore.""" 

23 

24__all__ = ("StoredMemoryItemInfo", "InMemoryDatastore") 

25 

26import time 

27import logging 

28from dataclasses import dataclass 

29from typing import Dict, Optional, Any 

30 

31from lsst.daf.butler import StoredDatastoreItemInfo, StorageClass 

32from .genericDatastore import GenericBaseDatastore 

33 

34log = logging.getLogger(__name__) 

35 

36 

37@dataclass(frozen=True) 

38class StoredMemoryItemInfo(StoredDatastoreItemInfo): 

39 """Internal InMemoryDatastore Metadata associated with a stored 

40 DatasetRef. 

41 """ 

42 __slots__ = {"timestamp", "storageClass", "parentID"} 

43 

44 timestamp: float 

45 """Unix timestamp indicating the time the dataset was stored.""" 

46 

47 storageClass: StorageClass 

48 """StorageClass associated with the dataset.""" 

49 

50 parentID: Optional[int] 

51 """ID of the parent `DatasetRef` if this entry is a concrete 

52 composite. Not used if the dataset being stored is not a 

53 virtual component of a composite 

54 """ 

55 

56 

57class InMemoryDatastore(GenericBaseDatastore): 

58 """Basic Datastore for writing to an in memory cache. 

59 

60 This datastore is ephemeral in that the contents of the datastore 

61 disappear when the Python process completes. This also means that 

62 other processes can not access this datastore. 

63 

64 Parameters 

65 ---------- 

66 config : `DatastoreConfig` or `str` 

67 Configuration. 

68 registry : `Registry`, optional 

69 Unused parameter. 

70 butlerRoot : `str`, optional 

71 Unused parameter. 

72 

73 Notes 

74 ----- 

75 InMemoryDatastore does not support any file-based ingest. 

76 """ 

77 

78 defaultConfigFile = "datastores/inMemoryDatastore.yaml" 

79 """Path to configuration defaults. Relative to $DAF_BUTLER_DIR/config or 

80 absolute path. Can be None if no defaults specified. 

81 """ 

82 

83 isEphemeral = True 

84 """A new datastore is created every time and datasets disappear when 

85 the process shuts down.""" 

86 

87 datasets: Dict[int, Any] 

88 """Internal storage of datasets indexed by dataset ID.""" 

89 

90 records: Dict[int, StoredMemoryItemInfo] 

91 """Internal records about stored datasets.""" 

92 

93 def __init__(self, config, registry=None, butlerRoot=None): 

94 super().__init__(config, registry) 

95 

96 # Name ourselves with the timestamp the datastore 

97 # was created. 

98 self.name = "{}@{}".format(type(self).__name__, time.time()) 

99 log.debug("Creating datastore %s", self.name) 

100 

101 # Storage of datasets, keyed by dataset_id 

102 self.datasets = {} 

103 

104 # Records is distinct in order to track concrete composite components 

105 # where we register multiple components for a single dataset. 

106 self.records = {} 

107 

108 # Related records that share the same parent 

109 self.related = {} 

110 

111 @classmethod 

112 def setConfigRoot(cls, root, config, full, overwrite=True): 

113 """Set any filesystem-dependent config options for this Datastore to 

114 be appropriate for a new empty repository with the given root. 

115 

116 Does nothing in this implementation. 

117 

118 Parameters 

119 ---------- 

120 root : `str` 

121 Filesystem path to the root of the data repository. 

122 config : `Config` 

123 A `Config` to update. Only the subset understood by 

124 this component will be updated. Will not expand 

125 defaults. 

126 full : `Config` 

127 A complete config with all defaults expanded that can be 

128 converted to a `DatastoreConfig`. Read-only and will not be 

129 modified by this method. 

130 Repository-specific options that should not be obtained 

131 from defaults when Butler instances are constructed 

132 should be copied from ``full`` to ``config``. 

133 overwrite : `bool`, optional 

134 If `False`, do not modify a value in ``config`` if the value 

135 already exists. Default is always to overwrite with the provided 

136 ``root``. 

137 

138 Notes 

139 ----- 

140 If a keyword is explicitly defined in the supplied ``config`` it 

141 will not be overridden by this method if ``overwrite`` is `False`. 

142 This allows explicit values set in external configs to be retained. 

143 """ 

144 return 

145 

146 def addStoredItemInfo(self, refs, infos): 

147 # Docstring inherited from GenericBaseDatastore. 

148 for ref, info in zip(refs, infos): 

149 self.records[ref.id] = info 

150 self.related.setdefault(info.parentID, set()).add(ref.id) 

151 

152 def getStoredItemInfo(self, ref): 

153 # Docstring inherited from GenericBaseDatastore. 

154 return self.records[ref.id] 

155 

156 def getStoredItemsInfo(self, ref): 

157 # Docstring inherited from GenericBaseDatastore. 

158 return [self.getStoredItemInfo(ref)] 

159 

160 def removeStoredItemInfo(self, ref): 

161 # Docstring inherited from GenericBaseDatastore. 

162 # If a component has been removed previously then we can sometimes 

163 # be asked to remove it again. Other datastores ignore this 

164 # so also ignore here 

165 if ref.id not in self.records: 165 ↛ 166line 165 didn't jump to line 166, because the condition on line 165 was never true

166 return 

167 record = self.records[ref.id] 

168 del self.records[ref.id] 

169 self.related[record.parentID].remove(ref.id) 

170 

171 def _get_dataset_info(self, ref): 

172 """Check that the dataset is present and return the real ID and 

173 associated information. 

174 

175 Parameters 

176 ---------- 

177 ref : `DatasetRef` 

178 Target `DatasetRef` 

179 

180 Returns 

181 ------- 

182 realID : `int` 

183 The dataset ID associated with this ref that shoul be used. This 

184 could either be the ID of the supplied `DatasetRef` or the parent. 

185 storageInfo : `StoredMemoryItemInfo` 

186 Associated storage information. 

187 

188 Raises 

189 ------ 

190 FileNotFoundError 

191 Raised if the dataset is not present in this datastore. 

192 """ 

193 try: 

194 storedItemInfo = self.getStoredItemInfo(ref) 

195 except KeyError: 

196 raise FileNotFoundError(f"No such file dataset in memory: {ref}") from None 

197 realID = ref.id 

198 if storedItemInfo.parentID is not None: 198 ↛ 201line 198 didn't jump to line 201, because the condition on line 198 was never false

199 realID = storedItemInfo.parentID 

200 

201 if realID not in self.datasets: 201 ↛ 202line 201 didn't jump to line 202, because the condition on line 201 was never true

202 raise FileNotFoundError(f"No such file dataset in memory: {ref}") 

203 

204 return realID, storedItemInfo 

205 

206 def exists(self, ref): 

207 """Check if the dataset exists in the datastore. 

208 

209 Parameters 

210 ---------- 

211 ref : `DatasetRef` 

212 Reference to the required dataset. 

213 

214 Returns 

215 ------- 

216 exists : `bool` 

217 `True` if the entity exists in the `Datastore`. 

218 """ 

219 try: 

220 self._get_dataset_info(ref) 

221 except FileNotFoundError: 

222 return False 

223 return True 

224 

225 def get(self, ref, parameters=None): 

226 """Load an InMemoryDataset from the store. 

227 

228 Parameters 

229 ---------- 

230 ref : `DatasetRef` 

231 Reference to the required Dataset. 

232 parameters : `dict` 

233 `StorageClass`-specific parameters that specify, for example, 

234 a slice of the dataset to be loaded. 

235 

236 Returns 

237 ------- 

238 inMemoryDataset : `object` 

239 Requested dataset or slice thereof as an InMemoryDataset. 

240 

241 Raises 

242 ------ 

243 FileNotFoundError 

244 Requested dataset can not be retrieved. 

245 TypeError 

246 Return value from formatter has unexpected type. 

247 ValueError 

248 Formatter failed to process the dataset. 

249 """ 

250 

251 log.debug("Retrieve %s from %s with parameters %s", ref, self.name, parameters) 

252 

253 realID, storedItemInfo = self._get_dataset_info(ref) 

254 

255 # We have a write storage class and a read storage class and they 

256 # can be different for concrete composites. 

257 readStorageClass = ref.datasetType.storageClass 

258 writeStorageClass = storedItemInfo.storageClass 

259 

260 # Check that the supplied parameters are suitable for the type read 

261 readStorageClass.validateParameters(parameters) 

262 

263 inMemoryDataset = self.datasets[realID] 

264 

265 component = ref.datasetType.component() 

266 

267 # Different storage classes implies a component request 

268 if readStorageClass != writeStorageClass: 

269 

270 if component is None: 270 ↛ 271line 270 didn't jump to line 271, because the condition on line 270 was never true

271 raise ValueError("Storage class inconsistency ({} vs {}) but no" 

272 " component requested".format(readStorageClass.name, 

273 writeStorageClass.name)) 

274 

275 # Concrete composite written as a single object (we hope) 

276 inMemoryDataset = writeStorageClass.assembler().getComponent(inMemoryDataset, component) 

277 

278 # Since there is no formatter to process parameters, they all must be 

279 # passed to the assembler. 

280 return self._post_process_get(inMemoryDataset, readStorageClass, parameters, 

281 isComponent=component is not None) 

282 

283 def put(self, inMemoryDataset, ref): 

284 """Write a InMemoryDataset with a given `DatasetRef` to the store. 

285 

286 Parameters 

287 ---------- 

288 inMemoryDataset : `object` 

289 The dataset to store. 

290 ref : `DatasetRef` 

291 Reference to the associated Dataset. 

292 

293 Raises 

294 ------ 

295 TypeError 

296 Supplied object and storage class are inconsistent. 

297 DatasetTypeNotSupportedError 

298 The associated `DatasetType` is not handled by this datastore. 

299 

300 Notes 

301 ----- 

302 If the datastore is configured to reject certain dataset types it 

303 is possible that the put will fail and raise a 

304 `DatasetTypeNotSupportedError`. The main use case for this is to 

305 allow `ChainedDatastore` to put to multiple datastores without 

306 requiring that every datastore accepts the dataset. 

307 """ 

308 

309 self._validate_put_parameters(inMemoryDataset, ref) 

310 

311 self.datasets[ref.id] = inMemoryDataset 

312 log.debug("Store %s in %s", ref, self.name) 

313 

314 # Store time we received this content, to allow us to optionally 

315 # expire it. Instead of storing a filename here, we include the 

316 # ID of this datasetRef so we can find it from components. 

317 itemInfo = StoredMemoryItemInfo(time.time(), ref.datasetType.storageClass, 

318 parentID=ref.id) 

319 

320 # We have to register this content with registry. 

321 # Currently this assumes we have a file so we need to use stub entries 

322 # TODO: Add to ephemeral part of registry 

323 self._register_datasets([(ref, itemInfo)]) 

324 

325 if self._transaction is not None: 

326 self._transaction.registerUndo("put", self.remove, ref) 

327 

328 def getUri(self, ref, predict=False): 

329 """URI to the Dataset. 

330 

331 Always uses "mem://" URI prefix. 

332 

333 Parameters 

334 ---------- 

335 ref : `DatasetRef` 

336 Reference to the required Dataset. 

337 predict : `bool` 

338 If `True`, allow URIs to be returned of datasets that have not 

339 been written. 

340 

341 Returns 

342 ------- 

343 uri : `str` 

344 URI string pointing to the dataset within the datastore. If the 

345 dataset does not exist in the datastore, and if ``predict`` is 

346 `True`, the URI will be a prediction and will include a URI 

347 fragment "#predicted". 

348 If the datastore does not have entities that relate well 

349 to the concept of a URI the returned URI string will be 

350 descriptive. The returned URI is not guaranteed to be obtainable. 

351 

352 Raises 

353 ------ 

354 FileNotFoundError 

355 A URI has been requested for a dataset that does not exist and 

356 guessing is not allowed. 

357 

358 """ 

359 

360 # if this has never been written then we have to guess 

361 if not self.exists(ref): 

362 if not predict: 

363 raise FileNotFoundError("Dataset {} not in this datastore".format(ref)) 

364 name = "{}#predicted".format(ref.datasetType.name) 

365 else: 

366 realID, _ = self._get_dataset_info(ref) 

367 name = '{}'.format(id(self.datasets[realID])) 

368 

369 return "mem://{}".format(name) 

370 

371 def trash(self, ref, ignore_errors=False): 

372 """Indicate to the Datastore that a dataset can be removed. 

373 

374 Parameters 

375 ---------- 

376 ref : `DatasetRef` 

377 Reference to the required Dataset. 

378 ignore_errors: `bool`, optional 

379 Indicate that errors should be ignored. 

380 

381 Raises 

382 ------ 

383 FileNotFoundError 

384 Attempt to remove a dataset that does not exist. 

385 

386 Notes 

387 ----- 

388 Concurrency should not normally be an issue for the in memory datastore 

389 since all internal changes are isolated to solely this process and 

390 the registry only changes rows associated with this process. 

391 """ 

392 

393 log.debug("Trash %s in datastore %s", ref, self.name) 

394 

395 # Check that this dataset is known to datastore 

396 try: 

397 self._get_dataset_info(ref) 

398 

399 # Move datasets to trash table 

400 self._move_to_trash_in_registry(ref) 

401 except Exception as e: 

402 if ignore_errors: 

403 log.warning("Error encountered moving dataset %s to trash in datastore %s: %s", 

404 ref, self.name, e) 

405 else: 

406 raise 

407 

408 def emptyTrash(self, ignore_errors=False): 

409 """Remove all datasets from the trash. 

410 

411 Parameters 

412 ---------- 

413 ignore_errors : `bool`, optional 

414 Ignore errors. 

415 

416 Notes 

417 ----- 

418 The internal tracking of datasets is affected by this method and 

419 transaction handling is not supported if there is a problem before 

420 the datasets themselves are deleted. 

421 

422 Concurrency should not normally be an issue for the in memory datastore 

423 since all internal changes are isolated to solely this process and 

424 the registry only changes rows associated with this process. 

425 """ 

426 log.debug("Emptying trash in datastore %s", self.name) 

427 trashed = self.registry.getTrashedDatasets(self.name) 

428 

429 for ref in trashed: 

430 try: 

431 realID, _ = self._get_dataset_info(ref) 

432 except Exception as e: 

433 if ignore_errors: 

434 log.warning("Emptying trash in datastore %s but encountered an error with dataset %s: %s", 

435 self.name, ref.id, e) 

436 continue 

437 else: 

438 raise 

439 

440 # Determine whether all references to this dataset have been 

441 # removed and we can delete the dataset itself 

442 allRefs = self.related[realID] 

443 theseRefs = {r.id for r in ref.flatten([ref])} 

444 remainingRefs = allRefs - theseRefs 

445 if not remainingRefs: 

446 log.debug("Removing artifact %s from datastore %s", realID, self.name) 

447 del self.datasets[realID] 

448 

449 # Remove this entry 

450 self.removeStoredItemInfo(ref) 

451 

452 # Inform registry that we have handled these items 

453 # This should work even if another process is clearing out those rows 

454 self.registry.emptyDatasetLocationsTrash(self.name, trashed) 

455 

456 def validateConfiguration(self, entities, logFailures=False): 

457 """Validate some of the configuration for this datastore. 

458 

459 Parameters 

460 ---------- 

461 entities : iterable of `DatasetRef`, `DatasetType`, or `StorageClass` 

462 Entities to test against this configuration. Can be differing 

463 types. 

464 logFailures : `bool`, optional 

465 If `True`, output a log message for every validation error 

466 detected. 

467 

468 Raises 

469 ------ 

470 DatastoreValidationError 

471 Raised if there is a validation problem with a configuration. 

472 All the problems are reported in a single exception. 

473 

474 Notes 

475 ----- 

476 This method is a no-op. 

477 """ 

478 return 

479 

480 def _overrideTransferMode(self, *datasets: Any, transfer: Optional[str] = None) -> str: 

481 # Docstring is inherited from base class 

482 return transfer 

483 

484 def validateKey(self, lookupKey, entity): 

485 # Docstring is inherited from base class 

486 return 

487 

488 def getLookupKeys(self): 

489 # Docstring is inherited from base class 

490 return self.constraints.getLookupKeys()