Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1from __future__ import annotations 

2 

3__all__ = ( 

4 "ByDimensionsDatasetRecordStorageManager", 

5 "ByDimensionsDatasetRecordStorageManagerUUID", 

6) 

7 

8from typing import ( 

9 Any, 

10 Dict, 

11 Iterator, 

12 Optional, 

13 Tuple, 

14 Type, 

15 TYPE_CHECKING, 

16) 

17 

18import copy 

19import sqlalchemy 

20 

21from lsst.daf.butler import ( 

22 DatasetId, 

23 DatasetRef, 

24 DatasetType, 

25 ddl, 

26 DimensionUniverse, 

27) 

28from lsst.daf.butler.registry import ConflictingDefinitionError, OrphanedRecordError 

29from lsst.daf.butler.registry.interfaces import ( 

30 DatasetRecordStorage, 

31 DatasetRecordStorageManager, 

32 VersionTuple 

33) 

34 

35from .tables import ( 

36 addDatasetForeignKey, 

37 makeCalibTableName, 

38 makeCalibTableSpec, 

39 makeStaticTableSpecs, 

40 makeTagTableName, 

41 makeTagTableSpec, 

42) 

43from .summaries import CollectionSummaryManager 

44from ._storage import ( 

45 ByDimensionsDatasetRecordStorage, 

46 ByDimensionsDatasetRecordStorageInt, 

47 ByDimensionsDatasetRecordStorageUUID 

48) 

49from ...summaries import CollectionSummary 

50 

51 

52if TYPE_CHECKING: 52 ↛ 53line 52 didn't jump to line 53, because the condition on line 52 was never true

53 from lsst.daf.butler.registry.interfaces import ( 

54 CollectionManager, 

55 CollectionRecord, 

56 Database, 

57 DimensionRecordStorageManager, 

58 StaticTablesContext, 

59 ) 

60 from .tables import StaticDatasetTablesTuple 

61 

62 

63# This has to be updated on every schema change 

64_VERSION_INT = VersionTuple(1, 0, 0) 

65_VERSION_UUID = VersionTuple(1, 0, 0) 

66 

67 

68class ByDimensionsDatasetRecordStorageManagerBase(DatasetRecordStorageManager): 

69 """A manager class for datasets that uses one dataset-collection table for 

70 each group of dataset types that share the same dimensions. 

71 

72 In addition to the table organization, this class makes a number of 

73 other design choices that would have been cumbersome (to say the least) to 

74 try to pack into its name: 

75 

76 - It uses a private surrogate integer autoincrement field to identify 

77 dataset types, instead of using the name as the primary and foreign key 

78 directly. 

79 

80 - It aggressively loads all DatasetTypes into memory instead of fetching 

81 them from the database only when needed or attempting more clever forms 

82 of caching. 

83 

84 Alternative implementations that make different choices for these while 

85 keeping the same general table organization might be reasonable as well. 

86 

87 This class provides complete implementation of manager logic but it is 

88 parametrized by few class attributes that have to be defined by 

89 sub-classes. 

90 

91 Parameters 

92 ---------- 

93 db : `Database` 

94 Interface to the underlying database engine and namespace. 

95 collections : `CollectionManager` 

96 Manager object for the collections in this `Registry`. 

97 dimensions : `DimensionRecordStorageManager` 

98 Manager object for the dimensions in this `Registry`. 

99 static : `StaticDatasetTablesTuple` 

100 Named tuple of `sqlalchemy.schema.Table` instances for all static 

101 tables used by this class. 

102 summaries : `CollectionSummaryManager` 

103 Structure containing tables that summarize the contents of collections. 

104 """ 

105 def __init__( 

106 self, *, 

107 db: Database, 

108 collections: CollectionManager, 

109 dimensions: DimensionRecordStorageManager, 

110 static: StaticDatasetTablesTuple, 

111 summaries: CollectionSummaryManager, 

112 ): 

113 self._db = db 

114 self._collections = collections 

115 self._dimensions = dimensions 

116 self._static = static 

117 self._summaries = summaries 

118 self._byName: Dict[str, ByDimensionsDatasetRecordStorage] = {} 

119 self._byId: Dict[DatasetId, ByDimensionsDatasetRecordStorage] = {} 

120 

121 @classmethod 

122 def initialize( 

123 cls, 

124 db: Database, 

125 context: StaticTablesContext, *, 

126 collections: CollectionManager, 

127 dimensions: DimensionRecordStorageManager, 

128 ) -> DatasetRecordStorageManager: 

129 # Docstring inherited from DatasetRecordStorageManager. 

130 specs = cls.makeStaticTableSpecs(type(collections), universe=dimensions.universe) 

131 static: StaticDatasetTablesTuple = context.addTableTuple(specs) # type: ignore 

132 summaries = CollectionSummaryManager.initialize( 

133 db, 

134 context, 

135 collections=collections, 

136 dimensions=dimensions, 

137 ) 

138 return cls(db=db, collections=collections, dimensions=dimensions, static=static, summaries=summaries) 

139 

140 @classmethod 

141 def currentVersion(cls) -> Optional[VersionTuple]: 

142 # Docstring inherited from VersionedExtension. 

143 return cls._version 

144 

145 @classmethod 

146 def makeStaticTableSpecs(cls, collections: Type[CollectionManager], 

147 universe: DimensionUniverse) -> StaticDatasetTablesTuple: 

148 """Construct all static tables used by the classes in this package. 

149 

150 Static tables are those that are present in all Registries and do not 

151 depend on what DatasetTypes have been registered. 

152 

153 Parameters 

154 ---------- 

155 collections: `CollectionManager` 

156 Manager object for the collections in this `Registry`. 

157 universe : `DimensionUniverse` 

158 Universe graph containing all dimensions known to this `Registry`. 

159 

160 Returns 

161 ------- 

162 specs : `StaticDatasetTablesTuple` 

163 A named tuple containing `ddl.TableSpec` instances. 

164 """ 

165 return makeStaticTableSpecs(collections, universe=universe, 

166 dtype=cls.getIdColumnType(), autoincrement=cls._autoincrement) 

167 

168 @classmethod 

169 def getIdColumnType(cls) -> type: 

170 # Docstring inherited from base class. 

171 return cls._idColumnType 

172 

173 @classmethod 

174 def addDatasetForeignKey(cls, tableSpec: ddl.TableSpec, *, name: str = "dataset", 

175 constraint: bool = True, onDelete: Optional[str] = None, 

176 **kwargs: Any) -> ddl.FieldSpec: 

177 # Docstring inherited from DatasetRecordStorageManager. 

178 return addDatasetForeignKey(tableSpec, cls.getIdColumnType(), name=name, onDelete=onDelete, 

179 constraint=constraint, **kwargs) 

180 

181 def refresh(self) -> None: 

182 # Docstring inherited from DatasetRecordStorageManager. 

183 byName = {} 

184 byId: Dict[DatasetId, ByDimensionsDatasetRecordStorage] = {} 

185 c = self._static.dataset_type.columns 

186 for row in self._db.query(self._static.dataset_type.select()).fetchall(): 

187 name = row[c.name] 

188 dimensions = self._dimensions.loadDimensionGraph(row[c.dimensions_key]) 

189 calibTableName = row[c.calibration_association_table] 

190 datasetType = DatasetType(name, dimensions, row[c.storage_class], 

191 isCalibration=(calibTableName is not None)) 

192 tags = self._db.getExistingTable( 

193 row[c.tag_association_table], 

194 makeTagTableSpec(datasetType, type(self._collections), self.getIdColumnType())) 

195 if calibTableName is not None: 

196 calibs = self._db.getExistingTable(row[c.calibration_association_table], 

197 makeCalibTableSpec(datasetType, type(self._collections), 

198 self._db.getTimespanRepresentation(), 

199 self.getIdColumnType())) 

200 else: 

201 calibs = None 

202 storage = self._recordStorageType(db=self._db, datasetType=datasetType, 

203 static=self._static, summaries=self._summaries, 

204 tags=tags, calibs=calibs, 

205 dataset_type_id=row["id"], 

206 collections=self._collections) 

207 byName[datasetType.name] = storage 

208 byId[storage._dataset_type_id] = storage 

209 self._byName = byName 

210 self._byId = byId 

211 self._summaries.refresh(lambda dataset_type_id: self._byId[dataset_type_id].datasetType) 

212 

213 def remove(self, name: str) -> None: 

214 # Docstring inherited from DatasetRecordStorageManager. 

215 compositeName, componentName = DatasetType.splitDatasetTypeName(name) 

216 if componentName is not None: 

217 raise ValueError(f"Cannot delete a dataset type of a component of a composite (given {name})") 

218 

219 # Delete the row 

220 try: 

221 self._db.delete(self._static.dataset_type, ["name"], {"name": name}) 

222 except sqlalchemy.exc.IntegrityError as e: 

223 raise OrphanedRecordError(f"Dataset type {name} can not be removed." 

224 " It is associated with datasets that must be removed first.") from e 

225 

226 # Now refresh everything -- removal is rare enough that this does 

227 # not need to be fast. 

228 self.refresh() 

229 

230 def find(self, name: str) -> Optional[DatasetRecordStorage]: 

231 # Docstring inherited from DatasetRecordStorageManager. 

232 compositeName, componentName = DatasetType.splitDatasetTypeName(name) 

233 storage = self._byName.get(compositeName) 

234 if storage is not None and componentName is not None: 

235 componentStorage = copy.copy(storage) 

236 componentStorage.datasetType = storage.datasetType.makeComponentDatasetType(componentName) 

237 return componentStorage 

238 else: 

239 return storage 

240 

241 def register(self, datasetType: DatasetType) -> Tuple[DatasetRecordStorage, bool]: 

242 # Docstring inherited from DatasetRecordStorageManager. 

243 if datasetType.isComponent(): 243 ↛ 244line 243 didn't jump to line 244, because the condition on line 243 was never true

244 raise ValueError("Component dataset types can not be stored in registry." 

245 f" Rejecting {datasetType.name}") 

246 storage = self._byName.get(datasetType.name) 

247 if storage is None: 

248 dimensionsKey = self._dimensions.saveDimensionGraph(datasetType.dimensions) 

249 tagTableName = makeTagTableName(datasetType, dimensionsKey) 

250 calibTableName = (makeCalibTableName(datasetType, dimensionsKey) 

251 if datasetType.isCalibration() else None) 

252 row, inserted = self._db.sync( 

253 self._static.dataset_type, 

254 keys={"name": datasetType.name}, 

255 compared={ 

256 "dimensions_key": dimensionsKey, 

257 "storage_class": datasetType.storageClass.name, 

258 }, 

259 extra={ 

260 "tag_association_table": tagTableName, 

261 "calibration_association_table": calibTableName, 

262 }, 

263 returning=["id", "tag_association_table"], 

264 ) 

265 assert row is not None 

266 tags = self._db.ensureTableExists( 

267 tagTableName, 

268 makeTagTableSpec(datasetType, type(self._collections), self.getIdColumnType()), 

269 ) 

270 if calibTableName is not None: 

271 calibs = self._db.ensureTableExists( 

272 calibTableName, 

273 makeCalibTableSpec(datasetType, type(self._collections), 

274 self._db.getTimespanRepresentation(), self.getIdColumnType()), 

275 ) 

276 else: 

277 calibs = None 

278 storage = self._recordStorageType(db=self._db, datasetType=datasetType, 

279 static=self._static, summaries=self._summaries, 

280 tags=tags, calibs=calibs, 

281 dataset_type_id=row["id"], 

282 collections=self._collections) 

283 self._byName[datasetType.name] = storage 

284 self._byId[storage._dataset_type_id] = storage 

285 else: 

286 if datasetType != storage.datasetType: 

287 raise ConflictingDefinitionError(f"Given dataset type {datasetType} is inconsistent " 

288 f"with database definition {storage.datasetType}.") 

289 inserted = False 

290 return storage, inserted 

291 

292 def __iter__(self) -> Iterator[DatasetType]: 

293 for storage in self._byName.values(): 

294 yield storage.datasetType 

295 

296 def getDatasetRef(self, id: DatasetId) -> Optional[DatasetRef]: 

297 # Docstring inherited from DatasetRecordStorageManager. 

298 sql = sqlalchemy.sql.select( 

299 [ 

300 self._static.dataset.columns.dataset_type_id, 

301 self._static.dataset.columns[self._collections.getRunForeignKeyName()], 

302 ] 

303 ).select_from( 

304 self._static.dataset 

305 ).where( 

306 self._static.dataset.columns.id == id 

307 ) 

308 row = self._db.query(sql).fetchone() 

309 if row is None: 

310 return None 

311 recordsForType = self._byId.get(row[self._static.dataset.columns.dataset_type_id]) 

312 if recordsForType is None: 312 ↛ 313line 312 didn't jump to line 313, because the condition on line 312 was never true

313 self.refresh() 

314 recordsForType = self._byId.get(row[self._static.dataset.columns.dataset_type_id]) 

315 assert recordsForType is not None, "Should be guaranteed by foreign key constraints." 

316 return DatasetRef( 

317 recordsForType.datasetType, 

318 dataId=recordsForType.getDataId(id=id), 

319 id=id, 

320 run=self._collections[row[self._collections.getRunForeignKeyName()]].name 

321 ) 

322 

323 def getCollectionSummary(self, collection: CollectionRecord) -> CollectionSummary: 

324 # Docstring inherited from DatasetRecordStorageManager. 

325 return self._summaries.get(collection) 

326 

327 def schemaDigest(self) -> Optional[str]: 

328 # Docstring inherited from VersionedExtension. 

329 return self._defaultSchemaDigest(self._static, self._db.dialect) 

330 

331 _version: VersionTuple 

332 """Schema version for this class.""" 

333 

334 _recordStorageType: Type[ByDimensionsDatasetRecordStorage] 

335 """Type of the storage class returned by this manager.""" 

336 

337 _autoincrement: bool 

338 """If True then PK column of the dataset table is auto-increment.""" 

339 

340 _idColumnType: type 

341 """Type of dataset column used to store dataset ID.""" 

342 

343 

344class ByDimensionsDatasetRecordStorageManager(ByDimensionsDatasetRecordStorageManagerBase): 

345 """Implementation of ByDimensionsDatasetRecordStorageManagerBase which uses 

346 auto-incremental integer for dataset primary key. 

347 """ 

348 _version: VersionTuple = _VERSION_INT 

349 _recordStorageType: Type[ByDimensionsDatasetRecordStorage] = ByDimensionsDatasetRecordStorageInt 

350 _autoincrement: bool = True 

351 _idColumnType: type = sqlalchemy.BigInteger 

352 

353 

354class ByDimensionsDatasetRecordStorageManagerUUID(ByDimensionsDatasetRecordStorageManagerBase): 

355 """Implementation of ByDimensionsDatasetRecordStorageManagerBase which uses 

356 UUID for dataset primary key. 

357 """ 

358 _version: VersionTuple = _VERSION_UUID 

359 _recordStorageType: Type[ByDimensionsDatasetRecordStorage] = ByDimensionsDatasetRecordStorageUUID 

360 _autoincrement: bool = False 

361 _idColumnType: type = ddl.GUID