Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1from __future__ import annotations 

2 

3__all__ = ( 

4 "ByDimensionsDatasetRecordStorageManager", 

5 "ByDimensionsDatasetRecordStorageManagerUUID", 

6) 

7 

8from typing import ( 

9 Any, 

10 Dict, 

11 Iterator, 

12 Optional, 

13 Tuple, 

14 Type, 

15 TYPE_CHECKING, 

16) 

17 

18import copy 

19import sqlalchemy 

20 

21from lsst.daf.butler import ( 

22 DatasetId, 

23 DatasetRef, 

24 DatasetType, 

25 ddl, 

26 DimensionUniverse, 

27) 

28from lsst.daf.butler.registry import ConflictingDefinitionError, OrphanedRecordError 

29from lsst.daf.butler.registry.interfaces import ( 

30 DatasetIdGenEnum, 

31 DatasetRecordStorage, 

32 DatasetRecordStorageManager, 

33 VersionTuple 

34) 

35 

36from .tables import ( 

37 addDatasetForeignKey, 

38 makeCalibTableName, 

39 makeCalibTableSpec, 

40 makeStaticTableSpecs, 

41 makeTagTableName, 

42 makeTagTableSpec, 

43) 

44from .summaries import CollectionSummaryManager 

45from ._storage import ( 

46 ByDimensionsDatasetRecordStorage, 

47 ByDimensionsDatasetRecordStorageInt, 

48 ByDimensionsDatasetRecordStorageUUID 

49) 

50from ...summaries import CollectionSummary 

51 

52 

53if TYPE_CHECKING: 53 ↛ 54line 53 didn't jump to line 54, because the condition on line 53 was never true

54 from lsst.daf.butler.registry.interfaces import ( 

55 CollectionManager, 

56 CollectionRecord, 

57 Database, 

58 DimensionRecordStorageManager, 

59 StaticTablesContext, 

60 ) 

61 from .tables import StaticDatasetTablesTuple 

62 

63 

64# This has to be updated on every schema change 

65_VERSION_INT = VersionTuple(1, 0, 0) 

66_VERSION_UUID = VersionTuple(1, 0, 0) 

67 

68 

69class ByDimensionsDatasetRecordStorageManagerBase(DatasetRecordStorageManager): 

70 """A manager class for datasets that uses one dataset-collection table for 

71 each group of dataset types that share the same dimensions. 

72 

73 In addition to the table organization, this class makes a number of 

74 other design choices that would have been cumbersome (to say the least) to 

75 try to pack into its name: 

76 

77 - It uses a private surrogate integer autoincrement field to identify 

78 dataset types, instead of using the name as the primary and foreign key 

79 directly. 

80 

81 - It aggressively loads all DatasetTypes into memory instead of fetching 

82 them from the database only when needed or attempting more clever forms 

83 of caching. 

84 

85 Alternative implementations that make different choices for these while 

86 keeping the same general table organization might be reasonable as well. 

87 

88 This class provides complete implementation of manager logic but it is 

89 parametrized by few class attributes that have to be defined by 

90 sub-classes. 

91 

92 Parameters 

93 ---------- 

94 db : `Database` 

95 Interface to the underlying database engine and namespace. 

96 collections : `CollectionManager` 

97 Manager object for the collections in this `Registry`. 

98 dimensions : `DimensionRecordStorageManager` 

99 Manager object for the dimensions in this `Registry`. 

100 static : `StaticDatasetTablesTuple` 

101 Named tuple of `sqlalchemy.schema.Table` instances for all static 

102 tables used by this class. 

103 summaries : `CollectionSummaryManager` 

104 Structure containing tables that summarize the contents of collections. 

105 """ 

106 def __init__( 

107 self, *, 

108 db: Database, 

109 collections: CollectionManager, 

110 dimensions: DimensionRecordStorageManager, 

111 static: StaticDatasetTablesTuple, 

112 summaries: CollectionSummaryManager, 

113 ): 

114 self._db = db 

115 self._collections = collections 

116 self._dimensions = dimensions 

117 self._static = static 

118 self._summaries = summaries 

119 self._byName: Dict[str, ByDimensionsDatasetRecordStorage] = {} 

120 self._byId: Dict[DatasetId, ByDimensionsDatasetRecordStorage] = {} 

121 

122 @classmethod 

123 def initialize( 

124 cls, 

125 db: Database, 

126 context: StaticTablesContext, *, 

127 collections: CollectionManager, 

128 dimensions: DimensionRecordStorageManager, 

129 ) -> DatasetRecordStorageManager: 

130 # Docstring inherited from DatasetRecordStorageManager. 

131 specs = cls.makeStaticTableSpecs(type(collections), universe=dimensions.universe) 

132 static: StaticDatasetTablesTuple = context.addTableTuple(specs) # type: ignore 

133 summaries = CollectionSummaryManager.initialize( 

134 db, 

135 context, 

136 collections=collections, 

137 dimensions=dimensions, 

138 ) 

139 return cls(db=db, collections=collections, dimensions=dimensions, static=static, summaries=summaries) 

140 

141 @classmethod 

142 def currentVersion(cls) -> Optional[VersionTuple]: 

143 # Docstring inherited from VersionedExtension. 

144 return cls._version 

145 

146 @classmethod 

147 def makeStaticTableSpecs(cls, collections: Type[CollectionManager], 

148 universe: DimensionUniverse) -> StaticDatasetTablesTuple: 

149 """Construct all static tables used by the classes in this package. 

150 

151 Static tables are those that are present in all Registries and do not 

152 depend on what DatasetTypes have been registered. 

153 

154 Parameters 

155 ---------- 

156 collections: `CollectionManager` 

157 Manager object for the collections in this `Registry`. 

158 universe : `DimensionUniverse` 

159 Universe graph containing all dimensions known to this `Registry`. 

160 

161 Returns 

162 ------- 

163 specs : `StaticDatasetTablesTuple` 

164 A named tuple containing `ddl.TableSpec` instances. 

165 """ 

166 return makeStaticTableSpecs(collections, universe=universe, 

167 dtype=cls.getIdColumnType(), autoincrement=cls._autoincrement) 

168 

169 @classmethod 

170 def getIdColumnType(cls) -> type: 

171 # Docstring inherited from base class. 

172 return cls._idColumnType 

173 

174 @classmethod 

175 def addDatasetForeignKey(cls, tableSpec: ddl.TableSpec, *, name: str = "dataset", 

176 constraint: bool = True, onDelete: Optional[str] = None, 

177 **kwargs: Any) -> ddl.FieldSpec: 

178 # Docstring inherited from DatasetRecordStorageManager. 

179 return addDatasetForeignKey(tableSpec, cls.getIdColumnType(), name=name, onDelete=onDelete, 

180 constraint=constraint, **kwargs) 

181 

182 def refresh(self) -> None: 

183 # Docstring inherited from DatasetRecordStorageManager. 

184 byName = {} 

185 byId: Dict[DatasetId, ByDimensionsDatasetRecordStorage] = {} 

186 c = self._static.dataset_type.columns 

187 for row in self._db.query(self._static.dataset_type.select()).mappings(): 

188 name = row[c.name] 

189 dimensions = self._dimensions.loadDimensionGraph(row[c.dimensions_key]) 

190 calibTableName = row[c.calibration_association_table] 

191 datasetType = DatasetType(name, dimensions, row[c.storage_class], 

192 isCalibration=(calibTableName is not None)) 

193 tags = self._db.getExistingTable( 

194 row[c.tag_association_table], 

195 makeTagTableSpec(datasetType, type(self._collections), self.getIdColumnType())) 

196 if calibTableName is not None: 

197 calibs = self._db.getExistingTable(row[c.calibration_association_table], 

198 makeCalibTableSpec(datasetType, type(self._collections), 

199 self._db.getTimespanRepresentation(), 

200 self.getIdColumnType())) 

201 else: 

202 calibs = None 

203 storage = self._recordStorageType(db=self._db, datasetType=datasetType, 

204 static=self._static, summaries=self._summaries, 

205 tags=tags, calibs=calibs, 

206 dataset_type_id=row["id"], 

207 collections=self._collections) 

208 byName[datasetType.name] = storage 

209 byId[storage._dataset_type_id] = storage 

210 self._byName = byName 

211 self._byId = byId 

212 self._summaries.refresh(lambda dataset_type_id: self._byId[dataset_type_id].datasetType) 

213 

214 def remove(self, name: str) -> None: 

215 # Docstring inherited from DatasetRecordStorageManager. 

216 compositeName, componentName = DatasetType.splitDatasetTypeName(name) 

217 if componentName is not None: 

218 raise ValueError(f"Cannot delete a dataset type of a component of a composite (given {name})") 

219 

220 # Delete the row 

221 try: 

222 self._db.delete(self._static.dataset_type, ["name"], {"name": name}) 

223 except sqlalchemy.exc.IntegrityError as e: 

224 raise OrphanedRecordError(f"Dataset type {name} can not be removed." 

225 " It is associated with datasets that must be removed first.") from e 

226 

227 # Now refresh everything -- removal is rare enough that this does 

228 # not need to be fast. 

229 self.refresh() 

230 

231 def find(self, name: str) -> Optional[DatasetRecordStorage]: 

232 # Docstring inherited from DatasetRecordStorageManager. 

233 compositeName, componentName = DatasetType.splitDatasetTypeName(name) 

234 storage = self._byName.get(compositeName) 

235 if storage is not None and componentName is not None: 

236 componentStorage = copy.copy(storage) 

237 componentStorage.datasetType = storage.datasetType.makeComponentDatasetType(componentName) 

238 return componentStorage 

239 else: 

240 return storage 

241 

242 def register(self, datasetType: DatasetType) -> Tuple[DatasetRecordStorage, bool]: 

243 # Docstring inherited from DatasetRecordStorageManager. 

244 if datasetType.isComponent(): 244 ↛ 245line 244 didn't jump to line 245, because the condition on line 244 was never true

245 raise ValueError("Component dataset types can not be stored in registry." 

246 f" Rejecting {datasetType.name}") 

247 storage = self._byName.get(datasetType.name) 

248 if storage is None: 

249 dimensionsKey = self._dimensions.saveDimensionGraph(datasetType.dimensions) 

250 tagTableName = makeTagTableName(datasetType, dimensionsKey) 

251 calibTableName = (makeCalibTableName(datasetType, dimensionsKey) 

252 if datasetType.isCalibration() else None) 

253 row, inserted = self._db.sync( 

254 self._static.dataset_type, 

255 keys={"name": datasetType.name}, 

256 compared={ 

257 "dimensions_key": dimensionsKey, 

258 "storage_class": datasetType.storageClass.name, 

259 }, 

260 extra={ 

261 "tag_association_table": tagTableName, 

262 "calibration_association_table": calibTableName, 

263 }, 

264 returning=["id", "tag_association_table"], 

265 ) 

266 assert row is not None 

267 tags = self._db.ensureTableExists( 

268 tagTableName, 

269 makeTagTableSpec(datasetType, type(self._collections), self.getIdColumnType()), 

270 ) 

271 if calibTableName is not None: 

272 calibs = self._db.ensureTableExists( 

273 calibTableName, 

274 makeCalibTableSpec(datasetType, type(self._collections), 

275 self._db.getTimespanRepresentation(), self.getIdColumnType()), 

276 ) 

277 else: 

278 calibs = None 

279 storage = self._recordStorageType(db=self._db, datasetType=datasetType, 

280 static=self._static, summaries=self._summaries, 

281 tags=tags, calibs=calibs, 

282 dataset_type_id=row["id"], 

283 collections=self._collections) 

284 self._byName[datasetType.name] = storage 

285 self._byId[storage._dataset_type_id] = storage 

286 else: 

287 if datasetType != storage.datasetType: 

288 raise ConflictingDefinitionError(f"Given dataset type {datasetType} is inconsistent " 

289 f"with database definition {storage.datasetType}.") 

290 inserted = False 

291 return storage, bool(inserted) 

292 

293 def __iter__(self) -> Iterator[DatasetType]: 

294 for storage in self._byName.values(): 

295 yield storage.datasetType 

296 

297 def getDatasetRef(self, id: DatasetId) -> Optional[DatasetRef]: 

298 # Docstring inherited from DatasetRecordStorageManager. 

299 sql = sqlalchemy.sql.select( 

300 self._static.dataset.columns.dataset_type_id, 

301 self._static.dataset.columns[self._collections.getRunForeignKeyName()], 

302 ).select_from( 

303 self._static.dataset 

304 ).where( 

305 self._static.dataset.columns.id == id 

306 ) 

307 row = self._db.query(sql).mappings().fetchone() 

308 if row is None: 

309 return None 

310 recordsForType = self._byId.get(row[self._static.dataset.columns.dataset_type_id]) 

311 if recordsForType is None: 311 ↛ 312line 311 didn't jump to line 312, because the condition on line 311 was never true

312 self.refresh() 

313 recordsForType = self._byId.get(row[self._static.dataset.columns.dataset_type_id]) 

314 assert recordsForType is not None, "Should be guaranteed by foreign key constraints." 

315 return DatasetRef( 

316 recordsForType.datasetType, 

317 dataId=recordsForType.getDataId(id=id), 

318 id=id, 

319 run=self._collections[row[self._collections.getRunForeignKeyName()]].name 

320 ) 

321 

322 def getCollectionSummary(self, collection: CollectionRecord) -> CollectionSummary: 

323 # Docstring inherited from DatasetRecordStorageManager. 

324 return self._summaries.get(collection) 

325 

326 def schemaDigest(self) -> Optional[str]: 

327 # Docstring inherited from VersionedExtension. 

328 return self._defaultSchemaDigest(self._static, self._db.dialect) 

329 

330 _version: VersionTuple 

331 """Schema version for this class.""" 

332 

333 _recordStorageType: Type[ByDimensionsDatasetRecordStorage] 

334 """Type of the storage class returned by this manager.""" 

335 

336 _autoincrement: bool 

337 """If True then PK column of the dataset table is auto-increment.""" 

338 

339 _idColumnType: type 

340 """Type of dataset column used to store dataset ID.""" 

341 

342 

343class ByDimensionsDatasetRecordStorageManager(ByDimensionsDatasetRecordStorageManagerBase): 

344 """Implementation of ByDimensionsDatasetRecordStorageManagerBase which uses 

345 auto-incremental integer for dataset primary key. 

346 """ 

347 _version: VersionTuple = _VERSION_INT 

348 _recordStorageType: Type[ByDimensionsDatasetRecordStorage] = ByDimensionsDatasetRecordStorageInt 

349 _autoincrement: bool = True 

350 _idColumnType: type = sqlalchemy.BigInteger 

351 

352 @classmethod 

353 def supportsIdGenerationMode(cls, mode: DatasetIdGenEnum) -> bool: 

354 # Docstring inherited from DatasetRecordStorageManager. 

355 # MyPy seems confused about enum value types here. 

356 return mode is mode.UNIQUE # type: ignore 

357 

358 

359class ByDimensionsDatasetRecordStorageManagerUUID(ByDimensionsDatasetRecordStorageManagerBase): 

360 """Implementation of ByDimensionsDatasetRecordStorageManagerBase which uses 

361 UUID for dataset primary key. 

362 """ 

363 _version: VersionTuple = _VERSION_UUID 

364 _recordStorageType: Type[ByDimensionsDatasetRecordStorage] = ByDimensionsDatasetRecordStorageUUID 

365 _autoincrement: bool = False 

366 _idColumnType: type = ddl.GUID 

367 

368 @classmethod 

369 def supportsIdGenerationMode(cls, mode: DatasetIdGenEnum) -> bool: 

370 # Docstring inherited from DatasetRecordStorageManager. 

371 return True