Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1from __future__ import annotations 

2 

3__all__ = ( 

4 "ByDimensionsDatasetRecordStorageManager", 

5 "ByDimensionsDatasetRecordStorageManagerUUID", 

6) 

7 

8from typing import ( 

9 Any, 

10 Dict, 

11 Iterator, 

12 Optional, 

13 Tuple, 

14 Type, 

15 TYPE_CHECKING, 

16) 

17 

18import copy 

19import sqlalchemy 

20 

21from lsst.daf.butler import ( 

22 DatasetId, 

23 DatasetRef, 

24 DatasetType, 

25 ddl, 

26 DimensionUniverse, 

27) 

28from lsst.daf.butler.registry import ConflictingDefinitionError, OrphanedRecordError 

29from lsst.daf.butler.registry.interfaces import ( 

30 DatasetIdGenEnum, 

31 DatasetRecordStorage, 

32 DatasetRecordStorageManager, 

33 VersionTuple 

34) 

35 

36from .tables import ( 

37 addDatasetForeignKey, 

38 makeCalibTableName, 

39 makeCalibTableSpec, 

40 makeStaticTableSpecs, 

41 makeTagTableName, 

42 makeTagTableSpec, 

43) 

44from .summaries import CollectionSummaryManager 

45from ._storage import ( 

46 ByDimensionsDatasetRecordStorage, 

47 ByDimensionsDatasetRecordStorageInt, 

48 ByDimensionsDatasetRecordStorageUUID 

49) 

50from ...summaries import CollectionSummary 

51 

52 

53if TYPE_CHECKING: 53 ↛ 54line 53 didn't jump to line 54, because the condition on line 53 was never true

54 from lsst.daf.butler.registry.interfaces import ( 

55 CollectionManager, 

56 CollectionRecord, 

57 Database, 

58 DimensionRecordStorageManager, 

59 StaticTablesContext, 

60 ) 

61 from .tables import StaticDatasetTablesTuple 

62 

63 

64# This has to be updated on every schema change 

65_VERSION_INT = VersionTuple(1, 0, 0) 

66_VERSION_UUID = VersionTuple(1, 0, 0) 

67 

68 

69class ByDimensionsDatasetRecordStorageManagerBase(DatasetRecordStorageManager): 

70 """A manager class for datasets that uses one dataset-collection table for 

71 each group of dataset types that share the same dimensions. 

72 

73 In addition to the table organization, this class makes a number of 

74 other design choices that would have been cumbersome (to say the least) to 

75 try to pack into its name: 

76 

77 - It uses a private surrogate integer autoincrement field to identify 

78 dataset types, instead of using the name as the primary and foreign key 

79 directly. 

80 

81 - It aggressively loads all DatasetTypes into memory instead of fetching 

82 them from the database only when needed or attempting more clever forms 

83 of caching. 

84 

85 Alternative implementations that make different choices for these while 

86 keeping the same general table organization might be reasonable as well. 

87 

88 This class provides complete implementation of manager logic but it is 

89 parametrized by few class attributes that have to be defined by 

90 sub-classes. 

91 

92 Parameters 

93 ---------- 

94 db : `Database` 

95 Interface to the underlying database engine and namespace. 

96 collections : `CollectionManager` 

97 Manager object for the collections in this `Registry`. 

98 dimensions : `DimensionRecordStorageManager` 

99 Manager object for the dimensions in this `Registry`. 

100 static : `StaticDatasetTablesTuple` 

101 Named tuple of `sqlalchemy.schema.Table` instances for all static 

102 tables used by this class. 

103 summaries : `CollectionSummaryManager` 

104 Structure containing tables that summarize the contents of collections. 

105 """ 

106 def __init__( 

107 self, *, 

108 db: Database, 

109 collections: CollectionManager, 

110 dimensions: DimensionRecordStorageManager, 

111 static: StaticDatasetTablesTuple, 

112 summaries: CollectionSummaryManager, 

113 ): 

114 self._db = db 

115 self._collections = collections 

116 self._dimensions = dimensions 

117 self._static = static 

118 self._summaries = summaries 

119 self._byName: Dict[str, ByDimensionsDatasetRecordStorage] = {} 

120 self._byId: Dict[DatasetId, ByDimensionsDatasetRecordStorage] = {} 

121 

122 @classmethod 

123 def initialize( 

124 cls, 

125 db: Database, 

126 context: StaticTablesContext, *, 

127 collections: CollectionManager, 

128 dimensions: DimensionRecordStorageManager, 

129 ) -> DatasetRecordStorageManager: 

130 # Docstring inherited from DatasetRecordStorageManager. 

131 specs = cls.makeStaticTableSpecs(type(collections), universe=dimensions.universe) 

132 static: StaticDatasetTablesTuple = context.addTableTuple(specs) # type: ignore 

133 summaries = CollectionSummaryManager.initialize( 

134 db, 

135 context, 

136 collections=collections, 

137 dimensions=dimensions, 

138 ) 

139 return cls(db=db, collections=collections, dimensions=dimensions, static=static, summaries=summaries) 

140 

141 @classmethod 

142 def currentVersion(cls) -> Optional[VersionTuple]: 

143 # Docstring inherited from VersionedExtension. 

144 return cls._version 

145 

146 @classmethod 

147 def makeStaticTableSpecs(cls, collections: Type[CollectionManager], 

148 universe: DimensionUniverse) -> StaticDatasetTablesTuple: 

149 """Construct all static tables used by the classes in this package. 

150 

151 Static tables are those that are present in all Registries and do not 

152 depend on what DatasetTypes have been registered. 

153 

154 Parameters 

155 ---------- 

156 collections: `CollectionManager` 

157 Manager object for the collections in this `Registry`. 

158 universe : `DimensionUniverse` 

159 Universe graph containing all dimensions known to this `Registry`. 

160 

161 Returns 

162 ------- 

163 specs : `StaticDatasetTablesTuple` 

164 A named tuple containing `ddl.TableSpec` instances. 

165 """ 

166 return makeStaticTableSpecs(collections, universe=universe, 

167 dtype=cls.getIdColumnType(), autoincrement=cls._autoincrement) 

168 

169 @classmethod 

170 def getIdColumnType(cls) -> type: 

171 # Docstring inherited from base class. 

172 return cls._idColumnType 

173 

174 @classmethod 

175 def addDatasetForeignKey(cls, tableSpec: ddl.TableSpec, *, name: str = "dataset", 

176 constraint: bool = True, onDelete: Optional[str] = None, 

177 **kwargs: Any) -> ddl.FieldSpec: 

178 # Docstring inherited from DatasetRecordStorageManager. 

179 return addDatasetForeignKey(tableSpec, cls.getIdColumnType(), name=name, onDelete=onDelete, 

180 constraint=constraint, **kwargs) 

181 

182 def refresh(self) -> None: 

183 # Docstring inherited from DatasetRecordStorageManager. 

184 byName = {} 

185 byId: Dict[DatasetId, ByDimensionsDatasetRecordStorage] = {} 

186 c = self._static.dataset_type.columns 

187 for row in self._db.query(self._static.dataset_type.select()).fetchall(): 

188 name = row[c.name] 

189 dimensions = self._dimensions.loadDimensionGraph(row[c.dimensions_key]) 

190 calibTableName = row[c.calibration_association_table] 

191 datasetType = DatasetType(name, dimensions, row[c.storage_class], 

192 isCalibration=(calibTableName is not None)) 

193 tags = self._db.getExistingTable( 

194 row[c.tag_association_table], 

195 makeTagTableSpec(datasetType, type(self._collections), self.getIdColumnType())) 

196 if calibTableName is not None: 

197 calibs = self._db.getExistingTable(row[c.calibration_association_table], 

198 makeCalibTableSpec(datasetType, type(self._collections), 

199 self._db.getTimespanRepresentation(), 

200 self.getIdColumnType())) 

201 else: 

202 calibs = None 

203 storage = self._recordStorageType(db=self._db, datasetType=datasetType, 

204 static=self._static, summaries=self._summaries, 

205 tags=tags, calibs=calibs, 

206 dataset_type_id=row["id"], 

207 collections=self._collections) 

208 byName[datasetType.name] = storage 

209 byId[storage._dataset_type_id] = storage 

210 self._byName = byName 

211 self._byId = byId 

212 self._summaries.refresh(lambda dataset_type_id: self._byId[dataset_type_id].datasetType) 

213 

214 def remove(self, name: str) -> None: 

215 # Docstring inherited from DatasetRecordStorageManager. 

216 compositeName, componentName = DatasetType.splitDatasetTypeName(name) 

217 if componentName is not None: 

218 raise ValueError(f"Cannot delete a dataset type of a component of a composite (given {name})") 

219 

220 # Delete the row 

221 try: 

222 self._db.delete(self._static.dataset_type, ["name"], {"name": name}) 

223 except sqlalchemy.exc.IntegrityError as e: 

224 raise OrphanedRecordError(f"Dataset type {name} can not be removed." 

225 " It is associated with datasets that must be removed first.") from e 

226 

227 # Now refresh everything -- removal is rare enough that this does 

228 # not need to be fast. 

229 self.refresh() 

230 

231 def find(self, name: str) -> Optional[DatasetRecordStorage]: 

232 # Docstring inherited from DatasetRecordStorageManager. 

233 compositeName, componentName = DatasetType.splitDatasetTypeName(name) 

234 storage = self._byName.get(compositeName) 

235 if storage is not None and componentName is not None: 

236 componentStorage = copy.copy(storage) 

237 componentStorage.datasetType = storage.datasetType.makeComponentDatasetType(componentName) 

238 return componentStorage 

239 else: 

240 return storage 

241 

242 def register(self, datasetType: DatasetType) -> Tuple[DatasetRecordStorage, bool]: 

243 # Docstring inherited from DatasetRecordStorageManager. 

244 if datasetType.isComponent(): 244 ↛ 245line 244 didn't jump to line 245, because the condition on line 244 was never true

245 raise ValueError("Component dataset types can not be stored in registry." 

246 f" Rejecting {datasetType.name}") 

247 storage = self._byName.get(datasetType.name) 

248 if storage is None: 

249 dimensionsKey = self._dimensions.saveDimensionGraph(datasetType.dimensions) 

250 tagTableName = makeTagTableName(datasetType, dimensionsKey) 

251 calibTableName = (makeCalibTableName(datasetType, dimensionsKey) 

252 if datasetType.isCalibration() else None) 

253 row, inserted = self._db.sync( 

254 self._static.dataset_type, 

255 keys={"name": datasetType.name}, 

256 compared={ 

257 "dimensions_key": dimensionsKey, 

258 "storage_class": datasetType.storageClass.name, 

259 }, 

260 extra={ 

261 "tag_association_table": tagTableName, 

262 "calibration_association_table": calibTableName, 

263 }, 

264 returning=["id", "tag_association_table"], 

265 ) 

266 assert row is not None 

267 tags = self._db.ensureTableExists( 

268 tagTableName, 

269 makeTagTableSpec(datasetType, type(self._collections), self.getIdColumnType()), 

270 ) 

271 if calibTableName is not None: 

272 calibs = self._db.ensureTableExists( 

273 calibTableName, 

274 makeCalibTableSpec(datasetType, type(self._collections), 

275 self._db.getTimespanRepresentation(), self.getIdColumnType()), 

276 ) 

277 else: 

278 calibs = None 

279 storage = self._recordStorageType(db=self._db, datasetType=datasetType, 

280 static=self._static, summaries=self._summaries, 

281 tags=tags, calibs=calibs, 

282 dataset_type_id=row["id"], 

283 collections=self._collections) 

284 self._byName[datasetType.name] = storage 

285 self._byId[storage._dataset_type_id] = storage 

286 else: 

287 if datasetType != storage.datasetType: 

288 raise ConflictingDefinitionError(f"Given dataset type {datasetType} is inconsistent " 

289 f"with database definition {storage.datasetType}.") 

290 inserted = False 

291 return storage, bool(inserted) 

292 

293 def __iter__(self) -> Iterator[DatasetType]: 

294 for storage in self._byName.values(): 

295 yield storage.datasetType 

296 

297 def getDatasetRef(self, id: DatasetId) -> Optional[DatasetRef]: 

298 # Docstring inherited from DatasetRecordStorageManager. 

299 sql = sqlalchemy.sql.select( 

300 [ 

301 self._static.dataset.columns.dataset_type_id, 

302 self._static.dataset.columns[self._collections.getRunForeignKeyName()], 

303 ] 

304 ).select_from( 

305 self._static.dataset 

306 ).where( 

307 self._static.dataset.columns.id == id 

308 ) 

309 row = self._db.query(sql).fetchone() 

310 if row is None: 

311 return None 

312 recordsForType = self._byId.get(row[self._static.dataset.columns.dataset_type_id]) 

313 if recordsForType is None: 313 ↛ 314line 313 didn't jump to line 314, because the condition on line 313 was never true

314 self.refresh() 

315 recordsForType = self._byId.get(row[self._static.dataset.columns.dataset_type_id]) 

316 assert recordsForType is not None, "Should be guaranteed by foreign key constraints." 

317 return DatasetRef( 

318 recordsForType.datasetType, 

319 dataId=recordsForType.getDataId(id=id), 

320 id=id, 

321 run=self._collections[row[self._collections.getRunForeignKeyName()]].name 

322 ) 

323 

324 def getCollectionSummary(self, collection: CollectionRecord) -> CollectionSummary: 

325 # Docstring inherited from DatasetRecordStorageManager. 

326 return self._summaries.get(collection) 

327 

328 def schemaDigest(self) -> Optional[str]: 

329 # Docstring inherited from VersionedExtension. 

330 return self._defaultSchemaDigest(self._static, self._db.dialect) 

331 

332 _version: VersionTuple 

333 """Schema version for this class.""" 

334 

335 _recordStorageType: Type[ByDimensionsDatasetRecordStorage] 

336 """Type of the storage class returned by this manager.""" 

337 

338 _autoincrement: bool 

339 """If True then PK column of the dataset table is auto-increment.""" 

340 

341 _idColumnType: type 

342 """Type of dataset column used to store dataset ID.""" 

343 

344 

345class ByDimensionsDatasetRecordStorageManager(ByDimensionsDatasetRecordStorageManagerBase): 

346 """Implementation of ByDimensionsDatasetRecordStorageManagerBase which uses 

347 auto-incremental integer for dataset primary key. 

348 """ 

349 _version: VersionTuple = _VERSION_INT 

350 _recordStorageType: Type[ByDimensionsDatasetRecordStorage] = ByDimensionsDatasetRecordStorageInt 

351 _autoincrement: bool = True 

352 _idColumnType: type = sqlalchemy.BigInteger 

353 

354 @classmethod 

355 def supportsIdGenerationMode(cls, mode: DatasetIdGenEnum) -> bool: 

356 # Docstring inherited from DatasetRecordStorageManager. 

357 # MyPy seems confused about enum value types here. 

358 return mode is mode.UNIQUE # type: ignore 

359 

360 

361class ByDimensionsDatasetRecordStorageManagerUUID(ByDimensionsDatasetRecordStorageManagerBase): 

362 """Implementation of ByDimensionsDatasetRecordStorageManagerBase which uses 

363 UUID for dataset primary key. 

364 """ 

365 _version: VersionTuple = _VERSION_UUID 

366 _recordStorageType: Type[ByDimensionsDatasetRecordStorage] = ByDimensionsDatasetRecordStorageUUID 

367 _autoincrement: bool = False 

368 _idColumnType: type = ddl.GUID 

369 

370 @classmethod 

371 def supportsIdGenerationMode(cls, mode: DatasetIdGenEnum) -> bool: 

372 # Docstring inherited from DatasetRecordStorageManager. 

373 return True