Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1from __future__ import annotations 

2 

3__all__ = ("ByDimensionsDatasetRecordStorageManager",) 

4 

5from typing import ( 

6 Any, 

7 Dict, 

8 Iterator, 

9 Optional, 

10 Tuple, 

11 TYPE_CHECKING, 

12) 

13 

14import copy 

15import sqlalchemy 

16 

17from lsst.daf.butler import ( 

18 DatasetRef, 

19 DatasetType, 

20 ddl, 

21) 

22from lsst.daf.butler.registry import ConflictingDefinitionError, OrphanedRecordError 

23from lsst.daf.butler.registry.interfaces import ( 

24 DatasetRecordStorage, 

25 DatasetRecordStorageManager, 

26 VersionTuple 

27) 

28 

29from .tables import ( 

30 addDatasetForeignKey, 

31 CollectionSummaryTables, 

32 makeCalibTableName, 

33 makeCalibTableSpec, 

34 makeStaticTableSpecs, 

35 makeTagTableName, 

36 makeTagTableSpec, 

37) 

38from ._storage import ByDimensionsDatasetRecordStorage 

39 

40if TYPE_CHECKING: 40 ↛ 41line 40 didn't jump to line 41, because the condition on line 40 was never true

41 from lsst.daf.butler.registry.interfaces import ( 

42 CollectionManager, 

43 Database, 

44 DimensionRecordStorageManager, 

45 StaticTablesContext, 

46 ) 

47 from .tables import StaticDatasetTablesTuple 

48 

49 

50# This has to be updated on every schema change 

51_VERSION = VersionTuple(1, 0, 0) 

52 

53 

54class ByDimensionsDatasetRecordStorageManager(DatasetRecordStorageManager): 

55 """A manager class for datasets that uses one dataset-collection table for 

56 each group of dataset types that share the same dimensions. 

57 

58 In addition to the table organization, this class makes a number of 

59 other design choices that would have been cumbersome (to say the least) to 

60 try to pack into its name: 

61 

62 - It uses a private surrogate integer autoincrement field to identify 

63 dataset types, instead of using the name as the primary and foreign key 

64 directly. 

65 

66 - It aggressively loads all DatasetTypes into memory instead of fetching 

67 them from the database only when needed or attempting more clever forms 

68 of caching. 

69 

70 Alternative implementations that make different choices for these while 

71 keeping the same general table organization might be reasonable as well. 

72 

73 Parameters 

74 ---------- 

75 db : `Database` 

76 Interface to the underlying database engine and namespace. 

77 collections : `CollectionManager` 

78 Manager object for the collections in this `Registry`. 

79 dimensions : `DimensionRecordStorageManager` 

80 Manager object for the dimensions in this `Registry`. 

81 static : `StaticDatasetTablesTuple` 

82 Named tuple of `sqlalchemy.schema.Table` instances for all static 

83 tables used by this class. 

84 summaries : `CollectionSummaryTables` 

85 Structure containing tables that summarize the contents of collections. 

86 """ 

87 def __init__( 

88 self, *, 

89 db: Database, 

90 collections: CollectionManager, 

91 dimensions: DimensionRecordStorageManager, 

92 static: StaticDatasetTablesTuple, 

93 summaries: CollectionSummaryTables, 

94 ): 

95 self._db = db 

96 self._collections = collections 

97 self._dimensions = dimensions 

98 self._static = static 

99 self._summaries = summaries 

100 self._byName: Dict[str, ByDimensionsDatasetRecordStorage] = {} 

101 self._byId: Dict[int, ByDimensionsDatasetRecordStorage] = {} 

102 

103 @classmethod 

104 def initialize( 

105 cls, 

106 db: Database, 

107 context: StaticTablesContext, *, 

108 collections: CollectionManager, 

109 dimensions: DimensionRecordStorageManager, 

110 ) -> DatasetRecordStorageManager: 

111 # Docstring inherited from DatasetRecordStorageManager. 

112 specs = makeStaticTableSpecs(type(collections), universe=dimensions.universe) 

113 static: StaticDatasetTablesTuple = context.addTableTuple(specs) # type: ignore 

114 summaries = CollectionSummaryTables.initialize( 

115 db, 

116 context, 

117 collections=collections, 

118 dimensions=dimensions, 

119 ) 

120 return cls(db=db, collections=collections, dimensions=dimensions, static=static, summaries=summaries) 

121 

122 @classmethod 

123 def addDatasetForeignKey(cls, tableSpec: ddl.TableSpec, *, name: str = "dataset", 

124 constraint: bool = True, onDelete: Optional[str] = None, 

125 **kwargs: Any) -> ddl.FieldSpec: 

126 # Docstring inherited from DatasetRecordStorageManager. 

127 return addDatasetForeignKey(tableSpec, name=name, onDelete=onDelete, constraint=constraint, **kwargs) 

128 

129 def refresh(self) -> None: 

130 # Docstring inherited from DatasetRecordStorageManager. 

131 byName = {} 

132 byId = {} 

133 c = self._static.dataset_type.columns 

134 for row in self._db.query(self._static.dataset_type.select()).fetchall(): 

135 name = row[c.name] 

136 dimensions = self._dimensions.loadDimensionGraph(row[c.dimensions_key]) 

137 calibTableName = row[c.calibration_association_table] 

138 datasetType = DatasetType(name, dimensions, row[c.storage_class], 

139 isCalibration=(calibTableName is not None)) 

140 tags = self._db.getExistingTable(row[c.tag_association_table], 

141 makeTagTableSpec(datasetType, type(self._collections))) 

142 if calibTableName is not None: 

143 calibs = self._db.getExistingTable(row[c.calibration_association_table], 

144 makeCalibTableSpec(datasetType, type(self._collections), 

145 self._db.getTimespanRepresentation())) 

146 else: 

147 calibs = None 

148 storage = ByDimensionsDatasetRecordStorage(db=self._db, datasetType=datasetType, 

149 static=self._static, summaries=self._summaries, 

150 tags=tags, calibs=calibs, 

151 dataset_type_id=row["id"], 

152 collections=self._collections) 

153 byName[datasetType.name] = storage 

154 byId[storage._dataset_type_id] = storage 

155 self._byName = byName 

156 self._byId = byId 

157 

158 def remove(self, name: str) -> None: 

159 # Docstring inherited from DatasetRecordStorageManager. 

160 compositeName, componentName = DatasetType.splitDatasetTypeName(name) 

161 if componentName is not None: 

162 raise ValueError(f"Cannot delete a dataset type of a component of a composite (given {name})") 

163 

164 # Delete the row 

165 try: 

166 self._db.delete(self._static.dataset_type, ["name"], {"name": name}) 

167 except sqlalchemy.exc.IntegrityError as e: 

168 raise OrphanedRecordError(f"Dataset type {name} can not be removed." 

169 " It is associated with datasets that must be removed first.") from e 

170 

171 # Now refresh everything -- removal is rare enough that this does 

172 # not need to be fast. 

173 self.refresh() 

174 

175 def find(self, name: str) -> Optional[DatasetRecordStorage]: 

176 # Docstring inherited from DatasetRecordStorageManager. 

177 compositeName, componentName = DatasetType.splitDatasetTypeName(name) 

178 storage = self._byName.get(compositeName) 

179 if storage is not None and componentName is not None: 

180 componentStorage = copy.copy(storage) 

181 componentStorage.datasetType = storage.datasetType.makeComponentDatasetType(componentName) 

182 return componentStorage 

183 else: 

184 return storage 

185 

186 def register(self, datasetType: DatasetType) -> Tuple[DatasetRecordStorage, bool]: 

187 # Docstring inherited from DatasetRecordStorageManager. 

188 if datasetType.isComponent(): 188 ↛ 189line 188 didn't jump to line 189, because the condition on line 188 was never true

189 raise ValueError("Component dataset types can not be stored in registry." 

190 f" Rejecting {datasetType.name}") 

191 storage = self._byName.get(datasetType.name) 

192 if storage is None: 

193 dimensionsKey = self._dimensions.saveDimensionGraph(datasetType.dimensions) 

194 tagTableName = makeTagTableName(datasetType, dimensionsKey) 

195 calibTableName = (makeCalibTableName(datasetType, dimensionsKey) 

196 if datasetType.isCalibration() else None) 

197 row, inserted = self._db.sync( 

198 self._static.dataset_type, 

199 keys={"name": datasetType.name}, 

200 compared={ 

201 "dimensions_key": dimensionsKey, 

202 "storage_class": datasetType.storageClass.name, 

203 }, 

204 extra={ 

205 "tag_association_table": tagTableName, 

206 "calibration_association_table": calibTableName, 

207 }, 

208 returning=["id", "tag_association_table"], 

209 ) 

210 assert row is not None 

211 tags = self._db.ensureTableExists( 

212 tagTableName, 

213 makeTagTableSpec(datasetType, type(self._collections)), 

214 ) 

215 if calibTableName is not None: 

216 calibs = self._db.ensureTableExists( 

217 calibTableName, 

218 makeCalibTableSpec(datasetType, type(self._collections), 

219 self._db.getTimespanRepresentation()), 

220 ) 

221 else: 

222 calibs = None 

223 storage = ByDimensionsDatasetRecordStorage(db=self._db, datasetType=datasetType, 

224 static=self._static, summaries=self._summaries, 

225 tags=tags, calibs=calibs, 

226 dataset_type_id=row["id"], 

227 collections=self._collections) 

228 self._byName[datasetType.name] = storage 

229 self._byId[storage._dataset_type_id] = storage 

230 else: 

231 if datasetType != storage.datasetType: 

232 raise ConflictingDefinitionError(f"Given dataset type {datasetType} is inconsistent " 

233 f"with database definition {storage.datasetType}.") 

234 inserted = False 

235 return storage, inserted 

236 

237 def __iter__(self) -> Iterator[DatasetType]: 

238 for storage in self._byName.values(): 

239 yield storage.datasetType 

240 

241 def getDatasetRef(self, id: int) -> Optional[DatasetRef]: 

242 # Docstring inherited from DatasetRecordStorageManager. 

243 sql = sqlalchemy.sql.select( 

244 [ 

245 self._static.dataset.columns.dataset_type_id, 

246 self._static.dataset.columns[self._collections.getRunForeignKeyName()], 

247 ] 

248 ).select_from( 

249 self._static.dataset 

250 ).where( 

251 self._static.dataset.columns.id == id 

252 ) 

253 row = self._db.query(sql).fetchone() 

254 if row is None: 

255 return None 

256 recordsForType = self._byId.get(row[self._static.dataset.columns.dataset_type_id]) 

257 if recordsForType is None: 257 ↛ 258line 257 didn't jump to line 258, because the condition on line 257 was never true

258 self.refresh() 

259 recordsForType = self._byId.get(row[self._static.dataset.columns.dataset_type_id]) 

260 assert recordsForType is not None, "Should be guaranteed by foreign key constraints." 

261 return DatasetRef( 

262 recordsForType.datasetType, 

263 dataId=recordsForType.getDataId(id=id), 

264 id=id, 

265 run=self._collections[row[self._collections.getRunForeignKeyName()]].name 

266 ) 

267 

268 @classmethod 

269 def currentVersion(cls) -> Optional[VersionTuple]: 

270 # Docstring inherited from VersionedExtension. 

271 return _VERSION 

272 

273 def schemaDigest(self) -> Optional[str]: 

274 # Docstring inherited from VersionedExtension. 

275 return self._defaultSchemaDigest(self._static, self._db.dialect)