Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1from __future__ import annotations 

2 

3__all__ = ("ByDimensionsDatasetRecordStorageManager",) 

4 

5from typing import ( 

6 Any, 

7 Dict, 

8 Iterator, 

9 Optional, 

10 Tuple, 

11 TYPE_CHECKING, 

12) 

13 

14import copy 

15import sqlalchemy 

16 

17from lsst.daf.butler import ( 

18 DatasetRef, 

19 DatasetType, 

20 ddl, 

21) 

22from lsst.daf.butler.registry import ConflictingDefinitionError, OrphanedRecordError 

23from lsst.daf.butler.registry.interfaces import ( 

24 DatasetRecordStorage, 

25 DatasetRecordStorageManager, 

26 VersionTuple 

27) 

28 

29from .tables import ( 

30 addDatasetForeignKey, 

31 makeCalibTableName, 

32 makeCalibTableSpec, 

33 makeStaticTableSpecs, 

34 makeTagTableName, 

35 makeTagTableSpec, 

36) 

37from .summaries import CollectionSummaryManager 

38from ._storage import ByDimensionsDatasetRecordStorage 

39from ...summaries import CollectionSummary 

40 

41 

42if TYPE_CHECKING: 42 ↛ 43line 42 didn't jump to line 43, because the condition on line 42 was never true

43 from lsst.daf.butler.registry.interfaces import ( 

44 CollectionManager, 

45 CollectionRecord, 

46 Database, 

47 DimensionRecordStorageManager, 

48 StaticTablesContext, 

49 ) 

50 from .tables import StaticDatasetTablesTuple 

51 

52 

53# This has to be updated on every schema change 

54_VERSION = VersionTuple(1, 0, 0) 

55 

56 

57class ByDimensionsDatasetRecordStorageManager(DatasetRecordStorageManager): 

58 """A manager class for datasets that uses one dataset-collection table for 

59 each group of dataset types that share the same dimensions. 

60 

61 In addition to the table organization, this class makes a number of 

62 other design choices that would have been cumbersome (to say the least) to 

63 try to pack into its name: 

64 

65 - It uses a private surrogate integer autoincrement field to identify 

66 dataset types, instead of using the name as the primary and foreign key 

67 directly. 

68 

69 - It aggressively loads all DatasetTypes into memory instead of fetching 

70 them from the database only when needed or attempting more clever forms 

71 of caching. 

72 

73 Alternative implementations that make different choices for these while 

74 keeping the same general table organization might be reasonable as well. 

75 

76 Parameters 

77 ---------- 

78 db : `Database` 

79 Interface to the underlying database engine and namespace. 

80 collections : `CollectionManager` 

81 Manager object for the collections in this `Registry`. 

82 dimensions : `DimensionRecordStorageManager` 

83 Manager object for the dimensions in this `Registry`. 

84 static : `StaticDatasetTablesTuple` 

85 Named tuple of `sqlalchemy.schema.Table` instances for all static 

86 tables used by this class. 

87 summaries : `CollectionSummaryManager` 

88 Structure containing tables that summarize the contents of collections. 

89 """ 

90 def __init__( 

91 self, *, 

92 db: Database, 

93 collections: CollectionManager, 

94 dimensions: DimensionRecordStorageManager, 

95 static: StaticDatasetTablesTuple, 

96 summaries: CollectionSummaryManager, 

97 ): 

98 self._db = db 

99 self._collections = collections 

100 self._dimensions = dimensions 

101 self._static = static 

102 self._summaries = summaries 

103 self._byName: Dict[str, ByDimensionsDatasetRecordStorage] = {} 

104 self._byId: Dict[int, ByDimensionsDatasetRecordStorage] = {} 

105 

106 @classmethod 

107 def initialize( 

108 cls, 

109 db: Database, 

110 context: StaticTablesContext, *, 

111 collections: CollectionManager, 

112 dimensions: DimensionRecordStorageManager, 

113 ) -> DatasetRecordStorageManager: 

114 # Docstring inherited from DatasetRecordStorageManager. 

115 specs = makeStaticTableSpecs(type(collections), universe=dimensions.universe) 

116 static: StaticDatasetTablesTuple = context.addTableTuple(specs) # type: ignore 

117 summaries = CollectionSummaryManager.initialize( 

118 db, 

119 context, 

120 collections=collections, 

121 dimensions=dimensions, 

122 ) 

123 return cls(db=db, collections=collections, dimensions=dimensions, static=static, summaries=summaries) 

124 

125 @classmethod 

126 def addDatasetForeignKey(cls, tableSpec: ddl.TableSpec, *, name: str = "dataset", 

127 constraint: bool = True, onDelete: Optional[str] = None, 

128 **kwargs: Any) -> ddl.FieldSpec: 

129 # Docstring inherited from DatasetRecordStorageManager. 

130 return addDatasetForeignKey(tableSpec, name=name, onDelete=onDelete, constraint=constraint, **kwargs) 

131 

132 def refresh(self) -> None: 

133 # Docstring inherited from DatasetRecordStorageManager. 

134 byName = {} 

135 byId = {} 

136 c = self._static.dataset_type.columns 

137 for row in self._db.query(self._static.dataset_type.select()).fetchall(): 

138 name = row[c.name] 

139 dimensions = self._dimensions.loadDimensionGraph(row[c.dimensions_key]) 

140 calibTableName = row[c.calibration_association_table] 

141 datasetType = DatasetType(name, dimensions, row[c.storage_class], 

142 isCalibration=(calibTableName is not None)) 

143 tags = self._db.getExistingTable(row[c.tag_association_table], 

144 makeTagTableSpec(datasetType, type(self._collections))) 

145 if calibTableName is not None: 

146 calibs = self._db.getExistingTable(row[c.calibration_association_table], 

147 makeCalibTableSpec(datasetType, type(self._collections), 

148 self._db.getTimespanRepresentation())) 

149 else: 

150 calibs = None 

151 storage = ByDimensionsDatasetRecordStorage(db=self._db, datasetType=datasetType, 

152 static=self._static, summaries=self._summaries, 

153 tags=tags, calibs=calibs, 

154 dataset_type_id=row["id"], 

155 collections=self._collections) 

156 byName[datasetType.name] = storage 

157 byId[storage._dataset_type_id] = storage 

158 self._byName = byName 

159 self._byId = byId 

160 self._summaries.refresh(lambda dataset_type_id: self._byId[dataset_type_id].datasetType) 

161 

162 def remove(self, name: str) -> None: 

163 # Docstring inherited from DatasetRecordStorageManager. 

164 compositeName, componentName = DatasetType.splitDatasetTypeName(name) 

165 if componentName is not None: 

166 raise ValueError(f"Cannot delete a dataset type of a component of a composite (given {name})") 

167 

168 # Delete the row 

169 try: 

170 self._db.delete(self._static.dataset_type, ["name"], {"name": name}) 

171 except sqlalchemy.exc.IntegrityError as e: 

172 raise OrphanedRecordError(f"Dataset type {name} can not be removed." 

173 " It is associated with datasets that must be removed first.") from e 

174 

175 # Now refresh everything -- removal is rare enough that this does 

176 # not need to be fast. 

177 self.refresh() 

178 

179 def find(self, name: str) -> Optional[DatasetRecordStorage]: 

180 # Docstring inherited from DatasetRecordStorageManager. 

181 compositeName, componentName = DatasetType.splitDatasetTypeName(name) 

182 storage = self._byName.get(compositeName) 

183 if storage is not None and componentName is not None: 

184 componentStorage = copy.copy(storage) 

185 componentStorage.datasetType = storage.datasetType.makeComponentDatasetType(componentName) 

186 return componentStorage 

187 else: 

188 return storage 

189 

190 def register(self, datasetType: DatasetType) -> Tuple[DatasetRecordStorage, bool]: 

191 # Docstring inherited from DatasetRecordStorageManager. 

192 if datasetType.isComponent(): 192 ↛ 193line 192 didn't jump to line 193, because the condition on line 192 was never true

193 raise ValueError("Component dataset types can not be stored in registry." 

194 f" Rejecting {datasetType.name}") 

195 storage = self._byName.get(datasetType.name) 

196 if storage is None: 

197 dimensionsKey = self._dimensions.saveDimensionGraph(datasetType.dimensions) 

198 tagTableName = makeTagTableName(datasetType, dimensionsKey) 

199 calibTableName = (makeCalibTableName(datasetType, dimensionsKey) 

200 if datasetType.isCalibration() else None) 

201 row, inserted = self._db.sync( 

202 self._static.dataset_type, 

203 keys={"name": datasetType.name}, 

204 compared={ 

205 "dimensions_key": dimensionsKey, 

206 "storage_class": datasetType.storageClass.name, 

207 }, 

208 extra={ 

209 "tag_association_table": tagTableName, 

210 "calibration_association_table": calibTableName, 

211 }, 

212 returning=["id", "tag_association_table"], 

213 ) 

214 assert row is not None 

215 tags = self._db.ensureTableExists( 

216 tagTableName, 

217 makeTagTableSpec(datasetType, type(self._collections)), 

218 ) 

219 if calibTableName is not None: 

220 calibs = self._db.ensureTableExists( 

221 calibTableName, 

222 makeCalibTableSpec(datasetType, type(self._collections), 

223 self._db.getTimespanRepresentation()), 

224 ) 

225 else: 

226 calibs = None 

227 storage = ByDimensionsDatasetRecordStorage(db=self._db, datasetType=datasetType, 

228 static=self._static, summaries=self._summaries, 

229 tags=tags, calibs=calibs, 

230 dataset_type_id=row["id"], 

231 collections=self._collections) 

232 self._byName[datasetType.name] = storage 

233 self._byId[storage._dataset_type_id] = storage 

234 else: 

235 if datasetType != storage.datasetType: 

236 raise ConflictingDefinitionError(f"Given dataset type {datasetType} is inconsistent " 

237 f"with database definition {storage.datasetType}.") 

238 inserted = False 

239 return storage, inserted 

240 

241 def __iter__(self) -> Iterator[DatasetType]: 

242 for storage in self._byName.values(): 

243 yield storage.datasetType 

244 

245 def getDatasetRef(self, id: int) -> Optional[DatasetRef]: 

246 # Docstring inherited from DatasetRecordStorageManager. 

247 sql = sqlalchemy.sql.select( 

248 [ 

249 self._static.dataset.columns.dataset_type_id, 

250 self._static.dataset.columns[self._collections.getRunForeignKeyName()], 

251 ] 

252 ).select_from( 

253 self._static.dataset 

254 ).where( 

255 self._static.dataset.columns.id == id 

256 ) 

257 row = self._db.query(sql).fetchone() 

258 if row is None: 

259 return None 

260 recordsForType = self._byId.get(row[self._static.dataset.columns.dataset_type_id]) 

261 if recordsForType is None: 261 ↛ 262line 261 didn't jump to line 262, because the condition on line 261 was never true

262 self.refresh() 

263 recordsForType = self._byId.get(row[self._static.dataset.columns.dataset_type_id]) 

264 assert recordsForType is not None, "Should be guaranteed by foreign key constraints." 

265 return DatasetRef( 

266 recordsForType.datasetType, 

267 dataId=recordsForType.getDataId(id=id), 

268 id=id, 

269 run=self._collections[row[self._collections.getRunForeignKeyName()]].name 

270 ) 

271 

272 def getCollectionSummary(self, collection: CollectionRecord) -> CollectionSummary: 

273 # Docstring inherited from DatasetRecordStorageManager. 

274 return self._summaries.get(collection) 

275 

276 @classmethod 

277 def currentVersion(cls) -> Optional[VersionTuple]: 

278 # Docstring inherited from VersionedExtension. 

279 return _VERSION 

280 

281 def schemaDigest(self) -> Optional[str]: 

282 # Docstring inherited from VersionedExtension. 

283 return self._defaultSchemaDigest(self._static, self._db.dialect)