Coverage for python/lsst/daf/butler/registry/datasets/byDimensions/_manager.py: 91%

Shortcuts on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

145 statements  

1from __future__ import annotations 

2 

3__all__ = ( 

4 "ByDimensionsDatasetRecordStorageManager", 

5 "ByDimensionsDatasetRecordStorageManagerUUID", 

6) 

7 

8from typing import ( 

9 Any, 

10 Dict, 

11 Iterator, 

12 Optional, 

13 Tuple, 

14 Type, 

15 TYPE_CHECKING, 

16) 

17 

18import copy 

19import sqlalchemy 

20 

21from lsst.daf.butler import ( 

22 DatasetId, 

23 DatasetRef, 

24 DatasetType, 

25 ddl, 

26 DimensionUniverse, 

27) 

28from lsst.daf.butler.registry import ConflictingDefinitionError, OrphanedRecordError 

29from lsst.daf.butler.registry.interfaces import ( 

30 DatasetIdGenEnum, 

31 DatasetRecordStorage, 

32 DatasetRecordStorageManager, 

33 VersionTuple 

34) 

35 

36from .tables import ( 

37 addDatasetForeignKey, 

38 makeCalibTableName, 

39 makeCalibTableSpec, 

40 makeStaticTableSpecs, 

41 makeTagTableName, 

42 makeTagTableSpec, 

43) 

44from .summaries import CollectionSummaryManager 

45from ._storage import ( 

46 ByDimensionsDatasetRecordStorage, 

47 ByDimensionsDatasetRecordStorageInt, 

48 ByDimensionsDatasetRecordStorageUUID 

49) 

50from ...summaries import CollectionSummary 

51 

52 

53if TYPE_CHECKING: 53 ↛ 54line 53 didn't jump to line 54, because the condition on line 53 was never true

54 from lsst.daf.butler.registry.interfaces import ( 

55 CollectionManager, 

56 CollectionRecord, 

57 Database, 

58 DimensionRecordStorageManager, 

59 StaticTablesContext, 

60 ) 

61 from .tables import StaticDatasetTablesTuple 

62 

63 

64# This has to be updated on every schema change 

65_VERSION_INT = VersionTuple(1, 0, 0) 

66_VERSION_UUID = VersionTuple(1, 0, 0) 

67 

68 

69class MissingDatabaseTableError(RuntimeError): 

70 """Exception raised when a table is not found in a database. 

71 """ 

72 

73 

74class ByDimensionsDatasetRecordStorageManagerBase(DatasetRecordStorageManager): 

75 """A manager class for datasets that uses one dataset-collection table for 

76 each group of dataset types that share the same dimensions. 

77 

78 In addition to the table organization, this class makes a number of 

79 other design choices that would have been cumbersome (to say the least) to 

80 try to pack into its name: 

81 

82 - It uses a private surrogate integer autoincrement field to identify 

83 dataset types, instead of using the name as the primary and foreign key 

84 directly. 

85 

86 - It aggressively loads all DatasetTypes into memory instead of fetching 

87 them from the database only when needed or attempting more clever forms 

88 of caching. 

89 

90 Alternative implementations that make different choices for these while 

91 keeping the same general table organization might be reasonable as well. 

92 

93 This class provides complete implementation of manager logic but it is 

94 parametrized by few class attributes that have to be defined by 

95 sub-classes. 

96 

97 Parameters 

98 ---------- 

99 db : `Database` 

100 Interface to the underlying database engine and namespace. 

101 collections : `CollectionManager` 

102 Manager object for the collections in this `Registry`. 

103 dimensions : `DimensionRecordStorageManager` 

104 Manager object for the dimensions in this `Registry`. 

105 static : `StaticDatasetTablesTuple` 

106 Named tuple of `sqlalchemy.schema.Table` instances for all static 

107 tables used by this class. 

108 summaries : `CollectionSummaryManager` 

109 Structure containing tables that summarize the contents of collections. 

110 """ 

111 def __init__( 

112 self, *, 

113 db: Database, 

114 collections: CollectionManager, 

115 dimensions: DimensionRecordStorageManager, 

116 static: StaticDatasetTablesTuple, 

117 summaries: CollectionSummaryManager, 

118 ): 

119 self._db = db 

120 self._collections = collections 

121 self._dimensions = dimensions 

122 self._static = static 

123 self._summaries = summaries 

124 self._byName: Dict[str, ByDimensionsDatasetRecordStorage] = {} 

125 self._byId: Dict[DatasetId, ByDimensionsDatasetRecordStorage] = {} 

126 

127 @classmethod 

128 def initialize( 

129 cls, 

130 db: Database, 

131 context: StaticTablesContext, *, 

132 collections: CollectionManager, 

133 dimensions: DimensionRecordStorageManager, 

134 ) -> DatasetRecordStorageManager: 

135 # Docstring inherited from DatasetRecordStorageManager. 

136 specs = cls.makeStaticTableSpecs(type(collections), universe=dimensions.universe) 

137 static: StaticDatasetTablesTuple = context.addTableTuple(specs) # type: ignore 

138 summaries = CollectionSummaryManager.initialize( 

139 db, 

140 context, 

141 collections=collections, 

142 dimensions=dimensions, 

143 ) 

144 return cls(db=db, collections=collections, dimensions=dimensions, static=static, summaries=summaries) 

145 

146 @classmethod 

147 def currentVersion(cls) -> Optional[VersionTuple]: 

148 # Docstring inherited from VersionedExtension. 

149 return cls._version 

150 

151 @classmethod 

152 def makeStaticTableSpecs(cls, collections: Type[CollectionManager], 

153 universe: DimensionUniverse) -> StaticDatasetTablesTuple: 

154 """Construct all static tables used by the classes in this package. 

155 

156 Static tables are those that are present in all Registries and do not 

157 depend on what DatasetTypes have been registered. 

158 

159 Parameters 

160 ---------- 

161 collections: `CollectionManager` 

162 Manager object for the collections in this `Registry`. 

163 universe : `DimensionUniverse` 

164 Universe graph containing all dimensions known to this `Registry`. 

165 

166 Returns 

167 ------- 

168 specs : `StaticDatasetTablesTuple` 

169 A named tuple containing `ddl.TableSpec` instances. 

170 """ 

171 return makeStaticTableSpecs(collections, universe=universe, 

172 dtype=cls.getIdColumnType(), autoincrement=cls._autoincrement) 

173 

174 @classmethod 

175 def getIdColumnType(cls) -> type: 

176 # Docstring inherited from base class. 

177 return cls._idColumnType 

178 

179 @classmethod 

180 def addDatasetForeignKey(cls, tableSpec: ddl.TableSpec, *, name: str = "dataset", 

181 constraint: bool = True, onDelete: Optional[str] = None, 

182 **kwargs: Any) -> ddl.FieldSpec: 

183 # Docstring inherited from DatasetRecordStorageManager. 

184 return addDatasetForeignKey(tableSpec, cls.getIdColumnType(), name=name, onDelete=onDelete, 

185 constraint=constraint, **kwargs) 

186 

187 def refresh(self) -> None: 

188 # Docstring inherited from DatasetRecordStorageManager. 

189 byName = {} 

190 byId: Dict[DatasetId, ByDimensionsDatasetRecordStorage] = {} 

191 c = self._static.dataset_type.columns 

192 for row in self._db.query(self._static.dataset_type.select()).mappings(): 

193 name = row[c.name] 

194 dimensions = self._dimensions.loadDimensionGraph(row[c.dimensions_key]) 

195 calibTableName = row[c.calibration_association_table] 

196 datasetType = DatasetType(name, dimensions, row[c.storage_class], 

197 isCalibration=(calibTableName is not None)) 

198 tags = self._db.getExistingTable( 

199 row[c.tag_association_table], 

200 makeTagTableSpec(datasetType, type(self._collections), self.getIdColumnType())) 

201 if tags is None: 201 ↛ 202line 201 didn't jump to line 202, because the condition on line 201 was never true

202 raise MissingDatabaseTableError( 

203 f"Table {row[c.tag_association_table]} is missing from database schema." 

204 ) 

205 if calibTableName is not None: 

206 calibs = self._db.getExistingTable(row[c.calibration_association_table], 

207 makeCalibTableSpec(datasetType, type(self._collections), 

208 self._db.getTimespanRepresentation(), 

209 self.getIdColumnType())) 

210 if calibs is None: 210 ↛ 211line 210 didn't jump to line 211, because the condition on line 210 was never true

211 raise MissingDatabaseTableError( 

212 f"Table {row[c.calibration_association_table]} is missing from database schema." 

213 ) 

214 else: 

215 calibs = None 

216 storage = self._recordStorageType(db=self._db, datasetType=datasetType, 

217 static=self._static, summaries=self._summaries, 

218 tags=tags, calibs=calibs, 

219 dataset_type_id=row["id"], 

220 collections=self._collections) 

221 byName[datasetType.name] = storage 

222 byId[storage._dataset_type_id] = storage 

223 self._byName = byName 

224 self._byId = byId 

225 self._summaries.refresh(lambda dataset_type_id: self._byId[dataset_type_id].datasetType) 

226 

227 def remove(self, name: str) -> None: 

228 # Docstring inherited from DatasetRecordStorageManager. 

229 compositeName, componentName = DatasetType.splitDatasetTypeName(name) 

230 if componentName is not None: 

231 raise ValueError(f"Cannot delete a dataset type of a component of a composite (given {name})") 

232 

233 # Delete the row 

234 try: 

235 self._db.delete(self._static.dataset_type, ["name"], {"name": name}) 

236 except sqlalchemy.exc.IntegrityError as e: 

237 raise OrphanedRecordError(f"Dataset type {name} can not be removed." 

238 " It is associated with datasets that must be removed first.") from e 

239 

240 # Now refresh everything -- removal is rare enough that this does 

241 # not need to be fast. 

242 self.refresh() 

243 

244 def find(self, name: str) -> Optional[DatasetRecordStorage]: 

245 # Docstring inherited from DatasetRecordStorageManager. 

246 compositeName, componentName = DatasetType.splitDatasetTypeName(name) 

247 storage = self._byName.get(compositeName) 

248 if storage is not None and componentName is not None: 

249 componentStorage = copy.copy(storage) 

250 componentStorage.datasetType = storage.datasetType.makeComponentDatasetType(componentName) 

251 return componentStorage 

252 else: 

253 return storage 

254 

255 def register(self, datasetType: DatasetType) -> Tuple[DatasetRecordStorage, bool]: 

256 # Docstring inherited from DatasetRecordStorageManager. 

257 if datasetType.isComponent(): 257 ↛ 258line 257 didn't jump to line 258, because the condition on line 257 was never true

258 raise ValueError("Component dataset types can not be stored in registry." 

259 f" Rejecting {datasetType.name}") 

260 storage = self._byName.get(datasetType.name) 

261 if storage is None: 

262 dimensionsKey = self._dimensions.saveDimensionGraph(datasetType.dimensions) 

263 tagTableName = makeTagTableName(datasetType, dimensionsKey) 

264 calibTableName = (makeCalibTableName(datasetType, dimensionsKey) 

265 if datasetType.isCalibration() else None) 

266 # The order is important here, we want to create tables first and 

267 # only register them if this operation is successful. We cannot 

268 # wrap it into a transaction because database class assumes that 

269 # DDL is not transaction safe in general. 

270 tags = self._db.ensureTableExists( 

271 tagTableName, 

272 makeTagTableSpec(datasetType, type(self._collections), self.getIdColumnType()), 

273 ) 

274 if calibTableName is not None: 

275 calibs = self._db.ensureTableExists( 

276 calibTableName, 

277 makeCalibTableSpec(datasetType, type(self._collections), 

278 self._db.getTimespanRepresentation(), self.getIdColumnType()), 

279 ) 

280 else: 

281 calibs = None 

282 row, inserted = self._db.sync( 

283 self._static.dataset_type, 

284 keys={"name": datasetType.name}, 

285 compared={ 

286 "dimensions_key": dimensionsKey, 

287 "storage_class": datasetType.storageClass.name, 

288 }, 

289 extra={ 

290 "tag_association_table": tagTableName, 

291 "calibration_association_table": calibTableName, 

292 }, 

293 returning=["id", "tag_association_table"], 

294 ) 

295 assert row is not None 

296 storage = self._recordStorageType(db=self._db, datasetType=datasetType, 

297 static=self._static, summaries=self._summaries, 

298 tags=tags, calibs=calibs, 

299 dataset_type_id=row["id"], 

300 collections=self._collections) 

301 self._byName[datasetType.name] = storage 

302 self._byId[storage._dataset_type_id] = storage 

303 else: 

304 if datasetType != storage.datasetType: 

305 raise ConflictingDefinitionError(f"Given dataset type {datasetType} is inconsistent " 

306 f"with database definition {storage.datasetType}.") 

307 inserted = False 

308 return storage, bool(inserted) 

309 

310 def __iter__(self) -> Iterator[DatasetType]: 

311 for storage in self._byName.values(): 

312 yield storage.datasetType 

313 

314 def getDatasetRef(self, id: DatasetId) -> Optional[DatasetRef]: 

315 # Docstring inherited from DatasetRecordStorageManager. 

316 sql = sqlalchemy.sql.select( 

317 self._static.dataset.columns.dataset_type_id, 

318 self._static.dataset.columns[self._collections.getRunForeignKeyName()], 

319 ).select_from( 

320 self._static.dataset 

321 ).where( 

322 self._static.dataset.columns.id == id 

323 ) 

324 row = self._db.query(sql).mappings().fetchone() 

325 if row is None: 

326 return None 

327 recordsForType = self._byId.get(row[self._static.dataset.columns.dataset_type_id]) 

328 if recordsForType is None: 328 ↛ 329line 328 didn't jump to line 329, because the condition on line 328 was never true

329 self.refresh() 

330 recordsForType = self._byId.get(row[self._static.dataset.columns.dataset_type_id]) 

331 assert recordsForType is not None, "Should be guaranteed by foreign key constraints." 

332 return DatasetRef( 

333 recordsForType.datasetType, 

334 dataId=recordsForType.getDataId(id=id), 

335 id=id, 

336 run=self._collections[row[self._collections.getRunForeignKeyName()]].name 

337 ) 

338 

339 def getCollectionSummary(self, collection: CollectionRecord) -> CollectionSummary: 

340 # Docstring inherited from DatasetRecordStorageManager. 

341 return self._summaries.get(collection) 

342 

343 def schemaDigest(self) -> Optional[str]: 

344 # Docstring inherited from VersionedExtension. 

345 return self._defaultSchemaDigest(self._static, self._db.dialect) 

346 

347 _version: VersionTuple 

348 """Schema version for this class.""" 

349 

350 _recordStorageType: Type[ByDimensionsDatasetRecordStorage] 

351 """Type of the storage class returned by this manager.""" 

352 

353 _autoincrement: bool 

354 """If True then PK column of the dataset table is auto-increment.""" 

355 

356 _idColumnType: type 

357 """Type of dataset column used to store dataset ID.""" 

358 

359 

360class ByDimensionsDatasetRecordStorageManager(ByDimensionsDatasetRecordStorageManagerBase): 

361 """Implementation of ByDimensionsDatasetRecordStorageManagerBase which uses 

362 auto-incremental integer for dataset primary key. 

363 """ 

364 _version: VersionTuple = _VERSION_INT 

365 _recordStorageType: Type[ByDimensionsDatasetRecordStorage] = ByDimensionsDatasetRecordStorageInt 

366 _autoincrement: bool = True 

367 _idColumnType: type = sqlalchemy.BigInteger 

368 

369 @classmethod 

370 def supportsIdGenerationMode(cls, mode: DatasetIdGenEnum) -> bool: 

371 # Docstring inherited from DatasetRecordStorageManager. 

372 # MyPy seems confused about enum value types here. 

373 return mode is mode.UNIQUE # type: ignore 

374 

375 

376class ByDimensionsDatasetRecordStorageManagerUUID(ByDimensionsDatasetRecordStorageManagerBase): 

377 """Implementation of ByDimensionsDatasetRecordStorageManagerBase which uses 

378 UUID for dataset primary key. 

379 """ 

380 _version: VersionTuple = _VERSION_UUID 

381 _recordStorageType: Type[ByDimensionsDatasetRecordStorage] = ByDimensionsDatasetRecordStorageUUID 

382 _autoincrement: bool = False 

383 _idColumnType: type = ddl.GUID 

384 

385 @classmethod 

386 def supportsIdGenerationMode(cls, mode: DatasetIdGenEnum) -> bool: 

387 # Docstring inherited from DatasetRecordStorageManager. 

388 return True