Coverage for python/lsst/daf/butler/registry/datasets/byDimensions/_manager.py: 92%

151 statements  

« prev     ^ index     » next       coverage.py v7.2.5, created at 2023-05-02 18:18 -0700

1from __future__ import annotations 

2 

3__all__ = ( 

4 "ByDimensionsDatasetRecordStorageManager", 

5 "ByDimensionsDatasetRecordStorageManagerUUID", 

6) 

7 

8import copy 

9from typing import TYPE_CHECKING, Any, Dict, Iterator, Optional, Tuple, Type 

10 

11import sqlalchemy 

12from lsst.daf.butler import DatasetId, DatasetRef, DatasetType, DimensionUniverse, ddl 

13from lsst.daf.butler.registry import ConflictingDefinitionError, OrphanedRecordError 

14from lsst.daf.butler.registry.interfaces import ( 

15 DatasetIdGenEnum, 

16 DatasetRecordStorage, 

17 DatasetRecordStorageManager, 

18 VersionTuple, 

19) 

20 

21from ...summaries import CollectionSummary 

22from ._storage import ( 

23 ByDimensionsDatasetRecordStorage, 

24 ByDimensionsDatasetRecordStorageInt, 

25 ByDimensionsDatasetRecordStorageUUID, 

26) 

27from .summaries import CollectionSummaryManager 

28from .tables import ( 

29 addDatasetForeignKey, 

30 makeCalibTableName, 

31 makeCalibTableSpec, 

32 makeStaticTableSpecs, 

33 makeTagTableName, 

34 makeTagTableSpec, 

35) 

36 

37if TYPE_CHECKING: 37 ↛ 38line 37 didn't jump to line 38, because the condition on line 37 was never true

38 from lsst.daf.butler.registry.interfaces import ( 

39 CollectionManager, 

40 CollectionRecord, 

41 Database, 

42 DimensionRecordStorageManager, 

43 StaticTablesContext, 

44 ) 

45 

46 from .tables import StaticDatasetTablesTuple 

47 

48 

49# This has to be updated on every schema change 

50_VERSION_INT = VersionTuple(1, 0, 0) 

51_VERSION_UUID = VersionTuple(1, 0, 0) 

52 

53 

54class MissingDatabaseTableError(RuntimeError): 

55 """Exception raised when a table is not found in a database.""" 

56 

57 

58class ByDimensionsDatasetRecordStorageManagerBase(DatasetRecordStorageManager): 

59 """A manager class for datasets that uses one dataset-collection table for 

60 each group of dataset types that share the same dimensions. 

61 

62 In addition to the table organization, this class makes a number of 

63 other design choices that would have been cumbersome (to say the least) to 

64 try to pack into its name: 

65 

66 - It uses a private surrogate integer autoincrement field to identify 

67 dataset types, instead of using the name as the primary and foreign key 

68 directly. 

69 

70 - It aggressively loads all DatasetTypes into memory instead of fetching 

71 them from the database only when needed or attempting more clever forms 

72 of caching. 

73 

74 Alternative implementations that make different choices for these while 

75 keeping the same general table organization might be reasonable as well. 

76 

77 This class provides complete implementation of manager logic but it is 

78 parametrized by few class attributes that have to be defined by 

79 sub-classes. 

80 

81 Parameters 

82 ---------- 

83 db : `Database` 

84 Interface to the underlying database engine and namespace. 

85 collections : `CollectionManager` 

86 Manager object for the collections in this `Registry`. 

87 dimensions : `DimensionRecordStorageManager` 

88 Manager object for the dimensions in this `Registry`. 

89 static : `StaticDatasetTablesTuple` 

90 Named tuple of `sqlalchemy.schema.Table` instances for all static 

91 tables used by this class. 

92 summaries : `CollectionSummaryManager` 

93 Structure containing tables that summarize the contents of collections. 

94 """ 

95 

96 def __init__( 

97 self, 

98 *, 

99 db: Database, 

100 collections: CollectionManager, 

101 dimensions: DimensionRecordStorageManager, 

102 static: StaticDatasetTablesTuple, 

103 summaries: CollectionSummaryManager, 

104 ): 

105 self._db = db 

106 self._collections = collections 

107 self._dimensions = dimensions 

108 self._static = static 

109 self._summaries = summaries 

110 self._byName: Dict[str, ByDimensionsDatasetRecordStorage] = {} 

111 self._byId: Dict[DatasetId, ByDimensionsDatasetRecordStorage] = {} 

112 

113 @classmethod 

114 def initialize( 

115 cls, 

116 db: Database, 

117 context: StaticTablesContext, 

118 *, 

119 collections: CollectionManager, 

120 dimensions: DimensionRecordStorageManager, 

121 ) -> DatasetRecordStorageManager: 

122 # Docstring inherited from DatasetRecordStorageManager. 

123 specs = cls.makeStaticTableSpecs(type(collections), universe=dimensions.universe) 

124 static: StaticDatasetTablesTuple = context.addTableTuple(specs) # type: ignore 

125 summaries = CollectionSummaryManager.initialize( 

126 db, 

127 context, 

128 collections=collections, 

129 dimensions=dimensions, 

130 ) 

131 return cls(db=db, collections=collections, dimensions=dimensions, static=static, summaries=summaries) 

132 

133 @classmethod 

134 def currentVersion(cls) -> Optional[VersionTuple]: 

135 # Docstring inherited from VersionedExtension. 

136 return cls._version 

137 

138 @classmethod 

139 def makeStaticTableSpecs( 

140 cls, collections: Type[CollectionManager], universe: DimensionUniverse 

141 ) -> StaticDatasetTablesTuple: 

142 """Construct all static tables used by the classes in this package. 

143 

144 Static tables are those that are present in all Registries and do not 

145 depend on what DatasetTypes have been registered. 

146 

147 Parameters 

148 ---------- 

149 collections: `CollectionManager` 

150 Manager object for the collections in this `Registry`. 

151 universe : `DimensionUniverse` 

152 Universe graph containing all dimensions known to this `Registry`. 

153 

154 Returns 

155 ------- 

156 specs : `StaticDatasetTablesTuple` 

157 A named tuple containing `ddl.TableSpec` instances. 

158 """ 

159 return makeStaticTableSpecs( 

160 collections, universe=universe, dtype=cls.getIdColumnType(), autoincrement=cls._autoincrement 

161 ) 

162 

163 @classmethod 

164 def getIdColumnType(cls) -> type: 

165 # Docstring inherited from base class. 

166 return cls._idColumnType 

167 

168 @classmethod 

169 def addDatasetForeignKey( 

170 cls, 

171 tableSpec: ddl.TableSpec, 

172 *, 

173 name: str = "dataset", 

174 constraint: bool = True, 

175 onDelete: Optional[str] = None, 

176 **kwargs: Any, 

177 ) -> ddl.FieldSpec: 

178 # Docstring inherited from DatasetRecordStorageManager. 

179 return addDatasetForeignKey( 

180 tableSpec, cls.getIdColumnType(), name=name, onDelete=onDelete, constraint=constraint, **kwargs 

181 ) 

182 

183 def refresh(self) -> None: 

184 # Docstring inherited from DatasetRecordStorageManager. 

185 byName = {} 

186 byId: Dict[DatasetId, ByDimensionsDatasetRecordStorage] = {} 

187 c = self._static.dataset_type.columns 

188 with self._db.query(self._static.dataset_type.select()) as sql_result: 

189 sql_rows = sql_result.mappings().fetchall() 

190 for row in sql_rows: 

191 name = row[c.name] 

192 dimensions = self._dimensions.loadDimensionGraph(row[c.dimensions_key]) 

193 calibTableName = row[c.calibration_association_table] 

194 datasetType = DatasetType( 

195 name, dimensions, row[c.storage_class], isCalibration=(calibTableName is not None) 

196 ) 

197 tags = self._db.getExistingTable( 

198 row[c.tag_association_table], 

199 makeTagTableSpec(datasetType, type(self._collections), self.getIdColumnType()), 

200 ) 

201 if tags is None: 201 ↛ 202line 201 didn't jump to line 202, because the condition on line 201 was never true

202 raise MissingDatabaseTableError( 

203 f"Table {row[c.tag_association_table]} is missing from database schema." 

204 ) 

205 if calibTableName is not None: 

206 calibs = self._db.getExistingTable( 

207 row[c.calibration_association_table], 

208 makeCalibTableSpec( 

209 datasetType, 

210 type(self._collections), 

211 self._db.getTimespanRepresentation(), 

212 self.getIdColumnType(), 

213 ), 

214 ) 

215 if calibs is None: 215 ↛ 216line 215 didn't jump to line 216, because the condition on line 215 was never true

216 raise MissingDatabaseTableError( 

217 f"Table {row[c.calibration_association_table]} is missing from database schema." 

218 ) 

219 else: 

220 calibs = None 

221 storage = self._recordStorageType( 

222 db=self._db, 

223 datasetType=datasetType, 

224 static=self._static, 

225 summaries=self._summaries, 

226 tags=tags, 

227 calibs=calibs, 

228 dataset_type_id=row["id"], 

229 collections=self._collections, 

230 ) 

231 byName[datasetType.name] = storage 

232 byId[storage._dataset_type_id] = storage 

233 self._byName = byName 

234 self._byId = byId 

235 self._summaries.refresh(lambda dataset_type_id: self._byId[dataset_type_id].datasetType) 

236 

237 def remove(self, name: str) -> None: 

238 # Docstring inherited from DatasetRecordStorageManager. 

239 compositeName, componentName = DatasetType.splitDatasetTypeName(name) 

240 if componentName is not None: 

241 raise ValueError(f"Cannot delete a dataset type of a component of a composite (given {name})") 

242 

243 # Delete the row 

244 try: 

245 self._db.delete(self._static.dataset_type, ["name"], {"name": name}) 

246 except sqlalchemy.exc.IntegrityError as e: 

247 raise OrphanedRecordError( 

248 f"Dataset type {name} can not be removed." 

249 " It is associated with datasets that must be removed first." 

250 ) from e 

251 

252 # Now refresh everything -- removal is rare enough that this does 

253 # not need to be fast. 

254 self.refresh() 

255 

256 def find(self, name: str) -> Optional[DatasetRecordStorage]: 

257 # Docstring inherited from DatasetRecordStorageManager. 

258 compositeName, componentName = DatasetType.splitDatasetTypeName(name) 

259 storage = self._byName.get(compositeName) 

260 if storage is not None and componentName is not None: 

261 componentStorage = copy.copy(storage) 

262 componentStorage.datasetType = storage.datasetType.makeComponentDatasetType(componentName) 

263 return componentStorage 

264 else: 

265 return storage 

266 

267 def register(self, datasetType: DatasetType) -> Tuple[DatasetRecordStorage, bool]: 

268 # Docstring inherited from DatasetRecordStorageManager. 

269 if datasetType.isComponent(): 269 ↛ 270line 269 didn't jump to line 270, because the condition on line 269 was never true

270 raise ValueError( 

271 f"Component dataset types can not be stored in registry. Rejecting {datasetType.name}" 

272 ) 

273 storage = self._byName.get(datasetType.name) 

274 if storage is None: 

275 dimensionsKey = self._dimensions.saveDimensionGraph(datasetType.dimensions) 

276 tagTableName = makeTagTableName(datasetType, dimensionsKey) 

277 calibTableName = ( 

278 makeCalibTableName(datasetType, dimensionsKey) if datasetType.isCalibration() else None 

279 ) 

280 # The order is important here, we want to create tables first and 

281 # only register them if this operation is successful. We cannot 

282 # wrap it into a transaction because database class assumes that 

283 # DDL is not transaction safe in general. 

284 tags = self._db.ensureTableExists( 

285 tagTableName, 

286 makeTagTableSpec(datasetType, type(self._collections), self.getIdColumnType()), 

287 ) 

288 if calibTableName is not None: 

289 calibs = self._db.ensureTableExists( 

290 calibTableName, 

291 makeCalibTableSpec( 

292 datasetType, 

293 type(self._collections), 

294 self._db.getTimespanRepresentation(), 

295 self.getIdColumnType(), 

296 ), 

297 ) 

298 else: 

299 calibs = None 

300 row, inserted = self._db.sync( 

301 self._static.dataset_type, 

302 keys={"name": datasetType.name}, 

303 compared={ 

304 "dimensions_key": dimensionsKey, 

305 # Force the storage class to be loaded to ensure it 

306 # exists and there is no typo in the name. 

307 "storage_class": datasetType.storageClass.name, 

308 }, 

309 extra={ 

310 "tag_association_table": tagTableName, 

311 "calibration_association_table": calibTableName, 

312 }, 

313 returning=["id", "tag_association_table"], 

314 ) 

315 assert row is not None 

316 storage = self._recordStorageType( 

317 db=self._db, 

318 datasetType=datasetType, 

319 static=self._static, 

320 summaries=self._summaries, 

321 tags=tags, 

322 calibs=calibs, 

323 dataset_type_id=row["id"], 

324 collections=self._collections, 

325 ) 

326 self._byName[datasetType.name] = storage 

327 self._byId[storage._dataset_type_id] = storage 

328 else: 

329 if datasetType != storage.datasetType: 

330 raise ConflictingDefinitionError( 

331 f"Given dataset type {datasetType} is inconsistent " 

332 f"with database definition {storage.datasetType}." 

333 ) 

334 inserted = False 

335 return storage, bool(inserted) 

336 

337 def __iter__(self) -> Iterator[DatasetType]: 

338 for storage in self._byName.values(): 

339 yield storage.datasetType 

340 

341 def getDatasetRef(self, id: DatasetId) -> Optional[DatasetRef]: 

342 # Docstring inherited from DatasetRecordStorageManager. 

343 sql = ( 

344 sqlalchemy.sql.select( 

345 self._static.dataset.columns.dataset_type_id, 

346 self._static.dataset.columns[self._collections.getRunForeignKeyName()], 

347 ) 

348 .select_from(self._static.dataset) 

349 .where(self._static.dataset.columns.id == id) 

350 ) 

351 with self._db.query(sql) as sql_result: 

352 row = sql_result.mappings().fetchone() 

353 if row is None: 

354 return None 

355 recordsForType = self._byId.get(row[self._static.dataset.columns.dataset_type_id]) 

356 if recordsForType is None: 356 ↛ 357line 356 didn't jump to line 357, because the condition on line 356 was never true

357 self.refresh() 

358 recordsForType = self._byId.get(row[self._static.dataset.columns.dataset_type_id]) 

359 assert recordsForType is not None, "Should be guaranteed by foreign key constraints." 

360 return DatasetRef( 

361 recordsForType.datasetType, 

362 dataId=recordsForType.getDataId(id=id), 

363 id=id, 

364 run=self._collections[row[self._collections.getRunForeignKeyName()]].name, 

365 ) 

366 

367 def getCollectionSummary(self, collection: CollectionRecord) -> CollectionSummary: 

368 # Docstring inherited from DatasetRecordStorageManager. 

369 return self._summaries.get(collection) 

370 

371 def schemaDigest(self) -> Optional[str]: 

372 # Docstring inherited from VersionedExtension. 

373 return self._defaultSchemaDigest(self._static, self._db.dialect) 

374 

375 _version: VersionTuple 

376 """Schema version for this class.""" 

377 

378 _recordStorageType: Type[ByDimensionsDatasetRecordStorage] 

379 """Type of the storage class returned by this manager.""" 

380 

381 _autoincrement: bool 

382 """If True then PK column of the dataset table is auto-increment.""" 

383 

384 _idColumnType: type 

385 """Type of dataset column used to store dataset ID.""" 

386 

387 

388class ByDimensionsDatasetRecordStorageManager(ByDimensionsDatasetRecordStorageManagerBase): 

389 """Implementation of ByDimensionsDatasetRecordStorageManagerBase which uses 

390 auto-incremental integer for dataset primary key. 

391 """ 

392 

393 _version: VersionTuple = _VERSION_INT 

394 _recordStorageType: Type[ByDimensionsDatasetRecordStorage] = ByDimensionsDatasetRecordStorageInt 

395 _autoincrement: bool = True 

396 _idColumnType: type = sqlalchemy.BigInteger 

397 

398 @classmethod 

399 def supportsIdGenerationMode(cls, mode: DatasetIdGenEnum) -> bool: 

400 # Docstring inherited from DatasetRecordStorageManager. 

401 # MyPy seems confused about enum value types here. 

402 return mode is mode.UNIQUE # type: ignore 

403 

404 

405class ByDimensionsDatasetRecordStorageManagerUUID(ByDimensionsDatasetRecordStorageManagerBase): 

406 """Implementation of ByDimensionsDatasetRecordStorageManagerBase which uses 

407 UUID for dataset primary key. 

408 """ 

409 

410 _version: VersionTuple = _VERSION_UUID 

411 _recordStorageType: Type[ByDimensionsDatasetRecordStorage] = ByDimensionsDatasetRecordStorageUUID 

412 _autoincrement: bool = False 

413 _idColumnType: type = ddl.GUID 

414 

415 @classmethod 

416 def supportsIdGenerationMode(cls, mode: DatasetIdGenEnum) -> bool: 

417 # Docstring inherited from DatasetRecordStorageManager. 

418 return True