Coverage for python/lsst/daf/butler/registry/datasets/byDimensions/_manager.py: 92%

149 statements  

« prev     ^ index     » next       coverage.py v6.4.4, created at 2022-09-27 01:59 -0700

1from __future__ import annotations 

2 

3__all__ = ( 

4 "ByDimensionsDatasetRecordStorageManager", 

5 "ByDimensionsDatasetRecordStorageManagerUUID", 

6) 

7 

8import copy 

9from collections.abc import Iterator 

10from typing import TYPE_CHECKING, Any 

11 

12import sqlalchemy 

13 

14from ....core import DatasetId, DatasetRef, DatasetType, DimensionUniverse, ddl 

15from ..._collection_summary import CollectionSummary 

16from ..._exceptions import ConflictingDefinitionError, OrphanedRecordError 

17from ...interfaces import DatasetIdGenEnum, DatasetRecordStorage, DatasetRecordStorageManager, VersionTuple 

18from ._storage import ( 

19 ByDimensionsDatasetRecordStorage, 

20 ByDimensionsDatasetRecordStorageInt, 

21 ByDimensionsDatasetRecordStorageUUID, 

22) 

23from .summaries import CollectionSummaryManager 

24from .tables import ( 

25 addDatasetForeignKey, 

26 makeCalibTableName, 

27 makeCalibTableSpec, 

28 makeStaticTableSpecs, 

29 makeTagTableName, 

30 makeTagTableSpec, 

31) 

32 

33if TYPE_CHECKING: 33 ↛ 34line 33 didn't jump to line 34, because the condition on line 33 was never true

34 from ...interfaces import ( 

35 CollectionManager, 

36 CollectionRecord, 

37 Database, 

38 DimensionRecordStorageManager, 

39 StaticTablesContext, 

40 ) 

41 from .tables import StaticDatasetTablesTuple 

42 

43 

44# This has to be updated on every schema change 

45_VERSION_INT = VersionTuple(1, 0, 0) 

46_VERSION_UUID = VersionTuple(1, 0, 0) 

47 

48 

49class MissingDatabaseTableError(RuntimeError): 

50 """Exception raised when a table is not found in a database.""" 

51 

52 

53class ByDimensionsDatasetRecordStorageManagerBase(DatasetRecordStorageManager): 

54 """A manager class for datasets that uses one dataset-collection table for 

55 each group of dataset types that share the same dimensions. 

56 

57 In addition to the table organization, this class makes a number of 

58 other design choices that would have been cumbersome (to say the least) to 

59 try to pack into its name: 

60 

61 - It uses a private surrogate integer autoincrement field to identify 

62 dataset types, instead of using the name as the primary and foreign key 

63 directly. 

64 

65 - It aggressively loads all DatasetTypes into memory instead of fetching 

66 them from the database only when needed or attempting more clever forms 

67 of caching. 

68 

69 Alternative implementations that make different choices for these while 

70 keeping the same general table organization might be reasonable as well. 

71 

72 This class provides complete implementation of manager logic but it is 

73 parametrized by few class attributes that have to be defined by 

74 sub-classes. 

75 

76 Parameters 

77 ---------- 

78 db : `Database` 

79 Interface to the underlying database engine and namespace. 

80 collections : `CollectionManager` 

81 Manager object for the collections in this `Registry`. 

82 dimensions : `DimensionRecordStorageManager` 

83 Manager object for the dimensions in this `Registry`. 

84 static : `StaticDatasetTablesTuple` 

85 Named tuple of `sqlalchemy.schema.Table` instances for all static 

86 tables used by this class. 

87 summaries : `CollectionSummaryManager` 

88 Structure containing tables that summarize the contents of collections. 

89 """ 

90 

91 def __init__( 

92 self, 

93 *, 

94 db: Database, 

95 collections: CollectionManager, 

96 dimensions: DimensionRecordStorageManager, 

97 static: StaticDatasetTablesTuple, 

98 summaries: CollectionSummaryManager, 

99 ): 

100 self._db = db 

101 self._collections = collections 

102 self._dimensions = dimensions 

103 self._static = static 

104 self._summaries = summaries 

105 self._byName: dict[str, ByDimensionsDatasetRecordStorage] = {} 

106 self._byId: dict[DatasetId, ByDimensionsDatasetRecordStorage] = {} 

107 

108 @classmethod 

109 def initialize( 

110 cls, 

111 db: Database, 

112 context: StaticTablesContext, 

113 *, 

114 collections: CollectionManager, 

115 dimensions: DimensionRecordStorageManager, 

116 ) -> DatasetRecordStorageManager: 

117 # Docstring inherited from DatasetRecordStorageManager. 

118 specs = cls.makeStaticTableSpecs(type(collections), universe=dimensions.universe) 

119 static: StaticDatasetTablesTuple = context.addTableTuple(specs) # type: ignore 

120 summaries = CollectionSummaryManager.initialize( 

121 db, 

122 context, 

123 collections=collections, 

124 dimensions=dimensions, 

125 ) 

126 return cls(db=db, collections=collections, dimensions=dimensions, static=static, summaries=summaries) 

127 

128 @classmethod 

129 def currentVersion(cls) -> VersionTuple | None: 

130 # Docstring inherited from VersionedExtension. 

131 return cls._version 

132 

133 @classmethod 

134 def makeStaticTableSpecs( 

135 cls, collections: type[CollectionManager], universe: DimensionUniverse 

136 ) -> StaticDatasetTablesTuple: 

137 """Construct all static tables used by the classes in this package. 

138 

139 Static tables are those that are present in all Registries and do not 

140 depend on what DatasetTypes have been registered. 

141 

142 Parameters 

143 ---------- 

144 collections: `CollectionManager` 

145 Manager object for the collections in this `Registry`. 

146 universe : `DimensionUniverse` 

147 Universe graph containing all dimensions known to this `Registry`. 

148 

149 Returns 

150 ------- 

151 specs : `StaticDatasetTablesTuple` 

152 A named tuple containing `ddl.TableSpec` instances. 

153 """ 

154 return makeStaticTableSpecs( 

155 collections, universe=universe, dtype=cls.getIdColumnType(), autoincrement=cls._autoincrement 

156 ) 

157 

158 @classmethod 

159 def getIdColumnType(cls) -> type: 

160 # Docstring inherited from base class. 

161 return cls._idColumnType 

162 

163 @classmethod 

164 def addDatasetForeignKey( 

165 cls, 

166 tableSpec: ddl.TableSpec, 

167 *, 

168 name: str = "dataset", 

169 constraint: bool = True, 

170 onDelete: str | None = None, 

171 **kwargs: Any, 

172 ) -> ddl.FieldSpec: 

173 # Docstring inherited from DatasetRecordStorageManager. 

174 return addDatasetForeignKey( 

175 tableSpec, cls.getIdColumnType(), name=name, onDelete=onDelete, constraint=constraint, **kwargs 

176 ) 

177 

178 def refresh(self) -> None: 

179 # Docstring inherited from DatasetRecordStorageManager. 

180 byName: dict[str, ByDimensionsDatasetRecordStorage] = {} 

181 byId: dict[DatasetId, ByDimensionsDatasetRecordStorage] = {} 

182 c = self._static.dataset_type.columns 

183 for row in self._db.query(self._static.dataset_type.select()).mappings(): 

184 name = row[c.name] 

185 dimensions = self._dimensions.loadDimensionGraph(row[c.dimensions_key]) 

186 calibTableName = row[c.calibration_association_table] 

187 datasetType = DatasetType( 

188 name, dimensions, row[c.storage_class], isCalibration=(calibTableName is not None) 

189 ) 

190 tags = self._db.getExistingTable( 

191 row[c.tag_association_table], 

192 makeTagTableSpec(datasetType, type(self._collections), self.getIdColumnType()), 

193 ) 

194 if tags is None: 194 ↛ 195line 194 didn't jump to line 195, because the condition on line 194 was never true

195 raise MissingDatabaseTableError( 

196 f"Table {row[c.tag_association_table]} is missing from database schema." 

197 ) 

198 if calibTableName is not None: 

199 calibs = self._db.getExistingTable( 

200 row[c.calibration_association_table], 

201 makeCalibTableSpec( 

202 datasetType, 

203 type(self._collections), 

204 self._db.getTimespanRepresentation(), 

205 self.getIdColumnType(), 

206 ), 

207 ) 

208 if calibs is None: 208 ↛ 209line 208 didn't jump to line 209, because the condition on line 208 was never true

209 raise MissingDatabaseTableError( 

210 f"Table {row[c.calibration_association_table]} is missing from database schema." 

211 ) 

212 else: 

213 calibs = None 

214 storage = self._recordStorageType( 

215 db=self._db, 

216 datasetType=datasetType, 

217 static=self._static, 

218 summaries=self._summaries, 

219 tags=tags, 

220 calibs=calibs, 

221 dataset_type_id=row["id"], 

222 collections=self._collections, 

223 ) 

224 byName[datasetType.name] = storage 

225 byId[storage._dataset_type_id] = storage 

226 self._byName = byName 

227 self._byId = byId 

228 self._summaries.refresh(lambda dataset_type_id: self._byId[dataset_type_id].datasetType) 

229 

230 def remove(self, name: str) -> None: 

231 # Docstring inherited from DatasetRecordStorageManager. 

232 compositeName, componentName = DatasetType.splitDatasetTypeName(name) 

233 if componentName is not None: 

234 raise ValueError(f"Cannot delete a dataset type of a component of a composite (given {name})") 

235 

236 # Delete the row 

237 try: 

238 self._db.delete(self._static.dataset_type, ["name"], {"name": name}) 

239 except sqlalchemy.exc.IntegrityError as e: 

240 raise OrphanedRecordError( 

241 f"Dataset type {name} can not be removed." 

242 " It is associated with datasets that must be removed first." 

243 ) from e 

244 

245 # Now refresh everything -- removal is rare enough that this does 

246 # not need to be fast. 

247 self.refresh() 

248 

249 def find(self, name: str) -> DatasetRecordStorage | None: 

250 # Docstring inherited from DatasetRecordStorageManager. 

251 compositeName, componentName = DatasetType.splitDatasetTypeName(name) 

252 storage = self._byName.get(compositeName) 

253 if storage is not None and componentName is not None: 

254 componentStorage = copy.copy(storage) 

255 componentStorage.datasetType = storage.datasetType.makeComponentDatasetType(componentName) 

256 return componentStorage 

257 else: 

258 return storage 

259 

260 def register(self, datasetType: DatasetType) -> tuple[DatasetRecordStorage, bool]: 

261 # Docstring inherited from DatasetRecordStorageManager. 

262 if datasetType.isComponent(): 262 ↛ 263line 262 didn't jump to line 263, because the condition on line 262 was never true

263 raise ValueError( 

264 f"Component dataset types can not be stored in registry. Rejecting {datasetType.name}" 

265 ) 

266 storage = self._byName.get(datasetType.name) 

267 if storage is None: 

268 dimensionsKey = self._dimensions.saveDimensionGraph(datasetType.dimensions) 

269 tagTableName = makeTagTableName(datasetType, dimensionsKey) 

270 calibTableName = ( 

271 makeCalibTableName(datasetType, dimensionsKey) if datasetType.isCalibration() else None 

272 ) 

273 # The order is important here, we want to create tables first and 

274 # only register them if this operation is successful. We cannot 

275 # wrap it into a transaction because database class assumes that 

276 # DDL is not transaction safe in general. 

277 tags = self._db.ensureTableExists( 

278 tagTableName, 

279 makeTagTableSpec(datasetType, type(self._collections), self.getIdColumnType()), 

280 ) 

281 if calibTableName is not None: 

282 calibs = self._db.ensureTableExists( 

283 calibTableName, 

284 makeCalibTableSpec( 

285 datasetType, 

286 type(self._collections), 

287 self._db.getTimespanRepresentation(), 

288 self.getIdColumnType(), 

289 ), 

290 ) 

291 else: 

292 calibs = None 

293 row, inserted = self._db.sync( 

294 self._static.dataset_type, 

295 keys={"name": datasetType.name}, 

296 compared={ 

297 "dimensions_key": dimensionsKey, 

298 # Force the storage class to be loaded to ensure it 

299 # exists and there is no typo in the name. 

300 "storage_class": datasetType.storageClass.name, 

301 }, 

302 extra={ 

303 "tag_association_table": tagTableName, 

304 "calibration_association_table": calibTableName, 

305 }, 

306 returning=["id", "tag_association_table"], 

307 ) 

308 assert row is not None 

309 storage = self._recordStorageType( 

310 db=self._db, 

311 datasetType=datasetType, 

312 static=self._static, 

313 summaries=self._summaries, 

314 tags=tags, 

315 calibs=calibs, 

316 dataset_type_id=row["id"], 

317 collections=self._collections, 

318 ) 

319 self._byName[datasetType.name] = storage 

320 self._byId[storage._dataset_type_id] = storage 

321 else: 

322 if datasetType != storage.datasetType: 

323 raise ConflictingDefinitionError( 

324 f"Given dataset type {datasetType} is inconsistent " 

325 f"with database definition {storage.datasetType}." 

326 ) 

327 inserted = False 

328 return storage, bool(inserted) 

329 

330 def __iter__(self) -> Iterator[DatasetType]: 

331 for storage in self._byName.values(): 

332 yield storage.datasetType 

333 

334 def getDatasetRef(self, id: DatasetId) -> DatasetRef | None: 

335 # Docstring inherited from DatasetRecordStorageManager. 

336 sql = ( 

337 sqlalchemy.sql.select( 

338 self._static.dataset.columns.dataset_type_id, 

339 self._static.dataset.columns[self._collections.getRunForeignKeyName()], 

340 ) 

341 .select_from(self._static.dataset) 

342 .where(self._static.dataset.columns.id == id) 

343 ) 

344 row = self._db.query(sql).mappings().fetchone() 

345 if row is None: 

346 return None 

347 recordsForType = self._byId.get(row[self._static.dataset.columns.dataset_type_id]) 

348 if recordsForType is None: 348 ↛ 349line 348 didn't jump to line 349, because the condition on line 348 was never true

349 self.refresh() 

350 recordsForType = self._byId.get(row[self._static.dataset.columns.dataset_type_id]) 

351 assert recordsForType is not None, "Should be guaranteed by foreign key constraints." 

352 return DatasetRef( 

353 recordsForType.datasetType, 

354 dataId=recordsForType.getDataId(id=id), 

355 id=id, 

356 run=self._collections[row[self._collections.getRunForeignKeyName()]].name, 

357 ) 

358 

359 def getCollectionSummary(self, collection: CollectionRecord) -> CollectionSummary: 

360 # Docstring inherited from DatasetRecordStorageManager. 

361 return self._summaries.get(collection) 

362 

363 def schemaDigest(self) -> str | None: 

364 # Docstring inherited from VersionedExtension. 

365 return self._defaultSchemaDigest(self._static, self._db.dialect) 

366 

367 _version: VersionTuple 

368 """Schema version for this class.""" 

369 

370 _recordStorageType: type[ByDimensionsDatasetRecordStorage] 

371 """Type of the storage class returned by this manager.""" 

372 

373 _autoincrement: bool 

374 """If True then PK column of the dataset table is auto-increment.""" 

375 

376 _idColumnType: type 

377 """Type of dataset column used to store dataset ID.""" 

378 

379 

380class ByDimensionsDatasetRecordStorageManager(ByDimensionsDatasetRecordStorageManagerBase): 

381 """Implementation of ByDimensionsDatasetRecordStorageManagerBase which uses 

382 auto-incremental integer for dataset primary key. 

383 """ 

384 

385 _version: VersionTuple = _VERSION_INT 

386 _recordStorageType: type[ByDimensionsDatasetRecordStorage] = ByDimensionsDatasetRecordStorageInt 

387 _autoincrement: bool = True 

388 _idColumnType: type = sqlalchemy.BigInteger 

389 

390 @classmethod 

391 def supportsIdGenerationMode(cls, mode: DatasetIdGenEnum) -> bool: 

392 # Docstring inherited from DatasetRecordStorageManager. 

393 # MyPy seems confused about enum value types here. 

394 return mode is mode.UNIQUE # type: ignore 

395 

396 

397class ByDimensionsDatasetRecordStorageManagerUUID(ByDimensionsDatasetRecordStorageManagerBase): 

398 """Implementation of ByDimensionsDatasetRecordStorageManagerBase which uses 

399 UUID for dataset primary key. 

400 """ 

401 

402 _version: VersionTuple = _VERSION_UUID 

403 _recordStorageType: type[ByDimensionsDatasetRecordStorage] = ByDimensionsDatasetRecordStorageUUID 

404 _autoincrement: bool = False 

405 _idColumnType: type = ddl.GUID 

406 

407 @classmethod 

408 def supportsIdGenerationMode(cls, mode: DatasetIdGenEnum) -> bool: 

409 # Docstring inherited from DatasetRecordStorageManager. 

410 return True