Coverage for python/lsst/daf/butler/registry/datasets/byDimensions/_manager.py: 92%

148 statements  

« prev     ^ index     » next       coverage.py v6.5.0, created at 2022-11-06 12:40 -0800

1from __future__ import annotations 

2 

3__all__ = ( 

4 "ByDimensionsDatasetRecordStorageManager", 

5 "ByDimensionsDatasetRecordStorageManagerUUID", 

6) 

7 

8import copy 

9from typing import TYPE_CHECKING, Any, Dict, Iterator, Optional, Tuple, Type 

10 

11import sqlalchemy 

12from lsst.daf.butler import DatasetId, DatasetRef, DatasetType, DimensionUniverse, ddl 

13from lsst.daf.butler.registry import ConflictingDefinitionError, OrphanedRecordError 

14from lsst.daf.butler.registry.interfaces import ( 

15 DatasetIdGenEnum, 

16 DatasetRecordStorage, 

17 DatasetRecordStorageManager, 

18 VersionTuple, 

19) 

20 

21from ...summaries import CollectionSummary 

22from ._storage import ( 

23 ByDimensionsDatasetRecordStorage, 

24 ByDimensionsDatasetRecordStorageInt, 

25 ByDimensionsDatasetRecordStorageUUID, 

26) 

27from .summaries import CollectionSummaryManager 

28from .tables import ( 

29 addDatasetForeignKey, 

30 makeCalibTableName, 

31 makeCalibTableSpec, 

32 makeStaticTableSpecs, 

33 makeTagTableName, 

34 makeTagTableSpec, 

35) 

36 

37if TYPE_CHECKING: 37 ↛ 38line 37 didn't jump to line 38, because the condition on line 37 was never true

38 from lsst.daf.butler.registry.interfaces import ( 

39 CollectionManager, 

40 CollectionRecord, 

41 Database, 

42 DimensionRecordStorageManager, 

43 StaticTablesContext, 

44 ) 

45 

46 from .tables import StaticDatasetTablesTuple 

47 

48 

49# This has to be updated on every schema change 

50_VERSION_INT = VersionTuple(1, 0, 0) 

51_VERSION_UUID = VersionTuple(1, 0, 0) 

52 

53 

54class MissingDatabaseTableError(RuntimeError): 

55 """Exception raised when a table is not found in a database.""" 

56 

57 

58class ByDimensionsDatasetRecordStorageManagerBase(DatasetRecordStorageManager): 

59 """A manager class for datasets that uses one dataset-collection table for 

60 each group of dataset types that share the same dimensions. 

61 

62 In addition to the table organization, this class makes a number of 

63 other design choices that would have been cumbersome (to say the least) to 

64 try to pack into its name: 

65 

66 - It uses a private surrogate integer autoincrement field to identify 

67 dataset types, instead of using the name as the primary and foreign key 

68 directly. 

69 

70 - It aggressively loads all DatasetTypes into memory instead of fetching 

71 them from the database only when needed or attempting more clever forms 

72 of caching. 

73 

74 Alternative implementations that make different choices for these while 

75 keeping the same general table organization might be reasonable as well. 

76 

77 This class provides complete implementation of manager logic but it is 

78 parametrized by few class attributes that have to be defined by 

79 sub-classes. 

80 

81 Parameters 

82 ---------- 

83 db : `Database` 

84 Interface to the underlying database engine and namespace. 

85 collections : `CollectionManager` 

86 Manager object for the collections in this `Registry`. 

87 dimensions : `DimensionRecordStorageManager` 

88 Manager object for the dimensions in this `Registry`. 

89 static : `StaticDatasetTablesTuple` 

90 Named tuple of `sqlalchemy.schema.Table` instances for all static 

91 tables used by this class. 

92 summaries : `CollectionSummaryManager` 

93 Structure containing tables that summarize the contents of collections. 

94 """ 

95 

96 def __init__( 

97 self, 

98 *, 

99 db: Database, 

100 collections: CollectionManager, 

101 dimensions: DimensionRecordStorageManager, 

102 static: StaticDatasetTablesTuple, 

103 summaries: CollectionSummaryManager, 

104 ): 

105 self._db = db 

106 self._collections = collections 

107 self._dimensions = dimensions 

108 self._static = static 

109 self._summaries = summaries 

110 self._byName: Dict[str, ByDimensionsDatasetRecordStorage] = {} 

111 self._byId: Dict[DatasetId, ByDimensionsDatasetRecordStorage] = {} 

112 

113 @classmethod 

114 def initialize( 

115 cls, 

116 db: Database, 

117 context: StaticTablesContext, 

118 *, 

119 collections: CollectionManager, 

120 dimensions: DimensionRecordStorageManager, 

121 ) -> DatasetRecordStorageManager: 

122 # Docstring inherited from DatasetRecordStorageManager. 

123 specs = cls.makeStaticTableSpecs(type(collections), universe=dimensions.universe) 

124 static: StaticDatasetTablesTuple = context.addTableTuple(specs) # type: ignore 

125 summaries = CollectionSummaryManager.initialize( 

126 db, 

127 context, 

128 collections=collections, 

129 dimensions=dimensions, 

130 ) 

131 return cls(db=db, collections=collections, dimensions=dimensions, static=static, summaries=summaries) 

132 

133 @classmethod 

134 def currentVersion(cls) -> Optional[VersionTuple]: 

135 # Docstring inherited from VersionedExtension. 

136 return cls._version 

137 

138 @classmethod 

139 def makeStaticTableSpecs( 

140 cls, collections: Type[CollectionManager], universe: DimensionUniverse 

141 ) -> StaticDatasetTablesTuple: 

142 """Construct all static tables used by the classes in this package. 

143 

144 Static tables are those that are present in all Registries and do not 

145 depend on what DatasetTypes have been registered. 

146 

147 Parameters 

148 ---------- 

149 collections: `CollectionManager` 

150 Manager object for the collections in this `Registry`. 

151 universe : `DimensionUniverse` 

152 Universe graph containing all dimensions known to this `Registry`. 

153 

154 Returns 

155 ------- 

156 specs : `StaticDatasetTablesTuple` 

157 A named tuple containing `ddl.TableSpec` instances. 

158 """ 

159 return makeStaticTableSpecs( 

160 collections, universe=universe, dtype=cls.getIdColumnType(), autoincrement=cls._autoincrement 

161 ) 

162 

163 @classmethod 

164 def getIdColumnType(cls) -> type: 

165 # Docstring inherited from base class. 

166 return cls._idColumnType 

167 

168 @classmethod 

169 def addDatasetForeignKey( 

170 cls, 

171 tableSpec: ddl.TableSpec, 

172 *, 

173 name: str = "dataset", 

174 constraint: bool = True, 

175 onDelete: Optional[str] = None, 

176 **kwargs: Any, 

177 ) -> ddl.FieldSpec: 

178 # Docstring inherited from DatasetRecordStorageManager. 

179 return addDatasetForeignKey( 

180 tableSpec, cls.getIdColumnType(), name=name, onDelete=onDelete, constraint=constraint, **kwargs 

181 ) 

182 

183 def refresh(self) -> None: 

184 # Docstring inherited from DatasetRecordStorageManager. 

185 byName = {} 

186 byId: Dict[DatasetId, ByDimensionsDatasetRecordStorage] = {} 

187 c = self._static.dataset_type.columns 

188 for row in self._db.query(self._static.dataset_type.select()).mappings(): 

189 name = row[c.name] 

190 dimensions = self._dimensions.loadDimensionGraph(row[c.dimensions_key]) 

191 calibTableName = row[c.calibration_association_table] 

192 datasetType = DatasetType( 

193 name, dimensions, row[c.storage_class], isCalibration=(calibTableName is not None) 

194 ) 

195 tags = self._db.getExistingTable( 

196 row[c.tag_association_table], 

197 makeTagTableSpec(datasetType, type(self._collections), self.getIdColumnType()), 

198 ) 

199 if tags is None: 199 ↛ 200line 199 didn't jump to line 200, because the condition on line 199 was never true

200 raise MissingDatabaseTableError( 

201 f"Table {row[c.tag_association_table]} is missing from database schema." 

202 ) 

203 if calibTableName is not None: 

204 calibs = self._db.getExistingTable( 

205 row[c.calibration_association_table], 

206 makeCalibTableSpec( 

207 datasetType, 

208 type(self._collections), 

209 self._db.getTimespanRepresentation(), 

210 self.getIdColumnType(), 

211 ), 

212 ) 

213 if calibs is None: 213 ↛ 214line 213 didn't jump to line 214, because the condition on line 213 was never true

214 raise MissingDatabaseTableError( 

215 f"Table {row[c.calibration_association_table]} is missing from database schema." 

216 ) 

217 else: 

218 calibs = None 

219 storage = self._recordStorageType( 

220 db=self._db, 

221 datasetType=datasetType, 

222 static=self._static, 

223 summaries=self._summaries, 

224 tags=tags, 

225 calibs=calibs, 

226 dataset_type_id=row["id"], 

227 collections=self._collections, 

228 ) 

229 byName[datasetType.name] = storage 

230 byId[storage._dataset_type_id] = storage 

231 self._byName = byName 

232 self._byId = byId 

233 self._summaries.refresh(lambda dataset_type_id: self._byId[dataset_type_id].datasetType) 

234 

235 def remove(self, name: str) -> None: 

236 # Docstring inherited from DatasetRecordStorageManager. 

237 compositeName, componentName = DatasetType.splitDatasetTypeName(name) 

238 if componentName is not None: 

239 raise ValueError(f"Cannot delete a dataset type of a component of a composite (given {name})") 

240 

241 # Delete the row 

242 try: 

243 self._db.delete(self._static.dataset_type, ["name"], {"name": name}) 

244 except sqlalchemy.exc.IntegrityError as e: 

245 raise OrphanedRecordError( 

246 f"Dataset type {name} can not be removed." 

247 " It is associated with datasets that must be removed first." 

248 ) from e 

249 

250 # Now refresh everything -- removal is rare enough that this does 

251 # not need to be fast. 

252 self.refresh() 

253 

254 def find(self, name: str) -> Optional[DatasetRecordStorage]: 

255 # Docstring inherited from DatasetRecordStorageManager. 

256 compositeName, componentName = DatasetType.splitDatasetTypeName(name) 

257 storage = self._byName.get(compositeName) 

258 if storage is not None and componentName is not None: 

259 componentStorage = copy.copy(storage) 

260 componentStorage.datasetType = storage.datasetType.makeComponentDatasetType(componentName) 

261 return componentStorage 

262 else: 

263 return storage 

264 

265 def register(self, datasetType: DatasetType) -> Tuple[DatasetRecordStorage, bool]: 

266 # Docstring inherited from DatasetRecordStorageManager. 

267 if datasetType.isComponent(): 267 ↛ 268line 267 didn't jump to line 268, because the condition on line 267 was never true

268 raise ValueError( 

269 f"Component dataset types can not be stored in registry. Rejecting {datasetType.name}" 

270 ) 

271 storage = self._byName.get(datasetType.name) 

272 if storage is None: 

273 dimensionsKey = self._dimensions.saveDimensionGraph(datasetType.dimensions) 

274 tagTableName = makeTagTableName(datasetType, dimensionsKey) 

275 calibTableName = ( 

276 makeCalibTableName(datasetType, dimensionsKey) if datasetType.isCalibration() else None 

277 ) 

278 # The order is important here, we want to create tables first and 

279 # only register them if this operation is successful. We cannot 

280 # wrap it into a transaction because database class assumes that 

281 # DDL is not transaction safe in general. 

282 tags = self._db.ensureTableExists( 

283 tagTableName, 

284 makeTagTableSpec(datasetType, type(self._collections), self.getIdColumnType()), 

285 ) 

286 if calibTableName is not None: 

287 calibs = self._db.ensureTableExists( 

288 calibTableName, 

289 makeCalibTableSpec( 

290 datasetType, 

291 type(self._collections), 

292 self._db.getTimespanRepresentation(), 

293 self.getIdColumnType(), 

294 ), 

295 ) 

296 else: 

297 calibs = None 

298 row, inserted = self._db.sync( 

299 self._static.dataset_type, 

300 keys={"name": datasetType.name}, 

301 compared={ 

302 "dimensions_key": dimensionsKey, 

303 # Force the storage class to be loaded to ensure it 

304 # exists and there is no typo in the name. 

305 "storage_class": datasetType.storageClass.name, 

306 }, 

307 extra={ 

308 "tag_association_table": tagTableName, 

309 "calibration_association_table": calibTableName, 

310 }, 

311 returning=["id", "tag_association_table"], 

312 ) 

313 assert row is not None 

314 storage = self._recordStorageType( 

315 db=self._db, 

316 datasetType=datasetType, 

317 static=self._static, 

318 summaries=self._summaries, 

319 tags=tags, 

320 calibs=calibs, 

321 dataset_type_id=row["id"], 

322 collections=self._collections, 

323 ) 

324 self._byName[datasetType.name] = storage 

325 self._byId[storage._dataset_type_id] = storage 

326 else: 

327 if datasetType != storage.datasetType: 

328 raise ConflictingDefinitionError( 

329 f"Given dataset type {datasetType} is inconsistent " 

330 f"with database definition {storage.datasetType}." 

331 ) 

332 inserted = False 

333 return storage, bool(inserted) 

334 

335 def __iter__(self) -> Iterator[DatasetType]: 

336 for storage in self._byName.values(): 

337 yield storage.datasetType 

338 

339 def getDatasetRef(self, id: DatasetId) -> Optional[DatasetRef]: 

340 # Docstring inherited from DatasetRecordStorageManager. 

341 sql = ( 

342 sqlalchemy.sql.select( 

343 self._static.dataset.columns.dataset_type_id, 

344 self._static.dataset.columns[self._collections.getRunForeignKeyName()], 

345 ) 

346 .select_from(self._static.dataset) 

347 .where(self._static.dataset.columns.id == id) 

348 ) 

349 row = self._db.query(sql).mappings().fetchone() 

350 if row is None: 

351 return None 

352 recordsForType = self._byId.get(row[self._static.dataset.columns.dataset_type_id]) 

353 if recordsForType is None: 353 ↛ 354line 353 didn't jump to line 354, because the condition on line 353 was never true

354 self.refresh() 

355 recordsForType = self._byId.get(row[self._static.dataset.columns.dataset_type_id]) 

356 assert recordsForType is not None, "Should be guaranteed by foreign key constraints." 

357 return DatasetRef( 

358 recordsForType.datasetType, 

359 dataId=recordsForType.getDataId(id=id), 

360 id=id, 

361 run=self._collections[row[self._collections.getRunForeignKeyName()]].name, 

362 ) 

363 

364 def getCollectionSummary(self, collection: CollectionRecord) -> CollectionSummary: 

365 # Docstring inherited from DatasetRecordStorageManager. 

366 return self._summaries.get(collection) 

367 

368 def schemaDigest(self) -> Optional[str]: 

369 # Docstring inherited from VersionedExtension. 

370 return self._defaultSchemaDigest(self._static, self._db.dialect) 

371 

372 _version: VersionTuple 

373 """Schema version for this class.""" 

374 

375 _recordStorageType: Type[ByDimensionsDatasetRecordStorage] 

376 """Type of the storage class returned by this manager.""" 

377 

378 _autoincrement: bool 

379 """If True then PK column of the dataset table is auto-increment.""" 

380 

381 _idColumnType: type 

382 """Type of dataset column used to store dataset ID.""" 

383 

384 

385class ByDimensionsDatasetRecordStorageManager(ByDimensionsDatasetRecordStorageManagerBase): 

386 """Implementation of ByDimensionsDatasetRecordStorageManagerBase which uses 

387 auto-incremental integer for dataset primary key. 

388 """ 

389 

390 _version: VersionTuple = _VERSION_INT 

391 _recordStorageType: Type[ByDimensionsDatasetRecordStorage] = ByDimensionsDatasetRecordStorageInt 

392 _autoincrement: bool = True 

393 _idColumnType: type = sqlalchemy.BigInteger 

394 

395 @classmethod 

396 def supportsIdGenerationMode(cls, mode: DatasetIdGenEnum) -> bool: 

397 # Docstring inherited from DatasetRecordStorageManager. 

398 # MyPy seems confused about enum value types here. 

399 return mode is mode.UNIQUE # type: ignore 

400 

401 

402class ByDimensionsDatasetRecordStorageManagerUUID(ByDimensionsDatasetRecordStorageManagerBase): 

403 """Implementation of ByDimensionsDatasetRecordStorageManagerBase which uses 

404 UUID for dataset primary key. 

405 """ 

406 

407 _version: VersionTuple = _VERSION_UUID 

408 _recordStorageType: Type[ByDimensionsDatasetRecordStorage] = ByDimensionsDatasetRecordStorageUUID 

409 _autoincrement: bool = False 

410 _idColumnType: type = ddl.GUID 

411 

412 @classmethod 

413 def supportsIdGenerationMode(cls, mode: DatasetIdGenEnum) -> bool: 

414 # Docstring inherited from DatasetRecordStorageManager. 

415 return True