Coverage for python/lsst/daf/butler/registry/datasets/byDimensions/_manager.py: 93%

202 statements  

« prev     ^ index     » next       coverage.py v6.5.0, created at 2022-11-15 01:58 -0800

1from __future__ import annotations 

2 

3__all__ = ( 

4 "ByDimensionsDatasetRecordStorageManager", 

5 "ByDimensionsDatasetRecordStorageManagerUUID", 

6) 

7 

8import logging 

9import warnings 

10from collections import defaultdict 

11from typing import TYPE_CHECKING, Any 

12 

13import sqlalchemy 

14from lsst.utils.ellipsis import Ellipsis 

15 

16from ....core import DatasetId, DatasetRef, DatasetType, DimensionUniverse, ddl 

17from ..._collection_summary import CollectionSummary 

18from ..._exceptions import ConflictingDefinitionError, DatasetTypeError, OrphanedRecordError 

19from ...interfaces import DatasetIdGenEnum, DatasetRecordStorage, DatasetRecordStorageManager, VersionTuple 

20from ...wildcards import DatasetTypeWildcard 

21from ._storage import ( 

22 ByDimensionsDatasetRecordStorage, 

23 ByDimensionsDatasetRecordStorageInt, 

24 ByDimensionsDatasetRecordStorageUUID, 

25) 

26from .summaries import CollectionSummaryManager 

27from .tables import ( 

28 addDatasetForeignKey, 

29 makeCalibTableName, 

30 makeCalibTableSpec, 

31 makeStaticTableSpecs, 

32 makeTagTableName, 

33 makeTagTableSpec, 

34) 

35 

36if TYPE_CHECKING: 36 ↛ 37line 36 didn't jump to line 37, because the condition on line 36 was never true

37 from ...interfaces import ( 

38 CollectionManager, 

39 CollectionRecord, 

40 Database, 

41 DimensionRecordStorageManager, 

42 StaticTablesContext, 

43 ) 

44 from .tables import StaticDatasetTablesTuple 

45 

46 

47# This has to be updated on every schema change 

48_VERSION_INT = VersionTuple(1, 0, 0) 

49_VERSION_UUID = VersionTuple(1, 0, 0) 

50 

51_LOG = logging.getLogger(__name__) 

52 

53 

54class MissingDatabaseTableError(RuntimeError): 

55 """Exception raised when a table is not found in a database.""" 

56 

57 

58class ByDimensionsDatasetRecordStorageManagerBase(DatasetRecordStorageManager): 

59 """A manager class for datasets that uses one dataset-collection table for 

60 each group of dataset types that share the same dimensions. 

61 

62 In addition to the table organization, this class makes a number of 

63 other design choices that would have been cumbersome (to say the least) to 

64 try to pack into its name: 

65 

66 - It uses a private surrogate integer autoincrement field to identify 

67 dataset types, instead of using the name as the primary and foreign key 

68 directly. 

69 

70 - It aggressively loads all DatasetTypes into memory instead of fetching 

71 them from the database only when needed or attempting more clever forms 

72 of caching. 

73 

74 Alternative implementations that make different choices for these while 

75 keeping the same general table organization might be reasonable as well. 

76 

77 This class provides complete implementation of manager logic but it is 

78 parametrized by few class attributes that have to be defined by 

79 sub-classes. 

80 

81 Parameters 

82 ---------- 

83 db : `Database` 

84 Interface to the underlying database engine and namespace. 

85 collections : `CollectionManager` 

86 Manager object for the collections in this `Registry`. 

87 dimensions : `DimensionRecordStorageManager` 

88 Manager object for the dimensions in this `Registry`. 

89 static : `StaticDatasetTablesTuple` 

90 Named tuple of `sqlalchemy.schema.Table` instances for all static 

91 tables used by this class. 

92 summaries : `CollectionSummaryManager` 

93 Structure containing tables that summarize the contents of collections. 

94 """ 

95 

96 def __init__( 

97 self, 

98 *, 

99 db: Database, 

100 collections: CollectionManager, 

101 dimensions: DimensionRecordStorageManager, 

102 static: StaticDatasetTablesTuple, 

103 summaries: CollectionSummaryManager, 

104 ): 

105 self._db = db 

106 self._collections = collections 

107 self._dimensions = dimensions 

108 self._static = static 

109 self._summaries = summaries 

110 self._byName: dict[str, ByDimensionsDatasetRecordStorage] = {} 

111 self._byId: dict[DatasetId, ByDimensionsDatasetRecordStorage] = {} 

112 

113 @classmethod 

114 def initialize( 

115 cls, 

116 db: Database, 

117 context: StaticTablesContext, 

118 *, 

119 collections: CollectionManager, 

120 dimensions: DimensionRecordStorageManager, 

121 ) -> DatasetRecordStorageManager: 

122 # Docstring inherited from DatasetRecordStorageManager. 

123 specs = cls.makeStaticTableSpecs(type(collections), universe=dimensions.universe) 

124 static: StaticDatasetTablesTuple = context.addTableTuple(specs) # type: ignore 

125 summaries = CollectionSummaryManager.initialize( 

126 db, 

127 context, 

128 collections=collections, 

129 dimensions=dimensions, 

130 ) 

131 return cls(db=db, collections=collections, dimensions=dimensions, static=static, summaries=summaries) 

132 

133 @classmethod 

134 def currentVersion(cls) -> VersionTuple | None: 

135 # Docstring inherited from VersionedExtension. 

136 return cls._version 

137 

138 @classmethod 

139 def makeStaticTableSpecs( 

140 cls, collections: type[CollectionManager], universe: DimensionUniverse 

141 ) -> StaticDatasetTablesTuple: 

142 """Construct all static tables used by the classes in this package. 

143 

144 Static tables are those that are present in all Registries and do not 

145 depend on what DatasetTypes have been registered. 

146 

147 Parameters 

148 ---------- 

149 collections: `CollectionManager` 

150 Manager object for the collections in this `Registry`. 

151 universe : `DimensionUniverse` 

152 Universe graph containing all dimensions known to this `Registry`. 

153 

154 Returns 

155 ------- 

156 specs : `StaticDatasetTablesTuple` 

157 A named tuple containing `ddl.TableSpec` instances. 

158 """ 

159 return makeStaticTableSpecs( 

160 collections, universe=universe, dtype=cls.getIdColumnType(), autoincrement=cls._autoincrement 

161 ) 

162 

163 @classmethod 

164 def getIdColumnType(cls) -> type: 

165 # Docstring inherited from base class. 

166 return cls._idColumnType 

167 

168 @classmethod 

169 def addDatasetForeignKey( 

170 cls, 

171 tableSpec: ddl.TableSpec, 

172 *, 

173 name: str = "dataset", 

174 constraint: bool = True, 

175 onDelete: str | None = None, 

176 **kwargs: Any, 

177 ) -> ddl.FieldSpec: 

178 # Docstring inherited from DatasetRecordStorageManager. 

179 return addDatasetForeignKey( 

180 tableSpec, cls.getIdColumnType(), name=name, onDelete=onDelete, constraint=constraint, **kwargs 

181 ) 

182 

183 def refresh(self) -> None: 

184 # Docstring inherited from DatasetRecordStorageManager. 

185 byName: dict[str, ByDimensionsDatasetRecordStorage] = {} 

186 byId: dict[DatasetId, ByDimensionsDatasetRecordStorage] = {} 

187 c = self._static.dataset_type.columns 

188 for row in self._db.query(self._static.dataset_type.select()).mappings(): 

189 name = row[c.name] 

190 dimensions = self._dimensions.loadDimensionGraph(row[c.dimensions_key]) 

191 calibTableName = row[c.calibration_association_table] 

192 datasetType = DatasetType( 

193 name, dimensions, row[c.storage_class], isCalibration=(calibTableName is not None) 

194 ) 

195 tags = self._db.getExistingTable( 

196 row[c.tag_association_table], 

197 makeTagTableSpec(datasetType, type(self._collections), self.getIdColumnType()), 

198 ) 

199 if tags is None: 199 ↛ 200line 199 didn't jump to line 200, because the condition on line 199 was never true

200 raise MissingDatabaseTableError( 

201 f"Table {row[c.tag_association_table]} is missing from database schema." 

202 ) 

203 if calibTableName is not None: 

204 calibs = self._db.getExistingTable( 

205 row[c.calibration_association_table], 

206 makeCalibTableSpec( 

207 datasetType, 

208 type(self._collections), 

209 self._db.getTimespanRepresentation(), 

210 self.getIdColumnType(), 

211 ), 

212 ) 

213 if calibs is None: 213 ↛ 214line 213 didn't jump to line 214, because the condition on line 213 was never true

214 raise MissingDatabaseTableError( 

215 f"Table {row[c.calibration_association_table]} is missing from database schema." 

216 ) 

217 else: 

218 calibs = None 

219 storage = self._recordStorageType( 

220 db=self._db, 

221 datasetType=datasetType, 

222 static=self._static, 

223 summaries=self._summaries, 

224 tags=tags, 

225 calibs=calibs, 

226 dataset_type_id=row["id"], 

227 collections=self._collections, 

228 ) 

229 byName[datasetType.name] = storage 

230 byId[storage._dataset_type_id] = storage 

231 self._byName = byName 

232 self._byId = byId 

233 self._summaries.refresh(lambda dataset_type_id: self._byId[dataset_type_id].datasetType) 

234 

235 def remove(self, name: str) -> None: 

236 # Docstring inherited from DatasetRecordStorageManager. 

237 compositeName, componentName = DatasetType.splitDatasetTypeName(name) 

238 if componentName is not None: 

239 raise ValueError(f"Cannot delete a dataset type of a component of a composite (given {name})") 

240 

241 # Delete the row 

242 try: 

243 self._db.delete(self._static.dataset_type, ["name"], {"name": name}) 

244 except sqlalchemy.exc.IntegrityError as e: 

245 raise OrphanedRecordError( 

246 f"Dataset type {name} can not be removed." 

247 " It is associated with datasets that must be removed first." 

248 ) from e 

249 

250 # Now refresh everything -- removal is rare enough that this does 

251 # not need to be fast. 

252 self.refresh() 

253 

254 def find(self, name: str) -> DatasetRecordStorage | None: 

255 # Docstring inherited from DatasetRecordStorageManager. 

256 return self._byName.get(name) 

257 

258 def register(self, datasetType: DatasetType) -> tuple[DatasetRecordStorage, bool]: 

259 # Docstring inherited from DatasetRecordStorageManager. 

260 if datasetType.isComponent(): 260 ↛ 261line 260 didn't jump to line 261, because the condition on line 260 was never true

261 raise ValueError( 

262 f"Component dataset types can not be stored in registry. Rejecting {datasetType.name}" 

263 ) 

264 storage = self._byName.get(datasetType.name) 

265 if storage is None: 

266 dimensionsKey = self._dimensions.saveDimensionGraph(datasetType.dimensions) 

267 tagTableName = makeTagTableName(datasetType, dimensionsKey) 

268 calibTableName = ( 

269 makeCalibTableName(datasetType, dimensionsKey) if datasetType.isCalibration() else None 

270 ) 

271 # The order is important here, we want to create tables first and 

272 # only register them if this operation is successful. We cannot 

273 # wrap it into a transaction because database class assumes that 

274 # DDL is not transaction safe in general. 

275 tags = self._db.ensureTableExists( 

276 tagTableName, 

277 makeTagTableSpec(datasetType, type(self._collections), self.getIdColumnType()), 

278 ) 

279 if calibTableName is not None: 

280 calibs = self._db.ensureTableExists( 

281 calibTableName, 

282 makeCalibTableSpec( 

283 datasetType, 

284 type(self._collections), 

285 self._db.getTimespanRepresentation(), 

286 self.getIdColumnType(), 

287 ), 

288 ) 

289 else: 

290 calibs = None 

291 row, inserted = self._db.sync( 

292 self._static.dataset_type, 

293 keys={"name": datasetType.name}, 

294 compared={ 

295 "dimensions_key": dimensionsKey, 

296 # Force the storage class to be loaded to ensure it 

297 # exists and there is no typo in the name. 

298 "storage_class": datasetType.storageClass.name, 

299 }, 

300 extra={ 

301 "tag_association_table": tagTableName, 

302 "calibration_association_table": calibTableName, 

303 }, 

304 returning=["id", "tag_association_table"], 

305 ) 

306 assert row is not None 

307 storage = self._recordStorageType( 

308 db=self._db, 

309 datasetType=datasetType, 

310 static=self._static, 

311 summaries=self._summaries, 

312 tags=tags, 

313 calibs=calibs, 

314 dataset_type_id=row["id"], 

315 collections=self._collections, 

316 ) 

317 self._byName[datasetType.name] = storage 

318 self._byId[storage._dataset_type_id] = storage 

319 else: 

320 if datasetType != storage.datasetType: 

321 raise ConflictingDefinitionError( 

322 f"Given dataset type {datasetType} is inconsistent " 

323 f"with database definition {storage.datasetType}." 

324 ) 

325 inserted = False 

326 return storage, bool(inserted) 

327 

328 def resolve_wildcard( 

329 self, 

330 expression: Any, 

331 components: bool | None = None, 

332 missing: list[str] | None = None, 

333 explicit_only: bool = False, 

334 ) -> dict[DatasetType, list[str | None]]: 

335 wildcard = DatasetTypeWildcard.from_expression(expression) 

336 result: defaultdict[DatasetType, set[str | None]] = defaultdict(set) 

337 # This message can be transformed into an error on DM-36303 after v26, 

338 # and the components argument here (and in all callers) can be removed 

339 # entirely on DM-36457 after v27. 

340 deprecation_message = ( 

341 "Querying for component datasets via Registry query methods is deprecated in favor of using " 

342 "DatasetRef and DatasetType methods on parent datasets. Only components=False will be supported " 

343 "after v26, and the components argument will be removed after v27." 

344 ) 

345 for name, dataset_type in wildcard.values.items(): 

346 parent_name, component_name = DatasetType.splitDatasetTypeName(name) 

347 if component_name is not None: 

348 warnings.warn(deprecation_message, FutureWarning) 

349 if (found_storage := self.find(parent_name)) is not None: 

350 found_parent = found_storage.datasetType 

351 if component_name is not None: 

352 found = found_parent.makeComponentDatasetType(component_name) 

353 else: 

354 found = found_parent 

355 if dataset_type is not None: 

356 if dataset_type.is_compatible_with(found): 356 ↛ 364line 356 didn't jump to line 364, because the condition on line 356 was never false

357 # Prefer the given dataset type to enable storage class 

358 # conversions. 

359 if component_name is not None: 359 ↛ 360line 359 didn't jump to line 360, because the condition on line 359 was never true

360 found_parent = dataset_type.makeCompositeDatasetType() 

361 else: 

362 found_parent = dataset_type 

363 else: 

364 raise DatasetTypeError( 

365 f"Dataset type definition in query expression {dataset_type} is " 

366 f"not compatible with the registered type {found}." 

367 ) 

368 result[found_parent].add(component_name) 

369 elif missing is not None: 369 ↛ 345line 369 didn't jump to line 345, because the condition on line 369 was never false

370 missing.append(name) 

371 already_warned = False 

372 if wildcard.patterns is Ellipsis: 

373 if explicit_only: 

374 raise TypeError( 

375 "Universal wildcard '...' is not permitted for dataset types in this context." 

376 ) 

377 for storage in self._byName.values(): 

378 result[storage.datasetType].add(None) 

379 if components: 

380 try: 

381 result[storage.datasetType].update( 

382 storage.datasetType.storageClass.allComponents().keys() 

383 ) 

384 if storage.datasetType.storageClass.allComponents() and not already_warned: 

385 warnings.warn(deprecation_message, FutureWarning) 

386 already_warned = True 

387 except KeyError as err: 

388 _LOG.warning( 

389 f"Could not load storage class {err} for {storage.datasetType.name}; " 

390 "if it has components they will not be included in query results.", 

391 ) 

392 elif wildcard.patterns: 

393 if explicit_only: 

394 # After v26 this should raise DatasetTypeExpressionError, to 

395 # be implemented on DM-36303. 

396 warnings.warn( 

397 "Passing wildcard patterns here is deprecated and will be prohibited after v26.", 

398 FutureWarning, 

399 ) 

400 for storage in self._byName.values(): 

401 if any(p.fullmatch(storage.datasetType.name) for p in wildcard.patterns): 

402 result[storage.datasetType].add(None) 

403 if components is not False: 

404 for storage in self._byName.values(): 

405 if components is None and storage.datasetType in result: 

406 continue 

407 try: 

408 components_for_parent = storage.datasetType.storageClass.allComponents().keys() 

409 except KeyError as err: 

410 _LOG.warning( 

411 f"Could not load storage class {err} for {storage.datasetType.name}; " 

412 "if it has components they will not be included in query results." 

413 ) 

414 continue 

415 for component_name in components_for_parent: 

416 if any( 

417 p.fullmatch( 

418 DatasetType.nameWithComponent(storage.datasetType.name, component_name) 

419 ) 

420 for p in wildcard.patterns 

421 ): 

422 result[storage.datasetType].add(component_name) 

423 if not already_warned: 

424 warnings.warn(deprecation_message, FutureWarning) 

425 already_warned = True 

426 return {k: list(v) for k, v in result.items()} 

427 

428 def getDatasetRef(self, id: DatasetId) -> DatasetRef | None: 

429 # Docstring inherited from DatasetRecordStorageManager. 

430 sql = ( 

431 sqlalchemy.sql.select( 

432 self._static.dataset.columns.dataset_type_id, 

433 self._static.dataset.columns[self._collections.getRunForeignKeyName()], 

434 ) 

435 .select_from(self._static.dataset) 

436 .where(self._static.dataset.columns.id == id) 

437 ) 

438 row = self._db.query(sql).mappings().fetchone() 

439 if row is None: 

440 return None 

441 recordsForType = self._byId.get(row[self._static.dataset.columns.dataset_type_id]) 

442 if recordsForType is None: 442 ↛ 443line 442 didn't jump to line 443, because the condition on line 442 was never true

443 self.refresh() 

444 recordsForType = self._byId.get(row[self._static.dataset.columns.dataset_type_id]) 

445 assert recordsForType is not None, "Should be guaranteed by foreign key constraints." 

446 return DatasetRef( 

447 recordsForType.datasetType, 

448 dataId=recordsForType.getDataId(id=id), 

449 id=id, 

450 run=self._collections[row[self._collections.getRunForeignKeyName()]].name, 

451 ) 

452 

453 def getCollectionSummary(self, collection: CollectionRecord) -> CollectionSummary: 

454 # Docstring inherited from DatasetRecordStorageManager. 

455 return self._summaries.get(collection) 

456 

457 def schemaDigest(self) -> str | None: 

458 # Docstring inherited from VersionedExtension. 

459 return self._defaultSchemaDigest(self._static, self._db.dialect) 

460 

461 _version: VersionTuple 

462 """Schema version for this class.""" 

463 

464 _recordStorageType: type[ByDimensionsDatasetRecordStorage] 

465 """Type of the storage class returned by this manager.""" 

466 

467 _autoincrement: bool 

468 """If True then PK column of the dataset table is auto-increment.""" 

469 

470 _idColumnType: type 

471 """Type of dataset column used to store dataset ID.""" 

472 

473 

474class ByDimensionsDatasetRecordStorageManager(ByDimensionsDatasetRecordStorageManagerBase): 

475 """Implementation of ByDimensionsDatasetRecordStorageManagerBase which uses 

476 auto-incremental integer for dataset primary key. 

477 """ 

478 

479 _version: VersionTuple = _VERSION_INT 

480 _recordStorageType: type[ByDimensionsDatasetRecordStorage] = ByDimensionsDatasetRecordStorageInt 

481 _autoincrement: bool = True 

482 _idColumnType: type = sqlalchemy.BigInteger 

483 

484 @classmethod 

485 def supportsIdGenerationMode(cls, mode: DatasetIdGenEnum) -> bool: 

486 # Docstring inherited from DatasetRecordStorageManager. 

487 # MyPy seems confused about enum value types here. 

488 return mode is mode.UNIQUE # type: ignore 

489 

490 

491class ByDimensionsDatasetRecordStorageManagerUUID(ByDimensionsDatasetRecordStorageManagerBase): 

492 """Implementation of ByDimensionsDatasetRecordStorageManagerBase which uses 

493 UUID for dataset primary key. 

494 """ 

495 

496 _version: VersionTuple = _VERSION_UUID 

497 _recordStorageType: type[ByDimensionsDatasetRecordStorage] = ByDimensionsDatasetRecordStorageUUID 

498 _autoincrement: bool = False 

499 _idColumnType: type = ddl.GUID 

500 

501 @classmethod 

502 def supportsIdGenerationMode(cls, mode: DatasetIdGenEnum) -> bool: 

503 # Docstring inherited from DatasetRecordStorageManager. 

504 return True