Coverage for python/lsst/daf/butler/registry/datasets/byDimensions/_manager.py: 94%

207 statements  

« prev     ^ index     » next       coverage.py v6.5.0, created at 2023-03-04 02:04 -0800

1from __future__ import annotations 

2 

3__all__ = ( 

4 "ByDimensionsDatasetRecordStorageManager", 

5 "ByDimensionsDatasetRecordStorageManagerUUID", 

6) 

7 

8import logging 

9import warnings 

10from collections import defaultdict 

11from typing import TYPE_CHECKING, Any 

12 

13import sqlalchemy 

14from deprecated.sphinx import deprecated 

15from lsst.utils.ellipsis import Ellipsis 

16 

17from ....core import DatasetId, DatasetRef, DatasetType, DimensionUniverse, ddl 

18from ..._collection_summary import CollectionSummary 

19from ..._exceptions import ConflictingDefinitionError, DatasetTypeError, OrphanedRecordError 

20from ...interfaces import DatasetIdGenEnum, DatasetRecordStorage, DatasetRecordStorageManager, VersionTuple 

21from ...wildcards import DatasetTypeWildcard 

22from ._storage import ( 

23 ByDimensionsDatasetRecordStorage, 

24 ByDimensionsDatasetRecordStorageInt, 

25 ByDimensionsDatasetRecordStorageUUID, 

26) 

27from .summaries import CollectionSummaryManager 

28from .tables import ( 

29 addDatasetForeignKey, 

30 makeCalibTableName, 

31 makeCalibTableSpec, 

32 makeStaticTableSpecs, 

33 makeTagTableName, 

34 makeTagTableSpec, 

35) 

36 

37if TYPE_CHECKING: 37 ↛ 38line 37 didn't jump to line 38, because the condition on line 37 was never true

38 from ...interfaces import ( 

39 CollectionManager, 

40 CollectionRecord, 

41 Database, 

42 DimensionRecordStorageManager, 

43 StaticTablesContext, 

44 ) 

45 from .tables import StaticDatasetTablesTuple 

46 

47 

48# This has to be updated on every schema change 

49_VERSION_INT = VersionTuple(1, 0, 0) 

50_VERSION_UUID = VersionTuple(1, 0, 0) 

51 

52_LOG = logging.getLogger(__name__) 

53 

54 

55class MissingDatabaseTableError(RuntimeError): 

56 """Exception raised when a table is not found in a database.""" 

57 

58 

59class ByDimensionsDatasetRecordStorageManagerBase(DatasetRecordStorageManager): 

60 """A manager class for datasets that uses one dataset-collection table for 

61 each group of dataset types that share the same dimensions. 

62 

63 In addition to the table organization, this class makes a number of 

64 other design choices that would have been cumbersome (to say the least) to 

65 try to pack into its name: 

66 

67 - It uses a private surrogate integer autoincrement field to identify 

68 dataset types, instead of using the name as the primary and foreign key 

69 directly. 

70 

71 - It aggressively loads all DatasetTypes into memory instead of fetching 

72 them from the database only when needed or attempting more clever forms 

73 of caching. 

74 

75 Alternative implementations that make different choices for these while 

76 keeping the same general table organization might be reasonable as well. 

77 

78 This class provides complete implementation of manager logic but it is 

79 parametrized by few class attributes that have to be defined by 

80 sub-classes. 

81 

82 Parameters 

83 ---------- 

84 db : `Database` 

85 Interface to the underlying database engine and namespace. 

86 collections : `CollectionManager` 

87 Manager object for the collections in this `Registry`. 

88 dimensions : `DimensionRecordStorageManager` 

89 Manager object for the dimensions in this `Registry`. 

90 static : `StaticDatasetTablesTuple` 

91 Named tuple of `sqlalchemy.schema.Table` instances for all static 

92 tables used by this class. 

93 summaries : `CollectionSummaryManager` 

94 Structure containing tables that summarize the contents of collections. 

95 """ 

96 

97 def __init__( 

98 self, 

99 *, 

100 db: Database, 

101 collections: CollectionManager, 

102 dimensions: DimensionRecordStorageManager, 

103 static: StaticDatasetTablesTuple, 

104 summaries: CollectionSummaryManager, 

105 ): 

106 self._db = db 

107 self._collections = collections 

108 self._dimensions = dimensions 

109 self._static = static 

110 self._summaries = summaries 

111 self._byName: dict[str, ByDimensionsDatasetRecordStorage] = {} 

112 self._byId: dict[DatasetId, ByDimensionsDatasetRecordStorage] = {} 

113 

114 @classmethod 

115 def initialize( 

116 cls, 

117 db: Database, 

118 context: StaticTablesContext, 

119 *, 

120 collections: CollectionManager, 

121 dimensions: DimensionRecordStorageManager, 

122 ) -> DatasetRecordStorageManager: 

123 # Docstring inherited from DatasetRecordStorageManager. 

124 specs = cls.makeStaticTableSpecs(type(collections), universe=dimensions.universe) 

125 static: StaticDatasetTablesTuple = context.addTableTuple(specs) # type: ignore 

126 summaries = CollectionSummaryManager.initialize( 

127 db, 

128 context, 

129 collections=collections, 

130 dimensions=dimensions, 

131 ) 

132 return cls(db=db, collections=collections, dimensions=dimensions, static=static, summaries=summaries) 

133 

134 @classmethod 

135 def currentVersion(cls) -> VersionTuple | None: 

136 # Docstring inherited from VersionedExtension. 

137 return cls._version 

138 

139 @classmethod 

140 def makeStaticTableSpecs( 

141 cls, collections: type[CollectionManager], universe: DimensionUniverse 

142 ) -> StaticDatasetTablesTuple: 

143 """Construct all static tables used by the classes in this package. 

144 

145 Static tables are those that are present in all Registries and do not 

146 depend on what DatasetTypes have been registered. 

147 

148 Parameters 

149 ---------- 

150 collections: `CollectionManager` 

151 Manager object for the collections in this `Registry`. 

152 universe : `DimensionUniverse` 

153 Universe graph containing all dimensions known to this `Registry`. 

154 

155 Returns 

156 ------- 

157 specs : `StaticDatasetTablesTuple` 

158 A named tuple containing `ddl.TableSpec` instances. 

159 """ 

160 return makeStaticTableSpecs( 

161 collections, universe=universe, dtype=cls.getIdColumnType(), autoincrement=cls._autoincrement 

162 ) 

163 

164 @classmethod 

165 def getIdColumnType(cls) -> type: 

166 # Docstring inherited from base class. 

167 return cls._idColumnType 

168 

169 @classmethod 

170 def addDatasetForeignKey( 

171 cls, 

172 tableSpec: ddl.TableSpec, 

173 *, 

174 name: str = "dataset", 

175 constraint: bool = True, 

176 onDelete: str | None = None, 

177 **kwargs: Any, 

178 ) -> ddl.FieldSpec: 

179 # Docstring inherited from DatasetRecordStorageManager. 

180 return addDatasetForeignKey( 

181 tableSpec, cls.getIdColumnType(), name=name, onDelete=onDelete, constraint=constraint, **kwargs 

182 ) 

183 

184 def refresh(self) -> None: 

185 # Docstring inherited from DatasetRecordStorageManager. 

186 byName: dict[str, ByDimensionsDatasetRecordStorage] = {} 

187 byId: dict[DatasetId, ByDimensionsDatasetRecordStorage] = {} 

188 c = self._static.dataset_type.columns 

189 with self._db.query(self._static.dataset_type.select()) as sql_result: 

190 sql_rows = sql_result.mappings().fetchall() 

191 for row in sql_rows: 

192 name = row[c.name] 

193 dimensions = self._dimensions.loadDimensionGraph(row[c.dimensions_key]) 

194 calibTableName = row[c.calibration_association_table] 

195 datasetType = DatasetType( 

196 name, dimensions, row[c.storage_class], isCalibration=(calibTableName is not None) 

197 ) 

198 tags = self._db.getExistingTable( 

199 row[c.tag_association_table], 

200 makeTagTableSpec(datasetType, type(self._collections), self.getIdColumnType()), 

201 ) 

202 if tags is None: 202 ↛ 203line 202 didn't jump to line 203, because the condition on line 202 was never true

203 raise MissingDatabaseTableError( 

204 f"Table {row[c.tag_association_table]} is missing from database schema." 

205 ) 

206 if calibTableName is not None: 

207 calibs = self._db.getExistingTable( 

208 row[c.calibration_association_table], 

209 makeCalibTableSpec( 

210 datasetType, 

211 type(self._collections), 

212 self._db.getTimespanRepresentation(), 

213 self.getIdColumnType(), 

214 ), 

215 ) 

216 if calibs is None: 216 ↛ 217line 216 didn't jump to line 217, because the condition on line 216 was never true

217 raise MissingDatabaseTableError( 

218 f"Table {row[c.calibration_association_table]} is missing from database schema." 

219 ) 

220 else: 

221 calibs = None 

222 storage = self._recordStorageType( 

223 db=self._db, 

224 datasetType=datasetType, 

225 static=self._static, 

226 summaries=self._summaries, 

227 tags=tags, 

228 calibs=calibs, 

229 dataset_type_id=row["id"], 

230 collections=self._collections, 

231 ) 

232 byName[datasetType.name] = storage 

233 byId[storage._dataset_type_id] = storage 

234 self._byName = byName 

235 self._byId = byId 

236 self._summaries.refresh(lambda dataset_type_id: self._byId[dataset_type_id].datasetType) 

237 

238 def remove(self, name: str) -> None: 

239 # Docstring inherited from DatasetRecordStorageManager. 

240 compositeName, componentName = DatasetType.splitDatasetTypeName(name) 

241 if componentName is not None: 

242 raise ValueError(f"Cannot delete a dataset type of a component of a composite (given {name})") 

243 

244 # Delete the row 

245 try: 

246 self._db.delete(self._static.dataset_type, ["name"], {"name": name}) 

247 except sqlalchemy.exc.IntegrityError as e: 

248 raise OrphanedRecordError( 

249 f"Dataset type {name} can not be removed." 

250 " It is associated with datasets that must be removed first." 

251 ) from e 

252 

253 # Now refresh everything -- removal is rare enough that this does 

254 # not need to be fast. 

255 self.refresh() 

256 

257 def find(self, name: str) -> DatasetRecordStorage | None: 

258 # Docstring inherited from DatasetRecordStorageManager. 

259 return self._byName.get(name) 

260 

261 def register(self, datasetType: DatasetType) -> tuple[DatasetRecordStorage, bool]: 

262 # Docstring inherited from DatasetRecordStorageManager. 

263 if datasetType.isComponent(): 263 ↛ 264line 263 didn't jump to line 264, because the condition on line 263 was never true

264 raise ValueError( 

265 f"Component dataset types can not be stored in registry. Rejecting {datasetType.name}" 

266 ) 

267 storage = self._byName.get(datasetType.name) 

268 if storage is None: 

269 dimensionsKey = self._dimensions.saveDimensionGraph(datasetType.dimensions) 

270 tagTableName = makeTagTableName(datasetType, dimensionsKey) 

271 calibTableName = ( 

272 makeCalibTableName(datasetType, dimensionsKey) if datasetType.isCalibration() else None 

273 ) 

274 # The order is important here, we want to create tables first and 

275 # only register them if this operation is successful. We cannot 

276 # wrap it into a transaction because database class assumes that 

277 # DDL is not transaction safe in general. 

278 tags = self._db.ensureTableExists( 

279 tagTableName, 

280 makeTagTableSpec(datasetType, type(self._collections), self.getIdColumnType()), 

281 ) 

282 if calibTableName is not None: 

283 calibs = self._db.ensureTableExists( 

284 calibTableName, 

285 makeCalibTableSpec( 

286 datasetType, 

287 type(self._collections), 

288 self._db.getTimespanRepresentation(), 

289 self.getIdColumnType(), 

290 ), 

291 ) 

292 else: 

293 calibs = None 

294 row, inserted = self._db.sync( 

295 self._static.dataset_type, 

296 keys={"name": datasetType.name}, 

297 compared={ 

298 "dimensions_key": dimensionsKey, 

299 # Force the storage class to be loaded to ensure it 

300 # exists and there is no typo in the name. 

301 "storage_class": datasetType.storageClass.name, 

302 }, 

303 extra={ 

304 "tag_association_table": tagTableName, 

305 "calibration_association_table": calibTableName, 

306 }, 

307 returning=["id", "tag_association_table"], 

308 ) 

309 assert row is not None 

310 storage = self._recordStorageType( 

311 db=self._db, 

312 datasetType=datasetType, 

313 static=self._static, 

314 summaries=self._summaries, 

315 tags=tags, 

316 calibs=calibs, 

317 dataset_type_id=row["id"], 

318 collections=self._collections, 

319 ) 

320 self._byName[datasetType.name] = storage 

321 self._byId[storage._dataset_type_id] = storage 

322 else: 

323 if datasetType != storage.datasetType: 

324 raise ConflictingDefinitionError( 

325 f"Given dataset type {datasetType} is inconsistent " 

326 f"with database definition {storage.datasetType}." 

327 ) 

328 inserted = False 

329 return storage, bool(inserted) 

330 

331 def resolve_wildcard( 

332 self, 

333 expression: Any, 

334 components: bool | None = None, 

335 missing: list[str] | None = None, 

336 explicit_only: bool = False, 

337 components_deprecated: bool = True, 

338 ) -> dict[DatasetType, list[str | None]]: 

339 wildcard = DatasetTypeWildcard.from_expression(expression) 

340 result: defaultdict[DatasetType, set[str | None]] = defaultdict(set) 

341 # This message can be transformed into an error on DM-36303 after v26, 

342 # and the components and components_deprecated arguments can be merged 

343 # into one on DM-36457 after v27. 

344 deprecation_message = ( 

345 "Querying for component datasets via Registry query methods is deprecated in favor of using " 

346 "DatasetRef and DatasetType methods on parent datasets. Only components=False will be supported " 

347 "after v26, and the components argument will be removed after v27." 

348 ) 

349 for name, dataset_type in wildcard.values.items(): 

350 parent_name, component_name = DatasetType.splitDatasetTypeName(name) 

351 if component_name is not None and components_deprecated: 

352 warnings.warn(deprecation_message, FutureWarning) 

353 if (found_storage := self.find(parent_name)) is not None: 

354 found_parent = found_storage.datasetType 

355 if component_name is not None: 

356 found = found_parent.makeComponentDatasetType(component_name) 

357 else: 

358 found = found_parent 

359 if dataset_type is not None: 

360 if dataset_type.is_compatible_with(found): 360 ↛ 368line 360 didn't jump to line 368, because the condition on line 360 was never false

361 # Prefer the given dataset type to enable storage class 

362 # conversions. 

363 if component_name is not None: 

364 found_parent = dataset_type.makeCompositeDatasetType() 

365 else: 

366 found_parent = dataset_type 

367 else: 

368 raise DatasetTypeError( 

369 f"Dataset type definition in query expression {dataset_type} is " 

370 f"not compatible with the registered type {found}." 

371 ) 

372 result[found_parent].add(component_name) 

373 elif missing is not None: 

374 missing.append(name) 

375 already_warned = False 

376 if wildcard.patterns is Ellipsis: 

377 if explicit_only: 

378 raise TypeError( 

379 "Universal wildcard '...' is not permitted for dataset types in this context." 

380 ) 

381 for storage in self._byName.values(): 

382 result[storage.datasetType].add(None) 

383 if components: 

384 try: 

385 result[storage.datasetType].update( 

386 storage.datasetType.storageClass.allComponents().keys() 

387 ) 

388 if ( 

389 storage.datasetType.storageClass.allComponents() 

390 and not already_warned 

391 and components_deprecated 

392 ): 

393 warnings.warn(deprecation_message, FutureWarning) 

394 already_warned = True 

395 except KeyError as err: 

396 _LOG.warning( 

397 f"Could not load storage class {err} for {storage.datasetType.name}; " 

398 "if it has components they will not be included in query results.", 

399 ) 

400 elif wildcard.patterns: 

401 if explicit_only: 

402 # After v26 this should raise DatasetTypeExpressionError, to 

403 # be implemented on DM-36303. 

404 warnings.warn( 

405 "Passing wildcard patterns here is deprecated and will be prohibited after v26.", 

406 FutureWarning, 

407 ) 

408 for storage in self._byName.values(): 

409 if any(p.fullmatch(storage.datasetType.name) for p in wildcard.patterns): 

410 result[storage.datasetType].add(None) 

411 if components is not False: 

412 for storage in self._byName.values(): 

413 if components is None and storage.datasetType in result: 

414 continue 

415 try: 

416 components_for_parent = storage.datasetType.storageClass.allComponents().keys() 

417 except KeyError as err: 

418 _LOG.warning( 

419 f"Could not load storage class {err} for {storage.datasetType.name}; " 

420 "if it has components they will not be included in query results." 

421 ) 

422 continue 

423 for component_name in components_for_parent: 

424 if any( 

425 p.fullmatch( 

426 DatasetType.nameWithComponent(storage.datasetType.name, component_name) 

427 ) 

428 for p in wildcard.patterns 

429 ): 

430 result[storage.datasetType].add(component_name) 

431 if not already_warned and components_deprecated: 

432 warnings.warn(deprecation_message, FutureWarning) 

433 already_warned = True 

434 return {k: list(v) for k, v in result.items()} 

435 

436 def getDatasetRef(self, id: DatasetId) -> DatasetRef | None: 

437 # Docstring inherited from DatasetRecordStorageManager. 

438 sql = ( 

439 sqlalchemy.sql.select( 

440 self._static.dataset.columns.dataset_type_id, 

441 self._static.dataset.columns[self._collections.getRunForeignKeyName()], 

442 ) 

443 .select_from(self._static.dataset) 

444 .where(self._static.dataset.columns.id == id) 

445 ) 

446 with self._db.query(sql) as sql_result: 

447 row = sql_result.mappings().fetchone() 

448 if row is None: 

449 return None 

450 recordsForType = self._byId.get(row[self._static.dataset.columns.dataset_type_id]) 

451 if recordsForType is None: 451 ↛ 452line 451 didn't jump to line 452, because the condition on line 451 was never true

452 self.refresh() 

453 recordsForType = self._byId.get(row[self._static.dataset.columns.dataset_type_id]) 

454 assert recordsForType is not None, "Should be guaranteed by foreign key constraints." 

455 return DatasetRef( 

456 recordsForType.datasetType, 

457 dataId=recordsForType.getDataId(id=id), 

458 id=id, 

459 run=self._collections[row[self._collections.getRunForeignKeyName()]].name, 

460 ) 

461 

462 def getCollectionSummary(self, collection: CollectionRecord) -> CollectionSummary: 

463 # Docstring inherited from DatasetRecordStorageManager. 

464 return self._summaries.get(collection) 

465 

466 def schemaDigest(self) -> str | None: 

467 # Docstring inherited from VersionedExtension. 

468 return self._defaultSchemaDigest(self._static, self._db.dialect) 

469 

470 _version: VersionTuple 

471 """Schema version for this class.""" 

472 

473 _recordStorageType: type[ByDimensionsDatasetRecordStorage] 

474 """Type of the storage class returned by this manager.""" 

475 

476 _autoincrement: bool 

477 """If True then PK column of the dataset table is auto-increment.""" 

478 

479 _idColumnType: type 

480 """Type of dataset column used to store dataset ID.""" 

481 

482 

483@deprecated( 

484 "Integer dataset IDs are deprecated in favor of UUIDs; support will be removed after v26. " 

485 "Please migrate or re-create this data repository.", 

486 version="v25.0", 

487 category=FutureWarning, 

488) 

489class ByDimensionsDatasetRecordStorageManager(ByDimensionsDatasetRecordStorageManagerBase): 

490 """Implementation of ByDimensionsDatasetRecordStorageManagerBase which uses 

491 auto-incremental integer for dataset primary key. 

492 """ 

493 

494 _version: VersionTuple = _VERSION_INT 

495 _recordStorageType: type[ByDimensionsDatasetRecordStorage] = ByDimensionsDatasetRecordStorageInt 

496 _autoincrement: bool = True 

497 _idColumnType: type = sqlalchemy.BigInteger 

498 

499 @classmethod 

500 def supportsIdGenerationMode(cls, mode: DatasetIdGenEnum) -> bool: 

501 # Docstring inherited from DatasetRecordStorageManager. 

502 # MyPy seems confused about enum value types here. 

503 return mode is mode.UNIQUE # type: ignore 

504 

505 

506class ByDimensionsDatasetRecordStorageManagerUUID(ByDimensionsDatasetRecordStorageManagerBase): 

507 """Implementation of ByDimensionsDatasetRecordStorageManagerBase which uses 

508 UUID for dataset primary key. 

509 """ 

510 

511 _version: VersionTuple = _VERSION_UUID 

512 _recordStorageType: type[ByDimensionsDatasetRecordStorage] = ByDimensionsDatasetRecordStorageUUID 

513 _autoincrement: bool = False 

514 _idColumnType: type = ddl.GUID 

515 

516 @classmethod 

517 def supportsIdGenerationMode(cls, mode: DatasetIdGenEnum) -> bool: 

518 # Docstring inherited from DatasetRecordStorageManager. 

519 return True