Coverage for python/lsst/daf/butler/registry/datasets/byDimensions/_manager.py: 93%

204 statements  

« prev     ^ index     » next       coverage.py v6.5.0, created at 2022-12-15 02:03 -0800

1from __future__ import annotations 

2 

3__all__ = ( 

4 "ByDimensionsDatasetRecordStorageManager", 

5 "ByDimensionsDatasetRecordStorageManagerUUID", 

6) 

7 

8import logging 

9import warnings 

10from collections import defaultdict 

11from typing import TYPE_CHECKING, Any 

12 

13import sqlalchemy 

14from deprecated.sphinx import deprecated 

15from lsst.utils.ellipsis import Ellipsis 

16 

17from ....core import DatasetId, DatasetRef, DatasetType, DimensionUniverse, ddl 

18from ..._collection_summary import CollectionSummary 

19from ..._exceptions import ConflictingDefinitionError, DatasetTypeError, OrphanedRecordError 

20from ...interfaces import DatasetIdGenEnum, DatasetRecordStorage, DatasetRecordStorageManager, VersionTuple 

21from ...wildcards import DatasetTypeWildcard 

22from ._storage import ( 

23 ByDimensionsDatasetRecordStorage, 

24 ByDimensionsDatasetRecordStorageInt, 

25 ByDimensionsDatasetRecordStorageUUID, 

26) 

27from .summaries import CollectionSummaryManager 

28from .tables import ( 

29 addDatasetForeignKey, 

30 makeCalibTableName, 

31 makeCalibTableSpec, 

32 makeStaticTableSpecs, 

33 makeTagTableName, 

34 makeTagTableSpec, 

35) 

36 

37if TYPE_CHECKING: 37 ↛ 38line 37 didn't jump to line 38, because the condition on line 37 was never true

38 from ...interfaces import ( 

39 CollectionManager, 

40 CollectionRecord, 

41 Database, 

42 DimensionRecordStorageManager, 

43 StaticTablesContext, 

44 ) 

45 from .tables import StaticDatasetTablesTuple 

46 

47 

48# This has to be updated on every schema change 

49_VERSION_INT = VersionTuple(1, 0, 0) 

50_VERSION_UUID = VersionTuple(1, 0, 0) 

51 

52_LOG = logging.getLogger(__name__) 

53 

54 

55class MissingDatabaseTableError(RuntimeError): 

56 """Exception raised when a table is not found in a database.""" 

57 

58 

59class ByDimensionsDatasetRecordStorageManagerBase(DatasetRecordStorageManager): 

60 """A manager class for datasets that uses one dataset-collection table for 

61 each group of dataset types that share the same dimensions. 

62 

63 In addition to the table organization, this class makes a number of 

64 other design choices that would have been cumbersome (to say the least) to 

65 try to pack into its name: 

66 

67 - It uses a private surrogate integer autoincrement field to identify 

68 dataset types, instead of using the name as the primary and foreign key 

69 directly. 

70 

71 - It aggressively loads all DatasetTypes into memory instead of fetching 

72 them from the database only when needed or attempting more clever forms 

73 of caching. 

74 

75 Alternative implementations that make different choices for these while 

76 keeping the same general table organization might be reasonable as well. 

77 

78 This class provides complete implementation of manager logic but it is 

79 parametrized by few class attributes that have to be defined by 

80 sub-classes. 

81 

82 Parameters 

83 ---------- 

84 db : `Database` 

85 Interface to the underlying database engine and namespace. 

86 collections : `CollectionManager` 

87 Manager object for the collections in this `Registry`. 

88 dimensions : `DimensionRecordStorageManager` 

89 Manager object for the dimensions in this `Registry`. 

90 static : `StaticDatasetTablesTuple` 

91 Named tuple of `sqlalchemy.schema.Table` instances for all static 

92 tables used by this class. 

93 summaries : `CollectionSummaryManager` 

94 Structure containing tables that summarize the contents of collections. 

95 """ 

96 

97 def __init__( 

98 self, 

99 *, 

100 db: Database, 

101 collections: CollectionManager, 

102 dimensions: DimensionRecordStorageManager, 

103 static: StaticDatasetTablesTuple, 

104 summaries: CollectionSummaryManager, 

105 ): 

106 self._db = db 

107 self._collections = collections 

108 self._dimensions = dimensions 

109 self._static = static 

110 self._summaries = summaries 

111 self._byName: dict[str, ByDimensionsDatasetRecordStorage] = {} 

112 self._byId: dict[DatasetId, ByDimensionsDatasetRecordStorage] = {} 

113 

114 @classmethod 

115 def initialize( 

116 cls, 

117 db: Database, 

118 context: StaticTablesContext, 

119 *, 

120 collections: CollectionManager, 

121 dimensions: DimensionRecordStorageManager, 

122 ) -> DatasetRecordStorageManager: 

123 # Docstring inherited from DatasetRecordStorageManager. 

124 specs = cls.makeStaticTableSpecs(type(collections), universe=dimensions.universe) 

125 static: StaticDatasetTablesTuple = context.addTableTuple(specs) # type: ignore 

126 summaries = CollectionSummaryManager.initialize( 

127 db, 

128 context, 

129 collections=collections, 

130 dimensions=dimensions, 

131 ) 

132 return cls(db=db, collections=collections, dimensions=dimensions, static=static, summaries=summaries) 

133 

134 @classmethod 

135 def currentVersion(cls) -> VersionTuple | None: 

136 # Docstring inherited from VersionedExtension. 

137 return cls._version 

138 

139 @classmethod 

140 def makeStaticTableSpecs( 

141 cls, collections: type[CollectionManager], universe: DimensionUniverse 

142 ) -> StaticDatasetTablesTuple: 

143 """Construct all static tables used by the classes in this package. 

144 

145 Static tables are those that are present in all Registries and do not 

146 depend on what DatasetTypes have been registered. 

147 

148 Parameters 

149 ---------- 

150 collections: `CollectionManager` 

151 Manager object for the collections in this `Registry`. 

152 universe : `DimensionUniverse` 

153 Universe graph containing all dimensions known to this `Registry`. 

154 

155 Returns 

156 ------- 

157 specs : `StaticDatasetTablesTuple` 

158 A named tuple containing `ddl.TableSpec` instances. 

159 """ 

160 return makeStaticTableSpecs( 

161 collections, universe=universe, dtype=cls.getIdColumnType(), autoincrement=cls._autoincrement 

162 ) 

163 

164 @classmethod 

165 def getIdColumnType(cls) -> type: 

166 # Docstring inherited from base class. 

167 return cls._idColumnType 

168 

169 @classmethod 

170 def addDatasetForeignKey( 

171 cls, 

172 tableSpec: ddl.TableSpec, 

173 *, 

174 name: str = "dataset", 

175 constraint: bool = True, 

176 onDelete: str | None = None, 

177 **kwargs: Any, 

178 ) -> ddl.FieldSpec: 

179 # Docstring inherited from DatasetRecordStorageManager. 

180 return addDatasetForeignKey( 

181 tableSpec, cls.getIdColumnType(), name=name, onDelete=onDelete, constraint=constraint, **kwargs 

182 ) 

183 

184 def refresh(self) -> None: 

185 # Docstring inherited from DatasetRecordStorageManager. 

186 byName: dict[str, ByDimensionsDatasetRecordStorage] = {} 

187 byId: dict[DatasetId, ByDimensionsDatasetRecordStorage] = {} 

188 c = self._static.dataset_type.columns 

189 for row in self._db.query(self._static.dataset_type.select()).mappings(): 

190 name = row[c.name] 

191 dimensions = self._dimensions.loadDimensionGraph(row[c.dimensions_key]) 

192 calibTableName = row[c.calibration_association_table] 

193 datasetType = DatasetType( 

194 name, dimensions, row[c.storage_class], isCalibration=(calibTableName is not None) 

195 ) 

196 tags = self._db.getExistingTable( 

197 row[c.tag_association_table], 

198 makeTagTableSpec(datasetType, type(self._collections), self.getIdColumnType()), 

199 ) 

200 if tags is None: 200 ↛ 201line 200 didn't jump to line 201, because the condition on line 200 was never true

201 raise MissingDatabaseTableError( 

202 f"Table {row[c.tag_association_table]} is missing from database schema." 

203 ) 

204 if calibTableName is not None: 

205 calibs = self._db.getExistingTable( 

206 row[c.calibration_association_table], 

207 makeCalibTableSpec( 

208 datasetType, 

209 type(self._collections), 

210 self._db.getTimespanRepresentation(), 

211 self.getIdColumnType(), 

212 ), 

213 ) 

214 if calibs is None: 214 ↛ 215line 214 didn't jump to line 215, because the condition on line 214 was never true

215 raise MissingDatabaseTableError( 

216 f"Table {row[c.calibration_association_table]} is missing from database schema." 

217 ) 

218 else: 

219 calibs = None 

220 storage = self._recordStorageType( 

221 db=self._db, 

222 datasetType=datasetType, 

223 static=self._static, 

224 summaries=self._summaries, 

225 tags=tags, 

226 calibs=calibs, 

227 dataset_type_id=row["id"], 

228 collections=self._collections, 

229 ) 

230 byName[datasetType.name] = storage 

231 byId[storage._dataset_type_id] = storage 

232 self._byName = byName 

233 self._byId = byId 

234 self._summaries.refresh(lambda dataset_type_id: self._byId[dataset_type_id].datasetType) 

235 

236 def remove(self, name: str) -> None: 

237 # Docstring inherited from DatasetRecordStorageManager. 

238 compositeName, componentName = DatasetType.splitDatasetTypeName(name) 

239 if componentName is not None: 

240 raise ValueError(f"Cannot delete a dataset type of a component of a composite (given {name})") 

241 

242 # Delete the row 

243 try: 

244 self._db.delete(self._static.dataset_type, ["name"], {"name": name}) 

245 except sqlalchemy.exc.IntegrityError as e: 

246 raise OrphanedRecordError( 

247 f"Dataset type {name} can not be removed." 

248 " It is associated with datasets that must be removed first." 

249 ) from e 

250 

251 # Now refresh everything -- removal is rare enough that this does 

252 # not need to be fast. 

253 self.refresh() 

254 

255 def find(self, name: str) -> DatasetRecordStorage | None: 

256 # Docstring inherited from DatasetRecordStorageManager. 

257 return self._byName.get(name) 

258 

259 def register(self, datasetType: DatasetType) -> tuple[DatasetRecordStorage, bool]: 

260 # Docstring inherited from DatasetRecordStorageManager. 

261 if datasetType.isComponent(): 261 ↛ 262line 261 didn't jump to line 262, because the condition on line 261 was never true

262 raise ValueError( 

263 f"Component dataset types can not be stored in registry. Rejecting {datasetType.name}" 

264 ) 

265 storage = self._byName.get(datasetType.name) 

266 if storage is None: 

267 dimensionsKey = self._dimensions.saveDimensionGraph(datasetType.dimensions) 

268 tagTableName = makeTagTableName(datasetType, dimensionsKey) 

269 calibTableName = ( 

270 makeCalibTableName(datasetType, dimensionsKey) if datasetType.isCalibration() else None 

271 ) 

272 # The order is important here, we want to create tables first and 

273 # only register them if this operation is successful. We cannot 

274 # wrap it into a transaction because database class assumes that 

275 # DDL is not transaction safe in general. 

276 tags = self._db.ensureTableExists( 

277 tagTableName, 

278 makeTagTableSpec(datasetType, type(self._collections), self.getIdColumnType()), 

279 ) 

280 if calibTableName is not None: 

281 calibs = self._db.ensureTableExists( 

282 calibTableName, 

283 makeCalibTableSpec( 

284 datasetType, 

285 type(self._collections), 

286 self._db.getTimespanRepresentation(), 

287 self.getIdColumnType(), 

288 ), 

289 ) 

290 else: 

291 calibs = None 

292 row, inserted = self._db.sync( 

293 self._static.dataset_type, 

294 keys={"name": datasetType.name}, 

295 compared={ 

296 "dimensions_key": dimensionsKey, 

297 # Force the storage class to be loaded to ensure it 

298 # exists and there is no typo in the name. 

299 "storage_class": datasetType.storageClass.name, 

300 }, 

301 extra={ 

302 "tag_association_table": tagTableName, 

303 "calibration_association_table": calibTableName, 

304 }, 

305 returning=["id", "tag_association_table"], 

306 ) 

307 assert row is not None 

308 storage = self._recordStorageType( 

309 db=self._db, 

310 datasetType=datasetType, 

311 static=self._static, 

312 summaries=self._summaries, 

313 tags=tags, 

314 calibs=calibs, 

315 dataset_type_id=row["id"], 

316 collections=self._collections, 

317 ) 

318 self._byName[datasetType.name] = storage 

319 self._byId[storage._dataset_type_id] = storage 

320 else: 

321 if datasetType != storage.datasetType: 

322 raise ConflictingDefinitionError( 

323 f"Given dataset type {datasetType} is inconsistent " 

324 f"with database definition {storage.datasetType}." 

325 ) 

326 inserted = False 

327 return storage, bool(inserted) 

328 

329 def resolve_wildcard( 

330 self, 

331 expression: Any, 

332 components: bool | None = None, 

333 missing: list[str] | None = None, 

334 explicit_only: bool = False, 

335 ) -> dict[DatasetType, list[str | None]]: 

336 wildcard = DatasetTypeWildcard.from_expression(expression) 

337 result: defaultdict[DatasetType, set[str | None]] = defaultdict(set) 

338 # This message can be transformed into an error on DM-36303 after v26, 

339 # and the components argument here (and in all callers) can be removed 

340 # entirely on DM-36457 after v27. 

341 deprecation_message = ( 

342 "Querying for component datasets via Registry query methods is deprecated in favor of using " 

343 "DatasetRef and DatasetType methods on parent datasets. Only components=False will be supported " 

344 "after v26, and the components argument will be removed after v27." 

345 ) 

346 for name, dataset_type in wildcard.values.items(): 

347 parent_name, component_name = DatasetType.splitDatasetTypeName(name) 

348 if component_name is not None: 

349 warnings.warn(deprecation_message, FutureWarning) 

350 if (found_storage := self.find(parent_name)) is not None: 

351 found_parent = found_storage.datasetType 

352 if component_name is not None: 

353 found = found_parent.makeComponentDatasetType(component_name) 

354 else: 

355 found = found_parent 

356 if dataset_type is not None: 

357 if dataset_type.is_compatible_with(found): 357 ↛ 365line 357 didn't jump to line 365, because the condition on line 357 was never false

358 # Prefer the given dataset type to enable storage class 

359 # conversions. 

360 if component_name is not None: 360 ↛ 361line 360 didn't jump to line 361, because the condition on line 360 was never true

361 found_parent = dataset_type.makeCompositeDatasetType() 

362 else: 

363 found_parent = dataset_type 

364 else: 

365 raise DatasetTypeError( 

366 f"Dataset type definition in query expression {dataset_type} is " 

367 f"not compatible with the registered type {found}." 

368 ) 

369 result[found_parent].add(component_name) 

370 elif missing is not None: 370 ↛ 346line 370 didn't jump to line 346, because the condition on line 370 was never false

371 missing.append(name) 

372 already_warned = False 

373 if wildcard.patterns is Ellipsis: 

374 if explicit_only: 

375 raise TypeError( 

376 "Universal wildcard '...' is not permitted for dataset types in this context." 

377 ) 

378 for storage in self._byName.values(): 

379 result[storage.datasetType].add(None) 

380 if components: 

381 try: 

382 result[storage.datasetType].update( 

383 storage.datasetType.storageClass.allComponents().keys() 

384 ) 

385 if storage.datasetType.storageClass.allComponents() and not already_warned: 

386 warnings.warn(deprecation_message, FutureWarning) 

387 already_warned = True 

388 except KeyError as err: 

389 _LOG.warning( 

390 f"Could not load storage class {err} for {storage.datasetType.name}; " 

391 "if it has components they will not be included in query results.", 

392 ) 

393 elif wildcard.patterns: 

394 if explicit_only: 

395 # After v26 this should raise DatasetTypeExpressionError, to 

396 # be implemented on DM-36303. 

397 warnings.warn( 

398 "Passing wildcard patterns here is deprecated and will be prohibited after v26.", 

399 FutureWarning, 

400 ) 

401 for storage in self._byName.values(): 

402 if any(p.fullmatch(storage.datasetType.name) for p in wildcard.patterns): 

403 result[storage.datasetType].add(None) 

404 if components is not False: 

405 for storage in self._byName.values(): 

406 if components is None and storage.datasetType in result: 

407 continue 

408 try: 

409 components_for_parent = storage.datasetType.storageClass.allComponents().keys() 

410 except KeyError as err: 

411 _LOG.warning( 

412 f"Could not load storage class {err} for {storage.datasetType.name}; " 

413 "if it has components they will not be included in query results." 

414 ) 

415 continue 

416 for component_name in components_for_parent: 

417 if any( 

418 p.fullmatch( 

419 DatasetType.nameWithComponent(storage.datasetType.name, component_name) 

420 ) 

421 for p in wildcard.patterns 

422 ): 

423 result[storage.datasetType].add(component_name) 

424 if not already_warned: 

425 warnings.warn(deprecation_message, FutureWarning) 

426 already_warned = True 

427 return {k: list(v) for k, v in result.items()} 

428 

429 def getDatasetRef(self, id: DatasetId) -> DatasetRef | None: 

430 # Docstring inherited from DatasetRecordStorageManager. 

431 sql = ( 

432 sqlalchemy.sql.select( 

433 self._static.dataset.columns.dataset_type_id, 

434 self._static.dataset.columns[self._collections.getRunForeignKeyName()], 

435 ) 

436 .select_from(self._static.dataset) 

437 .where(self._static.dataset.columns.id == id) 

438 ) 

439 row = self._db.query(sql).mappings().fetchone() 

440 if row is None: 

441 return None 

442 recordsForType = self._byId.get(row[self._static.dataset.columns.dataset_type_id]) 

443 if recordsForType is None: 443 ↛ 444line 443 didn't jump to line 444, because the condition on line 443 was never true

444 self.refresh() 

445 recordsForType = self._byId.get(row[self._static.dataset.columns.dataset_type_id]) 

446 assert recordsForType is not None, "Should be guaranteed by foreign key constraints." 

447 return DatasetRef( 

448 recordsForType.datasetType, 

449 dataId=recordsForType.getDataId(id=id), 

450 id=id, 

451 run=self._collections[row[self._collections.getRunForeignKeyName()]].name, 

452 ) 

453 

454 def getCollectionSummary(self, collection: CollectionRecord) -> CollectionSummary: 

455 # Docstring inherited from DatasetRecordStorageManager. 

456 return self._summaries.get(collection) 

457 

458 def schemaDigest(self) -> str | None: 

459 # Docstring inherited from VersionedExtension. 

460 return self._defaultSchemaDigest(self._static, self._db.dialect) 

461 

462 _version: VersionTuple 

463 """Schema version for this class.""" 

464 

465 _recordStorageType: type[ByDimensionsDatasetRecordStorage] 

466 """Type of the storage class returned by this manager.""" 

467 

468 _autoincrement: bool 

469 """If True then PK column of the dataset table is auto-increment.""" 

470 

471 _idColumnType: type 

472 """Type of dataset column used to store dataset ID.""" 

473 

474 

475@deprecated( 

476 "Integer dataset IDs are deprecated in favor of UUIDs; support will be removed after v26. " 

477 "Please migrate or re-create this data repository.", 

478 version="v25.0", 

479 category=FutureWarning, 

480) 

481class ByDimensionsDatasetRecordStorageManager(ByDimensionsDatasetRecordStorageManagerBase): 

482 """Implementation of ByDimensionsDatasetRecordStorageManagerBase which uses 

483 auto-incremental integer for dataset primary key. 

484 """ 

485 

486 _version: VersionTuple = _VERSION_INT 

487 _recordStorageType: type[ByDimensionsDatasetRecordStorage] = ByDimensionsDatasetRecordStorageInt 

488 _autoincrement: bool = True 

489 _idColumnType: type = sqlalchemy.BigInteger 

490 

491 @classmethod 

492 def supportsIdGenerationMode(cls, mode: DatasetIdGenEnum) -> bool: 

493 # Docstring inherited from DatasetRecordStorageManager. 

494 # MyPy seems confused about enum value types here. 

495 return mode is mode.UNIQUE # type: ignore 

496 

497 

498class ByDimensionsDatasetRecordStorageManagerUUID(ByDimensionsDatasetRecordStorageManagerBase): 

499 """Implementation of ByDimensionsDatasetRecordStorageManagerBase which uses 

500 UUID for dataset primary key. 

501 """ 

502 

503 _version: VersionTuple = _VERSION_UUID 

504 _recordStorageType: type[ByDimensionsDatasetRecordStorage] = ByDimensionsDatasetRecordStorageUUID 

505 _autoincrement: bool = False 

506 _idColumnType: type = ddl.GUID 

507 

508 @classmethod 

509 def supportsIdGenerationMode(cls, mode: DatasetIdGenEnum) -> bool: 

510 # Docstring inherited from DatasetRecordStorageManager. 

511 return True