Coverage for python/lsst/daf/butler/registry/datasets/byDimensions/_manager.py: 95%

194 statements  

« prev     ^ index     » next       coverage.py v6.5.0, created at 2023-04-04 02:05 -0700

1from __future__ import annotations 

2 

3__all__ = ("ByDimensionsDatasetRecordStorageManagerUUID",) 

4 

5import logging 

6import warnings 

7from collections import defaultdict 

8from typing import TYPE_CHECKING, Any 

9 

10import sqlalchemy 

11from lsst.utils.ellipsis import Ellipsis 

12 

13from ....core import DatasetId, DatasetRef, DatasetType, DimensionUniverse, ddl 

14from ..._collection_summary import CollectionSummary 

15from ..._exceptions import ConflictingDefinitionError, DatasetTypeError, OrphanedRecordError 

16from ...interfaces import DatasetIdGenEnum, DatasetRecordStorage, DatasetRecordStorageManager, VersionTuple 

17from ...wildcards import DatasetTypeWildcard 

18from ._storage import ByDimensionsDatasetRecordStorage, ByDimensionsDatasetRecordStorageUUID 

19from .summaries import CollectionSummaryManager 

20from .tables import ( 

21 addDatasetForeignKey, 

22 makeCalibTableName, 

23 makeCalibTableSpec, 

24 makeStaticTableSpecs, 

25 makeTagTableName, 

26 makeTagTableSpec, 

27) 

28 

29if TYPE_CHECKING: 

30 from ...interfaces import ( 

31 CollectionManager, 

32 CollectionRecord, 

33 Database, 

34 DimensionRecordStorageManager, 

35 StaticTablesContext, 

36 ) 

37 from .tables import StaticDatasetTablesTuple 

38 

39 

40# This has to be updated on every schema change 

41_VERSION_UUID = VersionTuple(1, 0, 0) 

42 

43_LOG = logging.getLogger(__name__) 

44 

45 

46class MissingDatabaseTableError(RuntimeError): 

47 """Exception raised when a table is not found in a database.""" 

48 

49 

50class ByDimensionsDatasetRecordStorageManagerBase(DatasetRecordStorageManager): 

51 """A manager class for datasets that uses one dataset-collection table for 

52 each group of dataset types that share the same dimensions. 

53 

54 In addition to the table organization, this class makes a number of 

55 other design choices that would have been cumbersome (to say the least) to 

56 try to pack into its name: 

57 

58 - It uses a private surrogate integer autoincrement field to identify 

59 dataset types, instead of using the name as the primary and foreign key 

60 directly. 

61 

62 - It aggressively loads all DatasetTypes into memory instead of fetching 

63 them from the database only when needed or attempting more clever forms 

64 of caching. 

65 

66 Alternative implementations that make different choices for these while 

67 keeping the same general table organization might be reasonable as well. 

68 

69 This class provides complete implementation of manager logic but it is 

70 parametrized by few class attributes that have to be defined by 

71 sub-classes. 

72 

73 Parameters 

74 ---------- 

75 db : `Database` 

76 Interface to the underlying database engine and namespace. 

77 collections : `CollectionManager` 

78 Manager object for the collections in this `Registry`. 

79 dimensions : `DimensionRecordStorageManager` 

80 Manager object for the dimensions in this `Registry`. 

81 static : `StaticDatasetTablesTuple` 

82 Named tuple of `sqlalchemy.schema.Table` instances for all static 

83 tables used by this class. 

84 summaries : `CollectionSummaryManager` 

85 Structure containing tables that summarize the contents of collections. 

86 """ 

87 

88 def __init__( 

89 self, 

90 *, 

91 db: Database, 

92 collections: CollectionManager, 

93 dimensions: DimensionRecordStorageManager, 

94 static: StaticDatasetTablesTuple, 

95 summaries: CollectionSummaryManager, 

96 registry_schema_version: VersionTuple | None = None, 

97 ): 

98 super().__init__(registry_schema_version=registry_schema_version) 

99 self._db = db 

100 self._collections = collections 

101 self._dimensions = dimensions 

102 self._static = static 

103 self._summaries = summaries 

104 self._byName: dict[str, ByDimensionsDatasetRecordStorage] = {} 

105 self._byId: dict[int, ByDimensionsDatasetRecordStorage] = {} 

106 

107 @classmethod 

108 def initialize( 

109 cls, 

110 db: Database, 

111 context: StaticTablesContext, 

112 *, 

113 collections: CollectionManager, 

114 dimensions: DimensionRecordStorageManager, 

115 registry_schema_version: VersionTuple | None = None, 

116 ) -> DatasetRecordStorageManager: 

117 # Docstring inherited from DatasetRecordStorageManager. 

118 specs = cls.makeStaticTableSpecs(type(collections), universe=dimensions.universe) 

119 static: StaticDatasetTablesTuple = context.addTableTuple(specs) # type: ignore 

120 summaries = CollectionSummaryManager.initialize( 

121 db, 

122 context, 

123 collections=collections, 

124 dimensions=dimensions, 

125 ) 

126 return cls( 

127 db=db, 

128 collections=collections, 

129 dimensions=dimensions, 

130 static=static, 

131 summaries=summaries, 

132 registry_schema_version=registry_schema_version, 

133 ) 

134 

135 @classmethod 

136 def currentVersions(cls) -> list[VersionTuple]: 

137 # Docstring inherited from VersionedExtension. 

138 return [cls._version] 

139 

140 @classmethod 

141 def makeStaticTableSpecs( 

142 cls, collections: type[CollectionManager], universe: DimensionUniverse 

143 ) -> StaticDatasetTablesTuple: 

144 """Construct all static tables used by the classes in this package. 

145 

146 Static tables are those that are present in all Registries and do not 

147 depend on what DatasetTypes have been registered. 

148 

149 Parameters 

150 ---------- 

151 collections: `CollectionManager` 

152 Manager object for the collections in this `Registry`. 

153 universe : `DimensionUniverse` 

154 Universe graph containing all dimensions known to this `Registry`. 

155 

156 Returns 

157 ------- 

158 specs : `StaticDatasetTablesTuple` 

159 A named tuple containing `ddl.TableSpec` instances. 

160 """ 

161 return makeStaticTableSpecs( 

162 collections, universe=universe, dtype=cls.getIdColumnType(), autoincrement=cls._autoincrement 

163 ) 

164 

165 @classmethod 

166 def getIdColumnType(cls) -> type: 

167 # Docstring inherited from base class. 

168 return cls._idColumnType 

169 

170 @classmethod 

171 def addDatasetForeignKey( 

172 cls, 

173 tableSpec: ddl.TableSpec, 

174 *, 

175 name: str = "dataset", 

176 constraint: bool = True, 

177 onDelete: str | None = None, 

178 **kwargs: Any, 

179 ) -> ddl.FieldSpec: 

180 # Docstring inherited from DatasetRecordStorageManager. 

181 return addDatasetForeignKey( 

182 tableSpec, cls.getIdColumnType(), name=name, onDelete=onDelete, constraint=constraint, **kwargs 

183 ) 

184 

185 def refresh(self) -> None: 

186 # Docstring inherited from DatasetRecordStorageManager. 

187 byName: dict[str, ByDimensionsDatasetRecordStorage] = {} 

188 byId: dict[int, ByDimensionsDatasetRecordStorage] = {} 

189 dataset_types: dict[int, DatasetType] = {} 

190 c = self._static.dataset_type.columns 

191 with self._db.query(self._static.dataset_type.select()) as sql_result: 

192 sql_rows = sql_result.mappings().fetchall() 

193 for row in sql_rows: 

194 name = row[c.name] 

195 dimensions = self._dimensions.loadDimensionGraph(row[c.dimensions_key]) 

196 calibTableName = row[c.calibration_association_table] 

197 datasetType = DatasetType( 

198 name, dimensions, row[c.storage_class], isCalibration=(calibTableName is not None) 

199 ) 

200 tags = self._db.getExistingTable( 

201 row[c.tag_association_table], 

202 makeTagTableSpec(datasetType, type(self._collections), self.getIdColumnType()), 

203 ) 

204 if tags is None: 204 ↛ 205line 204 didn't jump to line 205, because the condition on line 204 was never true

205 raise MissingDatabaseTableError( 

206 f"Table {row[c.tag_association_table]} is missing from database schema." 

207 ) 

208 if calibTableName is not None: 

209 calibs = self._db.getExistingTable( 

210 row[c.calibration_association_table], 

211 makeCalibTableSpec( 

212 datasetType, 

213 type(self._collections), 

214 self._db.getTimespanRepresentation(), 

215 self.getIdColumnType(), 

216 ), 

217 ) 

218 if calibs is None: 218 ↛ 219line 218 didn't jump to line 219, because the condition on line 218 was never true

219 raise MissingDatabaseTableError( 

220 f"Table {row[c.calibration_association_table]} is missing from database schema." 

221 ) 

222 else: 

223 calibs = None 

224 storage = self._recordStorageType( 

225 db=self._db, 

226 datasetType=datasetType, 

227 static=self._static, 

228 summaries=self._summaries, 

229 tags=tags, 

230 calibs=calibs, 

231 dataset_type_id=row["id"], 

232 collections=self._collections, 

233 ) 

234 byName[datasetType.name] = storage 

235 byId[storage._dataset_type_id] = storage 

236 dataset_types[row["id"]] = datasetType 

237 self._byName = byName 

238 self._byId = byId 

239 self._summaries.refresh(dataset_types) 

240 

241 def remove(self, name: str) -> None: 

242 # Docstring inherited from DatasetRecordStorageManager. 

243 compositeName, componentName = DatasetType.splitDatasetTypeName(name) 

244 if componentName is not None: 

245 raise ValueError(f"Cannot delete a dataset type of a component of a composite (given {name})") 

246 

247 # Delete the row 

248 try: 

249 self._db.delete(self._static.dataset_type, ["name"], {"name": name}) 

250 except sqlalchemy.exc.IntegrityError as e: 

251 raise OrphanedRecordError( 

252 f"Dataset type {name} can not be removed." 

253 " It is associated with datasets that must be removed first." 

254 ) from e 

255 

256 # Now refresh everything -- removal is rare enough that this does 

257 # not need to be fast. 

258 self.refresh() 

259 

260 def find(self, name: str) -> DatasetRecordStorage | None: 

261 # Docstring inherited from DatasetRecordStorageManager. 

262 return self._byName.get(name) 

263 

264 def register(self, datasetType: DatasetType) -> tuple[DatasetRecordStorage, bool]: 

265 # Docstring inherited from DatasetRecordStorageManager. 

266 if datasetType.isComponent(): 266 ↛ 267line 266 didn't jump to line 267, because the condition on line 266 was never true

267 raise ValueError( 

268 f"Component dataset types can not be stored in registry. Rejecting {datasetType.name}" 

269 ) 

270 storage = self._byName.get(datasetType.name) 

271 if storage is None: 

272 dimensionsKey = self._dimensions.saveDimensionGraph(datasetType.dimensions) 

273 tagTableName = makeTagTableName(datasetType, dimensionsKey) 

274 calibTableName = ( 

275 makeCalibTableName(datasetType, dimensionsKey) if datasetType.isCalibration() else None 

276 ) 

277 # The order is important here, we want to create tables first and 

278 # only register them if this operation is successful. We cannot 

279 # wrap it into a transaction because database class assumes that 

280 # DDL is not transaction safe in general. 

281 tags = self._db.ensureTableExists( 

282 tagTableName, 

283 makeTagTableSpec(datasetType, type(self._collections), self.getIdColumnType()), 

284 ) 

285 if calibTableName is not None: 

286 calibs = self._db.ensureTableExists( 

287 calibTableName, 

288 makeCalibTableSpec( 

289 datasetType, 

290 type(self._collections), 

291 self._db.getTimespanRepresentation(), 

292 self.getIdColumnType(), 

293 ), 

294 ) 

295 else: 

296 calibs = None 

297 row, inserted = self._db.sync( 

298 self._static.dataset_type, 

299 keys={"name": datasetType.name}, 

300 compared={ 

301 "dimensions_key": dimensionsKey, 

302 # Force the storage class to be loaded to ensure it 

303 # exists and there is no typo in the name. 

304 "storage_class": datasetType.storageClass.name, 

305 }, 

306 extra={ 

307 "tag_association_table": tagTableName, 

308 "calibration_association_table": calibTableName, 

309 }, 

310 returning=["id", "tag_association_table"], 

311 ) 

312 assert row is not None 

313 storage = self._recordStorageType( 

314 db=self._db, 

315 datasetType=datasetType, 

316 static=self._static, 

317 summaries=self._summaries, 

318 tags=tags, 

319 calibs=calibs, 

320 dataset_type_id=row["id"], 

321 collections=self._collections, 

322 ) 

323 self._byName[datasetType.name] = storage 

324 self._byId[storage._dataset_type_id] = storage 

325 else: 

326 if datasetType != storage.datasetType: 

327 raise ConflictingDefinitionError( 

328 f"Given dataset type {datasetType} is inconsistent " 

329 f"with database definition {storage.datasetType}." 

330 ) 

331 inserted = False 

332 return storage, bool(inserted) 

333 

334 def resolve_wildcard( 

335 self, 

336 expression: Any, 

337 components: bool | None = None, 

338 missing: list[str] | None = None, 

339 explicit_only: bool = False, 

340 components_deprecated: bool = True, 

341 ) -> dict[DatasetType, list[str | None]]: 

342 wildcard = DatasetTypeWildcard.from_expression(expression) 

343 result: defaultdict[DatasetType, set[str | None]] = defaultdict(set) 

344 # This message can be transformed into an error on DM-36303 after v26, 

345 # and the components and components_deprecated arguments can be merged 

346 # into one on DM-36457 after v27. 

347 deprecation_message = ( 

348 "Querying for component datasets via Registry query methods is deprecated in favor of using " 

349 "DatasetRef and DatasetType methods on parent datasets. Only components=False will be supported " 

350 "after v26, and the components argument will be removed after v27." 

351 ) 

352 for name, dataset_type in wildcard.values.items(): 

353 parent_name, component_name = DatasetType.splitDatasetTypeName(name) 

354 if component_name is not None and components_deprecated: 

355 warnings.warn(deprecation_message, FutureWarning) 

356 if (found_storage := self.find(parent_name)) is not None: 

357 found_parent = found_storage.datasetType 

358 if component_name is not None: 

359 found = found_parent.makeComponentDatasetType(component_name) 

360 else: 

361 found = found_parent 

362 if dataset_type is not None: 

363 if dataset_type.is_compatible_with(found): 363 ↛ 371line 363 didn't jump to line 371, because the condition on line 363 was never false

364 # Prefer the given dataset type to enable storage class 

365 # conversions. 

366 if component_name is not None: 

367 found_parent = dataset_type.makeCompositeDatasetType() 

368 else: 

369 found_parent = dataset_type 

370 else: 

371 raise DatasetTypeError( 

372 f"Dataset type definition in query expression {dataset_type} is " 

373 f"not compatible with the registered type {found}." 

374 ) 

375 result[found_parent].add(component_name) 

376 elif missing is not None: 

377 missing.append(name) 

378 already_warned = False 

379 if wildcard.patterns is Ellipsis: 

380 if explicit_only: 

381 raise TypeError( 

382 "Universal wildcard '...' is not permitted for dataset types in this context." 

383 ) 

384 for storage in self._byName.values(): 

385 result[storage.datasetType].add(None) 

386 if components: 

387 try: 

388 result[storage.datasetType].update( 

389 storage.datasetType.storageClass.allComponents().keys() 

390 ) 

391 if ( 

392 storage.datasetType.storageClass.allComponents() 

393 and not already_warned 

394 and components_deprecated 

395 ): 

396 warnings.warn(deprecation_message, FutureWarning) 

397 already_warned = True 

398 except KeyError as err: 

399 _LOG.warning( 

400 f"Could not load storage class {err} for {storage.datasetType.name}; " 

401 "if it has components they will not be included in query results.", 

402 ) 

403 elif wildcard.patterns: 

404 if explicit_only: 

405 # After v26 this should raise DatasetTypeExpressionError, to 

406 # be implemented on DM-36303. 

407 warnings.warn( 

408 "Passing wildcard patterns here is deprecated and will be prohibited after v26.", 

409 FutureWarning, 

410 ) 

411 for storage in self._byName.values(): 

412 if any(p.fullmatch(storage.datasetType.name) for p in wildcard.patterns): 

413 result[storage.datasetType].add(None) 

414 if components is not False: 

415 for storage in self._byName.values(): 

416 if components is None and storage.datasetType in result: 

417 continue 

418 try: 

419 components_for_parent = storage.datasetType.storageClass.allComponents().keys() 

420 except KeyError as err: 

421 _LOG.warning( 

422 f"Could not load storage class {err} for {storage.datasetType.name}; " 

423 "if it has components they will not be included in query results." 

424 ) 

425 continue 

426 for component_name in components_for_parent: 

427 if any( 

428 p.fullmatch( 

429 DatasetType.nameWithComponent(storage.datasetType.name, component_name) 

430 ) 

431 for p in wildcard.patterns 

432 ): 

433 result[storage.datasetType].add(component_name) 

434 if not already_warned and components_deprecated: 

435 warnings.warn(deprecation_message, FutureWarning) 

436 already_warned = True 

437 return {k: list(v) for k, v in result.items()} 

438 

439 def getDatasetRef(self, id: DatasetId) -> DatasetRef | None: 

440 # Docstring inherited from DatasetRecordStorageManager. 

441 sql = ( 

442 sqlalchemy.sql.select( 

443 self._static.dataset.columns.dataset_type_id, 

444 self._static.dataset.columns[self._collections.getRunForeignKeyName()], 

445 ) 

446 .select_from(self._static.dataset) 

447 .where(self._static.dataset.columns.id == id) 

448 ) 

449 with self._db.query(sql) as sql_result: 

450 row = sql_result.mappings().fetchone() 

451 if row is None: 

452 return None 

453 recordsForType = self._byId.get(row[self._static.dataset.columns.dataset_type_id]) 

454 if recordsForType is None: 454 ↛ 455line 454 didn't jump to line 455, because the condition on line 454 was never true

455 self.refresh() 

456 recordsForType = self._byId.get(row[self._static.dataset.columns.dataset_type_id]) 

457 assert recordsForType is not None, "Should be guaranteed by foreign key constraints." 

458 return DatasetRef( 

459 recordsForType.datasetType, 

460 dataId=recordsForType.getDataId(id=id), 

461 id=id, 

462 run=self._collections[row[self._collections.getRunForeignKeyName()]].name, 

463 ) 

464 

465 def getCollectionSummary(self, collection: CollectionRecord) -> CollectionSummary: 

466 # Docstring inherited from DatasetRecordStorageManager. 

467 return self._summaries.get(collection) 

468 

469 _version: VersionTuple 

470 """Schema version for this class.""" 

471 

472 _recordStorageType: type[ByDimensionsDatasetRecordStorage] 

473 """Type of the storage class returned by this manager.""" 

474 

475 _autoincrement: bool 

476 """If True then PK column of the dataset table is auto-increment.""" 

477 

478 _idColumnType: type 

479 """Type of dataset column used to store dataset ID.""" 

480 

481 

482class ByDimensionsDatasetRecordStorageManagerUUID(ByDimensionsDatasetRecordStorageManagerBase): 

483 """Implementation of ByDimensionsDatasetRecordStorageManagerBase which uses 

484 UUID for dataset primary key. 

485 """ 

486 

487 _version: VersionTuple = _VERSION_UUID 

488 _recordStorageType: type[ByDimensionsDatasetRecordStorage] = ByDimensionsDatasetRecordStorageUUID 

489 _autoincrement: bool = False 

490 _idColumnType: type = ddl.GUID 

491 

492 @classmethod 

493 def supportsIdGenerationMode(cls, mode: DatasetIdGenEnum) -> bool: 

494 # Docstring inherited from DatasetRecordStorageManager. 

495 return True