Coverage for python/lsst/daf/butler/registry/datasets/byDimensions/_manager.py: 94%

197 statements  

« prev     ^ index     » next       coverage.py v6.5.0, created at 2023-03-11 02:06 -0800

1from __future__ import annotations 

2 

3__all__ = ("ByDimensionsDatasetRecordStorageManagerUUID",) 

4 

5import logging 

6import warnings 

7from collections import defaultdict 

8from typing import TYPE_CHECKING, Any 

9 

10import sqlalchemy 

11from lsst.utils.ellipsis import Ellipsis 

12 

13from ....core import DatasetId, DatasetRef, DatasetType, DimensionUniverse, ddl 

14from ..._collection_summary import CollectionSummary 

15from ..._exceptions import ConflictingDefinitionError, DatasetTypeError, OrphanedRecordError 

16from ...interfaces import DatasetIdGenEnum, DatasetRecordStorage, DatasetRecordStorageManager, VersionTuple 

17from ...wildcards import DatasetTypeWildcard 

18from ._storage import ByDimensionsDatasetRecordStorage, ByDimensionsDatasetRecordStorageUUID 

19from .summaries import CollectionSummaryManager 

20from .tables import ( 

21 addDatasetForeignKey, 

22 makeCalibTableName, 

23 makeCalibTableSpec, 

24 makeStaticTableSpecs, 

25 makeTagTableName, 

26 makeTagTableSpec, 

27) 

28 

29if TYPE_CHECKING: 29 ↛ 30line 29 didn't jump to line 30, because the condition on line 29 was never true

30 from ...interfaces import ( 

31 CollectionManager, 

32 CollectionRecord, 

33 Database, 

34 DimensionRecordStorageManager, 

35 StaticTablesContext, 

36 ) 

37 from .tables import StaticDatasetTablesTuple 

38 

39 

40# This has to be updated on every schema change 

41_VERSION_INT = VersionTuple(1, 0, 0) 

42_VERSION_UUID = VersionTuple(1, 0, 0) 

43 

44_LOG = logging.getLogger(__name__) 

45 

46 

47class MissingDatabaseTableError(RuntimeError): 

48 """Exception raised when a table is not found in a database.""" 

49 

50 

51class ByDimensionsDatasetRecordStorageManagerBase(DatasetRecordStorageManager): 

52 """A manager class for datasets that uses one dataset-collection table for 

53 each group of dataset types that share the same dimensions. 

54 

55 In addition to the table organization, this class makes a number of 

56 other design choices that would have been cumbersome (to say the least) to 

57 try to pack into its name: 

58 

59 - It uses a private surrogate integer autoincrement field to identify 

60 dataset types, instead of using the name as the primary and foreign key 

61 directly. 

62 

63 - It aggressively loads all DatasetTypes into memory instead of fetching 

64 them from the database only when needed or attempting more clever forms 

65 of caching. 

66 

67 Alternative implementations that make different choices for these while 

68 keeping the same general table organization might be reasonable as well. 

69 

70 This class provides complete implementation of manager logic but it is 

71 parametrized by few class attributes that have to be defined by 

72 sub-classes. 

73 

74 Parameters 

75 ---------- 

76 db : `Database` 

77 Interface to the underlying database engine and namespace. 

78 collections : `CollectionManager` 

79 Manager object for the collections in this `Registry`. 

80 dimensions : `DimensionRecordStorageManager` 

81 Manager object for the dimensions in this `Registry`. 

82 static : `StaticDatasetTablesTuple` 

83 Named tuple of `sqlalchemy.schema.Table` instances for all static 

84 tables used by this class. 

85 summaries : `CollectionSummaryManager` 

86 Structure containing tables that summarize the contents of collections. 

87 """ 

88 

89 def __init__( 

90 self, 

91 *, 

92 db: Database, 

93 collections: CollectionManager, 

94 dimensions: DimensionRecordStorageManager, 

95 static: StaticDatasetTablesTuple, 

96 summaries: CollectionSummaryManager, 

97 ): 

98 self._db = db 

99 self._collections = collections 

100 self._dimensions = dimensions 

101 self._static = static 

102 self._summaries = summaries 

103 self._byName: dict[str, ByDimensionsDatasetRecordStorage] = {} 

104 self._byId: dict[int, ByDimensionsDatasetRecordStorage] = {} 

105 

106 @classmethod 

107 def initialize( 

108 cls, 

109 db: Database, 

110 context: StaticTablesContext, 

111 *, 

112 collections: CollectionManager, 

113 dimensions: DimensionRecordStorageManager, 

114 ) -> DatasetRecordStorageManager: 

115 # Docstring inherited from DatasetRecordStorageManager. 

116 specs = cls.makeStaticTableSpecs(type(collections), universe=dimensions.universe) 

117 static: StaticDatasetTablesTuple = context.addTableTuple(specs) # type: ignore 

118 summaries = CollectionSummaryManager.initialize( 

119 db, 

120 context, 

121 collections=collections, 

122 dimensions=dimensions, 

123 ) 

124 return cls(db=db, collections=collections, dimensions=dimensions, static=static, summaries=summaries) 

125 

126 @classmethod 

127 def currentVersion(cls) -> VersionTuple | None: 

128 # Docstring inherited from VersionedExtension. 

129 return cls._version 

130 

131 @classmethod 

132 def makeStaticTableSpecs( 

133 cls, collections: type[CollectionManager], universe: DimensionUniverse 

134 ) -> StaticDatasetTablesTuple: 

135 """Construct all static tables used by the classes in this package. 

136 

137 Static tables are those that are present in all Registries and do not 

138 depend on what DatasetTypes have been registered. 

139 

140 Parameters 

141 ---------- 

142 collections: `CollectionManager` 

143 Manager object for the collections in this `Registry`. 

144 universe : `DimensionUniverse` 

145 Universe graph containing all dimensions known to this `Registry`. 

146 

147 Returns 

148 ------- 

149 specs : `StaticDatasetTablesTuple` 

150 A named tuple containing `ddl.TableSpec` instances. 

151 """ 

152 return makeStaticTableSpecs( 

153 collections, universe=universe, dtype=cls.getIdColumnType(), autoincrement=cls._autoincrement 

154 ) 

155 

156 @classmethod 

157 def getIdColumnType(cls) -> type: 

158 # Docstring inherited from base class. 

159 return cls._idColumnType 

160 

161 @classmethod 

162 def addDatasetForeignKey( 

163 cls, 

164 tableSpec: ddl.TableSpec, 

165 *, 

166 name: str = "dataset", 

167 constraint: bool = True, 

168 onDelete: str | None = None, 

169 **kwargs: Any, 

170 ) -> ddl.FieldSpec: 

171 # Docstring inherited from DatasetRecordStorageManager. 

172 return addDatasetForeignKey( 

173 tableSpec, cls.getIdColumnType(), name=name, onDelete=onDelete, constraint=constraint, **kwargs 

174 ) 

175 

176 def refresh(self) -> None: 

177 # Docstring inherited from DatasetRecordStorageManager. 

178 byName: dict[str, ByDimensionsDatasetRecordStorage] = {} 

179 byId: dict[int, ByDimensionsDatasetRecordStorage] = {} 

180 c = self._static.dataset_type.columns 

181 with self._db.query(self._static.dataset_type.select()) as sql_result: 

182 sql_rows = sql_result.mappings().fetchall() 

183 for row in sql_rows: 

184 name = row[c.name] 

185 dimensions = self._dimensions.loadDimensionGraph(row[c.dimensions_key]) 

186 calibTableName = row[c.calibration_association_table] 

187 datasetType = DatasetType( 

188 name, dimensions, row[c.storage_class], isCalibration=(calibTableName is not None) 

189 ) 

190 tags = self._db.getExistingTable( 

191 row[c.tag_association_table], 

192 makeTagTableSpec(datasetType, type(self._collections), self.getIdColumnType()), 

193 ) 

194 if tags is None: 194 ↛ 195line 194 didn't jump to line 195, because the condition on line 194 was never true

195 raise MissingDatabaseTableError( 

196 f"Table {row[c.tag_association_table]} is missing from database schema." 

197 ) 

198 if calibTableName is not None: 

199 calibs = self._db.getExistingTable( 

200 row[c.calibration_association_table], 

201 makeCalibTableSpec( 

202 datasetType, 

203 type(self._collections), 

204 self._db.getTimespanRepresentation(), 

205 self.getIdColumnType(), 

206 ), 

207 ) 

208 if calibs is None: 208 ↛ 209line 208 didn't jump to line 209, because the condition on line 208 was never true

209 raise MissingDatabaseTableError( 

210 f"Table {row[c.calibration_association_table]} is missing from database schema." 

211 ) 

212 else: 

213 calibs = None 

214 storage = self._recordStorageType( 

215 db=self._db, 

216 datasetType=datasetType, 

217 static=self._static, 

218 summaries=self._summaries, 

219 tags=tags, 

220 calibs=calibs, 

221 dataset_type_id=row["id"], 

222 collections=self._collections, 

223 ) 

224 byName[datasetType.name] = storage 

225 byId[storage._dataset_type_id] = storage 

226 self._byName = byName 

227 self._byId = byId 

228 self._summaries.refresh(lambda dataset_type_id: self._byId[dataset_type_id].datasetType) 

229 

230 def remove(self, name: str) -> None: 

231 # Docstring inherited from DatasetRecordStorageManager. 

232 compositeName, componentName = DatasetType.splitDatasetTypeName(name) 

233 if componentName is not None: 

234 raise ValueError(f"Cannot delete a dataset type of a component of a composite (given {name})") 

235 

236 # Delete the row 

237 try: 

238 self._db.delete(self._static.dataset_type, ["name"], {"name": name}) 

239 except sqlalchemy.exc.IntegrityError as e: 

240 raise OrphanedRecordError( 

241 f"Dataset type {name} can not be removed." 

242 " It is associated with datasets that must be removed first." 

243 ) from e 

244 

245 # Now refresh everything -- removal is rare enough that this does 

246 # not need to be fast. 

247 self.refresh() 

248 

249 def find(self, name: str) -> DatasetRecordStorage | None: 

250 # Docstring inherited from DatasetRecordStorageManager. 

251 return self._byName.get(name) 

252 

253 def register(self, datasetType: DatasetType) -> tuple[DatasetRecordStorage, bool]: 

254 # Docstring inherited from DatasetRecordStorageManager. 

255 if datasetType.isComponent(): 255 ↛ 256line 255 didn't jump to line 256, because the condition on line 255 was never true

256 raise ValueError( 

257 f"Component dataset types can not be stored in registry. Rejecting {datasetType.name}" 

258 ) 

259 storage = self._byName.get(datasetType.name) 

260 if storage is None: 

261 dimensionsKey = self._dimensions.saveDimensionGraph(datasetType.dimensions) 

262 tagTableName = makeTagTableName(datasetType, dimensionsKey) 

263 calibTableName = ( 

264 makeCalibTableName(datasetType, dimensionsKey) if datasetType.isCalibration() else None 

265 ) 

266 # The order is important here, we want to create tables first and 

267 # only register them if this operation is successful. We cannot 

268 # wrap it into a transaction because database class assumes that 

269 # DDL is not transaction safe in general. 

270 tags = self._db.ensureTableExists( 

271 tagTableName, 

272 makeTagTableSpec(datasetType, type(self._collections), self.getIdColumnType()), 

273 ) 

274 if calibTableName is not None: 

275 calibs = self._db.ensureTableExists( 

276 calibTableName, 

277 makeCalibTableSpec( 

278 datasetType, 

279 type(self._collections), 

280 self._db.getTimespanRepresentation(), 

281 self.getIdColumnType(), 

282 ), 

283 ) 

284 else: 

285 calibs = None 

286 row, inserted = self._db.sync( 

287 self._static.dataset_type, 

288 keys={"name": datasetType.name}, 

289 compared={ 

290 "dimensions_key": dimensionsKey, 

291 # Force the storage class to be loaded to ensure it 

292 # exists and there is no typo in the name. 

293 "storage_class": datasetType.storageClass.name, 

294 }, 

295 extra={ 

296 "tag_association_table": tagTableName, 

297 "calibration_association_table": calibTableName, 

298 }, 

299 returning=["id", "tag_association_table"], 

300 ) 

301 assert row is not None 

302 storage = self._recordStorageType( 

303 db=self._db, 

304 datasetType=datasetType, 

305 static=self._static, 

306 summaries=self._summaries, 

307 tags=tags, 

308 calibs=calibs, 

309 dataset_type_id=row["id"], 

310 collections=self._collections, 

311 ) 

312 self._byName[datasetType.name] = storage 

313 self._byId[storage._dataset_type_id] = storage 

314 else: 

315 if datasetType != storage.datasetType: 

316 raise ConflictingDefinitionError( 

317 f"Given dataset type {datasetType} is inconsistent " 

318 f"with database definition {storage.datasetType}." 

319 ) 

320 inserted = False 

321 return storage, bool(inserted) 

322 

323 def resolve_wildcard( 

324 self, 

325 expression: Any, 

326 components: bool | None = None, 

327 missing: list[str] | None = None, 

328 explicit_only: bool = False, 

329 components_deprecated: bool = True, 

330 ) -> dict[DatasetType, list[str | None]]: 

331 wildcard = DatasetTypeWildcard.from_expression(expression) 

332 result: defaultdict[DatasetType, set[str | None]] = defaultdict(set) 

333 # This message can be transformed into an error on DM-36303 after v26, 

334 # and the components and components_deprecated arguments can be merged 

335 # into one on DM-36457 after v27. 

336 deprecation_message = ( 

337 "Querying for component datasets via Registry query methods is deprecated in favor of using " 

338 "DatasetRef and DatasetType methods on parent datasets. Only components=False will be supported " 

339 "after v26, and the components argument will be removed after v27." 

340 ) 

341 for name, dataset_type in wildcard.values.items(): 

342 parent_name, component_name = DatasetType.splitDatasetTypeName(name) 

343 if component_name is not None and components_deprecated: 

344 warnings.warn(deprecation_message, FutureWarning) 

345 if (found_storage := self.find(parent_name)) is not None: 

346 found_parent = found_storage.datasetType 

347 if component_name is not None: 

348 found = found_parent.makeComponentDatasetType(component_name) 

349 else: 

350 found = found_parent 

351 if dataset_type is not None: 

352 if dataset_type.is_compatible_with(found): 352 ↛ 360line 352 didn't jump to line 360, because the condition on line 352 was never false

353 # Prefer the given dataset type to enable storage class 

354 # conversions. 

355 if component_name is not None: 

356 found_parent = dataset_type.makeCompositeDatasetType() 

357 else: 

358 found_parent = dataset_type 

359 else: 

360 raise DatasetTypeError( 

361 f"Dataset type definition in query expression {dataset_type} is " 

362 f"not compatible with the registered type {found}." 

363 ) 

364 result[found_parent].add(component_name) 

365 elif missing is not None: 

366 missing.append(name) 

367 already_warned = False 

368 if wildcard.patterns is Ellipsis: 

369 if explicit_only: 

370 raise TypeError( 

371 "Universal wildcard '...' is not permitted for dataset types in this context." 

372 ) 

373 for storage in self._byName.values(): 

374 result[storage.datasetType].add(None) 

375 if components: 

376 try: 

377 result[storage.datasetType].update( 

378 storage.datasetType.storageClass.allComponents().keys() 

379 ) 

380 if ( 

381 storage.datasetType.storageClass.allComponents() 

382 and not already_warned 

383 and components_deprecated 

384 ): 

385 warnings.warn(deprecation_message, FutureWarning) 

386 already_warned = True 

387 except KeyError as err: 

388 _LOG.warning( 

389 f"Could not load storage class {err} for {storage.datasetType.name}; " 

390 "if it has components they will not be included in query results.", 

391 ) 

392 elif wildcard.patterns: 

393 if explicit_only: 

394 # After v26 this should raise DatasetTypeExpressionError, to 

395 # be implemented on DM-36303. 

396 warnings.warn( 

397 "Passing wildcard patterns here is deprecated and will be prohibited after v26.", 

398 FutureWarning, 

399 ) 

400 for storage in self._byName.values(): 

401 if any(p.fullmatch(storage.datasetType.name) for p in wildcard.patterns): 

402 result[storage.datasetType].add(None) 

403 if components is not False: 

404 for storage in self._byName.values(): 

405 if components is None and storage.datasetType in result: 

406 continue 

407 try: 

408 components_for_parent = storage.datasetType.storageClass.allComponents().keys() 

409 except KeyError as err: 

410 _LOG.warning( 

411 f"Could not load storage class {err} for {storage.datasetType.name}; " 

412 "if it has components they will not be included in query results." 

413 ) 

414 continue 

415 for component_name in components_for_parent: 

416 if any( 

417 p.fullmatch( 

418 DatasetType.nameWithComponent(storage.datasetType.name, component_name) 

419 ) 

420 for p in wildcard.patterns 

421 ): 

422 result[storage.datasetType].add(component_name) 

423 if not already_warned and components_deprecated: 

424 warnings.warn(deprecation_message, FutureWarning) 

425 already_warned = True 

426 return {k: list(v) for k, v in result.items()} 

427 

428 def getDatasetRef(self, id: DatasetId) -> DatasetRef | None: 

429 # Docstring inherited from DatasetRecordStorageManager. 

430 sql = ( 

431 sqlalchemy.sql.select( 

432 self._static.dataset.columns.dataset_type_id, 

433 self._static.dataset.columns[self._collections.getRunForeignKeyName()], 

434 ) 

435 .select_from(self._static.dataset) 

436 .where(self._static.dataset.columns.id == id) 

437 ) 

438 with self._db.query(sql) as sql_result: 

439 row = sql_result.mappings().fetchone() 

440 if row is None: 

441 return None 

442 recordsForType = self._byId.get(row[self._static.dataset.columns.dataset_type_id]) 

443 if recordsForType is None: 443 ↛ 444line 443 didn't jump to line 444, because the condition on line 443 was never true

444 self.refresh() 

445 recordsForType = self._byId.get(row[self._static.dataset.columns.dataset_type_id]) 

446 assert recordsForType is not None, "Should be guaranteed by foreign key constraints." 

447 return DatasetRef( 

448 recordsForType.datasetType, 

449 dataId=recordsForType.getDataId(id=id), 

450 id=id, 

451 run=self._collections[row[self._collections.getRunForeignKeyName()]].name, 

452 ) 

453 

454 def getCollectionSummary(self, collection: CollectionRecord) -> CollectionSummary: 

455 # Docstring inherited from DatasetRecordStorageManager. 

456 return self._summaries.get(collection) 

457 

458 def schemaDigest(self) -> str | None: 

459 # Docstring inherited from VersionedExtension. 

460 return self._defaultSchemaDigest(self._static, self._db.dialect) 

461 

462 _version: VersionTuple 

463 """Schema version for this class.""" 

464 

465 _recordStorageType: type[ByDimensionsDatasetRecordStorage] 

466 """Type of the storage class returned by this manager.""" 

467 

468 _autoincrement: bool 

469 """If True then PK column of the dataset table is auto-increment.""" 

470 

471 _idColumnType: type 

472 """Type of dataset column used to store dataset ID.""" 

473 

474 

475class ByDimensionsDatasetRecordStorageManagerUUID(ByDimensionsDatasetRecordStorageManagerBase): 

476 """Implementation of ByDimensionsDatasetRecordStorageManagerBase which uses 

477 UUID for dataset primary key. 

478 """ 

479 

480 _version: VersionTuple = _VERSION_UUID 

481 _recordStorageType: type[ByDimensionsDatasetRecordStorage] = ByDimensionsDatasetRecordStorageUUID 

482 _autoincrement: bool = False 

483 _idColumnType: type = ddl.GUID 

484 

485 @classmethod 

486 def supportsIdGenerationMode(cls, mode: DatasetIdGenEnum) -> bool: 

487 # Docstring inherited from DatasetRecordStorageManager. 

488 return True