Coverage for python/lsst/daf/butler/registry/datasets/byDimensions/_manager.py: 94%

197 statements  

« prev     ^ index     » next       coverage.py v6.5.0, created at 2023-03-23 02:06 -0700

1from __future__ import annotations 

2 

3__all__ = ("ByDimensionsDatasetRecordStorageManagerUUID",) 

4 

5import logging 

6import warnings 

7from collections import defaultdict 

8from typing import TYPE_CHECKING, Any 

9 

10import sqlalchemy 

11from lsst.utils.ellipsis import Ellipsis 

12 

13from ....core import DatasetId, DatasetRef, DatasetType, DimensionUniverse, ddl 

14from ..._collection_summary import CollectionSummary 

15from ..._exceptions import ConflictingDefinitionError, DatasetTypeError, OrphanedRecordError 

16from ...interfaces import DatasetIdGenEnum, DatasetRecordStorage, DatasetRecordStorageManager, VersionTuple 

17from ...wildcards import DatasetTypeWildcard 

18from ._storage import ByDimensionsDatasetRecordStorage, ByDimensionsDatasetRecordStorageUUID 

19from .summaries import CollectionSummaryManager 

20from .tables import ( 

21 addDatasetForeignKey, 

22 makeCalibTableName, 

23 makeCalibTableSpec, 

24 makeStaticTableSpecs, 

25 makeTagTableName, 

26 makeTagTableSpec, 

27) 

28 

29if TYPE_CHECKING: 29 ↛ 30line 29 didn't jump to line 30, because the condition on line 29 was never true

30 from ...interfaces import ( 

31 CollectionManager, 

32 CollectionRecord, 

33 Database, 

34 DimensionRecordStorageManager, 

35 StaticTablesContext, 

36 ) 

37 from .tables import StaticDatasetTablesTuple 

38 

39 

40# This has to be updated on every schema change 

41_VERSION_INT = VersionTuple(1, 0, 0) 

42_VERSION_UUID = VersionTuple(1, 0, 0) 

43 

44_LOG = logging.getLogger(__name__) 

45 

46 

47class MissingDatabaseTableError(RuntimeError): 

48 """Exception raised when a table is not found in a database.""" 

49 

50 

51class ByDimensionsDatasetRecordStorageManagerBase(DatasetRecordStorageManager): 

52 """A manager class for datasets that uses one dataset-collection table for 

53 each group of dataset types that share the same dimensions. 

54 

55 In addition to the table organization, this class makes a number of 

56 other design choices that would have been cumbersome (to say the least) to 

57 try to pack into its name: 

58 

59 - It uses a private surrogate integer autoincrement field to identify 

60 dataset types, instead of using the name as the primary and foreign key 

61 directly. 

62 

63 - It aggressively loads all DatasetTypes into memory instead of fetching 

64 them from the database only when needed or attempting more clever forms 

65 of caching. 

66 

67 Alternative implementations that make different choices for these while 

68 keeping the same general table organization might be reasonable as well. 

69 

70 This class provides complete implementation of manager logic but it is 

71 parametrized by few class attributes that have to be defined by 

72 sub-classes. 

73 

74 Parameters 

75 ---------- 

76 db : `Database` 

77 Interface to the underlying database engine and namespace. 

78 collections : `CollectionManager` 

79 Manager object for the collections in this `Registry`. 

80 dimensions : `DimensionRecordStorageManager` 

81 Manager object for the dimensions in this `Registry`. 

82 static : `StaticDatasetTablesTuple` 

83 Named tuple of `sqlalchemy.schema.Table` instances for all static 

84 tables used by this class. 

85 summaries : `CollectionSummaryManager` 

86 Structure containing tables that summarize the contents of collections. 

87 """ 

88 

89 def __init__( 

90 self, 

91 *, 

92 db: Database, 

93 collections: CollectionManager, 

94 dimensions: DimensionRecordStorageManager, 

95 static: StaticDatasetTablesTuple, 

96 summaries: CollectionSummaryManager, 

97 ): 

98 self._db = db 

99 self._collections = collections 

100 self._dimensions = dimensions 

101 self._static = static 

102 self._summaries = summaries 

103 self._byName: dict[str, ByDimensionsDatasetRecordStorage] = {} 

104 self._byId: dict[int, ByDimensionsDatasetRecordStorage] = {} 

105 

106 @classmethod 

107 def initialize( 

108 cls, 

109 db: Database, 

110 context: StaticTablesContext, 

111 *, 

112 collections: CollectionManager, 

113 dimensions: DimensionRecordStorageManager, 

114 ) -> DatasetRecordStorageManager: 

115 # Docstring inherited from DatasetRecordStorageManager. 

116 specs = cls.makeStaticTableSpecs(type(collections), universe=dimensions.universe) 

117 static: StaticDatasetTablesTuple = context.addTableTuple(specs) # type: ignore 

118 summaries = CollectionSummaryManager.initialize( 

119 db, 

120 context, 

121 collections=collections, 

122 dimensions=dimensions, 

123 ) 

124 return cls(db=db, collections=collections, dimensions=dimensions, static=static, summaries=summaries) 

125 

126 @classmethod 

127 def currentVersion(cls) -> VersionTuple | None: 

128 # Docstring inherited from VersionedExtension. 

129 return cls._version 

130 

131 @classmethod 

132 def makeStaticTableSpecs( 

133 cls, collections: type[CollectionManager], universe: DimensionUniverse 

134 ) -> StaticDatasetTablesTuple: 

135 """Construct all static tables used by the classes in this package. 

136 

137 Static tables are those that are present in all Registries and do not 

138 depend on what DatasetTypes have been registered. 

139 

140 Parameters 

141 ---------- 

142 collections: `CollectionManager` 

143 Manager object for the collections in this `Registry`. 

144 universe : `DimensionUniverse` 

145 Universe graph containing all dimensions known to this `Registry`. 

146 

147 Returns 

148 ------- 

149 specs : `StaticDatasetTablesTuple` 

150 A named tuple containing `ddl.TableSpec` instances. 

151 """ 

152 return makeStaticTableSpecs( 

153 collections, universe=universe, dtype=cls.getIdColumnType(), autoincrement=cls._autoincrement 

154 ) 

155 

156 @classmethod 

157 def getIdColumnType(cls) -> type: 

158 # Docstring inherited from base class. 

159 return cls._idColumnType 

160 

161 @classmethod 

162 def addDatasetForeignKey( 

163 cls, 

164 tableSpec: ddl.TableSpec, 

165 *, 

166 name: str = "dataset", 

167 constraint: bool = True, 

168 onDelete: str | None = None, 

169 **kwargs: Any, 

170 ) -> ddl.FieldSpec: 

171 # Docstring inherited from DatasetRecordStorageManager. 

172 return addDatasetForeignKey( 

173 tableSpec, cls.getIdColumnType(), name=name, onDelete=onDelete, constraint=constraint, **kwargs 

174 ) 

175 

176 def refresh(self) -> None: 

177 # Docstring inherited from DatasetRecordStorageManager. 

178 byName: dict[str, ByDimensionsDatasetRecordStorage] = {} 

179 byId: dict[int, ByDimensionsDatasetRecordStorage] = {} 

180 dataset_types: dict[int, DatasetType] = {} 

181 c = self._static.dataset_type.columns 

182 with self._db.query(self._static.dataset_type.select()) as sql_result: 

183 sql_rows = sql_result.mappings().fetchall() 

184 for row in sql_rows: 

185 name = row[c.name] 

186 dimensions = self._dimensions.loadDimensionGraph(row[c.dimensions_key]) 

187 calibTableName = row[c.calibration_association_table] 

188 datasetType = DatasetType( 

189 name, dimensions, row[c.storage_class], isCalibration=(calibTableName is not None) 

190 ) 

191 tags = self._db.getExistingTable( 

192 row[c.tag_association_table], 

193 makeTagTableSpec(datasetType, type(self._collections), self.getIdColumnType()), 

194 ) 

195 if tags is None: 195 ↛ 196line 195 didn't jump to line 196, because the condition on line 195 was never true

196 raise MissingDatabaseTableError( 

197 f"Table {row[c.tag_association_table]} is missing from database schema." 

198 ) 

199 if calibTableName is not None: 

200 calibs = self._db.getExistingTable( 

201 row[c.calibration_association_table], 

202 makeCalibTableSpec( 

203 datasetType, 

204 type(self._collections), 

205 self._db.getTimespanRepresentation(), 

206 self.getIdColumnType(), 

207 ), 

208 ) 

209 if calibs is None: 209 ↛ 210line 209 didn't jump to line 210, because the condition on line 209 was never true

210 raise MissingDatabaseTableError( 

211 f"Table {row[c.calibration_association_table]} is missing from database schema." 

212 ) 

213 else: 

214 calibs = None 

215 storage = self._recordStorageType( 

216 db=self._db, 

217 datasetType=datasetType, 

218 static=self._static, 

219 summaries=self._summaries, 

220 tags=tags, 

221 calibs=calibs, 

222 dataset_type_id=row["id"], 

223 collections=self._collections, 

224 ) 

225 byName[datasetType.name] = storage 

226 byId[storage._dataset_type_id] = storage 

227 dataset_types[row["id"]] = datasetType 

228 self._byName = byName 

229 self._byId = byId 

230 self._summaries.refresh(dataset_types) 

231 

232 def remove(self, name: str) -> None: 

233 # Docstring inherited from DatasetRecordStorageManager. 

234 compositeName, componentName = DatasetType.splitDatasetTypeName(name) 

235 if componentName is not None: 

236 raise ValueError(f"Cannot delete a dataset type of a component of a composite (given {name})") 

237 

238 # Delete the row 

239 try: 

240 self._db.delete(self._static.dataset_type, ["name"], {"name": name}) 

241 except sqlalchemy.exc.IntegrityError as e: 

242 raise OrphanedRecordError( 

243 f"Dataset type {name} can not be removed." 

244 " It is associated with datasets that must be removed first." 

245 ) from e 

246 

247 # Now refresh everything -- removal is rare enough that this does 

248 # not need to be fast. 

249 self.refresh() 

250 

251 def find(self, name: str) -> DatasetRecordStorage | None: 

252 # Docstring inherited from DatasetRecordStorageManager. 

253 return self._byName.get(name) 

254 

255 def register(self, datasetType: DatasetType) -> tuple[DatasetRecordStorage, bool]: 

256 # Docstring inherited from DatasetRecordStorageManager. 

257 if datasetType.isComponent(): 257 ↛ 258line 257 didn't jump to line 258, because the condition on line 257 was never true

258 raise ValueError( 

259 f"Component dataset types can not be stored in registry. Rejecting {datasetType.name}" 

260 ) 

261 storage = self._byName.get(datasetType.name) 

262 if storage is None: 

263 dimensionsKey = self._dimensions.saveDimensionGraph(datasetType.dimensions) 

264 tagTableName = makeTagTableName(datasetType, dimensionsKey) 

265 calibTableName = ( 

266 makeCalibTableName(datasetType, dimensionsKey) if datasetType.isCalibration() else None 

267 ) 

268 # The order is important here, we want to create tables first and 

269 # only register them if this operation is successful. We cannot 

270 # wrap it into a transaction because database class assumes that 

271 # DDL is not transaction safe in general. 

272 tags = self._db.ensureTableExists( 

273 tagTableName, 

274 makeTagTableSpec(datasetType, type(self._collections), self.getIdColumnType()), 

275 ) 

276 if calibTableName is not None: 

277 calibs = self._db.ensureTableExists( 

278 calibTableName, 

279 makeCalibTableSpec( 

280 datasetType, 

281 type(self._collections), 

282 self._db.getTimespanRepresentation(), 

283 self.getIdColumnType(), 

284 ), 

285 ) 

286 else: 

287 calibs = None 

288 row, inserted = self._db.sync( 

289 self._static.dataset_type, 

290 keys={"name": datasetType.name}, 

291 compared={ 

292 "dimensions_key": dimensionsKey, 

293 # Force the storage class to be loaded to ensure it 

294 # exists and there is no typo in the name. 

295 "storage_class": datasetType.storageClass.name, 

296 }, 

297 extra={ 

298 "tag_association_table": tagTableName, 

299 "calibration_association_table": calibTableName, 

300 }, 

301 returning=["id", "tag_association_table"], 

302 ) 

303 assert row is not None 

304 storage = self._recordStorageType( 

305 db=self._db, 

306 datasetType=datasetType, 

307 static=self._static, 

308 summaries=self._summaries, 

309 tags=tags, 

310 calibs=calibs, 

311 dataset_type_id=row["id"], 

312 collections=self._collections, 

313 ) 

314 self._byName[datasetType.name] = storage 

315 self._byId[storage._dataset_type_id] = storage 

316 else: 

317 if datasetType != storage.datasetType: 

318 raise ConflictingDefinitionError( 

319 f"Given dataset type {datasetType} is inconsistent " 

320 f"with database definition {storage.datasetType}." 

321 ) 

322 inserted = False 

323 return storage, bool(inserted) 

324 

325 def resolve_wildcard( 

326 self, 

327 expression: Any, 

328 components: bool | None = None, 

329 missing: list[str] | None = None, 

330 explicit_only: bool = False, 

331 components_deprecated: bool = True, 

332 ) -> dict[DatasetType, list[str | None]]: 

333 wildcard = DatasetTypeWildcard.from_expression(expression) 

334 result: defaultdict[DatasetType, set[str | None]] = defaultdict(set) 

335 # This message can be transformed into an error on DM-36303 after v26, 

336 # and the components and components_deprecated arguments can be merged 

337 # into one on DM-36457 after v27. 

338 deprecation_message = ( 

339 "Querying for component datasets via Registry query methods is deprecated in favor of using " 

340 "DatasetRef and DatasetType methods on parent datasets. Only components=False will be supported " 

341 "after v26, and the components argument will be removed after v27." 

342 ) 

343 for name, dataset_type in wildcard.values.items(): 

344 parent_name, component_name = DatasetType.splitDatasetTypeName(name) 

345 if component_name is not None and components_deprecated: 

346 warnings.warn(deprecation_message, FutureWarning) 

347 if (found_storage := self.find(parent_name)) is not None: 

348 found_parent = found_storage.datasetType 

349 if component_name is not None: 

350 found = found_parent.makeComponentDatasetType(component_name) 

351 else: 

352 found = found_parent 

353 if dataset_type is not None: 

354 if dataset_type.is_compatible_with(found): 354 ↛ 362line 354 didn't jump to line 362, because the condition on line 354 was never false

355 # Prefer the given dataset type to enable storage class 

356 # conversions. 

357 if component_name is not None: 

358 found_parent = dataset_type.makeCompositeDatasetType() 

359 else: 

360 found_parent = dataset_type 

361 else: 

362 raise DatasetTypeError( 

363 f"Dataset type definition in query expression {dataset_type} is " 

364 f"not compatible with the registered type {found}." 

365 ) 

366 result[found_parent].add(component_name) 

367 elif missing is not None: 

368 missing.append(name) 

369 already_warned = False 

370 if wildcard.patterns is Ellipsis: 

371 if explicit_only: 

372 raise TypeError( 

373 "Universal wildcard '...' is not permitted for dataset types in this context." 

374 ) 

375 for storage in self._byName.values(): 

376 result[storage.datasetType].add(None) 

377 if components: 

378 try: 

379 result[storage.datasetType].update( 

380 storage.datasetType.storageClass.allComponents().keys() 

381 ) 

382 if ( 

383 storage.datasetType.storageClass.allComponents() 

384 and not already_warned 

385 and components_deprecated 

386 ): 

387 warnings.warn(deprecation_message, FutureWarning) 

388 already_warned = True 

389 except KeyError as err: 

390 _LOG.warning( 

391 f"Could not load storage class {err} for {storage.datasetType.name}; " 

392 "if it has components they will not be included in query results.", 

393 ) 

394 elif wildcard.patterns: 

395 if explicit_only: 

396 # After v26 this should raise DatasetTypeExpressionError, to 

397 # be implemented on DM-36303. 

398 warnings.warn( 

399 "Passing wildcard patterns here is deprecated and will be prohibited after v26.", 

400 FutureWarning, 

401 ) 

402 for storage in self._byName.values(): 

403 if any(p.fullmatch(storage.datasetType.name) for p in wildcard.patterns): 

404 result[storage.datasetType].add(None) 

405 if components is not False: 

406 for storage in self._byName.values(): 

407 if components is None and storage.datasetType in result: 

408 continue 

409 try: 

410 components_for_parent = storage.datasetType.storageClass.allComponents().keys() 

411 except KeyError as err: 

412 _LOG.warning( 

413 f"Could not load storage class {err} for {storage.datasetType.name}; " 

414 "if it has components they will not be included in query results." 

415 ) 

416 continue 

417 for component_name in components_for_parent: 

418 if any( 

419 p.fullmatch( 

420 DatasetType.nameWithComponent(storage.datasetType.name, component_name) 

421 ) 

422 for p in wildcard.patterns 

423 ): 

424 result[storage.datasetType].add(component_name) 

425 if not already_warned and components_deprecated: 

426 warnings.warn(deprecation_message, FutureWarning) 

427 already_warned = True 

428 return {k: list(v) for k, v in result.items()} 

429 

430 def getDatasetRef(self, id: DatasetId) -> DatasetRef | None: 

431 # Docstring inherited from DatasetRecordStorageManager. 

432 sql = ( 

433 sqlalchemy.sql.select( 

434 self._static.dataset.columns.dataset_type_id, 

435 self._static.dataset.columns[self._collections.getRunForeignKeyName()], 

436 ) 

437 .select_from(self._static.dataset) 

438 .where(self._static.dataset.columns.id == id) 

439 ) 

440 with self._db.query(sql) as sql_result: 

441 row = sql_result.mappings().fetchone() 

442 if row is None: 

443 return None 

444 recordsForType = self._byId.get(row[self._static.dataset.columns.dataset_type_id]) 

445 if recordsForType is None: 445 ↛ 446line 445 didn't jump to line 446, because the condition on line 445 was never true

446 self.refresh() 

447 recordsForType = self._byId.get(row[self._static.dataset.columns.dataset_type_id]) 

448 assert recordsForType is not None, "Should be guaranteed by foreign key constraints." 

449 return DatasetRef( 

450 recordsForType.datasetType, 

451 dataId=recordsForType.getDataId(id=id), 

452 id=id, 

453 run=self._collections[row[self._collections.getRunForeignKeyName()]].name, 

454 ) 

455 

456 def getCollectionSummary(self, collection: CollectionRecord) -> CollectionSummary: 

457 # Docstring inherited from DatasetRecordStorageManager. 

458 return self._summaries.get(collection) 

459 

460 _version: VersionTuple 

461 """Schema version for this class.""" 

462 

463 _recordStorageType: type[ByDimensionsDatasetRecordStorage] 

464 """Type of the storage class returned by this manager.""" 

465 

466 _autoincrement: bool 

467 """If True then PK column of the dataset table is auto-increment.""" 

468 

469 _idColumnType: type 

470 """Type of dataset column used to store dataset ID.""" 

471 

472 

473class ByDimensionsDatasetRecordStorageManagerUUID(ByDimensionsDatasetRecordStorageManagerBase): 

474 """Implementation of ByDimensionsDatasetRecordStorageManagerBase which uses 

475 UUID for dataset primary key. 

476 """ 

477 

478 _version: VersionTuple = _VERSION_UUID 

479 _recordStorageType: type[ByDimensionsDatasetRecordStorage] = ByDimensionsDatasetRecordStorageUUID 

480 _autoincrement: bool = False 

481 _idColumnType: type = ddl.GUID 

482 

483 @classmethod 

484 def supportsIdGenerationMode(cls, mode: DatasetIdGenEnum) -> bool: 

485 # Docstring inherited from DatasetRecordStorageManager. 

486 return True