Coverage for python/lsst/daf/butler/registry/datasets/byDimensions/_manager.py: 95%

204 statements  

« prev     ^ index     » next       coverage.py v7.2.7, created at 2023-06-07 02:10 -0700

1from __future__ import annotations 

2 

3__all__ = ("ByDimensionsDatasetRecordStorageManagerUUID",) 

4 

5import logging 

6import warnings 

7from collections import defaultdict 

8from typing import TYPE_CHECKING, Any 

9 

10import sqlalchemy 

11 

12from ....core import DatasetId, DatasetIdGenEnum, DatasetRef, DatasetType, DimensionUniverse, ddl 

13from ..._collection_summary import CollectionSummary 

14from ..._exceptions import ConflictingDefinitionError, DatasetTypeError, OrphanedRecordError 

15from ...interfaces import DatasetRecordStorage, DatasetRecordStorageManager, VersionTuple 

16from ...wildcards import DatasetTypeWildcard 

17from ._storage import ByDimensionsDatasetRecordStorage, ByDimensionsDatasetRecordStorageUUID 

18from .summaries import CollectionSummaryManager 

19from .tables import ( 

20 addDatasetForeignKey, 

21 makeCalibTableName, 

22 makeCalibTableSpec, 

23 makeStaticTableSpecs, 

24 makeTagTableName, 

25 makeTagTableSpec, 

26) 

27 

28if TYPE_CHECKING: 

29 from ...interfaces import ( 

30 CollectionManager, 

31 CollectionRecord, 

32 Database, 

33 DimensionRecordStorageManager, 

34 StaticTablesContext, 

35 ) 

36 from .tables import StaticDatasetTablesTuple 

37 

38 

39# This has to be updated on every schema change 

40_VERSION_UUID = VersionTuple(1, 0, 0) 

41# Starting with 2.0.0 the `ingest_date` column type uses nanoseconds instead 

42# of TIMESTAMP. The code supports both 1.0.0 and 2.0.0 for the duration of 

43# client migration period. 

44_VERSION_UUID_NS = VersionTuple(2, 0, 0) 

45 

46_LOG = logging.getLogger(__name__) 

47 

48 

49class MissingDatabaseTableError(RuntimeError): 

50 """Exception raised when a table is not found in a database.""" 

51 

52 

53class ByDimensionsDatasetRecordStorageManagerBase(DatasetRecordStorageManager): 

54 """A manager class for datasets that uses one dataset-collection table for 

55 each group of dataset types that share the same dimensions. 

56 

57 In addition to the table organization, this class makes a number of 

58 other design choices that would have been cumbersome (to say the least) to 

59 try to pack into its name: 

60 

61 - It uses a private surrogate integer autoincrement field to identify 

62 dataset types, instead of using the name as the primary and foreign key 

63 directly. 

64 

65 - It aggressively loads all DatasetTypes into memory instead of fetching 

66 them from the database only when needed or attempting more clever forms 

67 of caching. 

68 

69 Alternative implementations that make different choices for these while 

70 keeping the same general table organization might be reasonable as well. 

71 

72 This class provides complete implementation of manager logic but it is 

73 parametrized by few class attributes that have to be defined by 

74 sub-classes. 

75 

76 Parameters 

77 ---------- 

78 db : `Database` 

79 Interface to the underlying database engine and namespace. 

80 collections : `CollectionManager` 

81 Manager object for the collections in this `Registry`. 

82 dimensions : `DimensionRecordStorageManager` 

83 Manager object for the dimensions in this `Registry`. 

84 static : `StaticDatasetTablesTuple` 

85 Named tuple of `sqlalchemy.schema.Table` instances for all static 

86 tables used by this class. 

87 summaries : `CollectionSummaryManager` 

88 Structure containing tables that summarize the contents of collections. 

89 """ 

90 

91 def __init__( 

92 self, 

93 *, 

94 db: Database, 

95 collections: CollectionManager, 

96 dimensions: DimensionRecordStorageManager, 

97 static: StaticDatasetTablesTuple, 

98 summaries: CollectionSummaryManager, 

99 registry_schema_version: VersionTuple | None = None, 

100 ): 

101 super().__init__(registry_schema_version=registry_schema_version) 

102 self._db = db 

103 self._collections = collections 

104 self._dimensions = dimensions 

105 self._static = static 

106 self._summaries = summaries 

107 self._byName: dict[str, ByDimensionsDatasetRecordStorage] = {} 

108 self._byId: dict[int, ByDimensionsDatasetRecordStorage] = {} 

109 

110 @classmethod 

111 def initialize( 

112 cls, 

113 db: Database, 

114 context: StaticTablesContext, 

115 *, 

116 collections: CollectionManager, 

117 dimensions: DimensionRecordStorageManager, 

118 registry_schema_version: VersionTuple | None = None, 

119 ) -> DatasetRecordStorageManager: 

120 # Docstring inherited from DatasetRecordStorageManager. 

121 specs = cls.makeStaticTableSpecs( 

122 type(collections), universe=dimensions.universe, schema_version=registry_schema_version 

123 ) 

124 static: StaticDatasetTablesTuple = context.addTableTuple(specs) # type: ignore 

125 summaries = CollectionSummaryManager.initialize( 

126 db, 

127 context, 

128 collections=collections, 

129 dimensions=dimensions, 

130 ) 

131 return cls( 

132 db=db, 

133 collections=collections, 

134 dimensions=dimensions, 

135 static=static, 

136 summaries=summaries, 

137 registry_schema_version=registry_schema_version, 

138 ) 

139 

140 @classmethod 

141 def currentVersions(cls) -> list[VersionTuple]: 

142 # Docstring inherited from VersionedExtension. 

143 return cls._versions 

144 

145 @classmethod 

146 def makeStaticTableSpecs( 

147 cls, 

148 collections: type[CollectionManager], 

149 universe: DimensionUniverse, 

150 schema_version: VersionTuple | None, 

151 ) -> StaticDatasetTablesTuple: 

152 """Construct all static tables used by the classes in this package. 

153 

154 Static tables are those that are present in all Registries and do not 

155 depend on what DatasetTypes have been registered. 

156 

157 Parameters 

158 ---------- 

159 collections: `CollectionManager` 

160 Manager object for the collections in this `Registry`. 

161 universe : `DimensionUniverse` 

162 Universe graph containing all dimensions known to this `Registry`. 

163 schema_version : `VersionTuple` or `None` 

164 Version of the schema that should be created, if `None` then 

165 default schema should be used. 

166 

167 Returns 

168 ------- 

169 specs : `StaticDatasetTablesTuple` 

170 A named tuple containing `ddl.TableSpec` instances. 

171 """ 

172 schema_version = cls.clsNewSchemaVersion(schema_version) 

173 assert schema_version is not None, "New schema version cannot be None" 

174 return makeStaticTableSpecs( 

175 collections, 

176 universe=universe, 

177 dtype=cls.getIdColumnType(), 

178 autoincrement=cls._autoincrement, 

179 schema_version=schema_version, 

180 ) 

181 

182 @classmethod 

183 def getIdColumnType(cls) -> type: 

184 # Docstring inherited from base class. 

185 return cls._idColumnType 

186 

187 @classmethod 

188 def addDatasetForeignKey( 

189 cls, 

190 tableSpec: ddl.TableSpec, 

191 *, 

192 name: str = "dataset", 

193 constraint: bool = True, 

194 onDelete: str | None = None, 

195 **kwargs: Any, 

196 ) -> ddl.FieldSpec: 

197 # Docstring inherited from DatasetRecordStorageManager. 

198 return addDatasetForeignKey( 

199 tableSpec, cls.getIdColumnType(), name=name, onDelete=onDelete, constraint=constraint, **kwargs 

200 ) 

201 

202 def refresh(self) -> None: 

203 # Docstring inherited from DatasetRecordStorageManager. 

204 byName: dict[str, ByDimensionsDatasetRecordStorage] = {} 

205 byId: dict[int, ByDimensionsDatasetRecordStorage] = {} 

206 dataset_types: dict[int, DatasetType] = {} 

207 c = self._static.dataset_type.columns 

208 with self._db.query(self._static.dataset_type.select()) as sql_result: 

209 sql_rows = sql_result.mappings().fetchall() 

210 for row in sql_rows: 

211 name = row[c.name] 

212 dimensions = self._dimensions.loadDimensionGraph(row[c.dimensions_key]) 

213 calibTableName = row[c.calibration_association_table] 

214 datasetType = DatasetType( 

215 name, dimensions, row[c.storage_class], isCalibration=(calibTableName is not None) 

216 ) 

217 tags = self._db.getExistingTable( 

218 row[c.tag_association_table], 

219 makeTagTableSpec(datasetType, type(self._collections), self.getIdColumnType()), 

220 ) 

221 if tags is None: 221 ↛ 222line 221 didn't jump to line 222, because the condition on line 221 was never true

222 raise MissingDatabaseTableError( 

223 f"Table {row[c.tag_association_table]} is missing from database schema." 

224 ) 

225 if calibTableName is not None: 

226 calibs = self._db.getExistingTable( 

227 row[c.calibration_association_table], 

228 makeCalibTableSpec( 

229 datasetType, 

230 type(self._collections), 

231 self._db.getTimespanRepresentation(), 

232 self.getIdColumnType(), 

233 ), 

234 ) 

235 if calibs is None: 235 ↛ 236line 235 didn't jump to line 236, because the condition on line 235 was never true

236 raise MissingDatabaseTableError( 

237 f"Table {row[c.calibration_association_table]} is missing from database schema." 

238 ) 

239 else: 

240 calibs = None 

241 storage = self._recordStorageType( 

242 db=self._db, 

243 datasetType=datasetType, 

244 static=self._static, 

245 summaries=self._summaries, 

246 tags=tags, 

247 calibs=calibs, 

248 dataset_type_id=row["id"], 

249 collections=self._collections, 

250 use_astropy_ingest_date=self.ingest_date_dtype() is ddl.AstropyTimeNsecTai, 

251 ) 

252 byName[datasetType.name] = storage 

253 byId[storage._dataset_type_id] = storage 

254 dataset_types[row["id"]] = datasetType 

255 self._byName = byName 

256 self._byId = byId 

257 self._summaries.refresh(dataset_types) 

258 

259 def remove(self, name: str) -> None: 

260 # Docstring inherited from DatasetRecordStorageManager. 

261 compositeName, componentName = DatasetType.splitDatasetTypeName(name) 

262 if componentName is not None: 

263 raise ValueError(f"Cannot delete a dataset type of a component of a composite (given {name})") 

264 

265 # Delete the row 

266 try: 

267 self._db.delete(self._static.dataset_type, ["name"], {"name": name}) 

268 except sqlalchemy.exc.IntegrityError as e: 

269 raise OrphanedRecordError( 

270 f"Dataset type {name} can not be removed." 

271 " It is associated with datasets that must be removed first." 

272 ) from e 

273 

274 # Now refresh everything -- removal is rare enough that this does 

275 # not need to be fast. 

276 self.refresh() 

277 

278 def find(self, name: str) -> DatasetRecordStorage | None: 

279 # Docstring inherited from DatasetRecordStorageManager. 

280 return self._byName.get(name) 

281 

282 def register(self, datasetType: DatasetType) -> tuple[DatasetRecordStorage, bool]: 

283 # Docstring inherited from DatasetRecordStorageManager. 

284 if datasetType.isComponent(): 284 ↛ 285line 284 didn't jump to line 285, because the condition on line 284 was never true

285 raise ValueError( 

286 f"Component dataset types can not be stored in registry. Rejecting {datasetType.name}" 

287 ) 

288 storage = self._byName.get(datasetType.name) 

289 if storage is None: 

290 dimensionsKey = self._dimensions.saveDimensionGraph(datasetType.dimensions) 

291 tagTableName = makeTagTableName(datasetType, dimensionsKey) 

292 calibTableName = ( 

293 makeCalibTableName(datasetType, dimensionsKey) if datasetType.isCalibration() else None 

294 ) 

295 # The order is important here, we want to create tables first and 

296 # only register them if this operation is successful. We cannot 

297 # wrap it into a transaction because database class assumes that 

298 # DDL is not transaction safe in general. 

299 tags = self._db.ensureTableExists( 

300 tagTableName, 

301 makeTagTableSpec(datasetType, type(self._collections), self.getIdColumnType()), 

302 ) 

303 if calibTableName is not None: 

304 calibs = self._db.ensureTableExists( 

305 calibTableName, 

306 makeCalibTableSpec( 

307 datasetType, 

308 type(self._collections), 

309 self._db.getTimespanRepresentation(), 

310 self.getIdColumnType(), 

311 ), 

312 ) 

313 else: 

314 calibs = None 

315 row, inserted = self._db.sync( 

316 self._static.dataset_type, 

317 keys={"name": datasetType.name}, 

318 compared={ 

319 "dimensions_key": dimensionsKey, 

320 # Force the storage class to be loaded to ensure it 

321 # exists and there is no typo in the name. 

322 "storage_class": datasetType.storageClass.name, 

323 }, 

324 extra={ 

325 "tag_association_table": tagTableName, 

326 "calibration_association_table": calibTableName, 

327 }, 

328 returning=["id", "tag_association_table"], 

329 ) 

330 assert row is not None 

331 storage = self._recordStorageType( 

332 db=self._db, 

333 datasetType=datasetType, 

334 static=self._static, 

335 summaries=self._summaries, 

336 tags=tags, 

337 calibs=calibs, 

338 dataset_type_id=row["id"], 

339 collections=self._collections, 

340 use_astropy_ingest_date=self.ingest_date_dtype() is ddl.AstropyTimeNsecTai, 

341 ) 

342 self._byName[datasetType.name] = storage 

343 self._byId[storage._dataset_type_id] = storage 

344 else: 

345 if datasetType != storage.datasetType: 

346 raise ConflictingDefinitionError( 

347 f"Given dataset type {datasetType} is inconsistent " 

348 f"with database definition {storage.datasetType}." 

349 ) 

350 inserted = False 

351 return storage, bool(inserted) 

352 

353 def resolve_wildcard( 

354 self, 

355 expression: Any, 

356 components: bool | None = None, 

357 missing: list[str] | None = None, 

358 explicit_only: bool = False, 

359 components_deprecated: bool = True, 

360 ) -> dict[DatasetType, list[str | None]]: 

361 wildcard = DatasetTypeWildcard.from_expression(expression) 

362 result: defaultdict[DatasetType, set[str | None]] = defaultdict(set) 

363 # This message can be transformed into an error on DM-36303 after v26, 

364 # and the components and components_deprecated arguments can be merged 

365 # into one on DM-36457 after v27. 

366 deprecation_message = ( 

367 "Querying for component datasets via Registry query methods is deprecated in favor of using " 

368 "DatasetRef and DatasetType methods on parent datasets. Only components=False will be supported " 

369 "after v26, and the components argument will be removed after v27." 

370 ) 

371 for name, dataset_type in wildcard.values.items(): 

372 parent_name, component_name = DatasetType.splitDatasetTypeName(name) 

373 if component_name is not None and components_deprecated: 

374 warnings.warn(deprecation_message, FutureWarning) 

375 if (found_storage := self.find(parent_name)) is not None: 

376 found_parent = found_storage.datasetType 

377 if component_name is not None: 

378 found = found_parent.makeComponentDatasetType(component_name) 

379 else: 

380 found = found_parent 

381 if dataset_type is not None: 

382 if dataset_type.is_compatible_with(found): 382 ↛ 390line 382 didn't jump to line 390, because the condition on line 382 was never false

383 # Prefer the given dataset type to enable storage class 

384 # conversions. 

385 if component_name is not None: 

386 found_parent = dataset_type.makeCompositeDatasetType() 

387 else: 

388 found_parent = dataset_type 

389 else: 

390 raise DatasetTypeError( 

391 f"Dataset type definition in query expression {dataset_type} is " 

392 f"not compatible with the registered type {found}." 

393 ) 

394 result[found_parent].add(component_name) 

395 elif missing is not None: 

396 missing.append(name) 

397 already_warned = False 

398 if wildcard.patterns is ...: 

399 if explicit_only: 

400 raise TypeError( 

401 "Universal wildcard '...' is not permitted for dataset types in this context." 

402 ) 

403 for storage in self._byName.values(): 

404 result[storage.datasetType].add(None) 

405 if components: 

406 try: 

407 result[storage.datasetType].update( 

408 storage.datasetType.storageClass.allComponents().keys() 

409 ) 

410 if ( 

411 storage.datasetType.storageClass.allComponents() 

412 and not already_warned 

413 and components_deprecated 

414 ): 

415 warnings.warn(deprecation_message, FutureWarning) 

416 already_warned = True 

417 except KeyError as err: 

418 _LOG.warning( 

419 f"Could not load storage class {err} for {storage.datasetType.name}; " 

420 "if it has components they will not be included in query results.", 

421 ) 

422 elif wildcard.patterns: 

423 if explicit_only: 

424 # After v26 this should raise DatasetTypeExpressionError, to 

425 # be implemented on DM-36303. 

426 warnings.warn( 

427 "Passing wildcard patterns here is deprecated and will be prohibited after v26.", 

428 FutureWarning, 

429 ) 

430 for storage in self._byName.values(): 

431 if any(p.fullmatch(storage.datasetType.name) for p in wildcard.patterns): 

432 result[storage.datasetType].add(None) 

433 if components is not False: 

434 for storage in self._byName.values(): 

435 if components is None and storage.datasetType in result: 

436 continue 

437 try: 

438 components_for_parent = storage.datasetType.storageClass.allComponents().keys() 

439 except KeyError as err: 

440 _LOG.warning( 

441 f"Could not load storage class {err} for {storage.datasetType.name}; " 

442 "if it has components they will not be included in query results." 

443 ) 

444 continue 

445 for component_name in components_for_parent: 

446 if any( 

447 p.fullmatch( 

448 DatasetType.nameWithComponent(storage.datasetType.name, component_name) 

449 ) 

450 for p in wildcard.patterns 

451 ): 

452 result[storage.datasetType].add(component_name) 

453 if not already_warned and components_deprecated: 

454 warnings.warn(deprecation_message, FutureWarning) 

455 already_warned = True 

456 return {k: list(v) for k, v in result.items()} 

457 

458 def getDatasetRef(self, id: DatasetId) -> DatasetRef | None: 

459 # Docstring inherited from DatasetRecordStorageManager. 

460 sql = ( 

461 sqlalchemy.sql.select( 

462 self._static.dataset.columns.dataset_type_id, 

463 self._static.dataset.columns[self._collections.getRunForeignKeyName()], 

464 ) 

465 .select_from(self._static.dataset) 

466 .where(self._static.dataset.columns.id == id) 

467 ) 

468 with self._db.query(sql) as sql_result: 

469 row = sql_result.mappings().fetchone() 

470 if row is None: 

471 return None 

472 recordsForType = self._byId.get(row[self._static.dataset.columns.dataset_type_id]) 

473 if recordsForType is None: 473 ↛ 474line 473 didn't jump to line 474, because the condition on line 473 was never true

474 self.refresh() 

475 recordsForType = self._byId.get(row[self._static.dataset.columns.dataset_type_id]) 

476 assert recordsForType is not None, "Should be guaranteed by foreign key constraints." 

477 return DatasetRef( 

478 recordsForType.datasetType, 

479 dataId=recordsForType.getDataId(id=id), 

480 id=id, 

481 run=self._collections[row[self._collections.getRunForeignKeyName()]].name, 

482 ) 

483 

484 def getCollectionSummary(self, collection: CollectionRecord) -> CollectionSummary: 

485 # Docstring inherited from DatasetRecordStorageManager. 

486 return self._summaries.get(collection) 

487 

488 _versions: list[VersionTuple] 

489 """Schema version for this class.""" 

490 

491 _recordStorageType: type[ByDimensionsDatasetRecordStorage] 

492 """Type of the storage class returned by this manager.""" 

493 

494 _autoincrement: bool 

495 """If True then PK column of the dataset table is auto-increment.""" 

496 

497 _idColumnType: type 

498 """Type of dataset column used to store dataset ID.""" 

499 

500 

501class ByDimensionsDatasetRecordStorageManagerUUID(ByDimensionsDatasetRecordStorageManagerBase): 

502 """Implementation of ByDimensionsDatasetRecordStorageManagerBase which uses 

503 UUID for dataset primary key. 

504 """ 

505 

506 _versions: list[VersionTuple] = [_VERSION_UUID, _VERSION_UUID_NS] 

507 _recordStorageType: type[ByDimensionsDatasetRecordStorage] = ByDimensionsDatasetRecordStorageUUID 

508 _autoincrement: bool = False 

509 _idColumnType: type = ddl.GUID 

510 

511 @classmethod 

512 def supportsIdGenerationMode(cls, mode: DatasetIdGenEnum) -> bool: 

513 # Docstring inherited from DatasetRecordStorageManager. 

514 return True 

515 

516 @classmethod 

517 def _newDefaultSchemaVersion(cls) -> VersionTuple: 

518 # Docstring inherited from VersionedExtension. 

519 

520 # By default return 1.0.0 so that older clients can still access new 

521 # registries created with a default config. 

522 return _VERSION_UUID 

523 

524 def ingest_date_dtype(self) -> type: 

525 """Return type of the ``ingest_date`` column.""" 

526 schema_version = self.newSchemaVersion() 

527 if schema_version is not None and schema_version.major > 1: 

528 return ddl.AstropyTimeNsecTai 

529 else: 

530 return sqlalchemy.TIMESTAMP