Coverage for python/lsst/daf/butler/registry/datasets/byDimensions/_manager.py: 95%

205 statements  

« prev     ^ index     » next       coverage.py v7.2.5, created at 2023-05-05 03:17 -0700

1from __future__ import annotations 

2 

3__all__ = ("ByDimensionsDatasetRecordStorageManagerUUID",) 

4 

5import logging 

6import warnings 

7from collections import defaultdict 

8from typing import TYPE_CHECKING, Any 

9 

10import sqlalchemy 

11from lsst.utils.ellipsis import Ellipsis 

12 

13from ....core import DatasetId, DatasetIdGenEnum, DatasetRef, DatasetType, DimensionUniverse, ddl 

14from ..._collection_summary import CollectionSummary 

15from ..._exceptions import ConflictingDefinitionError, DatasetTypeError, OrphanedRecordError 

16from ...interfaces import DatasetRecordStorage, DatasetRecordStorageManager, VersionTuple 

17from ...wildcards import DatasetTypeWildcard 

18from ._storage import ByDimensionsDatasetRecordStorage, ByDimensionsDatasetRecordStorageUUID 

19from .summaries import CollectionSummaryManager 

20from .tables import ( 

21 addDatasetForeignKey, 

22 makeCalibTableName, 

23 makeCalibTableSpec, 

24 makeStaticTableSpecs, 

25 makeTagTableName, 

26 makeTagTableSpec, 

27) 

28 

29if TYPE_CHECKING: 

30 from ...interfaces import ( 

31 CollectionManager, 

32 CollectionRecord, 

33 Database, 

34 DimensionRecordStorageManager, 

35 StaticTablesContext, 

36 ) 

37 from .tables import StaticDatasetTablesTuple 

38 

39 

40# This has to be updated on every schema change 

41_VERSION_UUID = VersionTuple(1, 0, 0) 

42# Starting with 2.0.0 the `ingest_date` column type uses nanoseconds instead 

43# of TIMESTAMP. The code supports both 1.0.0 and 2.0.0 for the duration of 

44# client migration period. 

45_VERSION_UUID_NS = VersionTuple(2, 0, 0) 

46 

47_LOG = logging.getLogger(__name__) 

48 

49 

50class MissingDatabaseTableError(RuntimeError): 

51 """Exception raised when a table is not found in a database.""" 

52 

53 

54class ByDimensionsDatasetRecordStorageManagerBase(DatasetRecordStorageManager): 

55 """A manager class for datasets that uses one dataset-collection table for 

56 each group of dataset types that share the same dimensions. 

57 

58 In addition to the table organization, this class makes a number of 

59 other design choices that would have been cumbersome (to say the least) to 

60 try to pack into its name: 

61 

62 - It uses a private surrogate integer autoincrement field to identify 

63 dataset types, instead of using the name as the primary and foreign key 

64 directly. 

65 

66 - It aggressively loads all DatasetTypes into memory instead of fetching 

67 them from the database only when needed or attempting more clever forms 

68 of caching. 

69 

70 Alternative implementations that make different choices for these while 

71 keeping the same general table organization might be reasonable as well. 

72 

73 This class provides complete implementation of manager logic but it is 

74 parametrized by few class attributes that have to be defined by 

75 sub-classes. 

76 

77 Parameters 

78 ---------- 

79 db : `Database` 

80 Interface to the underlying database engine and namespace. 

81 collections : `CollectionManager` 

82 Manager object for the collections in this `Registry`. 

83 dimensions : `DimensionRecordStorageManager` 

84 Manager object for the dimensions in this `Registry`. 

85 static : `StaticDatasetTablesTuple` 

86 Named tuple of `sqlalchemy.schema.Table` instances for all static 

87 tables used by this class. 

88 summaries : `CollectionSummaryManager` 

89 Structure containing tables that summarize the contents of collections. 

90 """ 

91 

92 def __init__( 

93 self, 

94 *, 

95 db: Database, 

96 collections: CollectionManager, 

97 dimensions: DimensionRecordStorageManager, 

98 static: StaticDatasetTablesTuple, 

99 summaries: CollectionSummaryManager, 

100 registry_schema_version: VersionTuple | None = None, 

101 ): 

102 super().__init__(registry_schema_version=registry_schema_version) 

103 self._db = db 

104 self._collections = collections 

105 self._dimensions = dimensions 

106 self._static = static 

107 self._summaries = summaries 

108 self._byName: dict[str, ByDimensionsDatasetRecordStorage] = {} 

109 self._byId: dict[int, ByDimensionsDatasetRecordStorage] = {} 

110 

111 @classmethod 

112 def initialize( 

113 cls, 

114 db: Database, 

115 context: StaticTablesContext, 

116 *, 

117 collections: CollectionManager, 

118 dimensions: DimensionRecordStorageManager, 

119 registry_schema_version: VersionTuple | None = None, 

120 ) -> DatasetRecordStorageManager: 

121 # Docstring inherited from DatasetRecordStorageManager. 

122 specs = cls.makeStaticTableSpecs( 

123 type(collections), universe=dimensions.universe, schema_version=registry_schema_version 

124 ) 

125 static: StaticDatasetTablesTuple = context.addTableTuple(specs) # type: ignore 

126 summaries = CollectionSummaryManager.initialize( 

127 db, 

128 context, 

129 collections=collections, 

130 dimensions=dimensions, 

131 ) 

132 return cls( 

133 db=db, 

134 collections=collections, 

135 dimensions=dimensions, 

136 static=static, 

137 summaries=summaries, 

138 registry_schema_version=registry_schema_version, 

139 ) 

140 

141 @classmethod 

142 def currentVersions(cls) -> list[VersionTuple]: 

143 # Docstring inherited from VersionedExtension. 

144 return cls._versions 

145 

146 @classmethod 

147 def makeStaticTableSpecs( 

148 cls, 

149 collections: type[CollectionManager], 

150 universe: DimensionUniverse, 

151 schema_version: VersionTuple | None, 

152 ) -> StaticDatasetTablesTuple: 

153 """Construct all static tables used by the classes in this package. 

154 

155 Static tables are those that are present in all Registries and do not 

156 depend on what DatasetTypes have been registered. 

157 

158 Parameters 

159 ---------- 

160 collections: `CollectionManager` 

161 Manager object for the collections in this `Registry`. 

162 universe : `DimensionUniverse` 

163 Universe graph containing all dimensions known to this `Registry`. 

164 schema_version : `VersionTuple` or `None` 

165 Version of the schema that should be created, if `None` then 

166 default schema should be used. 

167 

168 Returns 

169 ------- 

170 specs : `StaticDatasetTablesTuple` 

171 A named tuple containing `ddl.TableSpec` instances. 

172 """ 

173 schema_version = cls.clsNewSchemaVersion(schema_version) 

174 assert schema_version is not None, "New schema version cannot be None" 

175 return makeStaticTableSpecs( 

176 collections, 

177 universe=universe, 

178 dtype=cls.getIdColumnType(), 

179 autoincrement=cls._autoincrement, 

180 schema_version=schema_version, 

181 ) 

182 

183 @classmethod 

184 def getIdColumnType(cls) -> type: 

185 # Docstring inherited from base class. 

186 return cls._idColumnType 

187 

188 @classmethod 

189 def addDatasetForeignKey( 

190 cls, 

191 tableSpec: ddl.TableSpec, 

192 *, 

193 name: str = "dataset", 

194 constraint: bool = True, 

195 onDelete: str | None = None, 

196 **kwargs: Any, 

197 ) -> ddl.FieldSpec: 

198 # Docstring inherited from DatasetRecordStorageManager. 

199 return addDatasetForeignKey( 

200 tableSpec, cls.getIdColumnType(), name=name, onDelete=onDelete, constraint=constraint, **kwargs 

201 ) 

202 

203 def refresh(self) -> None: 

204 # Docstring inherited from DatasetRecordStorageManager. 

205 byName: dict[str, ByDimensionsDatasetRecordStorage] = {} 

206 byId: dict[int, ByDimensionsDatasetRecordStorage] = {} 

207 dataset_types: dict[int, DatasetType] = {} 

208 c = self._static.dataset_type.columns 

209 with self._db.query(self._static.dataset_type.select()) as sql_result: 

210 sql_rows = sql_result.mappings().fetchall() 

211 for row in sql_rows: 

212 name = row[c.name] 

213 dimensions = self._dimensions.loadDimensionGraph(row[c.dimensions_key]) 

214 calibTableName = row[c.calibration_association_table] 

215 datasetType = DatasetType( 

216 name, dimensions, row[c.storage_class], isCalibration=(calibTableName is not None) 

217 ) 

218 tags = self._db.getExistingTable( 

219 row[c.tag_association_table], 

220 makeTagTableSpec(datasetType, type(self._collections), self.getIdColumnType()), 

221 ) 

222 if tags is None: 222 ↛ 223line 222 didn't jump to line 223, because the condition on line 222 was never true

223 raise MissingDatabaseTableError( 

224 f"Table {row[c.tag_association_table]} is missing from database schema." 

225 ) 

226 if calibTableName is not None: 

227 calibs = self._db.getExistingTable( 

228 row[c.calibration_association_table], 

229 makeCalibTableSpec( 

230 datasetType, 

231 type(self._collections), 

232 self._db.getTimespanRepresentation(), 

233 self.getIdColumnType(), 

234 ), 

235 ) 

236 if calibs is None: 236 ↛ 237line 236 didn't jump to line 237, because the condition on line 236 was never true

237 raise MissingDatabaseTableError( 

238 f"Table {row[c.calibration_association_table]} is missing from database schema." 

239 ) 

240 else: 

241 calibs = None 

242 storage = self._recordStorageType( 

243 db=self._db, 

244 datasetType=datasetType, 

245 static=self._static, 

246 summaries=self._summaries, 

247 tags=tags, 

248 calibs=calibs, 

249 dataset_type_id=row["id"], 

250 collections=self._collections, 

251 use_astropy_ingest_date=self.ingest_date_dtype() is ddl.AstropyTimeNsecTai, 

252 ) 

253 byName[datasetType.name] = storage 

254 byId[storage._dataset_type_id] = storage 

255 dataset_types[row["id"]] = datasetType 

256 self._byName = byName 

257 self._byId = byId 

258 self._summaries.refresh(dataset_types) 

259 

260 def remove(self, name: str) -> None: 

261 # Docstring inherited from DatasetRecordStorageManager. 

262 compositeName, componentName = DatasetType.splitDatasetTypeName(name) 

263 if componentName is not None: 

264 raise ValueError(f"Cannot delete a dataset type of a component of a composite (given {name})") 

265 

266 # Delete the row 

267 try: 

268 self._db.delete(self._static.dataset_type, ["name"], {"name": name}) 

269 except sqlalchemy.exc.IntegrityError as e: 

270 raise OrphanedRecordError( 

271 f"Dataset type {name} can not be removed." 

272 " It is associated with datasets that must be removed first." 

273 ) from e 

274 

275 # Now refresh everything -- removal is rare enough that this does 

276 # not need to be fast. 

277 self.refresh() 

278 

279 def find(self, name: str) -> DatasetRecordStorage | None: 

280 # Docstring inherited from DatasetRecordStorageManager. 

281 return self._byName.get(name) 

282 

283 def register(self, datasetType: DatasetType) -> tuple[DatasetRecordStorage, bool]: 

284 # Docstring inherited from DatasetRecordStorageManager. 

285 if datasetType.isComponent(): 285 ↛ 286line 285 didn't jump to line 286, because the condition on line 285 was never true

286 raise ValueError( 

287 f"Component dataset types can not be stored in registry. Rejecting {datasetType.name}" 

288 ) 

289 storage = self._byName.get(datasetType.name) 

290 if storage is None: 

291 dimensionsKey = self._dimensions.saveDimensionGraph(datasetType.dimensions) 

292 tagTableName = makeTagTableName(datasetType, dimensionsKey) 

293 calibTableName = ( 

294 makeCalibTableName(datasetType, dimensionsKey) if datasetType.isCalibration() else None 

295 ) 

296 # The order is important here, we want to create tables first and 

297 # only register them if this operation is successful. We cannot 

298 # wrap it into a transaction because database class assumes that 

299 # DDL is not transaction safe in general. 

300 tags = self._db.ensureTableExists( 

301 tagTableName, 

302 makeTagTableSpec(datasetType, type(self._collections), self.getIdColumnType()), 

303 ) 

304 if calibTableName is not None: 

305 calibs = self._db.ensureTableExists( 

306 calibTableName, 

307 makeCalibTableSpec( 

308 datasetType, 

309 type(self._collections), 

310 self._db.getTimespanRepresentation(), 

311 self.getIdColumnType(), 

312 ), 

313 ) 

314 else: 

315 calibs = None 

316 row, inserted = self._db.sync( 

317 self._static.dataset_type, 

318 keys={"name": datasetType.name}, 

319 compared={ 

320 "dimensions_key": dimensionsKey, 

321 # Force the storage class to be loaded to ensure it 

322 # exists and there is no typo in the name. 

323 "storage_class": datasetType.storageClass.name, 

324 }, 

325 extra={ 

326 "tag_association_table": tagTableName, 

327 "calibration_association_table": calibTableName, 

328 }, 

329 returning=["id", "tag_association_table"], 

330 ) 

331 assert row is not None 

332 storage = self._recordStorageType( 

333 db=self._db, 

334 datasetType=datasetType, 

335 static=self._static, 

336 summaries=self._summaries, 

337 tags=tags, 

338 calibs=calibs, 

339 dataset_type_id=row["id"], 

340 collections=self._collections, 

341 use_astropy_ingest_date=self.ingest_date_dtype() is ddl.AstropyTimeNsecTai, 

342 ) 

343 self._byName[datasetType.name] = storage 

344 self._byId[storage._dataset_type_id] = storage 

345 else: 

346 if datasetType != storage.datasetType: 

347 raise ConflictingDefinitionError( 

348 f"Given dataset type {datasetType} is inconsistent " 

349 f"with database definition {storage.datasetType}." 

350 ) 

351 inserted = False 

352 return storage, bool(inserted) 

353 

354 def resolve_wildcard( 

355 self, 

356 expression: Any, 

357 components: bool | None = None, 

358 missing: list[str] | None = None, 

359 explicit_only: bool = False, 

360 components_deprecated: bool = True, 

361 ) -> dict[DatasetType, list[str | None]]: 

362 wildcard = DatasetTypeWildcard.from_expression(expression) 

363 result: defaultdict[DatasetType, set[str | None]] = defaultdict(set) 

364 # This message can be transformed into an error on DM-36303 after v26, 

365 # and the components and components_deprecated arguments can be merged 

366 # into one on DM-36457 after v27. 

367 deprecation_message = ( 

368 "Querying for component datasets via Registry query methods is deprecated in favor of using " 

369 "DatasetRef and DatasetType methods on parent datasets. Only components=False will be supported " 

370 "after v26, and the components argument will be removed after v27." 

371 ) 

372 for name, dataset_type in wildcard.values.items(): 

373 parent_name, component_name = DatasetType.splitDatasetTypeName(name) 

374 if component_name is not None and components_deprecated: 

375 warnings.warn(deprecation_message, FutureWarning) 

376 if (found_storage := self.find(parent_name)) is not None: 

377 found_parent = found_storage.datasetType 

378 if component_name is not None: 

379 found = found_parent.makeComponentDatasetType(component_name) 

380 else: 

381 found = found_parent 

382 if dataset_type is not None: 

383 if dataset_type.is_compatible_with(found): 383 ↛ 391line 383 didn't jump to line 391, because the condition on line 383 was never false

384 # Prefer the given dataset type to enable storage class 

385 # conversions. 

386 if component_name is not None: 

387 found_parent = dataset_type.makeCompositeDatasetType() 

388 else: 

389 found_parent = dataset_type 

390 else: 

391 raise DatasetTypeError( 

392 f"Dataset type definition in query expression {dataset_type} is " 

393 f"not compatible with the registered type {found}." 

394 ) 

395 result[found_parent].add(component_name) 

396 elif missing is not None: 

397 missing.append(name) 

398 already_warned = False 

399 if wildcard.patterns is Ellipsis: 

400 if explicit_only: 

401 raise TypeError( 

402 "Universal wildcard '...' is not permitted for dataset types in this context." 

403 ) 

404 for storage in self._byName.values(): 

405 result[storage.datasetType].add(None) 

406 if components: 

407 try: 

408 result[storage.datasetType].update( 

409 storage.datasetType.storageClass.allComponents().keys() 

410 ) 

411 if ( 

412 storage.datasetType.storageClass.allComponents() 

413 and not already_warned 

414 and components_deprecated 

415 ): 

416 warnings.warn(deprecation_message, FutureWarning) 

417 already_warned = True 

418 except KeyError as err: 

419 _LOG.warning( 

420 f"Could not load storage class {err} for {storage.datasetType.name}; " 

421 "if it has components they will not be included in query results.", 

422 ) 

423 elif wildcard.patterns: 

424 if explicit_only: 

425 # After v26 this should raise DatasetTypeExpressionError, to 

426 # be implemented on DM-36303. 

427 warnings.warn( 

428 "Passing wildcard patterns here is deprecated and will be prohibited after v26.", 

429 FutureWarning, 

430 ) 

431 for storage in self._byName.values(): 

432 if any(p.fullmatch(storage.datasetType.name) for p in wildcard.patterns): 

433 result[storage.datasetType].add(None) 

434 if components is not False: 

435 for storage in self._byName.values(): 

436 if components is None and storage.datasetType in result: 

437 continue 

438 try: 

439 components_for_parent = storage.datasetType.storageClass.allComponents().keys() 

440 except KeyError as err: 

441 _LOG.warning( 

442 f"Could not load storage class {err} for {storage.datasetType.name}; " 

443 "if it has components they will not be included in query results." 

444 ) 

445 continue 

446 for component_name in components_for_parent: 

447 if any( 

448 p.fullmatch( 

449 DatasetType.nameWithComponent(storage.datasetType.name, component_name) 

450 ) 

451 for p in wildcard.patterns 

452 ): 

453 result[storage.datasetType].add(component_name) 

454 if not already_warned and components_deprecated: 

455 warnings.warn(deprecation_message, FutureWarning) 

456 already_warned = True 

457 return {k: list(v) for k, v in result.items()} 

458 

459 def getDatasetRef(self, id: DatasetId) -> DatasetRef | None: 

460 # Docstring inherited from DatasetRecordStorageManager. 

461 sql = ( 

462 sqlalchemy.sql.select( 

463 self._static.dataset.columns.dataset_type_id, 

464 self._static.dataset.columns[self._collections.getRunForeignKeyName()], 

465 ) 

466 .select_from(self._static.dataset) 

467 .where(self._static.dataset.columns.id == id) 

468 ) 

469 with self._db.query(sql) as sql_result: 

470 row = sql_result.mappings().fetchone() 

471 if row is None: 

472 return None 

473 recordsForType = self._byId.get(row[self._static.dataset.columns.dataset_type_id]) 

474 if recordsForType is None: 474 ↛ 475line 474 didn't jump to line 475, because the condition on line 474 was never true

475 self.refresh() 

476 recordsForType = self._byId.get(row[self._static.dataset.columns.dataset_type_id]) 

477 assert recordsForType is not None, "Should be guaranteed by foreign key constraints." 

478 return DatasetRef( 

479 recordsForType.datasetType, 

480 dataId=recordsForType.getDataId(id=id), 

481 id=id, 

482 run=self._collections[row[self._collections.getRunForeignKeyName()]].name, 

483 ) 

484 

485 def getCollectionSummary(self, collection: CollectionRecord) -> CollectionSummary: 

486 # Docstring inherited from DatasetRecordStorageManager. 

487 return self._summaries.get(collection) 

488 

489 _versions: list[VersionTuple] 

490 """Schema version for this class.""" 

491 

492 _recordStorageType: type[ByDimensionsDatasetRecordStorage] 

493 """Type of the storage class returned by this manager.""" 

494 

495 _autoincrement: bool 

496 """If True then PK column of the dataset table is auto-increment.""" 

497 

498 _idColumnType: type 

499 """Type of dataset column used to store dataset ID.""" 

500 

501 

502class ByDimensionsDatasetRecordStorageManagerUUID(ByDimensionsDatasetRecordStorageManagerBase): 

503 """Implementation of ByDimensionsDatasetRecordStorageManagerBase which uses 

504 UUID for dataset primary key. 

505 """ 

506 

507 _versions: list[VersionTuple] = [_VERSION_UUID, _VERSION_UUID_NS] 

508 _recordStorageType: type[ByDimensionsDatasetRecordStorage] = ByDimensionsDatasetRecordStorageUUID 

509 _autoincrement: bool = False 

510 _idColumnType: type = ddl.GUID 

511 

512 @classmethod 

513 def supportsIdGenerationMode(cls, mode: DatasetIdGenEnum) -> bool: 

514 # Docstring inherited from DatasetRecordStorageManager. 

515 return True 

516 

517 @classmethod 

518 def _newDefaultSchemaVersion(cls) -> VersionTuple: 

519 # Docstring inherited from VersionedExtension. 

520 

521 # By default return 1.0.0 so that older clients can still access new 

522 # registries created with a default config. 

523 return _VERSION_UUID 

524 

525 def ingest_date_dtype(self) -> type: 

526 """Return type of the ``ingest_date`` column.""" 

527 schema_version = self.newSchemaVersion() 

528 if schema_version is not None and schema_version.major > 1: 

529 return ddl.AstropyTimeNsecTai 

530 else: 

531 return sqlalchemy.TIMESTAMP