Coverage for python/lsst/daf/butler/registry/datasets/byDimensions/_manager.py: 90%

228 statements  

« prev     ^ index     » next       coverage.py v7.4.0, created at 2024-01-16 10:43 +0000

1from __future__ import annotations 

2 

3from .... import ddl 

4 

5__all__ = ("ByDimensionsDatasetRecordStorageManagerUUID",) 

6 

7import dataclasses 

8import logging 

9from collections.abc import Iterable, Mapping 

10from typing import TYPE_CHECKING, Any 

11 

12import sqlalchemy 

13 

14from ...._dataset_ref import DatasetId, DatasetIdGenEnum, DatasetRef, DatasetType 

15from ....dimensions import DimensionUniverse 

16from ..._collection_summary import CollectionSummary 

17from ..._exceptions import ( 

18 ConflictingDefinitionError, 

19 DatasetTypeError, 

20 DatasetTypeExpressionError, 

21 OrphanedRecordError, 

22) 

23from ...interfaces import DatasetRecordStorage, DatasetRecordStorageManager, VersionTuple 

24from ...wildcards import DatasetTypeWildcard 

25from ._storage import ByDimensionsDatasetRecordStorage, ByDimensionsDatasetRecordStorageUUID 

26from .summaries import CollectionSummaryManager 

27from .tables import ( 

28 addDatasetForeignKey, 

29 makeCalibTableName, 

30 makeCalibTableSpec, 

31 makeStaticTableSpecs, 

32 makeTagTableName, 

33 makeTagTableSpec, 

34) 

35 

36if TYPE_CHECKING: 

37 from ..._caching_context import CachingContext 

38 from ...interfaces import ( 

39 CollectionManager, 

40 CollectionRecord, 

41 Database, 

42 DimensionRecordStorageManager, 

43 StaticTablesContext, 

44 ) 

45 from .tables import StaticDatasetTablesTuple 

46 

47 

48# This has to be updated on every schema change 

49_VERSION_UUID = VersionTuple(1, 0, 0) 

50# Starting with 2.0.0 the `ingest_date` column type uses nanoseconds instead 

51# of TIMESTAMP. The code supports both 1.0.0 and 2.0.0 for the duration of 

52# client migration period. 

53_VERSION_UUID_NS = VersionTuple(2, 0, 0) 

54 

55_LOG = logging.getLogger(__name__) 

56 

57 

58class MissingDatabaseTableError(RuntimeError): 

59 """Exception raised when a table is not found in a database.""" 

60 

61 

62@dataclasses.dataclass 

63class _DatasetTypeRecord: 

64 """Contents of a single dataset type record.""" 

65 

66 dataset_type: DatasetType 

67 dataset_type_id: int 

68 tag_table_name: str 

69 calib_table_name: str | None 

70 

71 

72class _SpecTableFactory: 

73 """Factory for `sqlalchemy.schema.Table` instances that builds table 

74 instances using provided `ddl.TableSpec` definition and verifies that 

75 table exists in the database. 

76 """ 

77 

78 def __init__(self, db: Database, name: str, spec: ddl.TableSpec): 

79 self._db = db 

80 self._name = name 

81 self._spec = spec 

82 

83 def __call__(self) -> sqlalchemy.schema.Table: 

84 table = self._db.getExistingTable(self._name, self._spec) 

85 if table is None: 85 ↛ 86line 85 didn't jump to line 86, because the condition on line 85 was never true

86 raise MissingDatabaseTableError(f"Table {self._name} is missing from database schema.") 

87 return table 

88 

89 

90class ByDimensionsDatasetRecordStorageManagerBase(DatasetRecordStorageManager): 

91 """A manager class for datasets that uses one dataset-collection table for 

92 each group of dataset types that share the same dimensions. 

93 

94 In addition to the table organization, this class makes a number of 

95 other design choices that would have been cumbersome (to say the least) to 

96 try to pack into its name: 

97 

98 - It uses a private surrogate integer autoincrement field to identify 

99 dataset types, instead of using the name as the primary and foreign key 

100 directly. 

101 

102 - It aggressively loads all DatasetTypes into memory instead of fetching 

103 them from the database only when needed or attempting more clever forms 

104 of caching. 

105 

106 Alternative implementations that make different choices for these while 

107 keeping the same general table organization might be reasonable as well. 

108 

109 This class provides complete implementation of manager logic but it is 

110 parametrized by few class attributes that have to be defined by 

111 sub-classes. 

112 

113 Parameters 

114 ---------- 

115 db : `Database` 

116 Interface to the underlying database engine and namespace. 

117 collections : `CollectionManager` 

118 Manager object for the collections in this `Registry`. 

119 dimensions : `DimensionRecordStorageManager` 

120 Manager object for the dimensions in this `Registry`. 

121 static : `StaticDatasetTablesTuple` 

122 Named tuple of `sqlalchemy.schema.Table` instances for all static 

123 tables used by this class. 

124 summaries : `CollectionSummaryManager` 

125 Structure containing tables that summarize the contents of collections. 

126 caching_context : `CachingContext` 

127 Object controlling caching of information returned by managers. 

128 registry_schema_version : `VersionTuple` or `None`, optional 

129 Version of registry schema. 

130 """ 

131 

132 def __init__( 

133 self, 

134 *, 

135 db: Database, 

136 collections: CollectionManager, 

137 dimensions: DimensionRecordStorageManager, 

138 static: StaticDatasetTablesTuple, 

139 summaries: CollectionSummaryManager, 

140 caching_context: CachingContext, 

141 registry_schema_version: VersionTuple | None = None, 

142 ): 

143 super().__init__(registry_schema_version=registry_schema_version) 

144 self._db = db 

145 self._collections = collections 

146 self._dimensions = dimensions 

147 self._static = static 

148 self._summaries = summaries 

149 self._caching_context = caching_context 

150 

151 @classmethod 

152 def initialize( 

153 cls, 

154 db: Database, 

155 context: StaticTablesContext, 

156 *, 

157 collections: CollectionManager, 

158 dimensions: DimensionRecordStorageManager, 

159 caching_context: CachingContext, 

160 registry_schema_version: VersionTuple | None = None, 

161 ) -> DatasetRecordStorageManager: 

162 # Docstring inherited from DatasetRecordStorageManager. 

163 specs = cls.makeStaticTableSpecs( 

164 type(collections), universe=dimensions.universe, schema_version=registry_schema_version 

165 ) 

166 static: StaticDatasetTablesTuple = context.addTableTuple(specs) # type: ignore 

167 summaries = CollectionSummaryManager.initialize( 

168 db, 

169 context, 

170 collections=collections, 

171 dimensions=dimensions, 

172 dataset_type_table=static.dataset_type, 

173 caching_context=caching_context, 

174 ) 

175 return cls( 

176 db=db, 

177 collections=collections, 

178 dimensions=dimensions, 

179 static=static, 

180 summaries=summaries, 

181 caching_context=caching_context, 

182 registry_schema_version=registry_schema_version, 

183 ) 

184 

185 @classmethod 

186 def currentVersions(cls) -> list[VersionTuple]: 

187 # Docstring inherited from VersionedExtension. 

188 return cls._versions 

189 

190 @classmethod 

191 def makeStaticTableSpecs( 

192 cls, 

193 collections: type[CollectionManager], 

194 universe: DimensionUniverse, 

195 schema_version: VersionTuple | None, 

196 ) -> StaticDatasetTablesTuple: 

197 """Construct all static tables used by the classes in this package. 

198 

199 Static tables are those that are present in all Registries and do not 

200 depend on what DatasetTypes have been registered. 

201 

202 Parameters 

203 ---------- 

204 collections : `CollectionManager` 

205 Manager object for the collections in this `Registry`. 

206 universe : `DimensionUniverse` 

207 Universe graph containing all dimensions known to this `Registry`. 

208 schema_version : `VersionTuple` or `None` 

209 Version of the schema that should be created, if `None` then 

210 default schema should be used. 

211 

212 Returns 

213 ------- 

214 specs : `StaticDatasetTablesTuple` 

215 A named tuple containing `ddl.TableSpec` instances. 

216 """ 

217 schema_version = cls.clsNewSchemaVersion(schema_version) 

218 assert schema_version is not None, "New schema version cannot be None" 

219 return makeStaticTableSpecs( 

220 collections, 

221 universe=universe, 

222 dtype=cls.getIdColumnType(), 

223 autoincrement=cls._autoincrement, 

224 schema_version=schema_version, 

225 ) 

226 

227 @classmethod 

228 def getIdColumnType(cls) -> type: 

229 # Docstring inherited from base class. 

230 return cls._idColumnType 

231 

232 @classmethod 

233 def addDatasetForeignKey( 

234 cls, 

235 tableSpec: ddl.TableSpec, 

236 *, 

237 name: str = "dataset", 

238 constraint: bool = True, 

239 onDelete: str | None = None, 

240 **kwargs: Any, 

241 ) -> ddl.FieldSpec: 

242 # Docstring inherited from DatasetRecordStorageManager. 

243 return addDatasetForeignKey( 

244 tableSpec, cls.getIdColumnType(), name=name, onDelete=onDelete, constraint=constraint, **kwargs 

245 ) 

246 

247 def refresh(self) -> None: 

248 # Docstring inherited from DatasetRecordStorageManager. 

249 if self._caching_context.dataset_types is not None: 249 ↛ exitline 249 didn't return from function 'refresh', because the condition on line 249 was never false

250 self._caching_context.dataset_types.clear() 

251 

252 def _make_storage(self, record: _DatasetTypeRecord) -> ByDimensionsDatasetRecordStorage: 

253 """Create storage instance for a dataset type record.""" 

254 tags_spec = makeTagTableSpec(record.dataset_type, type(self._collections), self.getIdColumnType()) 

255 tags_table_factory = _SpecTableFactory(self._db, record.tag_table_name, tags_spec) 

256 calibs_table_factory = None 

257 if record.calib_table_name is not None: 

258 calibs_spec = makeCalibTableSpec( 

259 record.dataset_type, 

260 type(self._collections), 

261 self._db.getTimespanRepresentation(), 

262 self.getIdColumnType(), 

263 ) 

264 calibs_table_factory = _SpecTableFactory(self._db, record.calib_table_name, calibs_spec) 

265 storage = self._recordStorageType( 

266 db=self._db, 

267 datasetType=record.dataset_type, 

268 static=self._static, 

269 summaries=self._summaries, 

270 tags_table_factory=tags_table_factory, 

271 calibs_table_factory=calibs_table_factory, 

272 dataset_type_id=record.dataset_type_id, 

273 collections=self._collections, 

274 use_astropy_ingest_date=self.ingest_date_dtype() is ddl.AstropyTimeNsecTai, 

275 ) 

276 return storage 

277 

278 def remove(self, name: str) -> None: 

279 # Docstring inherited from DatasetRecordStorageManager. 

280 compositeName, componentName = DatasetType.splitDatasetTypeName(name) 

281 if componentName is not None: 281 ↛ 282line 281 didn't jump to line 282, because the condition on line 281 was never true

282 raise ValueError(f"Cannot delete a dataset type of a component of a composite (given {name})") 

283 

284 # Delete the row 

285 try: 

286 self._db.delete(self._static.dataset_type, ["name"], {"name": name}) 

287 except sqlalchemy.exc.IntegrityError as e: 

288 raise OrphanedRecordError( 

289 f"Dataset type {name} can not be removed." 

290 " It is associated with datasets that must be removed first." 

291 ) from e 

292 

293 # Now refresh everything -- removal is rare enough that this does 

294 # not need to be fast. 

295 self.refresh() 

296 

297 def find(self, name: str) -> DatasetRecordStorage | None: 

298 # Docstring inherited from DatasetRecordStorageManager. 

299 if self._caching_context.dataset_types is not None: 299 ↛ 313line 299 didn't jump to line 313, because the condition on line 299 was never false

300 _, storage = self._caching_context.dataset_types.get(name) 

301 if storage is not None: 

302 return storage 

303 else: 

304 # On the first cache miss populate the cache with complete list 

305 # of dataset types (if it was not done yet). 

306 if not self._caching_context.dataset_types.full: 

307 self._fetch_dataset_types() 

308 # Try again 

309 _, storage = self._caching_context.dataset_types.get(name) 

310 if self._caching_context.dataset_types.full: 310 ↛ 313line 310 didn't jump to line 313, because the condition on line 310 was never false

311 # If not in cache then dataset type is not defined. 

312 return storage 

313 record = self._fetch_dataset_type_record(name) 

314 if record is not None: 

315 storage = self._make_storage(record) 

316 if self._caching_context.dataset_types is not None: 

317 self._caching_context.dataset_types.add(storage.datasetType, storage) 

318 return storage 

319 else: 

320 return None 

321 

322 def register(self, datasetType: DatasetType) -> bool: 

323 # Docstring inherited from DatasetRecordStorageManager. 

324 if datasetType.isComponent(): 324 ↛ 325line 324 didn't jump to line 325, because the condition on line 324 was never true

325 raise ValueError( 

326 f"Component dataset types can not be stored in registry. Rejecting {datasetType.name}" 

327 ) 

328 record = self._fetch_dataset_type_record(datasetType.name) 

329 if record is None: 

330 dimensionsKey = self._dimensions.save_dimension_group(datasetType.dimensions.as_group()) 

331 tagTableName = makeTagTableName(datasetType, dimensionsKey) 

332 self._db.ensureTableExists( 

333 tagTableName, 

334 makeTagTableSpec(datasetType, type(self._collections), self.getIdColumnType()), 

335 ) 

336 calibTableName = ( 

337 makeCalibTableName(datasetType, dimensionsKey) if datasetType.isCalibration() else None 

338 ) 

339 if calibTableName is not None: 

340 self._db.ensureTableExists( 

341 calibTableName, 

342 makeCalibTableSpec( 

343 datasetType, 

344 type(self._collections), 

345 self._db.getTimespanRepresentation(), 

346 self.getIdColumnType(), 

347 ), 

348 ) 

349 row, inserted = self._db.sync( 

350 self._static.dataset_type, 

351 keys={"name": datasetType.name}, 

352 compared={ 

353 "dimensions_key": dimensionsKey, 

354 # Force the storage class to be loaded to ensure it 

355 # exists and there is no typo in the name. 

356 "storage_class": datasetType.storageClass.name, 

357 }, 

358 extra={ 

359 "tag_association_table": tagTableName, 

360 "calibration_association_table": calibTableName, 

361 }, 

362 returning=["id", "tag_association_table"], 

363 ) 

364 # Make sure that cache is updated 

365 if self._caching_context.dataset_types is not None and row is not None: 365 ↛ 382line 365 didn't jump to line 382, because the condition on line 365 was never false

366 record = _DatasetTypeRecord( 

367 dataset_type=datasetType, 

368 dataset_type_id=row["id"], 

369 tag_table_name=tagTableName, 

370 calib_table_name=calibTableName, 

371 ) 

372 storage = self._make_storage(record) 

373 self._caching_context.dataset_types.add(datasetType, storage) 

374 else: 

375 if datasetType != record.dataset_type: 

376 raise ConflictingDefinitionError( 

377 f"Given dataset type {datasetType} is inconsistent " 

378 f"with database definition {record.dataset_type}." 

379 ) 

380 inserted = False 

381 

382 return bool(inserted) 

383 

384 def resolve_wildcard( 

385 self, 

386 expression: Any, 

387 missing: list[str] | None = None, 

388 explicit_only: bool = False, 

389 ) -> list[DatasetType]: 

390 wildcard = DatasetTypeWildcard.from_expression(expression) 

391 result: list[DatasetType] = [] 

392 for name, dataset_type in wildcard.values.items(): 

393 parent_name, component_name = DatasetType.splitDatasetTypeName(name) 

394 if component_name is not None: 

395 raise DatasetTypeError( 

396 "Component dataset types are not supported in Registry methods; use DatasetRef or " 

397 "DatasetType methods to obtain components from parents instead." 

398 ) 

399 if (found_storage := self.find(parent_name)) is not None: 

400 resolved_dataset_type = found_storage.datasetType 

401 if dataset_type is not None: 

402 if dataset_type.is_compatible_with(resolved_dataset_type): 402 ↛ 407line 402 didn't jump to line 407, because the condition on line 402 was never false

403 # Prefer the given dataset type to enable storage class 

404 # conversions. 

405 resolved_dataset_type = dataset_type 

406 else: 

407 raise DatasetTypeError( 

408 f"Dataset type definition in query expression {dataset_type} is " 

409 f"not compatible with the registered type {resolved_dataset_type}." 

410 ) 

411 result.append(resolved_dataset_type) 

412 elif missing is not None: 

413 missing.append(name) 

414 if wildcard.patterns is ...: 

415 if explicit_only: 

416 raise TypeError( 

417 "Universal wildcard '...' is not permitted for dataset types in this context." 

418 ) 

419 for datasetType in self._fetch_dataset_types(): 

420 result.append(datasetType) 

421 elif wildcard.patterns: 

422 if explicit_only: 

423 raise DatasetTypeExpressionError( 

424 "Dataset type wildcard expressions are not supported in this context." 

425 ) 

426 dataset_types = self._fetch_dataset_types() 

427 for datasetType in dataset_types: 

428 if any(p.fullmatch(datasetType.name) for p in wildcard.patterns): 

429 result.append(datasetType) 

430 

431 return result 

432 

433 def getDatasetRef(self, id: DatasetId) -> DatasetRef | None: 

434 # Docstring inherited from DatasetRecordStorageManager. 

435 sql = ( 

436 sqlalchemy.sql.select( 

437 self._static.dataset.columns.dataset_type_id, 

438 self._static.dataset.columns[self._collections.getRunForeignKeyName()], 

439 *self._static.dataset_type.columns, 

440 ) 

441 .select_from(self._static.dataset) 

442 .join(self._static.dataset_type) 

443 .where(self._static.dataset.columns.id == id) 

444 ) 

445 with self._db.query(sql) as sql_result: 

446 row = sql_result.mappings().fetchone() 

447 if row is None: 

448 return None 

449 record = self._record_from_row(row) 

450 storage: DatasetRecordStorage | None = None 

451 if self._caching_context.dataset_types is not None: 451 ↛ 453line 451 didn't jump to line 453, because the condition on line 451 was never false

452 _, storage = self._caching_context.dataset_types.get(record.dataset_type.name) 

453 if storage is None: 453 ↛ 454line 453 didn't jump to line 454, because the condition on line 453 was never true

454 storage = self._make_storage(record) 

455 if self._caching_context.dataset_types is not None: 

456 self._caching_context.dataset_types.add(storage.datasetType, storage) 

457 assert isinstance(storage, ByDimensionsDatasetRecordStorage), "Not expected storage class" 

458 return DatasetRef( 

459 storage.datasetType, 

460 dataId=storage.getDataId(id=id), 

461 id=id, 

462 run=self._collections[row[self._collections.getRunForeignKeyName()]].name, 

463 ) 

464 

465 def _fetch_dataset_type_record(self, name: str) -> _DatasetTypeRecord | None: 

466 """Retrieve all dataset types defined in database. 

467 

468 Yields 

469 ------ 

470 dataset_types : `_DatasetTypeRecord` 

471 Information from a single database record. 

472 """ 

473 c = self._static.dataset_type.columns 

474 stmt = self._static.dataset_type.select().where(c.name == name) 

475 with self._db.query(stmt) as sql_result: 

476 row = sql_result.mappings().one_or_none() 

477 if row is None: 

478 return None 

479 else: 

480 return self._record_from_row(row) 

481 

482 def _record_from_row(self, row: Mapping) -> _DatasetTypeRecord: 

483 name = row["name"] 

484 dimensions = self._dimensions.load_dimension_group(row["dimensions_key"]) 

485 calibTableName = row["calibration_association_table"] 

486 datasetType = DatasetType( 

487 name, dimensions, row["storage_class"], isCalibration=(calibTableName is not None) 

488 ) 

489 return _DatasetTypeRecord( 

490 dataset_type=datasetType, 

491 dataset_type_id=row["id"], 

492 tag_table_name=row["tag_association_table"], 

493 calib_table_name=calibTableName, 

494 ) 

495 

496 def _dataset_type_from_row(self, row: Mapping) -> DatasetType: 

497 return self._record_from_row(row).dataset_type 

498 

499 def _fetch_dataset_types(self) -> list[DatasetType]: 

500 """Fetch list of all defined dataset types.""" 

501 if self._caching_context.dataset_types is not None: 501 ↛ 504line 501 didn't jump to line 504, because the condition on line 501 was never false

502 if self._caching_context.dataset_types.full: 

503 return [dataset_type for dataset_type, _ in self._caching_context.dataset_types.items()] 

504 with self._db.query(self._static.dataset_type.select()) as sql_result: 

505 sql_rows = sql_result.mappings().fetchall() 

506 records = [self._record_from_row(row) for row in sql_rows] 

507 # Cache everything and specify that cache is complete. 

508 if self._caching_context.dataset_types is not None: 508 ↛ 511line 508 didn't jump to line 511, because the condition on line 508 was never false

509 cache_data = [(record.dataset_type, self._make_storage(record)) for record in records] 

510 self._caching_context.dataset_types.set(cache_data, full=True) 

511 return [record.dataset_type for record in records] 

512 

513 def getCollectionSummary(self, collection: CollectionRecord) -> CollectionSummary: 

514 # Docstring inherited from DatasetRecordStorageManager. 

515 summaries = self._summaries.fetch_summaries([collection], None, self._dataset_type_from_row) 

516 return summaries[collection.key] 

517 

518 def fetch_summaries( 

519 self, collections: Iterable[CollectionRecord], dataset_types: Iterable[DatasetType] | None = None 

520 ) -> Mapping[Any, CollectionSummary]: 

521 # Docstring inherited from DatasetRecordStorageManager. 

522 dataset_type_names: Iterable[str] | None = None 

523 if dataset_types is not None: 523 ↛ 525line 523 didn't jump to line 525, because the condition on line 523 was never false

524 dataset_type_names = set(dataset_type.name for dataset_type in dataset_types) 

525 return self._summaries.fetch_summaries(collections, dataset_type_names, self._dataset_type_from_row) 

526 

527 _versions: list[VersionTuple] 

528 """Schema version for this class.""" 

529 

530 _recordStorageType: type[ByDimensionsDatasetRecordStorage] 

531 """Type of the storage class returned by this manager.""" 

532 

533 _autoincrement: bool 

534 """If True then PK column of the dataset table is auto-increment.""" 

535 

536 _idColumnType: type 

537 """Type of dataset column used to store dataset ID.""" 

538 

539 

540class ByDimensionsDatasetRecordStorageManagerUUID(ByDimensionsDatasetRecordStorageManagerBase): 

541 """Implementation of ByDimensionsDatasetRecordStorageManagerBase which uses 

542 UUID for dataset primary key. 

543 """ 

544 

545 _versions: list[VersionTuple] = [_VERSION_UUID, _VERSION_UUID_NS] 

546 _recordStorageType: type[ByDimensionsDatasetRecordStorage] = ByDimensionsDatasetRecordStorageUUID 

547 _autoincrement: bool = False 

548 _idColumnType: type = ddl.GUID 

549 

550 @classmethod 

551 def supportsIdGenerationMode(cls, mode: DatasetIdGenEnum) -> bool: 

552 # Docstring inherited from DatasetRecordStorageManager. 

553 return True 

554 

555 @classmethod 

556 def _newDefaultSchemaVersion(cls) -> VersionTuple: 

557 # Docstring inherited from VersionedExtension. 

558 

559 # By default return 1.0.0 so that older clients can still access new 

560 # registries created with a default config. 

561 return _VERSION_UUID 

562 

563 def ingest_date_dtype(self) -> type: 

564 """Return type of the ``ingest_date`` column.""" 

565 schema_version = self.newSchemaVersion() 

566 if schema_version is not None and schema_version.major > 1: 

567 return ddl.AstropyTimeNsecTai 

568 else: 

569 return sqlalchemy.TIMESTAMP