Coverage for python/lsst/daf/butler/registry/datasets/byDimensions/_manager.py: 93%

230 statements  

« prev     ^ index     » next       coverage.py v7.4.0, created at 2024-01-25 10:48 +0000

1from __future__ import annotations 

2 

3from .... import ddl 

4 

5__all__ = ("ByDimensionsDatasetRecordStorageManagerUUID",) 

6 

7import dataclasses 

8import logging 

9from collections.abc import Iterable, Mapping 

10from typing import TYPE_CHECKING, Any 

11 

12import sqlalchemy 

13 

14from ...._dataset_ref import DatasetId, DatasetIdGenEnum, DatasetRef, DatasetType 

15from ....dimensions import DimensionUniverse 

16from ..._collection_summary import CollectionSummary 

17from ..._exceptions import ( 

18 ConflictingDefinitionError, 

19 DatasetTypeError, 

20 DatasetTypeExpressionError, 

21 OrphanedRecordError, 

22) 

23from ...interfaces import DatasetRecordStorage, DatasetRecordStorageManager, VersionTuple 

24from ...wildcards import DatasetTypeWildcard 

25from ._storage import ByDimensionsDatasetRecordStorage, ByDimensionsDatasetRecordStorageUUID 

26from .summaries import CollectionSummaryManager 

27from .tables import ( 

28 addDatasetForeignKey, 

29 makeCalibTableName, 

30 makeCalibTableSpec, 

31 makeStaticTableSpecs, 

32 makeTagTableName, 

33 makeTagTableSpec, 

34) 

35 

36if TYPE_CHECKING: 

37 from ..._caching_context import CachingContext 

38 from ...interfaces import ( 

39 CollectionManager, 

40 CollectionRecord, 

41 Database, 

42 DimensionRecordStorageManager, 

43 StaticTablesContext, 

44 ) 

45 from .tables import StaticDatasetTablesTuple 

46 

47 

48# This has to be updated on every schema change 

49_VERSION_UUID = VersionTuple(1, 0, 0) 

50# Starting with 2.0.0 the `ingest_date` column type uses nanoseconds instead 

51# of TIMESTAMP. The code supports both 1.0.0 and 2.0.0 for the duration of 

52# client migration period. 

53_VERSION_UUID_NS = VersionTuple(2, 0, 0) 

54 

55_LOG = logging.getLogger(__name__) 

56 

57 

58class MissingDatabaseTableError(RuntimeError): 

59 """Exception raised when a table is not found in a database.""" 

60 

61 

62@dataclasses.dataclass 

63class _DatasetTypeRecord: 

64 """Contents of a single dataset type record.""" 

65 

66 dataset_type: DatasetType 

67 dataset_type_id: int 

68 tag_table_name: str 

69 calib_table_name: str | None 

70 

71 

72class _SpecTableFactory: 

73 """Factory for `sqlalchemy.schema.Table` instances that builds table 

74 instances using provided `ddl.TableSpec` definition and verifies that 

75 table exists in the database. 

76 """ 

77 

78 def __init__(self, db: Database, name: str, spec: ddl.TableSpec): 

79 self._db = db 

80 self._name = name 

81 self._spec = spec 

82 

83 def __call__(self) -> sqlalchemy.schema.Table: 

84 table = self._db.getExistingTable(self._name, self._spec) 

85 if table is None: 85 ↛ 86line 85 didn't jump to line 86, because the condition on line 85 was never true

86 raise MissingDatabaseTableError(f"Table {self._name} is missing from database schema.") 

87 return table 

88 

89 

90class ByDimensionsDatasetRecordStorageManagerBase(DatasetRecordStorageManager): 

91 """A manager class for datasets that uses one dataset-collection table for 

92 each group of dataset types that share the same dimensions. 

93 

94 In addition to the table organization, this class makes a number of 

95 other design choices that would have been cumbersome (to say the least) to 

96 try to pack into its name: 

97 

98 - It uses a private surrogate integer autoincrement field to identify 

99 dataset types, instead of using the name as the primary and foreign key 

100 directly. 

101 

102 - It aggressively loads all DatasetTypes into memory instead of fetching 

103 them from the database only when needed or attempting more clever forms 

104 of caching. 

105 

106 Alternative implementations that make different choices for these while 

107 keeping the same general table organization might be reasonable as well. 

108 

109 This class provides complete implementation of manager logic but it is 

110 parametrized by few class attributes that have to be defined by 

111 sub-classes. 

112 

113 Parameters 

114 ---------- 

115 db : `Database` 

116 Interface to the underlying database engine and namespace. 

117 collections : `CollectionManager` 

118 Manager object for the collections in this `Registry`. 

119 dimensions : `DimensionRecordStorageManager` 

120 Manager object for the dimensions in this `Registry`. 

121 static : `StaticDatasetTablesTuple` 

122 Named tuple of `sqlalchemy.schema.Table` instances for all static 

123 tables used by this class. 

124 summaries : `CollectionSummaryManager` 

125 Structure containing tables that summarize the contents of collections. 

126 caching_context : `CachingContext` 

127 Object controlling caching of information returned by managers. 

128 registry_schema_version : `VersionTuple` or `None`, optional 

129 Version of registry schema. 

130 """ 

131 

132 def __init__( 

133 self, 

134 *, 

135 db: Database, 

136 collections: CollectionManager, 

137 dimensions: DimensionRecordStorageManager, 

138 static: StaticDatasetTablesTuple, 

139 summaries: CollectionSummaryManager, 

140 caching_context: CachingContext, 

141 registry_schema_version: VersionTuple | None = None, 

142 ): 

143 super().__init__(registry_schema_version=registry_schema_version) 

144 self._db = db 

145 self._collections = collections 

146 self._dimensions = dimensions 

147 self._static = static 

148 self._summaries = summaries 

149 self._caching_context = caching_context 

150 

151 @classmethod 

152 def initialize( 

153 cls, 

154 db: Database, 

155 context: StaticTablesContext, 

156 *, 

157 collections: CollectionManager, 

158 dimensions: DimensionRecordStorageManager, 

159 caching_context: CachingContext, 

160 registry_schema_version: VersionTuple | None = None, 

161 ) -> DatasetRecordStorageManager: 

162 # Docstring inherited from DatasetRecordStorageManager. 

163 specs = cls.makeStaticTableSpecs( 

164 type(collections), universe=dimensions.universe, schema_version=registry_schema_version 

165 ) 

166 static: StaticDatasetTablesTuple = context.addTableTuple(specs) # type: ignore 

167 summaries = CollectionSummaryManager.initialize( 

168 db, 

169 context, 

170 collections=collections, 

171 dimensions=dimensions, 

172 dataset_type_table=static.dataset_type, 

173 caching_context=caching_context, 

174 ) 

175 return cls( 

176 db=db, 

177 collections=collections, 

178 dimensions=dimensions, 

179 static=static, 

180 summaries=summaries, 

181 caching_context=caching_context, 

182 registry_schema_version=registry_schema_version, 

183 ) 

184 

185 @classmethod 

186 def currentVersions(cls) -> list[VersionTuple]: 

187 # Docstring inherited from VersionedExtension. 

188 return cls._versions 

189 

190 @classmethod 

191 def makeStaticTableSpecs( 

192 cls, 

193 collections: type[CollectionManager], 

194 universe: DimensionUniverse, 

195 schema_version: VersionTuple | None, 

196 ) -> StaticDatasetTablesTuple: 

197 """Construct all static tables used by the classes in this package. 

198 

199 Static tables are those that are present in all Registries and do not 

200 depend on what DatasetTypes have been registered. 

201 

202 Parameters 

203 ---------- 

204 collections : `CollectionManager` 

205 Manager object for the collections in this `Registry`. 

206 universe : `DimensionUniverse` 

207 Universe graph containing all dimensions known to this `Registry`. 

208 schema_version : `VersionTuple` or `None` 

209 Version of the schema that should be created, if `None` then 

210 default schema should be used. 

211 

212 Returns 

213 ------- 

214 specs : `StaticDatasetTablesTuple` 

215 A named tuple containing `ddl.TableSpec` instances. 

216 """ 

217 schema_version = cls.clsNewSchemaVersion(schema_version) 

218 assert schema_version is not None, "New schema version cannot be None" 

219 return makeStaticTableSpecs( 

220 collections, 

221 universe=universe, 

222 dtype=cls.getIdColumnType(), 

223 autoincrement=cls._autoincrement, 

224 schema_version=schema_version, 

225 ) 

226 

227 @classmethod 

228 def getIdColumnType(cls) -> type: 

229 # Docstring inherited from base class. 

230 return cls._idColumnType 

231 

232 @classmethod 

233 def addDatasetForeignKey( 

234 cls, 

235 tableSpec: ddl.TableSpec, 

236 *, 

237 name: str = "dataset", 

238 constraint: bool = True, 

239 onDelete: str | None = None, 

240 **kwargs: Any, 

241 ) -> ddl.FieldSpec: 

242 # Docstring inherited from DatasetRecordStorageManager. 

243 return addDatasetForeignKey( 

244 tableSpec, cls.getIdColumnType(), name=name, onDelete=onDelete, constraint=constraint, **kwargs 

245 ) 

246 

247 def refresh(self) -> None: 

248 # Docstring inherited from DatasetRecordStorageManager. 

249 if self._caching_context.dataset_types is not None: 249 ↛ exitline 249 didn't return from function 'refresh', because the condition on line 249 was never false

250 self._caching_context.dataset_types.clear() 

251 

252 def _make_storage(self, record: _DatasetTypeRecord) -> ByDimensionsDatasetRecordStorage: 

253 """Create storage instance for a dataset type record.""" 

254 tags_spec = makeTagTableSpec(record.dataset_type, type(self._collections), self.getIdColumnType()) 

255 tags_table_factory = _SpecTableFactory(self._db, record.tag_table_name, tags_spec) 

256 calibs_table_factory = None 

257 if record.calib_table_name is not None: 

258 calibs_spec = makeCalibTableSpec( 

259 record.dataset_type, 

260 type(self._collections), 

261 self._db.getTimespanRepresentation(), 

262 self.getIdColumnType(), 

263 ) 

264 calibs_table_factory = _SpecTableFactory(self._db, record.calib_table_name, calibs_spec) 

265 storage = self._recordStorageType( 

266 db=self._db, 

267 datasetType=record.dataset_type, 

268 static=self._static, 

269 summaries=self._summaries, 

270 tags_table_factory=tags_table_factory, 

271 calibs_table_factory=calibs_table_factory, 

272 dataset_type_id=record.dataset_type_id, 

273 collections=self._collections, 

274 use_astropy_ingest_date=self.ingest_date_dtype() is ddl.AstropyTimeNsecTai, 

275 ) 

276 return storage 

277 

278 def remove(self, name: str) -> None: 

279 # Docstring inherited from DatasetRecordStorageManager. 

280 compositeName, componentName = DatasetType.splitDatasetTypeName(name) 

281 if componentName is not None: 281 ↛ 282line 281 didn't jump to line 282, because the condition on line 281 was never true

282 raise ValueError(f"Cannot delete a dataset type of a component of a composite (given {name})") 

283 

284 # Delete the row 

285 try: 

286 self._db.delete(self._static.dataset_type, ["name"], {"name": name}) 

287 except sqlalchemy.exc.IntegrityError as e: 

288 raise OrphanedRecordError( 

289 f"Dataset type {name} can not be removed." 

290 " It is associated with datasets that must be removed first." 

291 ) from e 

292 

293 # Now refresh everything -- removal is rare enough that this does 

294 # not need to be fast. 

295 self.refresh() 

296 

297 def find(self, name: str) -> DatasetRecordStorage | None: 

298 # Docstring inherited from DatasetRecordStorageManager. 

299 if self._caching_context.dataset_types is not None: 299 ↛ 312line 299 didn't jump to line 312, because the condition on line 299 was never false

300 _, storage = self._caching_context.dataset_types.get(name) 

301 if storage is not None: 

302 return storage 

303 else: 

304 # On the first cache miss populate the cache with complete list 

305 # of dataset types (if it was not done yet). 

306 if not self._caching_context.dataset_types.full: 

307 self._fetch_dataset_types() 

308 # Try again 

309 _, storage = self._caching_context.dataset_types.get(name) 

310 if storage is not None: 

311 return storage 

312 record = self._fetch_dataset_type_record(name) 

313 if record is not None: 313 ↛ 314line 313 didn't jump to line 314, because the condition on line 313 was never true

314 storage = self._make_storage(record) 

315 if self._caching_context.dataset_types is not None: 

316 self._caching_context.dataset_types.add(storage.datasetType, storage) 

317 return storage 

318 else: 

319 return None 

320 

321 def register(self, datasetType: DatasetType) -> bool: 

322 # Docstring inherited from DatasetRecordStorageManager. 

323 if datasetType.isComponent(): 323 ↛ 324line 323 didn't jump to line 324, because the condition on line 323 was never true

324 raise ValueError( 

325 f"Component dataset types can not be stored in registry. Rejecting {datasetType.name}" 

326 ) 

327 record = self._fetch_dataset_type_record(datasetType.name) 

328 if record is None: 

329 dimensionsKey = self._dimensions.save_dimension_group(datasetType.dimensions.as_group()) 

330 tagTableName = makeTagTableName(datasetType, dimensionsKey) 

331 self._db.ensureTableExists( 

332 tagTableName, 

333 makeTagTableSpec(datasetType, type(self._collections), self.getIdColumnType()), 

334 ) 

335 calibTableName = ( 

336 makeCalibTableName(datasetType, dimensionsKey) if datasetType.isCalibration() else None 

337 ) 

338 if calibTableName is not None: 

339 self._db.ensureTableExists( 

340 calibTableName, 

341 makeCalibTableSpec( 

342 datasetType, 

343 type(self._collections), 

344 self._db.getTimespanRepresentation(), 

345 self.getIdColumnType(), 

346 ), 

347 ) 

348 row, inserted = self._db.sync( 

349 self._static.dataset_type, 

350 keys={"name": datasetType.name}, 

351 compared={ 

352 "dimensions_key": dimensionsKey, 

353 # Force the storage class to be loaded to ensure it 

354 # exists and there is no typo in the name. 

355 "storage_class": datasetType.storageClass.name, 

356 }, 

357 extra={ 

358 "tag_association_table": tagTableName, 

359 "calibration_association_table": calibTableName, 

360 }, 

361 returning=["id", "tag_association_table"], 

362 ) 

363 # Make sure that cache is updated 

364 if self._caching_context.dataset_types is not None and row is not None: 364 ↛ 381line 364 didn't jump to line 381, because the condition on line 364 was never false

365 record = _DatasetTypeRecord( 

366 dataset_type=datasetType, 

367 dataset_type_id=row["id"], 

368 tag_table_name=tagTableName, 

369 calib_table_name=calibTableName, 

370 ) 

371 storage = self._make_storage(record) 

372 self._caching_context.dataset_types.add(datasetType, storage) 

373 else: 

374 if datasetType != record.dataset_type: 

375 raise ConflictingDefinitionError( 

376 f"Given dataset type {datasetType} is inconsistent " 

377 f"with database definition {record.dataset_type}." 

378 ) 

379 inserted = False 

380 

381 return bool(inserted) 

382 

383 def resolve_wildcard( 

384 self, 

385 expression: Any, 

386 missing: list[str] | None = None, 

387 explicit_only: bool = False, 

388 ) -> list[DatasetType]: 

389 wildcard = DatasetTypeWildcard.from_expression(expression) 

390 result: list[DatasetType] = [] 

391 for name, dataset_type in wildcard.values.items(): 

392 parent_name, component_name = DatasetType.splitDatasetTypeName(name) 

393 if component_name is not None: 

394 raise DatasetTypeError( 

395 "Component dataset types are not supported in Registry methods; use DatasetRef or " 

396 "DatasetType methods to obtain components from parents instead." 

397 ) 

398 if (found_storage := self.find(parent_name)) is not None: 

399 resolved_dataset_type = found_storage.datasetType 

400 if dataset_type is not None: 

401 if dataset_type.is_compatible_with(resolved_dataset_type): 401 ↛ 406line 401 didn't jump to line 406, because the condition on line 401 was never false

402 # Prefer the given dataset type to enable storage class 

403 # conversions. 

404 resolved_dataset_type = dataset_type 

405 else: 

406 raise DatasetTypeError( 

407 f"Dataset type definition in query expression {dataset_type} is " 

408 f"not compatible with the registered type {resolved_dataset_type}." 

409 ) 

410 result.append(resolved_dataset_type) 

411 elif missing is not None: 

412 missing.append(name) 

413 if wildcard.patterns is ...: 

414 if explicit_only: 

415 raise TypeError( 

416 "Universal wildcard '...' is not permitted for dataset types in this context." 

417 ) 

418 for datasetType in self._fetch_dataset_types(): 

419 result.append(datasetType) 

420 elif wildcard.patterns: 

421 if explicit_only: 

422 raise DatasetTypeExpressionError( 

423 "Dataset type wildcard expressions are not supported in this context." 

424 ) 

425 dataset_types = self._fetch_dataset_types() 

426 for datasetType in dataset_types: 

427 if any(p.fullmatch(datasetType.name) for p in wildcard.patterns): 

428 result.append(datasetType) 

429 

430 return result 

431 

432 def getDatasetRef(self, id: DatasetId) -> DatasetRef | None: 

433 # Docstring inherited from DatasetRecordStorageManager. 

434 sql = ( 

435 sqlalchemy.sql.select( 

436 self._static.dataset.columns.dataset_type_id, 

437 self._static.dataset.columns[self._collections.getRunForeignKeyName()], 

438 *self._static.dataset_type.columns, 

439 ) 

440 .select_from(self._static.dataset) 

441 .join(self._static.dataset_type) 

442 .where(self._static.dataset.columns.id == id) 

443 ) 

444 with self._db.query(sql) as sql_result: 

445 row = sql_result.mappings().fetchone() 

446 if row is None: 

447 return None 

448 record = self._record_from_row(row) 

449 storage: DatasetRecordStorage | None = None 

450 if self._caching_context.dataset_types is not None: 450 ↛ 452line 450 didn't jump to line 452, because the condition on line 450 was never false

451 _, storage = self._caching_context.dataset_types.get(record.dataset_type.name) 

452 if storage is None: 

453 storage = self._make_storage(record) 

454 if self._caching_context.dataset_types is not None: 454 ↛ 456line 454 didn't jump to line 456, because the condition on line 454 was never false

455 self._caching_context.dataset_types.add(storage.datasetType, storage) 

456 assert isinstance(storage, ByDimensionsDatasetRecordStorage), "Not expected storage class" 

457 return DatasetRef( 

458 storage.datasetType, 

459 dataId=storage.getDataId(id=id), 

460 id=id, 

461 run=self._collections[row[self._collections.getRunForeignKeyName()]].name, 

462 ) 

463 

464 def _fetch_dataset_type_record(self, name: str) -> _DatasetTypeRecord | None: 

465 """Retrieve all dataset types defined in database. 

466 

467 Yields 

468 ------ 

469 dataset_types : `_DatasetTypeRecord` 

470 Information from a single database record. 

471 """ 

472 c = self._static.dataset_type.columns 

473 stmt = self._static.dataset_type.select().where(c.name == name) 

474 with self._db.query(stmt) as sql_result: 

475 row = sql_result.mappings().one_or_none() 

476 if row is None: 

477 return None 

478 else: 

479 return self._record_from_row(row) 

480 

481 def _record_from_row(self, row: Mapping) -> _DatasetTypeRecord: 

482 name = row["name"] 

483 dimensions = self._dimensions.load_dimension_group(row["dimensions_key"]) 

484 calibTableName = row["calibration_association_table"] 

485 datasetType = DatasetType( 

486 name, dimensions, row["storage_class"], isCalibration=(calibTableName is not None) 

487 ) 

488 return _DatasetTypeRecord( 

489 dataset_type=datasetType, 

490 dataset_type_id=row["id"], 

491 tag_table_name=row["tag_association_table"], 

492 calib_table_name=calibTableName, 

493 ) 

494 

495 def _dataset_type_from_row(self, row: Mapping) -> DatasetType: 

496 return self._record_from_row(row).dataset_type 

497 

498 def _fetch_dataset_types(self) -> list[DatasetType]: 

499 """Fetch list of all defined dataset types.""" 

500 if self._caching_context.dataset_types is not None: 500 ↛ 503line 500 didn't jump to line 503, because the condition on line 500 was never false

501 if self._caching_context.dataset_types.full: 

502 return [dataset_type for dataset_type, _ in self._caching_context.dataset_types.items()] 

503 with self._db.query(self._static.dataset_type.select()) as sql_result: 

504 sql_rows = sql_result.mappings().fetchall() 

505 records = [self._record_from_row(row) for row in sql_rows] 

506 # Cache everything and specify that cache is complete. 

507 if self._caching_context.dataset_types is not None: 507 ↛ 510line 507 didn't jump to line 510, because the condition on line 507 was never false

508 cache_data = [(record.dataset_type, self._make_storage(record)) for record in records] 

509 self._caching_context.dataset_types.set(cache_data, full=True) 

510 return [record.dataset_type for record in records] 

511 

512 def getCollectionSummary(self, collection: CollectionRecord) -> CollectionSummary: 

513 # Docstring inherited from DatasetRecordStorageManager. 

514 summaries = self._summaries.fetch_summaries([collection], None, self._dataset_type_from_row) 

515 return summaries[collection.key] 

516 

517 def fetch_summaries( 

518 self, collections: Iterable[CollectionRecord], dataset_types: Iterable[DatasetType] | None = None 

519 ) -> Mapping[Any, CollectionSummary]: 

520 # Docstring inherited from DatasetRecordStorageManager. 

521 dataset_type_names: Iterable[str] | None = None 

522 if dataset_types is not None: 522 ↛ 524line 522 didn't jump to line 524, because the condition on line 522 was never false

523 dataset_type_names = set(dataset_type.name for dataset_type in dataset_types) 

524 return self._summaries.fetch_summaries(collections, dataset_type_names, self._dataset_type_from_row) 

525 

526 _versions: list[VersionTuple] 

527 """Schema version for this class.""" 

528 

529 _recordStorageType: type[ByDimensionsDatasetRecordStorage] 

530 """Type of the storage class returned by this manager.""" 

531 

532 _autoincrement: bool 

533 """If True then PK column of the dataset table is auto-increment.""" 

534 

535 _idColumnType: type 

536 """Type of dataset column used to store dataset ID.""" 

537 

538 

539class ByDimensionsDatasetRecordStorageManagerUUID(ByDimensionsDatasetRecordStorageManagerBase): 

540 """Implementation of ByDimensionsDatasetRecordStorageManagerBase which uses 

541 UUID for dataset primary key. 

542 """ 

543 

544 _versions: list[VersionTuple] = [_VERSION_UUID, _VERSION_UUID_NS] 

545 _recordStorageType: type[ByDimensionsDatasetRecordStorage] = ByDimensionsDatasetRecordStorageUUID 

546 _autoincrement: bool = False 

547 _idColumnType: type = ddl.GUID 

548 

549 def clone( 

550 self, 

551 *, 

552 db: Database, 

553 collections: CollectionManager, 

554 dimensions: DimensionRecordStorageManager, 

555 caching_context: CachingContext, 

556 ) -> ByDimensionsDatasetRecordStorageManagerUUID: 

557 return ByDimensionsDatasetRecordStorageManagerUUID( 

558 db=db, 

559 collections=collections, 

560 dimensions=dimensions, 

561 static=self._static, 

562 summaries=self._summaries.clone(db=db, collections=collections, caching_context=caching_context), 

563 caching_context=caching_context, 

564 registry_schema_version=self._registry_schema_version, 

565 ) 

566 

567 @classmethod 

568 def supportsIdGenerationMode(cls, mode: DatasetIdGenEnum) -> bool: 

569 # Docstring inherited from DatasetRecordStorageManager. 

570 return True 

571 

572 @classmethod 

573 def _newDefaultSchemaVersion(cls) -> VersionTuple: 

574 # Docstring inherited from VersionedExtension. 

575 

576 # By default return 1.0.0 so that older clients can still access new 

577 # registries created with a default config. 

578 return _VERSION_UUID 

579 

580 def ingest_date_dtype(self) -> type: 

581 """Return type of the ``ingest_date`` column.""" 

582 schema_version = self.newSchemaVersion() 

583 if schema_version is not None and schema_version.major > 1: 

584 return ddl.AstropyTimeNsecTai 

585 else: 

586 return sqlalchemy.TIMESTAMP