Coverage for python/lsst/daf/butler/registry/datasets/byDimensions/_manager.py: 93%

231 statements  

« prev     ^ index     » next       coverage.py v7.4.4, created at 2024-04-19 03:43 -0700

1from __future__ import annotations 

2 

3from .... import ddl 

4 

5__all__ = ("ByDimensionsDatasetRecordStorageManagerUUID",) 

6 

7import dataclasses 

8import logging 

9from collections.abc import Iterable, Mapping 

10from typing import TYPE_CHECKING, Any 

11 

12import sqlalchemy 

13 

14from ...._dataset_ref import DatasetId, DatasetIdGenEnum, DatasetRef, DatasetType 

15from ...._exceptions_legacy import DatasetTypeError 

16from ....dimensions import DimensionUniverse 

17from ..._collection_summary import CollectionSummary 

18from ..._exceptions import ConflictingDefinitionError, DatasetTypeExpressionError, OrphanedRecordError 

19from ...interfaces import DatasetRecordStorage, DatasetRecordStorageManager, VersionTuple 

20from ...wildcards import DatasetTypeWildcard 

21from ._storage import ByDimensionsDatasetRecordStorage, ByDimensionsDatasetRecordStorageUUID 

22from .summaries import CollectionSummaryManager 

23from .tables import ( 

24 addDatasetForeignKey, 

25 makeCalibTableName, 

26 makeCalibTableSpec, 

27 makeStaticTableSpecs, 

28 makeTagTableName, 

29 makeTagTableSpec, 

30) 

31 

32if TYPE_CHECKING: 

33 from ..._caching_context import CachingContext 

34 from ...interfaces import ( 

35 CollectionManager, 

36 CollectionRecord, 

37 Database, 

38 DimensionRecordStorageManager, 

39 StaticTablesContext, 

40 ) 

41 from .tables import StaticDatasetTablesTuple 

42 

43 

44# This has to be updated on every schema change 

45_VERSION_UUID = VersionTuple(1, 0, 0) 

46# Starting with 2.0.0 the `ingest_date` column type uses nanoseconds instead 

47# of TIMESTAMP. The code supports both 1.0.0 and 2.0.0 for the duration of 

48# client migration period. 

49_VERSION_UUID_NS = VersionTuple(2, 0, 0) 

50 

51_LOG = logging.getLogger(__name__) 

52 

53 

54class MissingDatabaseTableError(RuntimeError): 

55 """Exception raised when a table is not found in a database.""" 

56 

57 

58@dataclasses.dataclass 

59class _DatasetTypeRecord: 

60 """Contents of a single dataset type record.""" 

61 

62 dataset_type: DatasetType 

63 dataset_type_id: int 

64 tag_table_name: str 

65 calib_table_name: str | None 

66 

67 

68class _SpecTableFactory: 

69 """Factory for `sqlalchemy.schema.Table` instances that builds table 

70 instances using provided `ddl.TableSpec` definition and verifies that 

71 table exists in the database. 

72 """ 

73 

74 def __init__(self, db: Database, name: str, spec: ddl.TableSpec): 

75 self._db = db 

76 self._name = name 

77 self._spec = spec 

78 

79 def __call__(self) -> sqlalchemy.schema.Table: 

80 table = self._db.getExistingTable(self._name, self._spec) 

81 if table is None: 81 ↛ 82line 81 didn't jump to line 82, because the condition on line 81 was never true

82 raise MissingDatabaseTableError(f"Table {self._name} is missing from database schema.") 

83 return table 

84 

85 

86class ByDimensionsDatasetRecordStorageManagerBase(DatasetRecordStorageManager): 

87 """A manager class for datasets that uses one dataset-collection table for 

88 each group of dataset types that share the same dimensions. 

89 

90 In addition to the table organization, this class makes a number of 

91 other design choices that would have been cumbersome (to say the least) to 

92 try to pack into its name: 

93 

94 - It uses a private surrogate integer autoincrement field to identify 

95 dataset types, instead of using the name as the primary and foreign key 

96 directly. 

97 

98 - It aggressively loads all DatasetTypes into memory instead of fetching 

99 them from the database only when needed or attempting more clever forms 

100 of caching. 

101 

102 Alternative implementations that make different choices for these while 

103 keeping the same general table organization might be reasonable as well. 

104 

105 This class provides complete implementation of manager logic but it is 

106 parametrized by few class attributes that have to be defined by 

107 sub-classes. 

108 

109 Parameters 

110 ---------- 

111 db : `Database` 

112 Interface to the underlying database engine and namespace. 

113 collections : `CollectionManager` 

114 Manager object for the collections in this `Registry`. 

115 dimensions : `DimensionRecordStorageManager` 

116 Manager object for the dimensions in this `Registry`. 

117 static : `StaticDatasetTablesTuple` 

118 Named tuple of `sqlalchemy.schema.Table` instances for all static 

119 tables used by this class. 

120 summaries : `CollectionSummaryManager` 

121 Structure containing tables that summarize the contents of collections. 

122 caching_context : `CachingContext` 

123 Object controlling caching of information returned by managers. 

124 registry_schema_version : `VersionTuple` or `None`, optional 

125 Version of registry schema. 

126 """ 

127 

128 def __init__( 

129 self, 

130 *, 

131 db: Database, 

132 collections: CollectionManager, 

133 dimensions: DimensionRecordStorageManager, 

134 static: StaticDatasetTablesTuple, 

135 summaries: CollectionSummaryManager, 

136 caching_context: CachingContext, 

137 registry_schema_version: VersionTuple | None = None, 

138 ): 

139 super().__init__(registry_schema_version=registry_schema_version) 

140 self._db = db 

141 self._collections = collections 

142 self._dimensions = dimensions 

143 self._static = static 

144 self._summaries = summaries 

145 self._caching_context = caching_context 

146 

147 @classmethod 

148 def initialize( 

149 cls, 

150 db: Database, 

151 context: StaticTablesContext, 

152 *, 

153 collections: CollectionManager, 

154 dimensions: DimensionRecordStorageManager, 

155 caching_context: CachingContext, 

156 registry_schema_version: VersionTuple | None = None, 

157 ) -> DatasetRecordStorageManager: 

158 # Docstring inherited from DatasetRecordStorageManager. 

159 specs = cls.makeStaticTableSpecs( 

160 type(collections), universe=dimensions.universe, schema_version=registry_schema_version 

161 ) 

162 static: StaticDatasetTablesTuple = context.addTableTuple(specs) # type: ignore 

163 summaries = CollectionSummaryManager.initialize( 

164 db, 

165 context, 

166 collections=collections, 

167 dimensions=dimensions, 

168 dataset_type_table=static.dataset_type, 

169 caching_context=caching_context, 

170 ) 

171 return cls( 

172 db=db, 

173 collections=collections, 

174 dimensions=dimensions, 

175 static=static, 

176 summaries=summaries, 

177 caching_context=caching_context, 

178 registry_schema_version=registry_schema_version, 

179 ) 

180 

181 @classmethod 

182 def currentVersions(cls) -> list[VersionTuple]: 

183 # Docstring inherited from VersionedExtension. 

184 return cls._versions 

185 

186 @classmethod 

187 def makeStaticTableSpecs( 

188 cls, 

189 collections: type[CollectionManager], 

190 universe: DimensionUniverse, 

191 schema_version: VersionTuple | None, 

192 ) -> StaticDatasetTablesTuple: 

193 """Construct all static tables used by the classes in this package. 

194 

195 Static tables are those that are present in all Registries and do not 

196 depend on what DatasetTypes have been registered. 

197 

198 Parameters 

199 ---------- 

200 collections : `CollectionManager` 

201 Manager object for the collections in this `Registry`. 

202 universe : `DimensionUniverse` 

203 Universe graph containing all dimensions known to this `Registry`. 

204 schema_version : `VersionTuple` or `None` 

205 Version of the schema that should be created, if `None` then 

206 default schema should be used. 

207 

208 Returns 

209 ------- 

210 specs : `StaticDatasetTablesTuple` 

211 A named tuple containing `ddl.TableSpec` instances. 

212 """ 

213 schema_version = cls.clsNewSchemaVersion(schema_version) 

214 assert schema_version is not None, "New schema version cannot be None" 

215 return makeStaticTableSpecs( 

216 collections, 

217 universe=universe, 

218 dtype=cls.getIdColumnType(), 

219 autoincrement=cls._autoincrement, 

220 schema_version=schema_version, 

221 ) 

222 

223 @classmethod 

224 def getIdColumnType(cls) -> type: 

225 # Docstring inherited from base class. 

226 return cls._idColumnType 

227 

228 @classmethod 

229 def addDatasetForeignKey( 

230 cls, 

231 tableSpec: ddl.TableSpec, 

232 *, 

233 name: str = "dataset", 

234 constraint: bool = True, 

235 onDelete: str | None = None, 

236 **kwargs: Any, 

237 ) -> ddl.FieldSpec: 

238 # Docstring inherited from DatasetRecordStorageManager. 

239 return addDatasetForeignKey( 

240 tableSpec, cls.getIdColumnType(), name=name, onDelete=onDelete, constraint=constraint, **kwargs 

241 ) 

242 

243 def refresh(self) -> None: 

244 # Docstring inherited from DatasetRecordStorageManager. 

245 if self._caching_context.dataset_types is not None: 245 ↛ exitline 245 didn't return from function 'refresh', because the condition on line 245 was never false

246 self._caching_context.dataset_types.clear() 

247 

248 def _make_storage(self, record: _DatasetTypeRecord) -> ByDimensionsDatasetRecordStorage: 

249 """Create storage instance for a dataset type record.""" 

250 tags_spec = makeTagTableSpec(record.dataset_type, type(self._collections), self.getIdColumnType()) 

251 tags_table_factory = _SpecTableFactory(self._db, record.tag_table_name, tags_spec) 

252 calibs_table_factory = None 

253 if record.calib_table_name is not None: 

254 calibs_spec = makeCalibTableSpec( 

255 record.dataset_type, 

256 type(self._collections), 

257 self._db.getTimespanRepresentation(), 

258 self.getIdColumnType(), 

259 ) 

260 calibs_table_factory = _SpecTableFactory(self._db, record.calib_table_name, calibs_spec) 

261 storage = self._recordStorageType( 

262 db=self._db, 

263 datasetType=record.dataset_type, 

264 static=self._static, 

265 summaries=self._summaries, 

266 tags_table_factory=tags_table_factory, 

267 calibs_table_factory=calibs_table_factory, 

268 dataset_type_id=record.dataset_type_id, 

269 collections=self._collections, 

270 use_astropy_ingest_date=self.ingest_date_dtype() is ddl.AstropyTimeNsecTai, 

271 ) 

272 return storage 

273 

274 def remove(self, name: str) -> None: 

275 # Docstring inherited from DatasetRecordStorageManager. 

276 compositeName, componentName = DatasetType.splitDatasetTypeName(name) 

277 if componentName is not None: 277 ↛ 278line 277 didn't jump to line 278, because the condition on line 277 was never true

278 raise ValueError(f"Cannot delete a dataset type of a component of a composite (given {name})") 

279 

280 # Delete the row 

281 try: 

282 self._db.delete(self._static.dataset_type, ["name"], {"name": name}) 

283 except sqlalchemy.exc.IntegrityError as e: 

284 raise OrphanedRecordError( 

285 f"Dataset type {name} can not be removed." 

286 " It is associated with datasets that must be removed first." 

287 ) from e 

288 

289 # Now refresh everything -- removal is rare enough that this does 

290 # not need to be fast. 

291 self.refresh() 

292 

293 def find(self, name: str) -> DatasetRecordStorage | None: 

294 # Docstring inherited from DatasetRecordStorageManager. 

295 if self._caching_context.dataset_types is not None: 295 ↛ 308line 295 didn't jump to line 308, because the condition on line 295 was never false

296 _, storage = self._caching_context.dataset_types.get(name) 

297 if storage is not None: 

298 return storage 

299 else: 

300 # On the first cache miss populate the cache with complete list 

301 # of dataset types (if it was not done yet). 

302 if not self._caching_context.dataset_types.full: 

303 self._fetch_dataset_types() 

304 # Try again 

305 _, storage = self._caching_context.dataset_types.get(name) 

306 if storage is not None: 

307 return storage 

308 record = self._fetch_dataset_type_record(name) 

309 if record is not None: 309 ↛ 310line 309 didn't jump to line 310, because the condition on line 309 was never true

310 storage = self._make_storage(record) 

311 if self._caching_context.dataset_types is not None: 

312 self._caching_context.dataset_types.add(storage.datasetType, storage) 

313 return storage 

314 else: 

315 return None 

316 

317 def register(self, datasetType: DatasetType) -> bool: 

318 # Docstring inherited from DatasetRecordStorageManager. 

319 if datasetType.isComponent(): 319 ↛ 320line 319 didn't jump to line 320, because the condition on line 319 was never true

320 raise ValueError( 

321 f"Component dataset types can not be stored in registry. Rejecting {datasetType.name}" 

322 ) 

323 record = self._fetch_dataset_type_record(datasetType.name) 

324 if record is None: 

325 dimensionsKey = self._dimensions.save_dimension_group(datasetType.dimensions.as_group()) 

326 tagTableName = makeTagTableName(datasetType, dimensionsKey) 

327 self._db.ensureTableExists( 

328 tagTableName, 

329 makeTagTableSpec(datasetType, type(self._collections), self.getIdColumnType()), 

330 ) 

331 calibTableName = ( 

332 makeCalibTableName(datasetType, dimensionsKey) if datasetType.isCalibration() else None 

333 ) 

334 if calibTableName is not None: 

335 self._db.ensureTableExists( 

336 calibTableName, 

337 makeCalibTableSpec( 

338 datasetType, 

339 type(self._collections), 

340 self._db.getTimespanRepresentation(), 

341 self.getIdColumnType(), 

342 ), 

343 ) 

344 row, inserted = self._db.sync( 

345 self._static.dataset_type, 

346 keys={"name": datasetType.name}, 

347 compared={ 

348 "dimensions_key": dimensionsKey, 

349 # Force the storage class to be loaded to ensure it 

350 # exists and there is no typo in the name. 

351 "storage_class": datasetType.storageClass.name, 

352 }, 

353 extra={ 

354 "tag_association_table": tagTableName, 

355 "calibration_association_table": calibTableName, 

356 }, 

357 returning=["id", "tag_association_table"], 

358 ) 

359 # Make sure that cache is updated 

360 if self._caching_context.dataset_types is not None and row is not None: 360 ↛ 377line 360 didn't jump to line 377, because the condition on line 360 was never false

361 record = _DatasetTypeRecord( 

362 dataset_type=datasetType, 

363 dataset_type_id=row["id"], 

364 tag_table_name=tagTableName, 

365 calib_table_name=calibTableName, 

366 ) 

367 storage = self._make_storage(record) 

368 self._caching_context.dataset_types.add(datasetType, storage) 

369 else: 

370 if datasetType != record.dataset_type: 

371 raise ConflictingDefinitionError( 

372 f"Given dataset type {datasetType} is inconsistent " 

373 f"with database definition {record.dataset_type}." 

374 ) 

375 inserted = False 

376 

377 return bool(inserted) 

378 

379 def resolve_wildcard( 

380 self, 

381 expression: Any, 

382 missing: list[str] | None = None, 

383 explicit_only: bool = False, 

384 ) -> list[DatasetType]: 

385 wildcard = DatasetTypeWildcard.from_expression(expression) 

386 result: list[DatasetType] = [] 

387 for name, dataset_type in wildcard.values.items(): 

388 parent_name, component_name = DatasetType.splitDatasetTypeName(name) 

389 if component_name is not None: 

390 raise DatasetTypeError( 

391 "Component dataset types are not supported in Registry methods; use DatasetRef or " 

392 "DatasetType methods to obtain components from parents instead." 

393 ) 

394 if (found_storage := self.find(parent_name)) is not None: 

395 resolved_dataset_type = found_storage.datasetType 

396 if dataset_type is not None: 

397 if dataset_type.is_compatible_with(resolved_dataset_type): 397 ↛ 402line 397 didn't jump to line 402, because the condition on line 397 was never false

398 # Prefer the given dataset type to enable storage class 

399 # conversions. 

400 resolved_dataset_type = dataset_type 

401 else: 

402 raise DatasetTypeError( 

403 f"Dataset type definition in query expression {dataset_type} is " 

404 f"not compatible with the registered type {resolved_dataset_type}." 

405 ) 

406 result.append(resolved_dataset_type) 

407 elif missing is not None: 

408 missing.append(name) 

409 if wildcard.patterns is ...: 

410 if explicit_only: 

411 raise TypeError( 

412 "Universal wildcard '...' is not permitted for dataset types in this context." 

413 ) 

414 for datasetType in self._fetch_dataset_types(): 

415 result.append(datasetType) 

416 elif wildcard.patterns: 

417 if explicit_only: 

418 raise DatasetTypeExpressionError( 

419 "Dataset type wildcard expressions are not supported in this context." 

420 ) 

421 dataset_types = self._fetch_dataset_types() 

422 for datasetType in dataset_types: 

423 if any(p.fullmatch(datasetType.name) for p in wildcard.patterns): 

424 result.append(datasetType) 

425 

426 return result 

427 

428 def getDatasetRef(self, id: DatasetId) -> DatasetRef | None: 

429 # Docstring inherited from DatasetRecordStorageManager. 

430 sql = ( 

431 sqlalchemy.sql.select( 

432 self._static.dataset.columns.dataset_type_id, 

433 self._static.dataset.columns[self._collections.getRunForeignKeyName()], 

434 *self._static.dataset_type.columns, 

435 ) 

436 .select_from(self._static.dataset) 

437 .join(self._static.dataset_type) 

438 .where(self._static.dataset.columns.id == id) 

439 ) 

440 with self._db.query(sql) as sql_result: 

441 row = sql_result.mappings().fetchone() 

442 if row is None: 

443 return None 

444 record = self._record_from_row(row) 

445 storage: DatasetRecordStorage | None = None 

446 if self._caching_context.dataset_types is not None: 446 ↛ 448line 446 didn't jump to line 448, because the condition on line 446 was never false

447 _, storage = self._caching_context.dataset_types.get(record.dataset_type.name) 

448 if storage is None: 

449 storage = self._make_storage(record) 

450 if self._caching_context.dataset_types is not None: 450 ↛ 452line 450 didn't jump to line 452, because the condition on line 450 was never false

451 self._caching_context.dataset_types.add(storage.datasetType, storage) 

452 assert isinstance(storage, ByDimensionsDatasetRecordStorage), "Not expected storage class" 

453 return DatasetRef( 

454 storage.datasetType, 

455 dataId=storage.getDataId(id=id), 

456 id=id, 

457 run=self._collections[row[self._collections.getRunForeignKeyName()]].name, 

458 ) 

459 

460 def _fetch_dataset_type_record(self, name: str) -> _DatasetTypeRecord | None: 

461 """Retrieve all dataset types defined in database. 

462 

463 Yields 

464 ------ 

465 dataset_types : `_DatasetTypeRecord` 

466 Information from a single database record. 

467 """ 

468 c = self._static.dataset_type.columns 

469 stmt = self._static.dataset_type.select().where(c.name == name) 

470 with self._db.query(stmt) as sql_result: 

471 row = sql_result.mappings().one_or_none() 

472 if row is None: 

473 return None 

474 else: 

475 return self._record_from_row(row) 

476 

477 def _record_from_row(self, row: Mapping) -> _DatasetTypeRecord: 

478 name = row["name"] 

479 dimensions = self._dimensions.load_dimension_group(row["dimensions_key"]) 

480 calibTableName = row["calibration_association_table"] 

481 datasetType = DatasetType( 

482 name, dimensions, row["storage_class"], isCalibration=(calibTableName is not None) 

483 ) 

484 return _DatasetTypeRecord( 

485 dataset_type=datasetType, 

486 dataset_type_id=row["id"], 

487 tag_table_name=row["tag_association_table"], 

488 calib_table_name=calibTableName, 

489 ) 

490 

491 def _dataset_type_from_row(self, row: Mapping) -> DatasetType: 

492 return self._record_from_row(row).dataset_type 

493 

494 def _fetch_dataset_types(self) -> list[DatasetType]: 

495 """Fetch list of all defined dataset types.""" 

496 if self._caching_context.dataset_types is not None: 496 ↛ 499line 496 didn't jump to line 499, because the condition on line 496 was never false

497 if self._caching_context.dataset_types.full: 

498 return [dataset_type for dataset_type, _ in self._caching_context.dataset_types.items()] 

499 with self._db.query(self._static.dataset_type.select()) as sql_result: 

500 sql_rows = sql_result.mappings().fetchall() 

501 records = [self._record_from_row(row) for row in sql_rows] 

502 # Cache everything and specify that cache is complete. 

503 if self._caching_context.dataset_types is not None: 503 ↛ 506line 503 didn't jump to line 506, because the condition on line 503 was never false

504 cache_data = [(record.dataset_type, self._make_storage(record)) for record in records] 

505 self._caching_context.dataset_types.set(cache_data, full=True) 

506 return [record.dataset_type for record in records] 

507 

508 def getCollectionSummary(self, collection: CollectionRecord) -> CollectionSummary: 

509 # Docstring inherited from DatasetRecordStorageManager. 

510 summaries = self._summaries.fetch_summaries([collection], None, self._dataset_type_from_row) 

511 return summaries[collection.key] 

512 

513 def fetch_summaries( 

514 self, collections: Iterable[CollectionRecord], dataset_types: Iterable[DatasetType] | None = None 

515 ) -> Mapping[Any, CollectionSummary]: 

516 # Docstring inherited from DatasetRecordStorageManager. 

517 dataset_type_names: Iterable[str] | None = None 

518 if dataset_types is not None: 518 ↛ 520line 518 didn't jump to line 520, because the condition on line 518 was never false

519 dataset_type_names = set(dataset_type.name for dataset_type in dataset_types) 

520 return self._summaries.fetch_summaries(collections, dataset_type_names, self._dataset_type_from_row) 

521 

522 _versions: list[VersionTuple] 

523 """Schema version for this class.""" 

524 

525 _recordStorageType: type[ByDimensionsDatasetRecordStorage] 

526 """Type of the storage class returned by this manager.""" 

527 

528 _autoincrement: bool 

529 """If True then PK column of the dataset table is auto-increment.""" 

530 

531 _idColumnType: type 

532 """Type of dataset column used to store dataset ID.""" 

533 

534 

535class ByDimensionsDatasetRecordStorageManagerUUID(ByDimensionsDatasetRecordStorageManagerBase): 

536 """Implementation of ByDimensionsDatasetRecordStorageManagerBase which uses 

537 UUID for dataset primary key. 

538 """ 

539 

540 _versions: list[VersionTuple] = [_VERSION_UUID, _VERSION_UUID_NS] 

541 _recordStorageType: type[ByDimensionsDatasetRecordStorage] = ByDimensionsDatasetRecordStorageUUID 

542 _autoincrement: bool = False 

543 _idColumnType: type = ddl.GUID 

544 

545 def clone( 

546 self, 

547 *, 

548 db: Database, 

549 collections: CollectionManager, 

550 dimensions: DimensionRecordStorageManager, 

551 caching_context: CachingContext, 

552 ) -> ByDimensionsDatasetRecordStorageManagerUUID: 

553 return ByDimensionsDatasetRecordStorageManagerUUID( 

554 db=db, 

555 collections=collections, 

556 dimensions=dimensions, 

557 static=self._static, 

558 summaries=self._summaries.clone(db=db, collections=collections, caching_context=caching_context), 

559 caching_context=caching_context, 

560 registry_schema_version=self._registry_schema_version, 

561 ) 

562 

563 @classmethod 

564 def supportsIdGenerationMode(cls, mode: DatasetIdGenEnum) -> bool: 

565 # Docstring inherited from DatasetRecordStorageManager. 

566 return True 

567 

568 @classmethod 

569 def _newDefaultSchemaVersion(cls) -> VersionTuple: 

570 # Docstring inherited from VersionedExtension. 

571 

572 # By default return 1.0.0 so that older clients can still access new 

573 # registries created with a default config. 

574 return _VERSION_UUID 

575 

576 def ingest_date_dtype(self) -> type: 

577 """Return type of the ``ingest_date`` column.""" 

578 schema_version = self.newSchemaVersion() 

579 if schema_version is not None and schema_version.major > 1: 

580 return ddl.AstropyTimeNsecTai 

581 else: 

582 return sqlalchemy.TIMESTAMP