Coverage for python/lsst/daf/butler/registry/datasets/byDimensions/_manager.py: 95%

207 statements  

« prev     ^ index     » next       coverage.py v7.3.2, created at 2023-10-27 09:43 +0000

1from __future__ import annotations 

2 

3from .... import ddl 

4 

5__all__ = ("ByDimensionsDatasetRecordStorageManagerUUID",) 

6 

7import logging 

8import warnings 

9from collections import defaultdict 

10from typing import TYPE_CHECKING, Any 

11 

12import sqlalchemy 

13from lsst.utils.introspection import find_outside_stacklevel 

14 

15from ...._dataset_ref import DatasetId, DatasetIdGenEnum, DatasetRef, DatasetType 

16from ....dimensions import DimensionUniverse 

17from ..._collection_summary import CollectionSummary 

18from ..._exceptions import ConflictingDefinitionError, DatasetTypeError, OrphanedRecordError 

19from ...interfaces import DatasetRecordStorage, DatasetRecordStorageManager, VersionTuple 

20from ...wildcards import DatasetTypeWildcard 

21from ._storage import ByDimensionsDatasetRecordStorage, ByDimensionsDatasetRecordStorageUUID 

22from .summaries import CollectionSummaryManager 

23from .tables import ( 

24 addDatasetForeignKey, 

25 makeCalibTableName, 

26 makeCalibTableSpec, 

27 makeStaticTableSpecs, 

28 makeTagTableName, 

29 makeTagTableSpec, 

30) 

31 

32if TYPE_CHECKING: 

33 from ...interfaces import ( 

34 CollectionManager, 

35 CollectionRecord, 

36 Database, 

37 DimensionRecordStorageManager, 

38 StaticTablesContext, 

39 ) 

40 from .tables import StaticDatasetTablesTuple 

41 

42 

43# This has to be updated on every schema change 

44_VERSION_UUID = VersionTuple(1, 0, 0) 

45# Starting with 2.0.0 the `ingest_date` column type uses nanoseconds instead 

46# of TIMESTAMP. The code supports both 1.0.0 and 2.0.0 for the duration of 

47# client migration period. 

48_VERSION_UUID_NS = VersionTuple(2, 0, 0) 

49 

50_LOG = logging.getLogger(__name__) 

51 

52 

53class MissingDatabaseTableError(RuntimeError): 

54 """Exception raised when a table is not found in a database.""" 

55 

56 

57class ByDimensionsDatasetRecordStorageManagerBase(DatasetRecordStorageManager): 

58 """A manager class for datasets that uses one dataset-collection table for 

59 each group of dataset types that share the same dimensions. 

60 

61 In addition to the table organization, this class makes a number of 

62 other design choices that would have been cumbersome (to say the least) to 

63 try to pack into its name: 

64 

65 - It uses a private surrogate integer autoincrement field to identify 

66 dataset types, instead of using the name as the primary and foreign key 

67 directly. 

68 

69 - It aggressively loads all DatasetTypes into memory instead of fetching 

70 them from the database only when needed or attempting more clever forms 

71 of caching. 

72 

73 Alternative implementations that make different choices for these while 

74 keeping the same general table organization might be reasonable as well. 

75 

76 This class provides complete implementation of manager logic but it is 

77 parametrized by few class attributes that have to be defined by 

78 sub-classes. 

79 

80 Parameters 

81 ---------- 

82 db : `Database` 

83 Interface to the underlying database engine and namespace. 

84 collections : `CollectionManager` 

85 Manager object for the collections in this `Registry`. 

86 dimensions : `DimensionRecordStorageManager` 

87 Manager object for the dimensions in this `Registry`. 

88 static : `StaticDatasetTablesTuple` 

89 Named tuple of `sqlalchemy.schema.Table` instances for all static 

90 tables used by this class. 

91 summaries : `CollectionSummaryManager` 

92 Structure containing tables that summarize the contents of collections. 

93 """ 

94 

95 def __init__( 

96 self, 

97 *, 

98 db: Database, 

99 collections: CollectionManager, 

100 dimensions: DimensionRecordStorageManager, 

101 static: StaticDatasetTablesTuple, 

102 summaries: CollectionSummaryManager, 

103 registry_schema_version: VersionTuple | None = None, 

104 ): 

105 super().__init__(registry_schema_version=registry_schema_version) 

106 self._db = db 

107 self._collections = collections 

108 self._dimensions = dimensions 

109 self._static = static 

110 self._summaries = summaries 

111 self._byName: dict[str, ByDimensionsDatasetRecordStorage] = {} 

112 self._byId: dict[int, ByDimensionsDatasetRecordStorage] = {} 

113 

114 @classmethod 

115 def initialize( 

116 cls, 

117 db: Database, 

118 context: StaticTablesContext, 

119 *, 

120 collections: CollectionManager, 

121 dimensions: DimensionRecordStorageManager, 

122 registry_schema_version: VersionTuple | None = None, 

123 ) -> DatasetRecordStorageManager: 

124 # Docstring inherited from DatasetRecordStorageManager. 

125 specs = cls.makeStaticTableSpecs( 

126 type(collections), universe=dimensions.universe, schema_version=registry_schema_version 

127 ) 

128 static: StaticDatasetTablesTuple = context.addTableTuple(specs) # type: ignore 

129 summaries = CollectionSummaryManager.initialize( 

130 db, 

131 context, 

132 collections=collections, 

133 dimensions=dimensions, 

134 ) 

135 return cls( 

136 db=db, 

137 collections=collections, 

138 dimensions=dimensions, 

139 static=static, 

140 summaries=summaries, 

141 registry_schema_version=registry_schema_version, 

142 ) 

143 

144 @classmethod 

145 def currentVersions(cls) -> list[VersionTuple]: 

146 # Docstring inherited from VersionedExtension. 

147 return cls._versions 

148 

149 @classmethod 

150 def makeStaticTableSpecs( 

151 cls, 

152 collections: type[CollectionManager], 

153 universe: DimensionUniverse, 

154 schema_version: VersionTuple | None, 

155 ) -> StaticDatasetTablesTuple: 

156 """Construct all static tables used by the classes in this package. 

157 

158 Static tables are those that are present in all Registries and do not 

159 depend on what DatasetTypes have been registered. 

160 

161 Parameters 

162 ---------- 

163 collections: `CollectionManager` 

164 Manager object for the collections in this `Registry`. 

165 universe : `DimensionUniverse` 

166 Universe graph containing all dimensions known to this `Registry`. 

167 schema_version : `VersionTuple` or `None` 

168 Version of the schema that should be created, if `None` then 

169 default schema should be used. 

170 

171 Returns 

172 ------- 

173 specs : `StaticDatasetTablesTuple` 

174 A named tuple containing `ddl.TableSpec` instances. 

175 """ 

176 schema_version = cls.clsNewSchemaVersion(schema_version) 

177 assert schema_version is not None, "New schema version cannot be None" 

178 return makeStaticTableSpecs( 

179 collections, 

180 universe=universe, 

181 dtype=cls.getIdColumnType(), 

182 autoincrement=cls._autoincrement, 

183 schema_version=schema_version, 

184 ) 

185 

186 @classmethod 

187 def getIdColumnType(cls) -> type: 

188 # Docstring inherited from base class. 

189 return cls._idColumnType 

190 

191 @classmethod 

192 def addDatasetForeignKey( 

193 cls, 

194 tableSpec: ddl.TableSpec, 

195 *, 

196 name: str = "dataset", 

197 constraint: bool = True, 

198 onDelete: str | None = None, 

199 **kwargs: Any, 

200 ) -> ddl.FieldSpec: 

201 # Docstring inherited from DatasetRecordStorageManager. 

202 return addDatasetForeignKey( 

203 tableSpec, cls.getIdColumnType(), name=name, onDelete=onDelete, constraint=constraint, **kwargs 

204 ) 

205 

206 def refresh(self) -> None: 

207 # Docstring inherited from DatasetRecordStorageManager. 

208 byName: dict[str, ByDimensionsDatasetRecordStorage] = {} 

209 byId: dict[int, ByDimensionsDatasetRecordStorage] = {} 

210 dataset_types: dict[int, DatasetType] = {} 

211 c = self._static.dataset_type.columns 

212 with self._db.query(self._static.dataset_type.select()) as sql_result: 

213 sql_rows = sql_result.mappings().fetchall() 

214 for row in sql_rows: 

215 name = row[c.name] 

216 dimensions = self._dimensions.loadDimensionGraph(row[c.dimensions_key]) 

217 calibTableName = row[c.calibration_association_table] 

218 datasetType = DatasetType( 

219 name, dimensions, row[c.storage_class], isCalibration=(calibTableName is not None) 

220 ) 

221 tags = self._db.getExistingTable( 

222 row[c.tag_association_table], 

223 makeTagTableSpec(datasetType, type(self._collections), self.getIdColumnType()), 

224 ) 

225 if tags is None: 225 ↛ 226line 225 didn't jump to line 226, because the condition on line 225 was never true

226 raise MissingDatabaseTableError( 

227 f"Table {row[c.tag_association_table]} is missing from database schema." 

228 ) 

229 if calibTableName is not None: 

230 calibs = self._db.getExistingTable( 

231 row[c.calibration_association_table], 

232 makeCalibTableSpec( 

233 datasetType, 

234 type(self._collections), 

235 self._db.getTimespanRepresentation(), 

236 self.getIdColumnType(), 

237 ), 

238 ) 

239 if calibs is None: 239 ↛ 240line 239 didn't jump to line 240, because the condition on line 239 was never true

240 raise MissingDatabaseTableError( 

241 f"Table {row[c.calibration_association_table]} is missing from database schema." 

242 ) 

243 else: 

244 calibs = None 

245 storage = self._recordStorageType( 

246 db=self._db, 

247 datasetType=datasetType, 

248 static=self._static, 

249 summaries=self._summaries, 

250 tags=tags, 

251 calibs=calibs, 

252 dataset_type_id=row["id"], 

253 collections=self._collections, 

254 use_astropy_ingest_date=self.ingest_date_dtype() is ddl.AstropyTimeNsecTai, 

255 ) 

256 byName[datasetType.name] = storage 

257 byId[storage._dataset_type_id] = storage 

258 dataset_types[row["id"]] = datasetType 

259 self._byName = byName 

260 self._byId = byId 

261 self._summaries.refresh(dataset_types) 

262 

263 def remove(self, name: str) -> None: 

264 # Docstring inherited from DatasetRecordStorageManager. 

265 compositeName, componentName = DatasetType.splitDatasetTypeName(name) 

266 if componentName is not None: 

267 raise ValueError(f"Cannot delete a dataset type of a component of a composite (given {name})") 

268 

269 # Delete the row 

270 try: 

271 self._db.delete(self._static.dataset_type, ["name"], {"name": name}) 

272 except sqlalchemy.exc.IntegrityError as e: 

273 raise OrphanedRecordError( 

274 f"Dataset type {name} can not be removed." 

275 " It is associated with datasets that must be removed first." 

276 ) from e 

277 

278 # Now refresh everything -- removal is rare enough that this does 

279 # not need to be fast. 

280 self.refresh() 

281 

282 def find(self, name: str) -> DatasetRecordStorage | None: 

283 # Docstring inherited from DatasetRecordStorageManager. 

284 return self._byName.get(name) 

285 

286 def register(self, datasetType: DatasetType) -> tuple[DatasetRecordStorage, bool]: 

287 # Docstring inherited from DatasetRecordStorageManager. 

288 if datasetType.isComponent(): 288 ↛ 289line 288 didn't jump to line 289, because the condition on line 288 was never true

289 raise ValueError( 

290 f"Component dataset types can not be stored in registry. Rejecting {datasetType.name}" 

291 ) 

292 storage = self._byName.get(datasetType.name) 

293 if storage is None: 

294 dimensionsKey = self._dimensions.saveDimensionGraph(datasetType.dimensions) 

295 tagTableName = makeTagTableName(datasetType, dimensionsKey) 

296 calibTableName = ( 

297 makeCalibTableName(datasetType, dimensionsKey) if datasetType.isCalibration() else None 

298 ) 

299 # The order is important here, we want to create tables first and 

300 # only register them if this operation is successful. We cannot 

301 # wrap it into a transaction because database class assumes that 

302 # DDL is not transaction safe in general. 

303 tags = self._db.ensureTableExists( 

304 tagTableName, 

305 makeTagTableSpec(datasetType, type(self._collections), self.getIdColumnType()), 

306 ) 

307 if calibTableName is not None: 

308 calibs = self._db.ensureTableExists( 

309 calibTableName, 

310 makeCalibTableSpec( 

311 datasetType, 

312 type(self._collections), 

313 self._db.getTimespanRepresentation(), 

314 self.getIdColumnType(), 

315 ), 

316 ) 

317 else: 

318 calibs = None 

319 row, inserted = self._db.sync( 

320 self._static.dataset_type, 

321 keys={"name": datasetType.name}, 

322 compared={ 

323 "dimensions_key": dimensionsKey, 

324 # Force the storage class to be loaded to ensure it 

325 # exists and there is no typo in the name. 

326 "storage_class": datasetType.storageClass.name, 

327 }, 

328 extra={ 

329 "tag_association_table": tagTableName, 

330 "calibration_association_table": calibTableName, 

331 }, 

332 returning=["id", "tag_association_table"], 

333 ) 

334 assert row is not None 

335 storage = self._recordStorageType( 

336 db=self._db, 

337 datasetType=datasetType, 

338 static=self._static, 

339 summaries=self._summaries, 

340 tags=tags, 

341 calibs=calibs, 

342 dataset_type_id=row["id"], 

343 collections=self._collections, 

344 use_astropy_ingest_date=self.ingest_date_dtype() is ddl.AstropyTimeNsecTai, 

345 ) 

346 self._byName[datasetType.name] = storage 

347 self._byId[storage._dataset_type_id] = storage 

348 else: 

349 if datasetType != storage.datasetType: 

350 raise ConflictingDefinitionError( 

351 f"Given dataset type {datasetType} is inconsistent " 

352 f"with database definition {storage.datasetType}." 

353 ) 

354 inserted = False 

355 return storage, bool(inserted) 

356 

357 def resolve_wildcard( 

358 self, 

359 expression: Any, 

360 components: bool | None = False, 

361 missing: list[str] | None = None, 

362 explicit_only: bool = False, 

363 components_deprecated: bool = True, 

364 ) -> dict[DatasetType, list[str | None]]: 

365 wildcard = DatasetTypeWildcard.from_expression(expression) 

366 result: defaultdict[DatasetType, set[str | None]] = defaultdict(set) 

367 # This message can be transformed into an error on DM-36303 after v26, 

368 # and the components and components_deprecated arguments can be merged 

369 # into one on DM-36457 after v27. 

370 deprecation_message = ( 

371 "Querying for component datasets via Registry query methods is deprecated in favor of using " 

372 "DatasetRef and DatasetType methods on parent datasets. Only components=False will be supported " 

373 "after v26, and the components argument will be removed after v27." 

374 ) 

375 for name, dataset_type in wildcard.values.items(): 

376 parent_name, component_name = DatasetType.splitDatasetTypeName(name) 

377 if component_name is not None and components_deprecated: 

378 warnings.warn( 

379 deprecation_message, FutureWarning, stacklevel=find_outside_stacklevel("lsst.daf.butler") 

380 ) 

381 if (found_storage := self.find(parent_name)) is not None: 

382 found_parent = found_storage.datasetType 

383 if component_name is not None: 

384 found = found_parent.makeComponentDatasetType(component_name) 

385 else: 

386 found = found_parent 

387 if dataset_type is not None: 

388 if dataset_type.is_compatible_with(found): 388 ↛ 396line 388 didn't jump to line 396, because the condition on line 388 was never false

389 # Prefer the given dataset type to enable storage class 

390 # conversions. 

391 if component_name is not None: 

392 found_parent = dataset_type.makeCompositeDatasetType() 

393 else: 

394 found_parent = dataset_type 

395 else: 

396 raise DatasetTypeError( 

397 f"Dataset type definition in query expression {dataset_type} is " 

398 f"not compatible with the registered type {found}." 

399 ) 

400 result[found_parent].add(component_name) 

401 elif missing is not None: 

402 missing.append(name) 

403 already_warned = False 

404 if wildcard.patterns is ...: 

405 if explicit_only: 

406 raise TypeError( 

407 "Universal wildcard '...' is not permitted for dataset types in this context." 

408 ) 

409 for storage in self._byName.values(): 

410 result[storage.datasetType].add(None) 

411 if components: 

412 try: 

413 result[storage.datasetType].update( 

414 storage.datasetType.storageClass.allComponents().keys() 

415 ) 

416 if ( 

417 storage.datasetType.storageClass.allComponents() 

418 and not already_warned 

419 and components_deprecated 

420 ): 

421 warnings.warn( 

422 deprecation_message, 

423 FutureWarning, 

424 stacklevel=find_outside_stacklevel("lsst.daf.butler"), 

425 ) 

426 already_warned = True 

427 except KeyError as err: 

428 _LOG.warning( 

429 f"Could not load storage class {err} for {storage.datasetType.name}; " 

430 "if it has components they will not be included in query results.", 

431 ) 

432 elif wildcard.patterns: 

433 if explicit_only: 

434 # After v26 this should raise DatasetTypeExpressionError, to 

435 # be implemented on DM-36303. 

436 warnings.warn( 

437 "Passing wildcard patterns here is deprecated and will be prohibited after v26.", 

438 FutureWarning, 

439 stacklevel=find_outside_stacklevel("lsst.daf.butler"), 

440 ) 

441 for storage in self._byName.values(): 

442 if any(p.fullmatch(storage.datasetType.name) for p in wildcard.patterns): 

443 result[storage.datasetType].add(None) 

444 if components is not False: 

445 for storage in self._byName.values(): 

446 if components is None and storage.datasetType in result: 446 ↛ 447line 446 didn't jump to line 447, because the condition on line 446 was never true

447 continue 

448 try: 

449 components_for_parent = storage.datasetType.storageClass.allComponents().keys() 

450 except KeyError as err: 

451 _LOG.warning( 

452 f"Could not load storage class {err} for {storage.datasetType.name}; " 

453 "if it has components they will not be included in query results." 

454 ) 

455 continue 

456 for component_name in components_for_parent: 

457 if any( 

458 p.fullmatch( 

459 DatasetType.nameWithComponent(storage.datasetType.name, component_name) 

460 ) 

461 for p in wildcard.patterns 

462 ): 

463 result[storage.datasetType].add(component_name) 

464 if not already_warned and components_deprecated: 

465 warnings.warn( 

466 deprecation_message, 

467 FutureWarning, 

468 stacklevel=find_outside_stacklevel("lsst.daf.butler"), 

469 ) 

470 already_warned = True 

471 return {k: list(v) for k, v in result.items()} 

472 

473 def getDatasetRef(self, id: DatasetId) -> DatasetRef | None: 

474 # Docstring inherited from DatasetRecordStorageManager. 

475 sql = ( 

476 sqlalchemy.sql.select( 

477 self._static.dataset.columns.dataset_type_id, 

478 self._static.dataset.columns[self._collections.getRunForeignKeyName()], 

479 ) 

480 .select_from(self._static.dataset) 

481 .where(self._static.dataset.columns.id == id) 

482 ) 

483 with self._db.query(sql) as sql_result: 

484 row = sql_result.mappings().fetchone() 

485 if row is None: 

486 return None 

487 recordsForType = self._byId.get(row[self._static.dataset.columns.dataset_type_id]) 

488 if recordsForType is None: 488 ↛ 489line 488 didn't jump to line 489, because the condition on line 488 was never true

489 self.refresh() 

490 recordsForType = self._byId.get(row[self._static.dataset.columns.dataset_type_id]) 

491 assert recordsForType is not None, "Should be guaranteed by foreign key constraints." 

492 return DatasetRef( 

493 recordsForType.datasetType, 

494 dataId=recordsForType.getDataId(id=id), 

495 id=id, 

496 run=self._collections[row[self._collections.getRunForeignKeyName()]].name, 

497 ) 

498 

499 def getCollectionSummary(self, collection: CollectionRecord) -> CollectionSummary: 

500 # Docstring inherited from DatasetRecordStorageManager. 

501 return self._summaries.get(collection) 

502 

503 _versions: list[VersionTuple] 

504 """Schema version for this class.""" 

505 

506 _recordStorageType: type[ByDimensionsDatasetRecordStorage] 

507 """Type of the storage class returned by this manager.""" 

508 

509 _autoincrement: bool 

510 """If True then PK column of the dataset table is auto-increment.""" 

511 

512 _idColumnType: type 

513 """Type of dataset column used to store dataset ID.""" 

514 

515 

516class ByDimensionsDatasetRecordStorageManagerUUID(ByDimensionsDatasetRecordStorageManagerBase): 

517 """Implementation of ByDimensionsDatasetRecordStorageManagerBase which uses 

518 UUID for dataset primary key. 

519 """ 

520 

521 _versions: list[VersionTuple] = [_VERSION_UUID, _VERSION_UUID_NS] 

522 _recordStorageType: type[ByDimensionsDatasetRecordStorage] = ByDimensionsDatasetRecordStorageUUID 

523 _autoincrement: bool = False 

524 _idColumnType: type = ddl.GUID 

525 

526 @classmethod 

527 def supportsIdGenerationMode(cls, mode: DatasetIdGenEnum) -> bool: 

528 # Docstring inherited from DatasetRecordStorageManager. 

529 return True 

530 

531 @classmethod 

532 def _newDefaultSchemaVersion(cls) -> VersionTuple: 

533 # Docstring inherited from VersionedExtension. 

534 

535 # By default return 1.0.0 so that older clients can still access new 

536 # registries created with a default config. 

537 return _VERSION_UUID 

538 

539 def ingest_date_dtype(self) -> type: 

540 """Return type of the ``ingest_date`` column.""" 

541 schema_version = self.newSchemaVersion() 

542 if schema_version is not None and schema_version.major > 1: 

543 return ddl.AstropyTimeNsecTai 

544 else: 

545 return sqlalchemy.TIMESTAMP