Coverage for python/lsst/daf/butler/registry/datasets/byDimensions/_manager.py: 95%

205 statements  

« prev     ^ index     » next       coverage.py v7.3.2, created at 2023-10-25 15:13 +0000

1from __future__ import annotations 

2 

3__all__ = ("ByDimensionsDatasetRecordStorageManagerUUID",) 

4 

5import logging 

6import warnings 

7from collections import defaultdict 

8from typing import TYPE_CHECKING, Any 

9 

10import sqlalchemy 

11from lsst.utils.introspection import find_outside_stacklevel 

12 

13from ....core import DatasetId, DatasetIdGenEnum, DatasetRef, DatasetType, DimensionUniverse, ddl 

14from ..._collection_summary import CollectionSummary 

15from ..._exceptions import ConflictingDefinitionError, DatasetTypeError, OrphanedRecordError 

16from ...interfaces import DatasetRecordStorage, DatasetRecordStorageManager, VersionTuple 

17from ...wildcards import DatasetTypeWildcard 

18from ._storage import ByDimensionsDatasetRecordStorage, ByDimensionsDatasetRecordStorageUUID 

19from .summaries import CollectionSummaryManager 

20from .tables import ( 

21 addDatasetForeignKey, 

22 makeCalibTableName, 

23 makeCalibTableSpec, 

24 makeStaticTableSpecs, 

25 makeTagTableName, 

26 makeTagTableSpec, 

27) 

28 

29if TYPE_CHECKING: 

30 from ...interfaces import ( 

31 CollectionManager, 

32 CollectionRecord, 

33 Database, 

34 DimensionRecordStorageManager, 

35 StaticTablesContext, 

36 ) 

37 from .tables import StaticDatasetTablesTuple 

38 

39 

40# This has to be updated on every schema change 

41_VERSION_UUID = VersionTuple(1, 0, 0) 

42# Starting with 2.0.0 the `ingest_date` column type uses nanoseconds instead 

43# of TIMESTAMP. The code supports both 1.0.0 and 2.0.0 for the duration of 

44# client migration period. 

45_VERSION_UUID_NS = VersionTuple(2, 0, 0) 

46 

47_LOG = logging.getLogger(__name__) 

48 

49 

50class MissingDatabaseTableError(RuntimeError): 

51 """Exception raised when a table is not found in a database.""" 

52 

53 

54class ByDimensionsDatasetRecordStorageManagerBase(DatasetRecordStorageManager): 

55 """A manager class for datasets that uses one dataset-collection table for 

56 each group of dataset types that share the same dimensions. 

57 

58 In addition to the table organization, this class makes a number of 

59 other design choices that would have been cumbersome (to say the least) to 

60 try to pack into its name: 

61 

62 - It uses a private surrogate integer autoincrement field to identify 

63 dataset types, instead of using the name as the primary and foreign key 

64 directly. 

65 

66 - It aggressively loads all DatasetTypes into memory instead of fetching 

67 them from the database only when needed or attempting more clever forms 

68 of caching. 

69 

70 Alternative implementations that make different choices for these while 

71 keeping the same general table organization might be reasonable as well. 

72 

73 This class provides complete implementation of manager logic but it is 

74 parametrized by few class attributes that have to be defined by 

75 sub-classes. 

76 

77 Parameters 

78 ---------- 

79 db : `Database` 

80 Interface to the underlying database engine and namespace. 

81 collections : `CollectionManager` 

82 Manager object for the collections in this `Registry`. 

83 dimensions : `DimensionRecordStorageManager` 

84 Manager object for the dimensions in this `Registry`. 

85 static : `StaticDatasetTablesTuple` 

86 Named tuple of `sqlalchemy.schema.Table` instances for all static 

87 tables used by this class. 

88 summaries : `CollectionSummaryManager` 

89 Structure containing tables that summarize the contents of collections. 

90 """ 

91 

92 def __init__( 

93 self, 

94 *, 

95 db: Database, 

96 collections: CollectionManager, 

97 dimensions: DimensionRecordStorageManager, 

98 static: StaticDatasetTablesTuple, 

99 summaries: CollectionSummaryManager, 

100 registry_schema_version: VersionTuple | None = None, 

101 ): 

102 super().__init__(registry_schema_version=registry_schema_version) 

103 self._db = db 

104 self._collections = collections 

105 self._dimensions = dimensions 

106 self._static = static 

107 self._summaries = summaries 

108 self._byName: dict[str, ByDimensionsDatasetRecordStorage] = {} 

109 self._byId: dict[int, ByDimensionsDatasetRecordStorage] = {} 

110 

111 @classmethod 

112 def initialize( 

113 cls, 

114 db: Database, 

115 context: StaticTablesContext, 

116 *, 

117 collections: CollectionManager, 

118 dimensions: DimensionRecordStorageManager, 

119 registry_schema_version: VersionTuple | None = None, 

120 ) -> DatasetRecordStorageManager: 

121 # Docstring inherited from DatasetRecordStorageManager. 

122 specs = cls.makeStaticTableSpecs( 

123 type(collections), universe=dimensions.universe, schema_version=registry_schema_version 

124 ) 

125 static: StaticDatasetTablesTuple = context.addTableTuple(specs) # type: ignore 

126 summaries = CollectionSummaryManager.initialize( 

127 db, 

128 context, 

129 collections=collections, 

130 dimensions=dimensions, 

131 ) 

132 return cls( 

133 db=db, 

134 collections=collections, 

135 dimensions=dimensions, 

136 static=static, 

137 summaries=summaries, 

138 registry_schema_version=registry_schema_version, 

139 ) 

140 

141 @classmethod 

142 def currentVersions(cls) -> list[VersionTuple]: 

143 # Docstring inherited from VersionedExtension. 

144 return cls._versions 

145 

146 @classmethod 

147 def makeStaticTableSpecs( 

148 cls, 

149 collections: type[CollectionManager], 

150 universe: DimensionUniverse, 

151 schema_version: VersionTuple | None, 

152 ) -> StaticDatasetTablesTuple: 

153 """Construct all static tables used by the classes in this package. 

154 

155 Static tables are those that are present in all Registries and do not 

156 depend on what DatasetTypes have been registered. 

157 

158 Parameters 

159 ---------- 

160 collections: `CollectionManager` 

161 Manager object for the collections in this `Registry`. 

162 universe : `DimensionUniverse` 

163 Universe graph containing all dimensions known to this `Registry`. 

164 schema_version : `VersionTuple` or `None` 

165 Version of the schema that should be created, if `None` then 

166 default schema should be used. 

167 

168 Returns 

169 ------- 

170 specs : `StaticDatasetTablesTuple` 

171 A named tuple containing `ddl.TableSpec` instances. 

172 """ 

173 schema_version = cls.clsNewSchemaVersion(schema_version) 

174 assert schema_version is not None, "New schema version cannot be None" 

175 return makeStaticTableSpecs( 

176 collections, 

177 universe=universe, 

178 dtype=cls.getIdColumnType(), 

179 autoincrement=cls._autoincrement, 

180 schema_version=schema_version, 

181 ) 

182 

183 @classmethod 

184 def getIdColumnType(cls) -> type: 

185 # Docstring inherited from base class. 

186 return cls._idColumnType 

187 

188 @classmethod 

189 def addDatasetForeignKey( 

190 cls, 

191 tableSpec: ddl.TableSpec, 

192 *, 

193 name: str = "dataset", 

194 constraint: bool = True, 

195 onDelete: str | None = None, 

196 **kwargs: Any, 

197 ) -> ddl.FieldSpec: 

198 # Docstring inherited from DatasetRecordStorageManager. 

199 return addDatasetForeignKey( 

200 tableSpec, cls.getIdColumnType(), name=name, onDelete=onDelete, constraint=constraint, **kwargs 

201 ) 

202 

203 def refresh(self) -> None: 

204 # Docstring inherited from DatasetRecordStorageManager. 

205 byName: dict[str, ByDimensionsDatasetRecordStorage] = {} 

206 byId: dict[int, ByDimensionsDatasetRecordStorage] = {} 

207 dataset_types: dict[int, DatasetType] = {} 

208 c = self._static.dataset_type.columns 

209 with self._db.query(self._static.dataset_type.select()) as sql_result: 

210 sql_rows = sql_result.mappings().fetchall() 

211 for row in sql_rows: 

212 name = row[c.name] 

213 dimensions = self._dimensions.loadDimensionGraph(row[c.dimensions_key]) 

214 calibTableName = row[c.calibration_association_table] 

215 datasetType = DatasetType( 

216 name, dimensions, row[c.storage_class], isCalibration=(calibTableName is not None) 

217 ) 

218 tags = self._db.getExistingTable( 

219 row[c.tag_association_table], 

220 makeTagTableSpec(datasetType, type(self._collections), self.getIdColumnType()), 

221 ) 

222 if tags is None: 222 ↛ 223line 222 didn't jump to line 223, because the condition on line 222 was never true

223 raise MissingDatabaseTableError( 

224 f"Table {row[c.tag_association_table]} is missing from database schema." 

225 ) 

226 if calibTableName is not None: 

227 calibs = self._db.getExistingTable( 

228 row[c.calibration_association_table], 

229 makeCalibTableSpec( 

230 datasetType, 

231 type(self._collections), 

232 self._db.getTimespanRepresentation(), 

233 self.getIdColumnType(), 

234 ), 

235 ) 

236 if calibs is None: 236 ↛ 237line 236 didn't jump to line 237, because the condition on line 236 was never true

237 raise MissingDatabaseTableError( 

238 f"Table {row[c.calibration_association_table]} is missing from database schema." 

239 ) 

240 else: 

241 calibs = None 

242 storage = self._recordStorageType( 

243 db=self._db, 

244 datasetType=datasetType, 

245 static=self._static, 

246 summaries=self._summaries, 

247 tags=tags, 

248 calibs=calibs, 

249 dataset_type_id=row["id"], 

250 collections=self._collections, 

251 use_astropy_ingest_date=self.ingest_date_dtype() is ddl.AstropyTimeNsecTai, 

252 ) 

253 byName[datasetType.name] = storage 

254 byId[storage._dataset_type_id] = storage 

255 dataset_types[row["id"]] = datasetType 

256 self._byName = byName 

257 self._byId = byId 

258 self._summaries.refresh(dataset_types) 

259 

260 def remove(self, name: str) -> None: 

261 # Docstring inherited from DatasetRecordStorageManager. 

262 compositeName, componentName = DatasetType.splitDatasetTypeName(name) 

263 if componentName is not None: 

264 raise ValueError(f"Cannot delete a dataset type of a component of a composite (given {name})") 

265 

266 # Delete the row 

267 try: 

268 self._db.delete(self._static.dataset_type, ["name"], {"name": name}) 

269 except sqlalchemy.exc.IntegrityError as e: 

270 raise OrphanedRecordError( 

271 f"Dataset type {name} can not be removed." 

272 " It is associated with datasets that must be removed first." 

273 ) from e 

274 

275 # Now refresh everything -- removal is rare enough that this does 

276 # not need to be fast. 

277 self.refresh() 

278 

279 def find(self, name: str) -> DatasetRecordStorage | None: 

280 # Docstring inherited from DatasetRecordStorageManager. 

281 return self._byName.get(name) 

282 

283 def register(self, datasetType: DatasetType) -> tuple[DatasetRecordStorage, bool]: 

284 # Docstring inherited from DatasetRecordStorageManager. 

285 if datasetType.isComponent(): 285 ↛ 286line 285 didn't jump to line 286, because the condition on line 285 was never true

286 raise ValueError( 

287 f"Component dataset types can not be stored in registry. Rejecting {datasetType.name}" 

288 ) 

289 storage = self._byName.get(datasetType.name) 

290 if storage is None: 

291 dimensionsKey = self._dimensions.saveDimensionGraph(datasetType.dimensions) 

292 tagTableName = makeTagTableName(datasetType, dimensionsKey) 

293 calibTableName = ( 

294 makeCalibTableName(datasetType, dimensionsKey) if datasetType.isCalibration() else None 

295 ) 

296 # The order is important here, we want to create tables first and 

297 # only register them if this operation is successful. We cannot 

298 # wrap it into a transaction because database class assumes that 

299 # DDL is not transaction safe in general. 

300 tags = self._db.ensureTableExists( 

301 tagTableName, 

302 makeTagTableSpec(datasetType, type(self._collections), self.getIdColumnType()), 

303 ) 

304 if calibTableName is not None: 

305 calibs = self._db.ensureTableExists( 

306 calibTableName, 

307 makeCalibTableSpec( 

308 datasetType, 

309 type(self._collections), 

310 self._db.getTimespanRepresentation(), 

311 self.getIdColumnType(), 

312 ), 

313 ) 

314 else: 

315 calibs = None 

316 row, inserted = self._db.sync( 

317 self._static.dataset_type, 

318 keys={"name": datasetType.name}, 

319 compared={ 

320 "dimensions_key": dimensionsKey, 

321 # Force the storage class to be loaded to ensure it 

322 # exists and there is no typo in the name. 

323 "storage_class": datasetType.storageClass.name, 

324 }, 

325 extra={ 

326 "tag_association_table": tagTableName, 

327 "calibration_association_table": calibTableName, 

328 }, 

329 returning=["id", "tag_association_table"], 

330 ) 

331 assert row is not None 

332 storage = self._recordStorageType( 

333 db=self._db, 

334 datasetType=datasetType, 

335 static=self._static, 

336 summaries=self._summaries, 

337 tags=tags, 

338 calibs=calibs, 

339 dataset_type_id=row["id"], 

340 collections=self._collections, 

341 use_astropy_ingest_date=self.ingest_date_dtype() is ddl.AstropyTimeNsecTai, 

342 ) 

343 self._byName[datasetType.name] = storage 

344 self._byId[storage._dataset_type_id] = storage 

345 else: 

346 if datasetType != storage.datasetType: 

347 raise ConflictingDefinitionError( 

348 f"Given dataset type {datasetType} is inconsistent " 

349 f"with database definition {storage.datasetType}." 

350 ) 

351 inserted = False 

352 return storage, bool(inserted) 

353 

354 def resolve_wildcard( 

355 self, 

356 expression: Any, 

357 components: bool | None = False, 

358 missing: list[str] | None = None, 

359 explicit_only: bool = False, 

360 components_deprecated: bool = True, 

361 ) -> dict[DatasetType, list[str | None]]: 

362 wildcard = DatasetTypeWildcard.from_expression(expression) 

363 result: defaultdict[DatasetType, set[str | None]] = defaultdict(set) 

364 # This message can be transformed into an error on DM-36303 after v26, 

365 # and the components and components_deprecated arguments can be merged 

366 # into one on DM-36457 after v27. 

367 deprecation_message = ( 

368 "Querying for component datasets via Registry query methods is deprecated in favor of using " 

369 "DatasetRef and DatasetType methods on parent datasets. Only components=False will be supported " 

370 "after v26, and the components argument will be removed after v27." 

371 ) 

372 for name, dataset_type in wildcard.values.items(): 

373 parent_name, component_name = DatasetType.splitDatasetTypeName(name) 

374 if component_name is not None and components_deprecated: 

375 warnings.warn( 

376 deprecation_message, FutureWarning, stacklevel=find_outside_stacklevel("lsst.daf.butler") 

377 ) 

378 if (found_storage := self.find(parent_name)) is not None: 

379 found_parent = found_storage.datasetType 

380 if component_name is not None: 

381 found = found_parent.makeComponentDatasetType(component_name) 

382 else: 

383 found = found_parent 

384 if dataset_type is not None: 

385 if dataset_type.is_compatible_with(found): 385 ↛ 393line 385 didn't jump to line 393, because the condition on line 385 was never false

386 # Prefer the given dataset type to enable storage class 

387 # conversions. 

388 if component_name is not None: 

389 found_parent = dataset_type.makeCompositeDatasetType() 

390 else: 

391 found_parent = dataset_type 

392 else: 

393 raise DatasetTypeError( 

394 f"Dataset type definition in query expression {dataset_type} is " 

395 f"not compatible with the registered type {found}." 

396 ) 

397 result[found_parent].add(component_name) 

398 elif missing is not None: 

399 missing.append(name) 

400 already_warned = False 

401 if wildcard.patterns is ...: 

402 if explicit_only: 

403 raise TypeError( 

404 "Universal wildcard '...' is not permitted for dataset types in this context." 

405 ) 

406 for storage in self._byName.values(): 

407 result[storage.datasetType].add(None) 

408 if components: 

409 try: 

410 result[storage.datasetType].update( 

411 storage.datasetType.storageClass.allComponents().keys() 

412 ) 

413 if ( 

414 storage.datasetType.storageClass.allComponents() 

415 and not already_warned 

416 and components_deprecated 

417 ): 

418 warnings.warn( 

419 deprecation_message, 

420 FutureWarning, 

421 stacklevel=find_outside_stacklevel("lsst.daf.butler"), 

422 ) 

423 already_warned = True 

424 except KeyError as err: 

425 _LOG.warning( 

426 f"Could not load storage class {err} for {storage.datasetType.name}; " 

427 "if it has components they will not be included in query results.", 

428 ) 

429 elif wildcard.patterns: 

430 if explicit_only: 

431 # After v26 this should raise DatasetTypeExpressionError, to 

432 # be implemented on DM-36303. 

433 warnings.warn( 

434 "Passing wildcard patterns here is deprecated and will be prohibited after v26.", 

435 FutureWarning, 

436 stacklevel=find_outside_stacklevel("lsst.daf.butler"), 

437 ) 

438 for storage in self._byName.values(): 

439 if any(p.fullmatch(storage.datasetType.name) for p in wildcard.patterns): 

440 result[storage.datasetType].add(None) 

441 if components is not False: 

442 for storage in self._byName.values(): 

443 if components is None and storage.datasetType in result: 443 ↛ 444line 443 didn't jump to line 444, because the condition on line 443 was never true

444 continue 

445 try: 

446 components_for_parent = storage.datasetType.storageClass.allComponents().keys() 

447 except KeyError as err: 

448 _LOG.warning( 

449 f"Could not load storage class {err} for {storage.datasetType.name}; " 

450 "if it has components they will not be included in query results." 

451 ) 

452 continue 

453 for component_name in components_for_parent: 

454 if any( 

455 p.fullmatch( 

456 DatasetType.nameWithComponent(storage.datasetType.name, component_name) 

457 ) 

458 for p in wildcard.patterns 

459 ): 

460 result[storage.datasetType].add(component_name) 

461 if not already_warned and components_deprecated: 

462 warnings.warn( 

463 deprecation_message, 

464 FutureWarning, 

465 stacklevel=find_outside_stacklevel("lsst.daf.butler"), 

466 ) 

467 already_warned = True 

468 return {k: list(v) for k, v in result.items()} 

469 

470 def getDatasetRef(self, id: DatasetId) -> DatasetRef | None: 

471 # Docstring inherited from DatasetRecordStorageManager. 

472 sql = ( 

473 sqlalchemy.sql.select( 

474 self._static.dataset.columns.dataset_type_id, 

475 self._static.dataset.columns[self._collections.getRunForeignKeyName()], 

476 ) 

477 .select_from(self._static.dataset) 

478 .where(self._static.dataset.columns.id == id) 

479 ) 

480 with self._db.query(sql) as sql_result: 

481 row = sql_result.mappings().fetchone() 

482 if row is None: 

483 return None 

484 recordsForType = self._byId.get(row[self._static.dataset.columns.dataset_type_id]) 

485 if recordsForType is None: 485 ↛ 486line 485 didn't jump to line 486, because the condition on line 485 was never true

486 self.refresh() 

487 recordsForType = self._byId.get(row[self._static.dataset.columns.dataset_type_id]) 

488 assert recordsForType is not None, "Should be guaranteed by foreign key constraints." 

489 return DatasetRef( 

490 recordsForType.datasetType, 

491 dataId=recordsForType.getDataId(id=id), 

492 id=id, 

493 run=self._collections[row[self._collections.getRunForeignKeyName()]].name, 

494 ) 

495 

496 def getCollectionSummary(self, collection: CollectionRecord) -> CollectionSummary: 

497 # Docstring inherited from DatasetRecordStorageManager. 

498 return self._summaries.get(collection) 

499 

500 _versions: list[VersionTuple] 

501 """Schema version for this class.""" 

502 

503 _recordStorageType: type[ByDimensionsDatasetRecordStorage] 

504 """Type of the storage class returned by this manager.""" 

505 

506 _autoincrement: bool 

507 """If True then PK column of the dataset table is auto-increment.""" 

508 

509 _idColumnType: type 

510 """Type of dataset column used to store dataset ID.""" 

511 

512 

513class ByDimensionsDatasetRecordStorageManagerUUID(ByDimensionsDatasetRecordStorageManagerBase): 

514 """Implementation of ByDimensionsDatasetRecordStorageManagerBase which uses 

515 UUID for dataset primary key. 

516 """ 

517 

518 _versions: list[VersionTuple] = [_VERSION_UUID, _VERSION_UUID_NS] 

519 _recordStorageType: type[ByDimensionsDatasetRecordStorage] = ByDimensionsDatasetRecordStorageUUID 

520 _autoincrement: bool = False 

521 _idColumnType: type = ddl.GUID 

522 

523 @classmethod 

524 def supportsIdGenerationMode(cls, mode: DatasetIdGenEnum) -> bool: 

525 # Docstring inherited from DatasetRecordStorageManager. 

526 return True 

527 

528 @classmethod 

529 def _newDefaultSchemaVersion(cls) -> VersionTuple: 

530 # Docstring inherited from VersionedExtension. 

531 

532 # By default return 1.0.0 so that older clients can still access new 

533 # registries created with a default config. 

534 return _VERSION_UUID 

535 

536 def ingest_date_dtype(self) -> type: 

537 """Return type of the ``ingest_date`` column.""" 

538 schema_version = self.newSchemaVersion() 

539 if schema_version is not None and schema_version.major > 1: 

540 return ddl.AstropyTimeNsecTai 

541 else: 

542 return sqlalchemy.TIMESTAMP