Coverage for python/lsst/daf/butler/registry/dimensions/static.py: 93%

157 statements  

« prev     ^ index     » next       coverage.py v7.3.2, created at 2023-12-05 11:05 +0000

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This software is dual licensed under the GNU General Public License and also 

10# under a 3-clause BSD license. Recipients may choose which of these licenses 

11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt, 

12# respectively. If you choose the GPL option then the following text applies 

13# (but note that there is still no warranty even if you opt for BSD instead): 

14# 

15# This program is free software: you can redistribute it and/or modify 

16# it under the terms of the GNU General Public License as published by 

17# the Free Software Foundation, either version 3 of the License, or 

18# (at your option) any later version. 

19# 

20# This program is distributed in the hope that it will be useful, 

21# but WITHOUT ANY WARRANTY; without even the implied warranty of 

22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

23# GNU General Public License for more details. 

24# 

25# You should have received a copy of the GNU General Public License 

26# along with this program. If not, see <http://www.gnu.org/licenses/>. 

27from __future__ import annotations 

28 

29import itertools 

30from collections import defaultdict 

31from collections.abc import Mapping, Set 

32from typing import TYPE_CHECKING, cast 

33 

34import sqlalchemy 

35from lsst.daf.relation import Relation 

36 

37from ... import ddl 

38from ..._column_tags import DimensionKeyColumnTag 

39from ..._named import NamedKeyDict 

40from ...dimensions import ( 

41 DatabaseDimensionElement, 

42 DatabaseTopologicalFamily, 

43 DimensionElement, 

44 DimensionGroup, 

45 DimensionUniverse, 

46 GovernorDimension, 

47) 

48from .._exceptions import MissingSpatialOverlapError 

49from ..interfaces import ( 

50 Database, 

51 DatabaseDimensionOverlapStorage, 

52 DatabaseDimensionRecordStorage, 

53 DimensionRecordStorage, 

54 DimensionRecordStorageManager, 

55 GovernorDimensionRecordStorage, 

56 StaticTablesContext, 

57 VersionTuple, 

58) 

59 

60if TYPE_CHECKING: 

61 from .. import queries 

62 

63 

64# This has to be updated on every schema change 

65_VERSION = VersionTuple(6, 0, 2) 

66 

67 

68class StaticDimensionRecordStorageManager(DimensionRecordStorageManager): 

69 """An implementation of `DimensionRecordStorageManager` for single-layer 

70 `Registry` and the base layers of multi-layer `Registry`. 

71 

72 This manager creates `DimensionRecordStorage` instances for all elements 

73 in the `DimensionUniverse` in its own `initialize` method, as part of 

74 static table creation, so it never needs to manage any dynamic registry 

75 tables. 

76 

77 Parameters 

78 ---------- 

79 db : `Database` 

80 Interface to the underlying database engine and namespace. 

81 records : `NamedKeyDict` 

82 Mapping from `DimensionElement` to `DimensionRecordStorage` for that 

83 element. 

84 overlaps : `list` [ `DatabaseDimensionOverlapStorage` ] 

85 Objects that manage materialized overlaps between database-backed 

86 dimensions. 

87 dimension_group_storage : `_DimensionGroupStorage` 

88 Object that manages saved `DimensionGroup` definitions. 

89 universe : `DimensionUniverse` 

90 All known dimensions. 

91 """ 

92 

93 def __init__( 

94 self, 

95 db: Database, 

96 *, 

97 records: NamedKeyDict[DimensionElement, DimensionRecordStorage], 

98 overlaps: dict[ 

99 tuple[DatabaseDimensionElement, DatabaseDimensionElement], DatabaseDimensionOverlapStorage 

100 ], 

101 dimension_group_storage: _DimensionGroupStorage, 

102 universe: DimensionUniverse, 

103 registry_schema_version: VersionTuple | None = None, 

104 ): 

105 super().__init__(universe=universe, registry_schema_version=registry_schema_version) 

106 self._db = db 

107 self._records = records 

108 self._overlaps = overlaps 

109 self._dimension_group_storage = dimension_group_storage 

110 

111 @classmethod 

112 def initialize( 

113 cls, 

114 db: Database, 

115 context: StaticTablesContext, 

116 *, 

117 universe: DimensionUniverse, 

118 registry_schema_version: VersionTuple | None = None, 

119 ) -> DimensionRecordStorageManager: 

120 # Docstring inherited from DimensionRecordStorageManager. 

121 # Start by initializing governor dimensions; those go both in the main 

122 # 'records' mapping we'll pass to init, and a local dictionary that we 

123 # can pass in when initializing storage for DatabaseDimensionElements. 

124 governors = NamedKeyDict[GovernorDimension, GovernorDimensionRecordStorage]() 

125 records = NamedKeyDict[DimensionElement, DimensionRecordStorage]() 

126 for dimension in universe.governor_dimensions: 

127 governorStorage = dimension.makeStorage(db, context=context) 

128 governors[dimension] = governorStorage 

129 records[dimension] = governorStorage 

130 # Next we initialize storage for DatabaseDimensionElements. Some 

131 # elements' storage may be views into anothers; we'll do a first pass 

132 # to gather a mapping from the names of those targets back to their 

133 # views. 

134 view_targets = { 

135 element.viewOf: element for element in universe.database_elements if element.viewOf is not None 

136 } 

137 # We remember the spatial ones (grouped by family) so we can go back 

138 # and initialize overlap storage for them later. 

139 spatial = NamedKeyDict[DatabaseTopologicalFamily, list[DatabaseDimensionRecordStorage]]() 

140 for element in universe.database_elements: 

141 if element.viewOf is not None: 

142 # We'll initialize this storage when the view's target is 

143 # initialized. 

144 continue 

145 elementStorage = element.makeStorage(db, context=context, governors=governors) 

146 records[element] = elementStorage 

147 if element.spatial is not None: 

148 spatial.setdefault(element.spatial, []).append(elementStorage) 

149 if (view_element := view_targets.get(element.name)) is not None: 

150 view_element_storage = view_element.makeStorage( 

151 db, 

152 context=context, 

153 governors=governors, 

154 view_target=elementStorage, 

155 ) 

156 records[view_element] = view_element_storage 

157 if view_element.spatial is not None: 157 ↛ 158line 157 didn't jump to line 158, because the condition on line 157 was never true

158 spatial.setdefault(view_element.spatial, []).append(view_element_storage) 

159 

160 # Finally we initialize overlap storage. The implementation class for 

161 # this is currently hard-coded (it's not obvious there will ever be 

162 # others). Note that overlaps between database-backed dimensions and 

163 # skypix dimensions is internal to `DatabaseDimensionRecordStorage`, 

164 # and hence is not included here. 

165 from ..dimensions.overlaps import CrossFamilyDimensionOverlapStorage 

166 

167 overlaps: dict[ 

168 tuple[DatabaseDimensionElement, DatabaseDimensionElement], DatabaseDimensionOverlapStorage 

169 ] = {} 

170 for (family1, storages1), (family2, storages2) in itertools.combinations(spatial.items(), 2): 

171 for elementStoragePair in itertools.product(storages1, storages2): 

172 governorStoragePair = (governors[family1.governor], governors[family2.governor]) 

173 if elementStoragePair[0].element > elementStoragePair[1].element: 173 ↛ 174line 173 didn't jump to line 174, because the condition on line 173 was never true

174 elementStoragePair = (elementStoragePair[1], elementStoragePair[0]) 

175 governorStoragePair = (governorStoragePair[1], governorStoragePair[1]) 

176 overlapStorage = CrossFamilyDimensionOverlapStorage.initialize( 

177 db, 

178 elementStoragePair, 

179 governorStoragePair, 

180 context=context, 

181 ) 

182 elementStoragePair[0].connect(overlapStorage) 

183 elementStoragePair[1].connect(overlapStorage) 

184 overlaps[overlapStorage.elements] = overlapStorage 

185 # Create table that stores DimensionGraph definitions. 

186 dimension_group_storage = _DimensionGroupStorage.initialize(db, context, universe=universe) 

187 return cls( 

188 db=db, 

189 records=records, 

190 universe=universe, 

191 overlaps=overlaps, 

192 dimension_group_storage=dimension_group_storage, 

193 registry_schema_version=registry_schema_version, 

194 ) 

195 

196 def get(self, element: DimensionElement | str) -> DimensionRecordStorage | None: 

197 # Docstring inherited from DimensionRecordStorageManager. 

198 r = self._records.get(element) 

199 if r is None: 

200 if (dimension := self.universe.skypix_dimensions.get(element)) is not None: 200 ↛ 202line 200 didn't jump to line 202, because the condition on line 200 was never false

201 return dimension.makeStorage() 

202 return r 

203 

204 def register(self, element: DimensionElement) -> DimensionRecordStorage: 

205 # Docstring inherited from DimensionRecordStorageManager. 

206 result = self.get(element) 

207 assert result, "All records instances should be created in initialize()." 

208 return result 

209 

210 def save_dimension_group(self, graph: DimensionGroup) -> int: 

211 # Docstring inherited from DimensionRecordStorageManager. 

212 return self._dimension_group_storage.save(graph) 

213 

214 def load_dimension_group(self, key: int) -> DimensionGroup: 

215 # Docstring inherited from DimensionRecordStorageManager. 

216 return self._dimension_group_storage.load(key) 

217 

218 def clearCaches(self) -> None: 

219 # Docstring inherited from DimensionRecordStorageManager. 

220 for storage in self._records.values(): 

221 storage.clearCaches() 

222 

223 def make_spatial_join_relation( 

224 self, 

225 element1: str, 

226 element2: str, 

227 context: queries.SqlQueryContext, 

228 governor_constraints: Mapping[str, Set[str]], 

229 existing_relationships: Set[frozenset[str]] = frozenset(), 

230 ) -> tuple[Relation, bool]: 

231 # Docstring inherited. 

232 overlap_relationship = frozenset( 

233 self.universe[element1].dimensions.names | self.universe[element2].dimensions.names 

234 ) 

235 if overlap_relationship in existing_relationships: 235 ↛ 236line 235 didn't jump to line 236, because the condition on line 235 was never true

236 return context.preferred_engine.make_join_identity_relation(), False 

237 storage1 = self[element1] 

238 storage2 = self[element2] 

239 overlaps: Relation | None = None 

240 needs_refinement: bool = False 

241 match (storage1, storage2): 

242 case [ 

243 DatabaseDimensionRecordStorage() as db_storage1, 

244 DatabaseDimensionRecordStorage() as db_storage2, 

245 ]: 

246 # Construction guarantees that we only need to try this in one 

247 # direction; either both storage objects know about the other 

248 # or neither do. 

249 overlaps = db_storage1.make_spatial_join_relation( 

250 db_storage2.element, context, governor_constraints 

251 ) 

252 if overlaps is None: 252 ↛ 300line 252 didn't jump to line 300, because the condition on line 252 was never false

253 # No direct materialized overlaps; use commonSkyPix as an 

254 # intermediary. 

255 have_overlap1_already = ( 

256 frozenset( 

257 self.universe[element1].dimensions.names | {self.universe.commonSkyPix.name} 

258 ) 

259 in existing_relationships 

260 ) 

261 have_overlap2_already = ( 

262 frozenset( 

263 self.universe[element2].dimensions.names | {self.universe.commonSkyPix.name} 

264 ) 

265 in existing_relationships 

266 ) 

267 overlap1 = context.preferred_engine.make_join_identity_relation() 

268 overlap2 = context.preferred_engine.make_join_identity_relation() 

269 if not have_overlap1_already: 

270 overlap1 = cast( 

271 Relation, 

272 db_storage1.make_spatial_join_relation( 

273 self.universe.commonSkyPix, context, governor_constraints 

274 ), 

275 ) 

276 if not have_overlap2_already: 

277 overlap2 = cast( 

278 Relation, 

279 db_storage2.make_spatial_join_relation( 

280 self.universe.commonSkyPix, context, governor_constraints 

281 ), 

282 ) 

283 overlaps = overlap1.join(overlap2) 

284 if not have_overlap1_already and not have_overlap2_already: 

285 # Drop the common skypix ID column from the overlap 

286 # relation we return, since we don't want that column 

287 # to be mistakenly equated with any other appearance of 

288 # that column, since this would mangle queries like 

289 # "join visit to tract and tract to healpix10", by 

290 # incorrectly requiring all visits and healpix10 pixels 

291 # share common skypix pixels, not just tracts. 

292 columns = set(overlaps.columns) 

293 columns.remove(DimensionKeyColumnTag(self.universe.commonSkyPix.name)) 

294 overlaps = overlaps.with_only_columns(columns) 

295 needs_refinement = True 

296 case [DatabaseDimensionRecordStorage() as db_storage, other]: 296 ↛ 297line 296 didn't jump to line 297, because the pattern on line 296 never matched

297 overlaps = db_storage.make_spatial_join_relation(other.element, context, governor_constraints) 

298 case [other, DatabaseDimensionRecordStorage() as db_storage]: 298 ↛ 300line 298 didn't jump to line 300, because the pattern on line 298 never matched

299 overlaps = db_storage.make_spatial_join_relation(other.element, context, governor_constraints) 

300 if overlaps is None: 

301 # In the future, there's a lot more we could try here: 

302 # 

303 # - for skypix dimensions, looking for materialized overlaps at 

304 # smaller spatial scales (higher-levels) and using bit-shifting; 

305 # 

306 # - for non-skypix dimensions, looking for materialized overlaps 

307 # for more finer-grained members of the same family, and then 

308 # doing SELECT DISTINCT (or even tolerating duplicates) on the 

309 # columns we care about (e.g. use patch overlaps to satisfy a 

310 # request for tract overlaps). 

311 # 

312 # It's not obvious that's better than just telling the user to 

313 # materialize more overlaps, though. 

314 raise MissingSpatialOverlapError( 

315 f"No materialized overlaps for spatial join between {element1!r} and {element2!r}." 

316 ) 

317 return overlaps, needs_refinement 

318 

319 @classmethod 

320 def currentVersions(cls) -> list[VersionTuple]: 

321 # Docstring inherited from VersionedExtension. 

322 return [_VERSION] 

323 

324 

325class _DimensionGroupStorage: 

326 """Helper object that manages saved DimensionGroup definitions. 

327 

328 Should generally be constructed by calling `initialize` instead of invoking 

329 the constructor directly. 

330 

331 Parameters 

332 ---------- 

333 db : `Database` 

334 Interface to the underlying database engine and namespace. 

335 idTable : `sqlalchemy.schema.Table` 

336 Table that just holds unique IDs for dimension graphs. 

337 definitionTable : `sqlalchemy.schema.Table` 

338 Table that maps dimension names to the IDs of the dimension graphs to 

339 which they belong. 

340 universe : `DimensionUniverse` 

341 All known dimensions. 

342 """ 

343 

344 def __init__( 

345 self, 

346 db: Database, 

347 idTable: sqlalchemy.schema.Table, 

348 definitionTable: sqlalchemy.schema.Table, 

349 universe: DimensionUniverse, 

350 ): 

351 self._db = db 

352 self._idTable = idTable 

353 self._definitionTable = definitionTable 

354 self._universe = universe 

355 self._keysByGroup: dict[DimensionGroup, int] = {universe.empty.as_group(): 0} 

356 self._groupsByKey: dict[int, DimensionGroup] = {0: universe.empty.as_group()} 

357 

358 @classmethod 

359 def initialize( 

360 cls, 

361 db: Database, 

362 context: StaticTablesContext, 

363 *, 

364 universe: DimensionUniverse, 

365 ) -> _DimensionGroupStorage: 

366 """Construct a new instance, including creating tables if necessary. 

367 

368 Parameters 

369 ---------- 

370 db : `Database` 

371 Interface to the underlying database engine and namespace. 

372 context : `StaticTablesContext` 

373 Context object obtained from `Database.declareStaticTables`; used 

374 to declare any tables that should always be present. 

375 universe : `DimensionUniverse` 

376 All known dimensions. 

377 

378 Returns 

379 ------- 

380 storage : `_DimensionGroupStorage` 

381 New instance of this class. 

382 """ 

383 # We need two tables just so we have one where the autoincrement key is 

384 # the only primary key column, as is required by (at least) SQLite. In 

385 # other databases, we might be able to use a Sequence directly. 

386 idTable = context.addTable( 

387 "dimension_graph_key", 

388 ddl.TableSpec( 

389 fields=[ 

390 ddl.FieldSpec( 

391 name="id", 

392 dtype=sqlalchemy.BigInteger, 

393 autoincrement=True, 

394 primaryKey=True, 

395 ), 

396 ], 

397 ), 

398 ) 

399 definitionTable = context.addTable( 

400 "dimension_graph_definition", 

401 ddl.TableSpec( 

402 fields=[ 

403 ddl.FieldSpec(name="dimension_graph_id", dtype=sqlalchemy.BigInteger, primaryKey=True), 

404 ddl.FieldSpec(name="dimension_name", dtype=sqlalchemy.Text, primaryKey=True), 

405 ], 

406 foreignKeys=[ 

407 ddl.ForeignKeySpec( 

408 "dimension_graph_key", 

409 source=("dimension_graph_id",), 

410 target=("id",), 

411 onDelete="CASCADE", 

412 ), 

413 ], 

414 ), 

415 ) 

416 return cls(db, idTable, definitionTable, universe=universe) 

417 

418 def refresh(self) -> None: 

419 """Refresh the in-memory cache of saved DimensionGraph definitions. 

420 

421 This should be done automatically whenever needed, but it can also 

422 be called explicitly. 

423 """ 

424 dimensionNamesByKey: dict[int, set[str]] = defaultdict(set) 

425 with self._db.query(self._definitionTable.select()) as sql_result: 

426 sql_rows = sql_result.mappings().fetchall() 

427 for row in sql_rows: 

428 key = row[self._definitionTable.columns.dimension_graph_id] 

429 dimensionNamesByKey[key].add(row[self._definitionTable.columns.dimension_name]) 

430 keysByGraph: dict[DimensionGroup, int] = {self._universe.empty.as_group(): 0} 

431 graphsByKey: dict[int, DimensionGroup] = {0: self._universe.empty.as_group()} 

432 for key, dimensionNames in dimensionNamesByKey.items(): 

433 graph = DimensionGroup(self._universe, names=dimensionNames) 

434 keysByGraph[graph] = key 

435 graphsByKey[key] = graph 

436 self._groupsByKey = graphsByKey 

437 self._keysByGroup = keysByGraph 

438 

439 def save(self, group: DimensionGroup) -> int: 

440 """Save a `DimensionGraph` definition to the database, allowing it to 

441 be retrieved later via the returned key. 

442 

443 Parameters 

444 ---------- 

445 group : `DimensionGroup` 

446 Set of dimensions to save. 

447 

448 Returns 

449 ------- 

450 key : `int` 

451 Integer used as the unique key for this `DimensionGraph` in the 

452 database. 

453 """ 

454 key = self._keysByGroup.get(group) 

455 if key is not None: 

456 return key 

457 # Lock tables and then refresh to guard against races where some other 

458 # process is trying to register the exact same dimension graph. This 

459 # is probably not the most efficient way to do it, but it should be a 

460 # rare operation, especially since the short-circuit above will usually 

461 # work in long-lived data repositories. 

462 with self._db.transaction(lock=[self._idTable, self._definitionTable]): 

463 self.refresh() 

464 key = self._keysByGroup.get(group) 

465 if key is None: 

466 (key,) = self._db.insert(self._idTable, {}, returnIds=True) # type: ignore 

467 self._db.insert( 

468 self._definitionTable, 

469 *[{"dimension_graph_id": key, "dimension_name": name} for name in group.required], 

470 ) 

471 self._keysByGroup[group] = key 

472 self._groupsByKey[key] = group 

473 return key 

474 

475 def load(self, key: int) -> DimensionGroup: 

476 """Retrieve a `DimensionGraph` that was previously saved in the 

477 database. 

478 

479 Parameters 

480 ---------- 

481 key : `int` 

482 Integer used as the unique key for this `DimensionGraph` in the 

483 database. 

484 

485 Returns 

486 ------- 

487 graph : `DimensionGraph` 

488 Retrieved graph. 

489 """ 

490 graph = self._groupsByKey.get(key) 

491 if graph is None: 

492 self.refresh() 

493 graph = self._groupsByKey[key] 

494 return graph