Coverage for python/lsst/daf/butler/registry/dimensions/static.py: 93%

156 statements  

« prev     ^ index     » next       coverage.py v7.2.7, created at 2023-07-12 10:56 -0700

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21from __future__ import annotations 

22 

23import itertools 

24from collections import defaultdict 

25from collections.abc import Mapping, Set 

26from typing import TYPE_CHECKING, cast 

27 

28import sqlalchemy 

29from lsst.daf.relation import Relation 

30 

31from ...core import ( 

32 DatabaseDimensionElement, 

33 DatabaseTopologicalFamily, 

34 DimensionElement, 

35 DimensionGraph, 

36 DimensionKeyColumnTag, 

37 DimensionUniverse, 

38 GovernorDimension, 

39 NamedKeyDict, 

40 SkyPixDimension, 

41 ddl, 

42) 

43from .._exceptions import MissingSpatialOverlapError 

44from ..interfaces import ( 

45 Database, 

46 DatabaseDimensionOverlapStorage, 

47 DatabaseDimensionRecordStorage, 

48 DimensionRecordStorage, 

49 DimensionRecordStorageManager, 

50 GovernorDimensionRecordStorage, 

51 StaticTablesContext, 

52 VersionTuple, 

53) 

54 

55if TYPE_CHECKING: 

56 from .. import queries 

57 

58 

59# This has to be updated on every schema change 

60_VERSION = VersionTuple(6, 0, 2) 

61 

62 

63class StaticDimensionRecordStorageManager(DimensionRecordStorageManager): 

64 """An implementation of `DimensionRecordStorageManager` for single-layer 

65 `Registry` and the base layers of multi-layer `Registry`. 

66 

67 This manager creates `DimensionRecordStorage` instances for all elements 

68 in the `DimensionUniverse` in its own `initialize` method, as part of 

69 static table creation, so it never needs to manage any dynamic registry 

70 tables. 

71 

72 Parameters 

73 ---------- 

74 db : `Database` 

75 Interface to the underlying database engine and namespace. 

76 records : `NamedKeyDict` 

77 Mapping from `DimensionElement` to `DimensionRecordStorage` for that 

78 element. 

79 overlaps : `list` [ `DatabaseDimensionOverlapStorage` ] 

80 Objects that manage materialized overlaps between database-backed 

81 dimensions. 

82 dimensionGraphStorage : `_DimensionGraphStorage` 

83 Object that manages saved `DimensionGraph` definitions. 

84 universe : `DimensionUniverse` 

85 All known dimensions. 

86 """ 

87 

88 def __init__( 

89 self, 

90 db: Database, 

91 *, 

92 records: NamedKeyDict[DimensionElement, DimensionRecordStorage], 

93 overlaps: dict[ 

94 tuple[DatabaseDimensionElement, DatabaseDimensionElement], DatabaseDimensionOverlapStorage 

95 ], 

96 dimensionGraphStorage: _DimensionGraphStorage, 

97 universe: DimensionUniverse, 

98 registry_schema_version: VersionTuple | None = None, 

99 ): 

100 super().__init__(universe=universe, registry_schema_version=registry_schema_version) 

101 self._db = db 

102 self._records = records 

103 self._overlaps = overlaps 

104 self._dimensionGraphStorage = dimensionGraphStorage 

105 

106 @classmethod 

107 def initialize( 

108 cls, 

109 db: Database, 

110 context: StaticTablesContext, 

111 *, 

112 universe: DimensionUniverse, 

113 registry_schema_version: VersionTuple | None = None, 

114 ) -> DimensionRecordStorageManager: 

115 # Docstring inherited from DimensionRecordStorageManager. 

116 # Start by initializing governor dimensions; those go both in the main 

117 # 'records' mapping we'll pass to init, and a local dictionary that we 

118 # can pass in when initializing storage for DatabaseDimensionElements. 

119 governors = NamedKeyDict[GovernorDimension, GovernorDimensionRecordStorage]() 

120 records = NamedKeyDict[DimensionElement, DimensionRecordStorage]() 

121 for dimension in universe.getGovernorDimensions(): 

122 governorStorage = dimension.makeStorage(db, context=context) 

123 governors[dimension] = governorStorage 

124 records[dimension] = governorStorage 

125 # Next we initialize storage for DatabaseDimensionElements. Some 

126 # elements' storage may be views into anothers; we'll do a first pass 

127 # to gather a mapping from the names of those targets back to their 

128 # views. 

129 view_targets = { 

130 element.viewOf: element 

131 for element in universe.getDatabaseElements() 

132 if element.viewOf is not None 

133 } 

134 # We remember the spatial ones (grouped by family) so we can go back 

135 # and initialize overlap storage for them later. 

136 spatial = NamedKeyDict[DatabaseTopologicalFamily, list[DatabaseDimensionRecordStorage]]() 

137 for element in universe.getDatabaseElements(): 

138 if element.viewOf is not None: 

139 # We'll initialize this storage when the view's target is 

140 # initialized. 

141 continue 

142 elementStorage = element.makeStorage(db, context=context, governors=governors) 

143 records[element] = elementStorage 

144 if element.spatial is not None: 

145 spatial.setdefault(element.spatial, []).append(elementStorage) 

146 if (view_element := view_targets.get(element.name)) is not None: 

147 view_element_storage = view_element.makeStorage( 

148 db, 

149 context=context, 

150 governors=governors, 

151 view_target=elementStorage, 

152 ) 

153 records[view_element] = view_element_storage 

154 if view_element.spatial is not None: 154 ↛ 155line 154 didn't jump to line 155, because the condition on line 154 was never true

155 spatial.setdefault(view_element.spatial, []).append(view_element_storage) 

156 

157 # Finally we initialize overlap storage. The implementation class for 

158 # this is currently hard-coded (it's not obvious there will ever be 

159 # others). Note that overlaps between database-backed dimensions and 

160 # skypix dimensions is internal to `DatabaseDimensionRecordStorage`, 

161 # and hence is not included here. 

162 from ..dimensions.overlaps import CrossFamilyDimensionOverlapStorage 

163 

164 overlaps: dict[ 

165 tuple[DatabaseDimensionElement, DatabaseDimensionElement], DatabaseDimensionOverlapStorage 

166 ] = {} 

167 for (family1, storages1), (family2, storages2) in itertools.combinations(spatial.items(), 2): 

168 for elementStoragePair in itertools.product(storages1, storages2): 

169 governorStoragePair = (governors[family1.governor], governors[family2.governor]) 

170 if elementStoragePair[0].element > elementStoragePair[1].element: 170 ↛ 171line 170 didn't jump to line 171, because the condition on line 170 was never true

171 elementStoragePair = (elementStoragePair[1], elementStoragePair[0]) 

172 governorStoragePair = (governorStoragePair[1], governorStoragePair[1]) 

173 overlapStorage = CrossFamilyDimensionOverlapStorage.initialize( 

174 db, 

175 elementStoragePair, 

176 governorStoragePair, 

177 context=context, 

178 ) 

179 elementStoragePair[0].connect(overlapStorage) 

180 elementStoragePair[1].connect(overlapStorage) 

181 overlaps[overlapStorage.elements] = overlapStorage 

182 # Create table that stores DimensionGraph definitions. 

183 dimensionGraphStorage = _DimensionGraphStorage.initialize(db, context, universe=universe) 

184 return cls( 

185 db=db, 

186 records=records, 

187 universe=universe, 

188 overlaps=overlaps, 

189 dimensionGraphStorage=dimensionGraphStorage, 

190 registry_schema_version=registry_schema_version, 

191 ) 

192 

193 def get(self, element: DimensionElement | str) -> DimensionRecordStorage | None: 

194 # Docstring inherited from DimensionRecordStorageManager. 

195 r = self._records.get(element) 

196 if r is None: 

197 if isinstance(element, str): 

198 element = self.universe[element] 

199 if isinstance(element, SkyPixDimension): 199 ↛ 201line 199 didn't jump to line 201, because the condition on line 199 was never false

200 return self.universe.skypix[element.system][element.level].makeStorage() 

201 return r 

202 

203 def register(self, element: DimensionElement) -> DimensionRecordStorage: 

204 # Docstring inherited from DimensionRecordStorageManager. 

205 result = self.get(element) 

206 assert result, "All records instances should be created in initialize()." 

207 return result 

208 

209 def saveDimensionGraph(self, graph: DimensionGraph) -> int: 

210 # Docstring inherited from DimensionRecordStorageManager. 

211 return self._dimensionGraphStorage.save(graph) 

212 

213 def loadDimensionGraph(self, key: int) -> DimensionGraph: 

214 # Docstring inherited from DimensionRecordStorageManager. 

215 return self._dimensionGraphStorage.load(key) 

216 

217 def clearCaches(self) -> None: 

218 # Docstring inherited from DimensionRecordStorageManager. 

219 for storage in self._records.values(): 

220 storage.clearCaches() 

221 

222 def make_spatial_join_relation( 

223 self, 

224 element1: str, 

225 element2: str, 

226 context: queries.SqlQueryContext, 

227 governor_constraints: Mapping[str, Set[str]], 

228 existing_relationships: Set[frozenset[str]] = frozenset(), 

229 ) -> tuple[Relation, bool]: 

230 # Docstring inherited. 

231 overlap_relationship = frozenset( 

232 self.universe[element1].dimensions.names | self.universe[element2].dimensions.names 

233 ) 

234 if overlap_relationship in existing_relationships: 234 ↛ 235line 234 didn't jump to line 235, because the condition on line 234 was never true

235 return context.preferred_engine.make_join_identity_relation(), False 

236 storage1 = self[element1] 

237 storage2 = self[element2] 

238 overlaps: Relation | None = None 

239 needs_refinement: bool = False 

240 match (storage1, storage2): 

241 case [ 

242 DatabaseDimensionRecordStorage() as db_storage1, 

243 DatabaseDimensionRecordStorage() as db_storage2, 

244 ]: 

245 # Construction guarantees that we only need to try this in one 

246 # direction; either both storage objects know about the other 

247 # or neither do. 

248 overlaps = db_storage1.make_spatial_join_relation( 

249 db_storage2.element, context, governor_constraints 

250 ) 

251 if overlaps is None: 251 ↛ 299line 251 didn't jump to line 299, because the condition on line 251 was never false

252 # No direct materialized overlaps; use commonSkyPix as an 

253 # intermediary. 

254 have_overlap1_already = ( 

255 frozenset( 

256 self.universe[element1].dimensions.names | {self.universe.commonSkyPix.name} 

257 ) 

258 in existing_relationships 

259 ) 

260 have_overlap2_already = ( 

261 frozenset( 

262 self.universe[element2].dimensions.names | {self.universe.commonSkyPix.name} 

263 ) 

264 in existing_relationships 

265 ) 

266 overlap1 = context.preferred_engine.make_join_identity_relation() 

267 overlap2 = context.preferred_engine.make_join_identity_relation() 

268 if not have_overlap1_already: 

269 overlap1 = cast( 

270 Relation, 

271 db_storage1.make_spatial_join_relation( 

272 self.universe.commonSkyPix, context, governor_constraints 

273 ), 

274 ) 

275 if not have_overlap2_already: 

276 overlap2 = cast( 

277 Relation, 

278 db_storage2.make_spatial_join_relation( 

279 self.universe.commonSkyPix, context, governor_constraints 

280 ), 

281 ) 

282 overlaps = overlap1.join(overlap2) 

283 if not have_overlap1_already and not have_overlap2_already: 

284 # Drop the common skypix ID column from the overlap 

285 # relation we return, since we don't want that column 

286 # to be mistakenly equated with any other appearance of 

287 # that column, since this would mangle queries like 

288 # "join visit to tract and tract to healpix10", by 

289 # incorrectly requiring all visits and healpix10 pixels 

290 # share common skypix pixels, not just tracts. 

291 columns = set(overlaps.columns) 

292 columns.remove(DimensionKeyColumnTag(self.universe.commonSkyPix.name)) 

293 overlaps = overlaps.with_only_columns(columns) 

294 needs_refinement = True 

295 case [DatabaseDimensionRecordStorage() as db_storage, other]: 295 ↛ 296line 295 didn't jump to line 296, because the pattern on line 295 never matched

296 overlaps = db_storage.make_spatial_join_relation(other.element, context, governor_constraints) 

297 case [other, DatabaseDimensionRecordStorage() as db_storage]: 297 ↛ 299line 297 didn't jump to line 299, because the pattern on line 297 never matched

298 overlaps = db_storage.make_spatial_join_relation(other.element, context, governor_constraints) 

299 if overlaps is None: 

300 # In the future, there's a lot more we could try here: 

301 # 

302 # - for skypix dimensions, looking for materialized overlaps at 

303 # smaller spatial scales (higher-levels) and using bit-shifting; 

304 # 

305 # - for non-skypix dimensions, looking for materialized overlaps 

306 # for more finer-grained members of the same family, and then 

307 # doing SELECT DISTINCT (or even tolerating duplicates) on the 

308 # columns we care about (e.g. use patch overlaps to satisfy a 

309 # request for tract overlaps). 

310 # 

311 # It's not obvious that's better than just telling the user to 

312 # materialize more overlaps, though. 

313 raise MissingSpatialOverlapError( 

314 f"No materialized overlaps for spatial join between {element1!r} and {element2!r}." 

315 ) 

316 return overlaps, needs_refinement 

317 

318 @classmethod 

319 def currentVersions(cls) -> list[VersionTuple]: 

320 # Docstring inherited from VersionedExtension. 

321 return [_VERSION] 

322 

323 

324class _DimensionGraphStorage: 

325 """Helper object that manages saved DimensionGraph definitions. 

326 

327 Should generally be constructed by calling `initialize` instead of invoking 

328 the constructor directly. 

329 

330 Parameters 

331 ---------- 

332 db : `Database` 

333 Interface to the underlying database engine and namespace. 

334 idTable : `sqlalchemy.schema.Table` 

335 Table that just holds unique IDs for dimension graphs. 

336 definitionTable : `sqlalchemy.schema.Table` 

337 Table that maps dimension names to the IDs of the dimension graphs to 

338 which they belong. 

339 universe : `DimensionUniverse` 

340 All known dimensions. 

341 """ 

342 

343 def __init__( 

344 self, 

345 db: Database, 

346 idTable: sqlalchemy.schema.Table, 

347 definitionTable: sqlalchemy.schema.Table, 

348 universe: DimensionUniverse, 

349 ): 

350 self._db = db 

351 self._idTable = idTable 

352 self._definitionTable = definitionTable 

353 self._universe = universe 

354 self._keysByGraph: dict[DimensionGraph, int] = {universe.empty: 0} 

355 self._graphsByKey: dict[int, DimensionGraph] = {0: universe.empty} 

356 

357 @classmethod 

358 def initialize( 

359 cls, 

360 db: Database, 

361 context: StaticTablesContext, 

362 *, 

363 universe: DimensionUniverse, 

364 ) -> _DimensionGraphStorage: 

365 """Construct a new instance, including creating tables if necessary. 

366 

367 Parameters 

368 ---------- 

369 db : `Database` 

370 Interface to the underlying database engine and namespace. 

371 context : `StaticTablesContext` 

372 Context object obtained from `Database.declareStaticTables`; used 

373 to declare any tables that should always be present. 

374 universe : `DimensionUniverse` 

375 All known dimensions. 

376 

377 Returns 

378 ------- 

379 storage : `_DimensionGraphStorage` 

380 New instance of this class. 

381 """ 

382 # We need two tables just so we have one where the autoincrement key is 

383 # the only primary key column, as is required by (at least) SQLite. In 

384 # other databases, we might be able to use a Sequence directly. 

385 idTable = context.addTable( 

386 "dimension_graph_key", 

387 ddl.TableSpec( 

388 fields=[ 

389 ddl.FieldSpec( 

390 name="id", 

391 dtype=sqlalchemy.BigInteger, 

392 autoincrement=True, 

393 primaryKey=True, 

394 ), 

395 ], 

396 ), 

397 ) 

398 definitionTable = context.addTable( 

399 "dimension_graph_definition", 

400 ddl.TableSpec( 

401 fields=[ 

402 ddl.FieldSpec(name="dimension_graph_id", dtype=sqlalchemy.BigInteger, primaryKey=True), 

403 ddl.FieldSpec(name="dimension_name", dtype=sqlalchemy.Text, primaryKey=True), 

404 ], 

405 foreignKeys=[ 

406 ddl.ForeignKeySpec( 

407 "dimension_graph_key", 

408 source=("dimension_graph_id",), 

409 target=("id",), 

410 onDelete="CASCADE", 

411 ), 

412 ], 

413 ), 

414 ) 

415 return cls(db, idTable, definitionTable, universe=universe) 

416 

417 def refresh(self) -> None: 

418 """Refresh the in-memory cache of saved DimensionGraph definitions. 

419 

420 This should be done automatically whenever needed, but it can also 

421 be called explicitly. 

422 """ 

423 dimensionNamesByKey: dict[int, set[str]] = defaultdict(set) 

424 with self._db.query(self._definitionTable.select()) as sql_result: 

425 sql_rows = sql_result.mappings().fetchall() 

426 for row in sql_rows: 

427 key = row[self._definitionTable.columns.dimension_graph_id] 

428 dimensionNamesByKey[key].add(row[self._definitionTable.columns.dimension_name]) 

429 keysByGraph: dict[DimensionGraph, int] = {self._universe.empty: 0} 

430 graphsByKey: dict[int, DimensionGraph] = {0: self._universe.empty} 

431 for key, dimensionNames in dimensionNamesByKey.items(): 

432 graph = DimensionGraph(self._universe, names=dimensionNames) 

433 keysByGraph[graph] = key 

434 graphsByKey[key] = graph 

435 self._graphsByKey = graphsByKey 

436 self._keysByGraph = keysByGraph 

437 

438 def save(self, graph: DimensionGraph) -> int: 

439 """Save a `DimensionGraph` definition to the database, allowing it to 

440 be retrieved later via the returned key. 

441 

442 Parameters 

443 ---------- 

444 graph : `DimensionGraph` 

445 Set of dimensions to save. 

446 

447 Returns 

448 ------- 

449 key : `int` 

450 Integer used as the unique key for this `DimensionGraph` in the 

451 database. 

452 """ 

453 key = self._keysByGraph.get(graph) 

454 if key is not None: 

455 return key 

456 # Lock tables and then refresh to guard against races where some other 

457 # process is trying to register the exact same dimension graph. This 

458 # is probably not the most efficient way to do it, but it should be a 

459 # rare operation, especially since the short-circuit above will usually 

460 # work in long-lived data repositories. 

461 with self._db.transaction(lock=[self._idTable, self._definitionTable]): 

462 self.refresh() 

463 key = self._keysByGraph.get(graph) 

464 if key is None: 464 ↛ 470line 464 didn't jump to line 470, because the condition on line 464 was never false

465 (key,) = self._db.insert(self._idTable, {}, returnIds=True) # type: ignore 

466 self._db.insert( 

467 self._definitionTable, 

468 *[{"dimension_graph_id": key, "dimension_name": name} for name in graph.required.names], 

469 ) 

470 self._keysByGraph[graph] = key 

471 self._graphsByKey[key] = graph 

472 return key 

473 

474 def load(self, key: int) -> DimensionGraph: 

475 """Retrieve a `DimensionGraph` that was previously saved in the 

476 database. 

477 

478 Parameters 

479 ---------- 

480 key : `int` 

481 Integer used as the unique key for this `DimensionGraph` in the 

482 database. 

483 

484 Returns 

485 ------- 

486 graph : `DimensionGraph` 

487 Retrieved graph. 

488 """ 

489 graph = self._graphsByKey.get(key) 

490 if graph is None: 

491 self.refresh() 

492 graph = self._graphsByKey[key] 

493 return graph