Coverage for python/lsst/daf/butler/registry/dimensions/static.py: 93%

156 statements  

« prev     ^ index     » next       coverage.py v7.3.1, created at 2023-10-02 07:59 +0000

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This software is dual licensed under the GNU General Public License and also 

10# under a 3-clause BSD license. Recipients may choose which of these licenses 

11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt, 

12# respectively. If you choose the GPL option then the following text applies 

13# (but note that there is still no warranty even if you opt for BSD instead): 

14# 

15# This program is free software: you can redistribute it and/or modify 

16# it under the terms of the GNU General Public License as published by 

17# the Free Software Foundation, either version 3 of the License, or 

18# (at your option) any later version. 

19# 

20# This program is distributed in the hope that it will be useful, 

21# but WITHOUT ANY WARRANTY; without even the implied warranty of 

22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

23# GNU General Public License for more details. 

24# 

25# You should have received a copy of the GNU General Public License 

26# along with this program. If not, see <http://www.gnu.org/licenses/>. 

27from __future__ import annotations 

28 

29import itertools 

30from collections import defaultdict 

31from collections.abc import Mapping, Set 

32from typing import TYPE_CHECKING, cast 

33 

34import sqlalchemy 

35from lsst.daf.relation import Relation 

36 

37from ...core import ( 

38 DatabaseDimensionElement, 

39 DatabaseTopologicalFamily, 

40 DimensionElement, 

41 DimensionGraph, 

42 DimensionKeyColumnTag, 

43 DimensionUniverse, 

44 GovernorDimension, 

45 NamedKeyDict, 

46 SkyPixDimension, 

47 ddl, 

48) 

49from .._exceptions import MissingSpatialOverlapError 

50from ..interfaces import ( 

51 Database, 

52 DatabaseDimensionOverlapStorage, 

53 DatabaseDimensionRecordStorage, 

54 DimensionRecordStorage, 

55 DimensionRecordStorageManager, 

56 GovernorDimensionRecordStorage, 

57 StaticTablesContext, 

58 VersionTuple, 

59) 

60 

61if TYPE_CHECKING: 

62 from .. import queries 

63 

64 

65# This has to be updated on every schema change 

66_VERSION = VersionTuple(6, 0, 2) 

67 

68 

69class StaticDimensionRecordStorageManager(DimensionRecordStorageManager): 

70 """An implementation of `DimensionRecordStorageManager` for single-layer 

71 `Registry` and the base layers of multi-layer `Registry`. 

72 

73 This manager creates `DimensionRecordStorage` instances for all elements 

74 in the `DimensionUniverse` in its own `initialize` method, as part of 

75 static table creation, so it never needs to manage any dynamic registry 

76 tables. 

77 

78 Parameters 

79 ---------- 

80 db : `Database` 

81 Interface to the underlying database engine and namespace. 

82 records : `NamedKeyDict` 

83 Mapping from `DimensionElement` to `DimensionRecordStorage` for that 

84 element. 

85 overlaps : `list` [ `DatabaseDimensionOverlapStorage` ] 

86 Objects that manage materialized overlaps between database-backed 

87 dimensions. 

88 dimensionGraphStorage : `_DimensionGraphStorage` 

89 Object that manages saved `DimensionGraph` definitions. 

90 universe : `DimensionUniverse` 

91 All known dimensions. 

92 """ 

93 

94 def __init__( 

95 self, 

96 db: Database, 

97 *, 

98 records: NamedKeyDict[DimensionElement, DimensionRecordStorage], 

99 overlaps: dict[ 

100 tuple[DatabaseDimensionElement, DatabaseDimensionElement], DatabaseDimensionOverlapStorage 

101 ], 

102 dimensionGraphStorage: _DimensionGraphStorage, 

103 universe: DimensionUniverse, 

104 registry_schema_version: VersionTuple | None = None, 

105 ): 

106 super().__init__(universe=universe, registry_schema_version=registry_schema_version) 

107 self._db = db 

108 self._records = records 

109 self._overlaps = overlaps 

110 self._dimensionGraphStorage = dimensionGraphStorage 

111 

112 @classmethod 

113 def initialize( 

114 cls, 

115 db: Database, 

116 context: StaticTablesContext, 

117 *, 

118 universe: DimensionUniverse, 

119 registry_schema_version: VersionTuple | None = None, 

120 ) -> DimensionRecordStorageManager: 

121 # Docstring inherited from DimensionRecordStorageManager. 

122 # Start by initializing governor dimensions; those go both in the main 

123 # 'records' mapping we'll pass to init, and a local dictionary that we 

124 # can pass in when initializing storage for DatabaseDimensionElements. 

125 governors = NamedKeyDict[GovernorDimension, GovernorDimensionRecordStorage]() 

126 records = NamedKeyDict[DimensionElement, DimensionRecordStorage]() 

127 for dimension in universe.getGovernorDimensions(): 

128 governorStorage = dimension.makeStorage(db, context=context) 

129 governors[dimension] = governorStorage 

130 records[dimension] = governorStorage 

131 # Next we initialize storage for DatabaseDimensionElements. Some 

132 # elements' storage may be views into anothers; we'll do a first pass 

133 # to gather a mapping from the names of those targets back to their 

134 # views. 

135 view_targets = { 

136 element.viewOf: element 

137 for element in universe.getDatabaseElements() 

138 if element.viewOf is not None 

139 } 

140 # We remember the spatial ones (grouped by family) so we can go back 

141 # and initialize overlap storage for them later. 

142 spatial = NamedKeyDict[DatabaseTopologicalFamily, list[DatabaseDimensionRecordStorage]]() 

143 for element in universe.getDatabaseElements(): 

144 if element.viewOf is not None: 

145 # We'll initialize this storage when the view's target is 

146 # initialized. 

147 continue 

148 elementStorage = element.makeStorage(db, context=context, governors=governors) 

149 records[element] = elementStorage 

150 if element.spatial is not None: 

151 spatial.setdefault(element.spatial, []).append(elementStorage) 

152 if (view_element := view_targets.get(element.name)) is not None: 

153 view_element_storage = view_element.makeStorage( 

154 db, 

155 context=context, 

156 governors=governors, 

157 view_target=elementStorage, 

158 ) 

159 records[view_element] = view_element_storage 

160 if view_element.spatial is not None: 160 ↛ 161line 160 didn't jump to line 161, because the condition on line 160 was never true

161 spatial.setdefault(view_element.spatial, []).append(view_element_storage) 

162 

163 # Finally we initialize overlap storage. The implementation class for 

164 # this is currently hard-coded (it's not obvious there will ever be 

165 # others). Note that overlaps between database-backed dimensions and 

166 # skypix dimensions is internal to `DatabaseDimensionRecordStorage`, 

167 # and hence is not included here. 

168 from ..dimensions.overlaps import CrossFamilyDimensionOverlapStorage 

169 

170 overlaps: dict[ 

171 tuple[DatabaseDimensionElement, DatabaseDimensionElement], DatabaseDimensionOverlapStorage 

172 ] = {} 

173 for (family1, storages1), (family2, storages2) in itertools.combinations(spatial.items(), 2): 

174 for elementStoragePair in itertools.product(storages1, storages2): 

175 governorStoragePair = (governors[family1.governor], governors[family2.governor]) 

176 if elementStoragePair[0].element > elementStoragePair[1].element: 176 ↛ 177line 176 didn't jump to line 177, because the condition on line 176 was never true

177 elementStoragePair = (elementStoragePair[1], elementStoragePair[0]) 

178 governorStoragePair = (governorStoragePair[1], governorStoragePair[1]) 

179 overlapStorage = CrossFamilyDimensionOverlapStorage.initialize( 

180 db, 

181 elementStoragePair, 

182 governorStoragePair, 

183 context=context, 

184 ) 

185 elementStoragePair[0].connect(overlapStorage) 

186 elementStoragePair[1].connect(overlapStorage) 

187 overlaps[overlapStorage.elements] = overlapStorage 

188 # Create table that stores DimensionGraph definitions. 

189 dimensionGraphStorage = _DimensionGraphStorage.initialize(db, context, universe=universe) 

190 return cls( 

191 db=db, 

192 records=records, 

193 universe=universe, 

194 overlaps=overlaps, 

195 dimensionGraphStorage=dimensionGraphStorage, 

196 registry_schema_version=registry_schema_version, 

197 ) 

198 

199 def get(self, element: DimensionElement | str) -> DimensionRecordStorage | None: 

200 # Docstring inherited from DimensionRecordStorageManager. 

201 r = self._records.get(element) 

202 if r is None: 

203 if isinstance(element, str): 

204 element = self.universe[element] 

205 if isinstance(element, SkyPixDimension): 205 ↛ 207line 205 didn't jump to line 207, because the condition on line 205 was never false

206 return self.universe.skypix[element.system][element.level].makeStorage() 

207 return r 

208 

209 def register(self, element: DimensionElement) -> DimensionRecordStorage: 

210 # Docstring inherited from DimensionRecordStorageManager. 

211 result = self.get(element) 

212 assert result, "All records instances should be created in initialize()." 

213 return result 

214 

215 def saveDimensionGraph(self, graph: DimensionGraph) -> int: 

216 # Docstring inherited from DimensionRecordStorageManager. 

217 return self._dimensionGraphStorage.save(graph) 

218 

219 def loadDimensionGraph(self, key: int) -> DimensionGraph: 

220 # Docstring inherited from DimensionRecordStorageManager. 

221 return self._dimensionGraphStorage.load(key) 

222 

223 def clearCaches(self) -> None: 

224 # Docstring inherited from DimensionRecordStorageManager. 

225 for storage in self._records.values(): 

226 storage.clearCaches() 

227 

228 def make_spatial_join_relation( 

229 self, 

230 element1: str, 

231 element2: str, 

232 context: queries.SqlQueryContext, 

233 governor_constraints: Mapping[str, Set[str]], 

234 existing_relationships: Set[frozenset[str]] = frozenset(), 

235 ) -> tuple[Relation, bool]: 

236 # Docstring inherited. 

237 overlap_relationship = frozenset( 

238 self.universe[element1].dimensions.names | self.universe[element2].dimensions.names 

239 ) 

240 if overlap_relationship in existing_relationships: 240 ↛ 241line 240 didn't jump to line 241, because the condition on line 240 was never true

241 return context.preferred_engine.make_join_identity_relation(), False 

242 storage1 = self[element1] 

243 storage2 = self[element2] 

244 overlaps: Relation | None = None 

245 needs_refinement: bool = False 

246 match (storage1, storage2): 

247 case [ 

248 DatabaseDimensionRecordStorage() as db_storage1, 

249 DatabaseDimensionRecordStorage() as db_storage2, 

250 ]: 

251 # Construction guarantees that we only need to try this in one 

252 # direction; either both storage objects know about the other 

253 # or neither do. 

254 overlaps = db_storage1.make_spatial_join_relation( 

255 db_storage2.element, context, governor_constraints 

256 ) 

257 if overlaps is None: 257 ↛ 305line 257 didn't jump to line 305, because the condition on line 257 was never false

258 # No direct materialized overlaps; use commonSkyPix as an 

259 # intermediary. 

260 have_overlap1_already = ( 

261 frozenset( 

262 self.universe[element1].dimensions.names | {self.universe.commonSkyPix.name} 

263 ) 

264 in existing_relationships 

265 ) 

266 have_overlap2_already = ( 

267 frozenset( 

268 self.universe[element2].dimensions.names | {self.universe.commonSkyPix.name} 

269 ) 

270 in existing_relationships 

271 ) 

272 overlap1 = context.preferred_engine.make_join_identity_relation() 

273 overlap2 = context.preferred_engine.make_join_identity_relation() 

274 if not have_overlap1_already: 

275 overlap1 = cast( 

276 Relation, 

277 db_storage1.make_spatial_join_relation( 

278 self.universe.commonSkyPix, context, governor_constraints 

279 ), 

280 ) 

281 if not have_overlap2_already: 

282 overlap2 = cast( 

283 Relation, 

284 db_storage2.make_spatial_join_relation( 

285 self.universe.commonSkyPix, context, governor_constraints 

286 ), 

287 ) 

288 overlaps = overlap1.join(overlap2) 

289 if not have_overlap1_already and not have_overlap2_already: 

290 # Drop the common skypix ID column from the overlap 

291 # relation we return, since we don't want that column 

292 # to be mistakenly equated with any other appearance of 

293 # that column, since this would mangle queries like 

294 # "join visit to tract and tract to healpix10", by 

295 # incorrectly requiring all visits and healpix10 pixels 

296 # share common skypix pixels, not just tracts. 

297 columns = set(overlaps.columns) 

298 columns.remove(DimensionKeyColumnTag(self.universe.commonSkyPix.name)) 

299 overlaps = overlaps.with_only_columns(columns) 

300 needs_refinement = True 

301 case [DatabaseDimensionRecordStorage() as db_storage, other]: 301 ↛ 302line 301 didn't jump to line 302, because the pattern on line 301 never matched

302 overlaps = db_storage.make_spatial_join_relation(other.element, context, governor_constraints) 

303 case [other, DatabaseDimensionRecordStorage() as db_storage]: 303 ↛ 305line 303 didn't jump to line 305, because the pattern on line 303 never matched

304 overlaps = db_storage.make_spatial_join_relation(other.element, context, governor_constraints) 

305 if overlaps is None: 

306 # In the future, there's a lot more we could try here: 

307 # 

308 # - for skypix dimensions, looking for materialized overlaps at 

309 # smaller spatial scales (higher-levels) and using bit-shifting; 

310 # 

311 # - for non-skypix dimensions, looking for materialized overlaps 

312 # for more finer-grained members of the same family, and then 

313 # doing SELECT DISTINCT (or even tolerating duplicates) on the 

314 # columns we care about (e.g. use patch overlaps to satisfy a 

315 # request for tract overlaps). 

316 # 

317 # It's not obvious that's better than just telling the user to 

318 # materialize more overlaps, though. 

319 raise MissingSpatialOverlapError( 

320 f"No materialized overlaps for spatial join between {element1!r} and {element2!r}." 

321 ) 

322 return overlaps, needs_refinement 

323 

324 @classmethod 

325 def currentVersions(cls) -> list[VersionTuple]: 

326 # Docstring inherited from VersionedExtension. 

327 return [_VERSION] 

328 

329 

330class _DimensionGraphStorage: 

331 """Helper object that manages saved DimensionGraph definitions. 

332 

333 Should generally be constructed by calling `initialize` instead of invoking 

334 the constructor directly. 

335 

336 Parameters 

337 ---------- 

338 db : `Database` 

339 Interface to the underlying database engine and namespace. 

340 idTable : `sqlalchemy.schema.Table` 

341 Table that just holds unique IDs for dimension graphs. 

342 definitionTable : `sqlalchemy.schema.Table` 

343 Table that maps dimension names to the IDs of the dimension graphs to 

344 which they belong. 

345 universe : `DimensionUniverse` 

346 All known dimensions. 

347 """ 

348 

349 def __init__( 

350 self, 

351 db: Database, 

352 idTable: sqlalchemy.schema.Table, 

353 definitionTable: sqlalchemy.schema.Table, 

354 universe: DimensionUniverse, 

355 ): 

356 self._db = db 

357 self._idTable = idTable 

358 self._definitionTable = definitionTable 

359 self._universe = universe 

360 self._keysByGraph: dict[DimensionGraph, int] = {universe.empty: 0} 

361 self._graphsByKey: dict[int, DimensionGraph] = {0: universe.empty} 

362 

363 @classmethod 

364 def initialize( 

365 cls, 

366 db: Database, 

367 context: StaticTablesContext, 

368 *, 

369 universe: DimensionUniverse, 

370 ) -> _DimensionGraphStorage: 

371 """Construct a new instance, including creating tables if necessary. 

372 

373 Parameters 

374 ---------- 

375 db : `Database` 

376 Interface to the underlying database engine and namespace. 

377 context : `StaticTablesContext` 

378 Context object obtained from `Database.declareStaticTables`; used 

379 to declare any tables that should always be present. 

380 universe : `DimensionUniverse` 

381 All known dimensions. 

382 

383 Returns 

384 ------- 

385 storage : `_DimensionGraphStorage` 

386 New instance of this class. 

387 """ 

388 # We need two tables just so we have one where the autoincrement key is 

389 # the only primary key column, as is required by (at least) SQLite. In 

390 # other databases, we might be able to use a Sequence directly. 

391 idTable = context.addTable( 

392 "dimension_graph_key", 

393 ddl.TableSpec( 

394 fields=[ 

395 ddl.FieldSpec( 

396 name="id", 

397 dtype=sqlalchemy.BigInteger, 

398 autoincrement=True, 

399 primaryKey=True, 

400 ), 

401 ], 

402 ), 

403 ) 

404 definitionTable = context.addTable( 

405 "dimension_graph_definition", 

406 ddl.TableSpec( 

407 fields=[ 

408 ddl.FieldSpec(name="dimension_graph_id", dtype=sqlalchemy.BigInteger, primaryKey=True), 

409 ddl.FieldSpec(name="dimension_name", dtype=sqlalchemy.Text, primaryKey=True), 

410 ], 

411 foreignKeys=[ 

412 ddl.ForeignKeySpec( 

413 "dimension_graph_key", 

414 source=("dimension_graph_id",), 

415 target=("id",), 

416 onDelete="CASCADE", 

417 ), 

418 ], 

419 ), 

420 ) 

421 return cls(db, idTable, definitionTable, universe=universe) 

422 

423 def refresh(self) -> None: 

424 """Refresh the in-memory cache of saved DimensionGraph definitions. 

425 

426 This should be done automatically whenever needed, but it can also 

427 be called explicitly. 

428 """ 

429 dimensionNamesByKey: dict[int, set[str]] = defaultdict(set) 

430 with self._db.query(self._definitionTable.select()) as sql_result: 

431 sql_rows = sql_result.mappings().fetchall() 

432 for row in sql_rows: 

433 key = row[self._definitionTable.columns.dimension_graph_id] 

434 dimensionNamesByKey[key].add(row[self._definitionTable.columns.dimension_name]) 

435 keysByGraph: dict[DimensionGraph, int] = {self._universe.empty: 0} 

436 graphsByKey: dict[int, DimensionGraph] = {0: self._universe.empty} 

437 for key, dimensionNames in dimensionNamesByKey.items(): 

438 graph = DimensionGraph(self._universe, names=dimensionNames) 

439 keysByGraph[graph] = key 

440 graphsByKey[key] = graph 

441 self._graphsByKey = graphsByKey 

442 self._keysByGraph = keysByGraph 

443 

444 def save(self, graph: DimensionGraph) -> int: 

445 """Save a `DimensionGraph` definition to the database, allowing it to 

446 be retrieved later via the returned key. 

447 

448 Parameters 

449 ---------- 

450 graph : `DimensionGraph` 

451 Set of dimensions to save. 

452 

453 Returns 

454 ------- 

455 key : `int` 

456 Integer used as the unique key for this `DimensionGraph` in the 

457 database. 

458 """ 

459 key = self._keysByGraph.get(graph) 

460 if key is not None: 

461 return key 

462 # Lock tables and then refresh to guard against races where some other 

463 # process is trying to register the exact same dimension graph. This 

464 # is probably not the most efficient way to do it, but it should be a 

465 # rare operation, especially since the short-circuit above will usually 

466 # work in long-lived data repositories. 

467 with self._db.transaction(lock=[self._idTable, self._definitionTable]): 

468 self.refresh() 

469 key = self._keysByGraph.get(graph) 

470 if key is None: 470 ↛ 476line 470 didn't jump to line 476, because the condition on line 470 was never false

471 (key,) = self._db.insert(self._idTable, {}, returnIds=True) # type: ignore 

472 self._db.insert( 

473 self._definitionTable, 

474 *[{"dimension_graph_id": key, "dimension_name": name} for name in graph.required.names], 

475 ) 

476 self._keysByGraph[graph] = key 

477 self._graphsByKey[key] = graph 

478 return key 

479 

480 def load(self, key: int) -> DimensionGraph: 

481 """Retrieve a `DimensionGraph` that was previously saved in the 

482 database. 

483 

484 Parameters 

485 ---------- 

486 key : `int` 

487 Integer used as the unique key for this `DimensionGraph` in the 

488 database. 

489 

490 Returns 

491 ------- 

492 graph : `DimensionGraph` 

493 Retrieved graph. 

494 """ 

495 graph = self._graphsByKey.get(key) 

496 if graph is None: 

497 self.refresh() 

498 graph = self._graphsByKey[key] 

499 return graph