Coverage for python/lsst/daf/butler/registry/dimensions/static.py: 93%
156 statements
« prev ^ index » next coverage.py v7.2.7, created at 2023-06-28 10:09 +0000
« prev ^ index » next coverage.py v7.2.7, created at 2023-06-28 10:09 +0000
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
21from __future__ import annotations
23import itertools
24from collections import defaultdict
25from collections.abc import Mapping, Set
26from typing import TYPE_CHECKING, cast
28import sqlalchemy
29from lsst.daf.relation import Relation
31from ...core import (
32 DatabaseDimensionElement,
33 DatabaseTopologicalFamily,
34 DimensionElement,
35 DimensionGraph,
36 DimensionKeyColumnTag,
37 DimensionUniverse,
38 GovernorDimension,
39 NamedKeyDict,
40 SkyPixDimension,
41 ddl,
42)
43from .._exceptions import MissingSpatialOverlapError
44from ..interfaces import (
45 Database,
46 DatabaseDimensionOverlapStorage,
47 DatabaseDimensionRecordStorage,
48 DimensionRecordStorage,
49 DimensionRecordStorageManager,
50 GovernorDimensionRecordStorage,
51 StaticTablesContext,
52 VersionTuple,
53)
55if TYPE_CHECKING:
56 from .. import queries
59# This has to be updated on every schema change
60_VERSION = VersionTuple(6, 0, 2)
63class StaticDimensionRecordStorageManager(DimensionRecordStorageManager):
64 """An implementation of `DimensionRecordStorageManager` for single-layer
65 `Registry` and the base layers of multi-layer `Registry`.
67 This manager creates `DimensionRecordStorage` instances for all elements
68 in the `DimensionUniverse` in its own `initialize` method, as part of
69 static table creation, so it never needs to manage any dynamic registry
70 tables.
72 Parameters
73 ----------
74 db : `Database`
75 Interface to the underlying database engine and namespace.
76 records : `NamedKeyDict`
77 Mapping from `DimensionElement` to `DimensionRecordStorage` for that
78 element.
79 overlaps : `list` [ `DatabaseDimensionOverlapStorage` ]
80 Objects that manage materialized overlaps between database-backed
81 dimensions.
82 dimensionGraphStorage : `_DimensionGraphStorage`
83 Object that manages saved `DimensionGraph` definitions.
84 universe : `DimensionUniverse`
85 All known dimensions.
86 """
88 def __init__(
89 self,
90 db: Database,
91 *,
92 records: NamedKeyDict[DimensionElement, DimensionRecordStorage],
93 overlaps: dict[
94 tuple[DatabaseDimensionElement, DatabaseDimensionElement], DatabaseDimensionOverlapStorage
95 ],
96 dimensionGraphStorage: _DimensionGraphStorage,
97 universe: DimensionUniverse,
98 registry_schema_version: VersionTuple | None = None,
99 ):
100 super().__init__(universe=universe, registry_schema_version=registry_schema_version)
101 self._db = db
102 self._records = records
103 self._overlaps = overlaps
104 self._dimensionGraphStorage = dimensionGraphStorage
106 @classmethod
107 def initialize(
108 cls,
109 db: Database,
110 context: StaticTablesContext,
111 *,
112 universe: DimensionUniverse,
113 registry_schema_version: VersionTuple | None = None,
114 ) -> DimensionRecordStorageManager:
115 # Docstring inherited from DimensionRecordStorageManager.
116 # Start by initializing governor dimensions; those go both in the main
117 # 'records' mapping we'll pass to init, and a local dictionary that we
118 # can pass in when initializing storage for DatabaseDimensionElements.
119 governors = NamedKeyDict[GovernorDimension, GovernorDimensionRecordStorage]()
120 records = NamedKeyDict[DimensionElement, DimensionRecordStorage]()
121 for dimension in universe.getGovernorDimensions():
122 governorStorage = dimension.makeStorage(db, context=context)
123 governors[dimension] = governorStorage
124 records[dimension] = governorStorage
125 # Next we initialize storage for DatabaseDimensionElements. Some
126 # elements' storage may be views into anothers; we'll do a first pass
127 # to gather a mapping from the names of those targets back to their
128 # views.
129 view_targets = {
130 element.viewOf: element
131 for element in universe.getDatabaseElements()
132 if element.viewOf is not None
133 }
134 # We remember the spatial ones (grouped by family) so we can go back
135 # and initialize overlap storage for them later.
136 spatial = NamedKeyDict[DatabaseTopologicalFamily, list[DatabaseDimensionRecordStorage]]()
137 for element in universe.getDatabaseElements():
138 if element.viewOf is not None:
139 # We'll initialize this storage when the view's target is
140 # initialized.
141 continue
142 elementStorage = element.makeStorage(db, context=context, governors=governors)
143 records[element] = elementStorage
144 if element.spatial is not None:
145 spatial.setdefault(element.spatial, []).append(elementStorage)
146 if (view_element := view_targets.get(element.name)) is not None:
147 view_element_storage = view_element.makeStorage(
148 db,
149 context=context,
150 governors=governors,
151 view_target=elementStorage,
152 )
153 records[view_element] = view_element_storage
154 if view_element.spatial is not None: 154 ↛ 155line 154 didn't jump to line 155, because the condition on line 154 was never true
155 spatial.setdefault(view_element.spatial, []).append(view_element_storage)
157 # Finally we initialize overlap storage. The implementation class for
158 # this is currently hard-coded (it's not obvious there will ever be
159 # others). Note that overlaps between database-backed dimensions and
160 # skypix dimensions is internal to `DatabaseDimensionRecordStorage`,
161 # and hence is not included here.
162 from ..dimensions.overlaps import CrossFamilyDimensionOverlapStorage
164 overlaps: dict[
165 tuple[DatabaseDimensionElement, DatabaseDimensionElement], DatabaseDimensionOverlapStorage
166 ] = {}
167 for (family1, storages1), (family2, storages2) in itertools.combinations(spatial.items(), 2):
168 for elementStoragePair in itertools.product(storages1, storages2):
169 governorStoragePair = (governors[family1.governor], governors[family2.governor])
170 if elementStoragePair[0].element > elementStoragePair[1].element: 170 ↛ 171line 170 didn't jump to line 171, because the condition on line 170 was never true
171 elementStoragePair = (elementStoragePair[1], elementStoragePair[0])
172 governorStoragePair = (governorStoragePair[1], governorStoragePair[1])
173 overlapStorage = CrossFamilyDimensionOverlapStorage.initialize(
174 db,
175 elementStoragePair,
176 governorStoragePair,
177 context=context,
178 )
179 elementStoragePair[0].connect(overlapStorage)
180 elementStoragePair[1].connect(overlapStorage)
181 overlaps[overlapStorage.elements] = overlapStorage
182 # Create table that stores DimensionGraph definitions.
183 dimensionGraphStorage = _DimensionGraphStorage.initialize(db, context, universe=universe)
184 return cls(
185 db=db,
186 records=records,
187 universe=universe,
188 overlaps=overlaps,
189 dimensionGraphStorage=dimensionGraphStorage,
190 registry_schema_version=registry_schema_version,
191 )
193 def get(self, element: DimensionElement | str) -> DimensionRecordStorage | None:
194 # Docstring inherited from DimensionRecordStorageManager.
195 r = self._records.get(element)
196 if r is None:
197 if isinstance(element, str):
198 element = self.universe[element]
199 if isinstance(element, SkyPixDimension): 199 ↛ 201line 199 didn't jump to line 201, because the condition on line 199 was never false
200 return self.universe.skypix[element.system][element.level].makeStorage()
201 return r
203 def register(self, element: DimensionElement) -> DimensionRecordStorage:
204 # Docstring inherited from DimensionRecordStorageManager.
205 result = self.get(element)
206 assert result, "All records instances should be created in initialize()."
207 return result
209 def saveDimensionGraph(self, graph: DimensionGraph) -> int:
210 # Docstring inherited from DimensionRecordStorageManager.
211 return self._dimensionGraphStorage.save(graph)
213 def loadDimensionGraph(self, key: int) -> DimensionGraph:
214 # Docstring inherited from DimensionRecordStorageManager.
215 return self._dimensionGraphStorage.load(key)
217 def clearCaches(self) -> None:
218 # Docstring inherited from DimensionRecordStorageManager.
219 for storage in self._records.values():
220 storage.clearCaches()
222 def make_spatial_join_relation(
223 self,
224 element1: str,
225 element2: str,
226 context: queries.SqlQueryContext,
227 governor_constraints: Mapping[str, Set[str]],
228 existing_relationships: Set[frozenset[str]] = frozenset(),
229 ) -> tuple[Relation, bool]:
230 # Docstring inherited.
231 overlap_relationship = frozenset(
232 self.universe[element1].dimensions.names | self.universe[element2].dimensions.names
233 )
234 if overlap_relationship in existing_relationships: 234 ↛ 235line 234 didn't jump to line 235, because the condition on line 234 was never true
235 return context.preferred_engine.make_join_identity_relation(), False
236 storage1 = self[element1]
237 storage2 = self[element2]
238 overlaps: Relation | None = None
239 needs_refinement: bool = False
240 match (storage1, storage2):
241 case [
242 DatabaseDimensionRecordStorage() as db_storage1,
243 DatabaseDimensionRecordStorage() as db_storage2,
244 ]:
245 # Construction guarantees that we only need to try this in one
246 # direction; either both storage objects know about the other
247 # or neither do.
248 overlaps = db_storage1.make_spatial_join_relation(
249 db_storage2.element, context, governor_constraints
250 )
251 if overlaps is None: 251 ↛ 299line 251 didn't jump to line 299, because the condition on line 251 was never false
252 # No direct materialized overlaps; use commonSkyPix as an
253 # intermediary.
254 have_overlap1_already = (
255 frozenset(
256 self.universe[element1].dimensions.names | {self.universe.commonSkyPix.name}
257 )
258 in existing_relationships
259 )
260 have_overlap2_already = (
261 frozenset(
262 self.universe[element2].dimensions.names | {self.universe.commonSkyPix.name}
263 )
264 in existing_relationships
265 )
266 overlap1 = context.preferred_engine.make_join_identity_relation()
267 overlap2 = context.preferred_engine.make_join_identity_relation()
268 if not have_overlap1_already:
269 overlap1 = cast(
270 Relation,
271 db_storage1.make_spatial_join_relation(
272 self.universe.commonSkyPix, context, governor_constraints
273 ),
274 )
275 if not have_overlap2_already:
276 overlap2 = cast(
277 Relation,
278 db_storage2.make_spatial_join_relation(
279 self.universe.commonSkyPix, context, governor_constraints
280 ),
281 )
282 overlaps = overlap1.join(overlap2)
283 if not have_overlap1_already and not have_overlap2_already:
284 # Drop the common skypix ID column from the overlap
285 # relation we return, since we don't want that column
286 # to be mistakenly equated with any other appearance of
287 # that column, since this would mangle queries like
288 # "join visit to tract and tract to healpix10", by
289 # incorrectly requiring all visits and healpix10 pixels
290 # share common skypix pixels, not just tracts.
291 columns = set(overlaps.columns)
292 columns.remove(DimensionKeyColumnTag(self.universe.commonSkyPix.name))
293 overlaps = overlaps.with_only_columns(columns)
294 needs_refinement = True
295 case [DatabaseDimensionRecordStorage() as db_storage, other]: 295 ↛ 296line 295 didn't jump to line 296, because the pattern on line 295 never matched
296 overlaps = db_storage.make_spatial_join_relation(other.element, context, governor_constraints)
297 case [other, DatabaseDimensionRecordStorage() as db_storage]: 297 ↛ 299line 297 didn't jump to line 299, because the pattern on line 297 never matched
298 overlaps = db_storage.make_spatial_join_relation(other.element, context, governor_constraints)
299 if overlaps is None:
300 # In the future, there's a lot more we could try here:
301 #
302 # - for skypix dimensions, looking for materialized overlaps at
303 # smaller spatial scales (higher-levels) and using bit-shifting;
304 #
305 # - for non-skypix dimensions, looking for materialized overlaps
306 # for more finer-grained members of the same family, and then
307 # doing SELECT DISTINCT (or even tolerating duplicates) on the
308 # columns we care about (e.g. use patch overlaps to satisfy a
309 # request for tract overlaps).
310 #
311 # It's not obvious that's better than just telling the user to
312 # materialize more overlaps, though.
313 raise MissingSpatialOverlapError(
314 f"No materialized overlaps for spatial join between {element1!r} and {element2!r}."
315 )
316 return overlaps, needs_refinement
318 @classmethod
319 def currentVersions(cls) -> list[VersionTuple]:
320 # Docstring inherited from VersionedExtension.
321 return [_VERSION]
324class _DimensionGraphStorage:
325 """Helper object that manages saved DimensionGraph definitions.
327 Should generally be constructed by calling `initialize` instead of invoking
328 the constructor directly.
330 Parameters
331 ----------
332 db : `Database`
333 Interface to the underlying database engine and namespace.
334 idTable : `sqlalchemy.schema.Table`
335 Table that just holds unique IDs for dimension graphs.
336 definitionTable : `sqlalchemy.schema.Table`
337 Table that maps dimension names to the IDs of the dimension graphs to
338 which they belong.
339 universe : `DimensionUniverse`
340 All known dimensions.
341 """
343 def __init__(
344 self,
345 db: Database,
346 idTable: sqlalchemy.schema.Table,
347 definitionTable: sqlalchemy.schema.Table,
348 universe: DimensionUniverse,
349 ):
350 self._db = db
351 self._idTable = idTable
352 self._definitionTable = definitionTable
353 self._universe = universe
354 self._keysByGraph: dict[DimensionGraph, int] = {universe.empty: 0}
355 self._graphsByKey: dict[int, DimensionGraph] = {0: universe.empty}
357 @classmethod
358 def initialize(
359 cls,
360 db: Database,
361 context: StaticTablesContext,
362 *,
363 universe: DimensionUniverse,
364 ) -> _DimensionGraphStorage:
365 """Construct a new instance, including creating tables if necessary.
367 Parameters
368 ----------
369 db : `Database`
370 Interface to the underlying database engine and namespace.
371 context : `StaticTablesContext`
372 Context object obtained from `Database.declareStaticTables`; used
373 to declare any tables that should always be present.
374 universe : `DimensionUniverse`
375 All known dimensions.
377 Returns
378 -------
379 storage : `_DimensionGraphStorage`
380 New instance of this class.
381 """
382 # We need two tables just so we have one where the autoincrement key is
383 # the only primary key column, as is required by (at least) SQLite. In
384 # other databases, we might be able to use a Sequence directly.
385 idTable = context.addTable(
386 "dimension_graph_key",
387 ddl.TableSpec(
388 fields=[
389 ddl.FieldSpec(
390 name="id",
391 dtype=sqlalchemy.BigInteger,
392 autoincrement=True,
393 primaryKey=True,
394 ),
395 ],
396 ),
397 )
398 definitionTable = context.addTable(
399 "dimension_graph_definition",
400 ddl.TableSpec(
401 fields=[
402 ddl.FieldSpec(name="dimension_graph_id", dtype=sqlalchemy.BigInteger, primaryKey=True),
403 ddl.FieldSpec(name="dimension_name", dtype=sqlalchemy.Text, primaryKey=True),
404 ],
405 foreignKeys=[
406 ddl.ForeignKeySpec(
407 "dimension_graph_key",
408 source=("dimension_graph_id",),
409 target=("id",),
410 onDelete="CASCADE",
411 ),
412 ],
413 ),
414 )
415 return cls(db, idTable, definitionTable, universe=universe)
417 def refresh(self) -> None:
418 """Refresh the in-memory cache of saved DimensionGraph definitions.
420 This should be done automatically whenever needed, but it can also
421 be called explicitly.
422 """
423 dimensionNamesByKey: dict[int, set[str]] = defaultdict(set)
424 with self._db.query(self._definitionTable.select()) as sql_result:
425 sql_rows = sql_result.mappings().fetchall()
426 for row in sql_rows:
427 key = row[self._definitionTable.columns.dimension_graph_id]
428 dimensionNamesByKey[key].add(row[self._definitionTable.columns.dimension_name])
429 keysByGraph: dict[DimensionGraph, int] = {self._universe.empty: 0}
430 graphsByKey: dict[int, DimensionGraph] = {0: self._universe.empty}
431 for key, dimensionNames in dimensionNamesByKey.items():
432 graph = DimensionGraph(self._universe, names=dimensionNames)
433 keysByGraph[graph] = key
434 graphsByKey[key] = graph
435 self._graphsByKey = graphsByKey
436 self._keysByGraph = keysByGraph
438 def save(self, graph: DimensionGraph) -> int:
439 """Save a `DimensionGraph` definition to the database, allowing it to
440 be retrieved later via the returned key.
442 Parameters
443 ----------
444 graph : `DimensionGraph`
445 Set of dimensions to save.
447 Returns
448 -------
449 key : `int`
450 Integer used as the unique key for this `DimensionGraph` in the
451 database.
452 """
453 key = self._keysByGraph.get(graph)
454 if key is not None:
455 return key
456 # Lock tables and then refresh to guard against races where some other
457 # process is trying to register the exact same dimension graph. This
458 # is probably not the most efficient way to do it, but it should be a
459 # rare operation, especially since the short-circuit above will usually
460 # work in long-lived data repositories.
461 with self._db.transaction(lock=[self._idTable, self._definitionTable]):
462 self.refresh()
463 key = self._keysByGraph.get(graph)
464 if key is None: 464 ↛ 470line 464 didn't jump to line 470, because the condition on line 464 was never false
465 (key,) = self._db.insert(self._idTable, {}, returnIds=True) # type: ignore
466 self._db.insert(
467 self._definitionTable,
468 *[{"dimension_graph_id": key, "dimension_name": name} for name in graph.required.names],
469 )
470 self._keysByGraph[graph] = key
471 self._graphsByKey[key] = graph
472 return key
474 def load(self, key: int) -> DimensionGraph:
475 """Retrieve a `DimensionGraph` that was previously saved in the
476 database.
478 Parameters
479 ----------
480 key : `int`
481 Integer used as the unique key for this `DimensionGraph` in the
482 database.
484 Returns
485 -------
486 graph : `DimensionGraph`
487 Retrieved graph.
488 """
489 graph = self._graphsByKey.get(key)
490 if graph is None:
491 self.refresh()
492 graph = self._graphsByKey[key]
493 return graph