Coverage for python/lsst/daf/butler/registry/dimensions/static.py: 93%
157 statements
« prev ^ index » next coverage.py v7.3.2, created at 2023-12-01 10:59 +0000
« prev ^ index » next coverage.py v7.3.2, created at 2023-12-01 10:59 +0000
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This software is dual licensed under the GNU General Public License and also
10# under a 3-clause BSD license. Recipients may choose which of these licenses
11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt,
12# respectively. If you choose the GPL option then the following text applies
13# (but note that there is still no warranty even if you opt for BSD instead):
14#
15# This program is free software: you can redistribute it and/or modify
16# it under the terms of the GNU General Public License as published by
17# the Free Software Foundation, either version 3 of the License, or
18# (at your option) any later version.
19#
20# This program is distributed in the hope that it will be useful,
21# but WITHOUT ANY WARRANTY; without even the implied warranty of
22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23# GNU General Public License for more details.
24#
25# You should have received a copy of the GNU General Public License
26# along with this program. If not, see <http://www.gnu.org/licenses/>.
27from __future__ import annotations
29import itertools
30from collections import defaultdict
31from collections.abc import Mapping, Set
32from typing import TYPE_CHECKING, cast
34import sqlalchemy
35from lsst.daf.relation import Relation
37from ... import ddl
38from ..._column_tags import DimensionKeyColumnTag
39from ..._named import NamedKeyDict
40from ...dimensions import (
41 DatabaseDimensionElement,
42 DatabaseTopologicalFamily,
43 DimensionElement,
44 DimensionGroup,
45 DimensionUniverse,
46 GovernorDimension,
47)
48from .._exceptions import MissingSpatialOverlapError
49from ..interfaces import (
50 Database,
51 DatabaseDimensionOverlapStorage,
52 DatabaseDimensionRecordStorage,
53 DimensionRecordStorage,
54 DimensionRecordStorageManager,
55 GovernorDimensionRecordStorage,
56 StaticTablesContext,
57 VersionTuple,
58)
60if TYPE_CHECKING:
61 from .. import queries
64# This has to be updated on every schema change
65_VERSION = VersionTuple(6, 0, 2)
68class StaticDimensionRecordStorageManager(DimensionRecordStorageManager):
69 """An implementation of `DimensionRecordStorageManager` for single-layer
70 `Registry` and the base layers of multi-layer `Registry`.
72 This manager creates `DimensionRecordStorage` instances for all elements
73 in the `DimensionUniverse` in its own `initialize` method, as part of
74 static table creation, so it never needs to manage any dynamic registry
75 tables.
77 Parameters
78 ----------
79 db : `Database`
80 Interface to the underlying database engine and namespace.
81 records : `NamedKeyDict`
82 Mapping from `DimensionElement` to `DimensionRecordStorage` for that
83 element.
84 overlaps : `list` [ `DatabaseDimensionOverlapStorage` ]
85 Objects that manage materialized overlaps between database-backed
86 dimensions.
87 dimension_group_storage : `_DimensionGroupStorage`
88 Object that manages saved `DimensionGroup` definitions.
89 universe : `DimensionUniverse`
90 All known dimensions.
91 """
93 def __init__(
94 self,
95 db: Database,
96 *,
97 records: NamedKeyDict[DimensionElement, DimensionRecordStorage],
98 overlaps: dict[
99 tuple[DatabaseDimensionElement, DatabaseDimensionElement], DatabaseDimensionOverlapStorage
100 ],
101 dimension_group_storage: _DimensionGroupStorage,
102 universe: DimensionUniverse,
103 registry_schema_version: VersionTuple | None = None,
104 ):
105 super().__init__(universe=universe, registry_schema_version=registry_schema_version)
106 self._db = db
107 self._records = records
108 self._overlaps = overlaps
109 self._dimension_group_storage = dimension_group_storage
111 @classmethod
112 def initialize(
113 cls,
114 db: Database,
115 context: StaticTablesContext,
116 *,
117 universe: DimensionUniverse,
118 registry_schema_version: VersionTuple | None = None,
119 ) -> DimensionRecordStorageManager:
120 # Docstring inherited from DimensionRecordStorageManager.
121 # Start by initializing governor dimensions; those go both in the main
122 # 'records' mapping we'll pass to init, and a local dictionary that we
123 # can pass in when initializing storage for DatabaseDimensionElements.
124 governors = NamedKeyDict[GovernorDimension, GovernorDimensionRecordStorage]()
125 records = NamedKeyDict[DimensionElement, DimensionRecordStorage]()
126 for dimension in universe.governor_dimensions:
127 governorStorage = dimension.makeStorage(db, context=context)
128 governors[dimension] = governorStorage
129 records[dimension] = governorStorage
130 # Next we initialize storage for DatabaseDimensionElements. Some
131 # elements' storage may be views into anothers; we'll do a first pass
132 # to gather a mapping from the names of those targets back to their
133 # views.
134 view_targets = {
135 element.viewOf: element for element in universe.database_elements if element.viewOf is not None
136 }
137 # We remember the spatial ones (grouped by family) so we can go back
138 # and initialize overlap storage for them later.
139 spatial = NamedKeyDict[DatabaseTopologicalFamily, list[DatabaseDimensionRecordStorage]]()
140 for element in universe.database_elements:
141 if element.viewOf is not None:
142 # We'll initialize this storage when the view's target is
143 # initialized.
144 continue
145 elementStorage = element.makeStorage(db, context=context, governors=governors)
146 records[element] = elementStorage
147 if element.spatial is not None:
148 spatial.setdefault(element.spatial, []).append(elementStorage)
149 if (view_element := view_targets.get(element.name)) is not None:
150 view_element_storage = view_element.makeStorage(
151 db,
152 context=context,
153 governors=governors,
154 view_target=elementStorage,
155 )
156 records[view_element] = view_element_storage
157 if view_element.spatial is not None: 157 ↛ 158line 157 didn't jump to line 158, because the condition on line 157 was never true
158 spatial.setdefault(view_element.spatial, []).append(view_element_storage)
160 # Finally we initialize overlap storage. The implementation class for
161 # this is currently hard-coded (it's not obvious there will ever be
162 # others). Note that overlaps between database-backed dimensions and
163 # skypix dimensions is internal to `DatabaseDimensionRecordStorage`,
164 # and hence is not included here.
165 from ..dimensions.overlaps import CrossFamilyDimensionOverlapStorage
167 overlaps: dict[
168 tuple[DatabaseDimensionElement, DatabaseDimensionElement], DatabaseDimensionOverlapStorage
169 ] = {}
170 for (family1, storages1), (family2, storages2) in itertools.combinations(spatial.items(), 2):
171 for elementStoragePair in itertools.product(storages1, storages2):
172 governorStoragePair = (governors[family1.governor], governors[family2.governor])
173 if elementStoragePair[0].element > elementStoragePair[1].element: 173 ↛ 174line 173 didn't jump to line 174, because the condition on line 173 was never true
174 elementStoragePair = (elementStoragePair[1], elementStoragePair[0])
175 governorStoragePair = (governorStoragePair[1], governorStoragePair[1])
176 overlapStorage = CrossFamilyDimensionOverlapStorage.initialize(
177 db,
178 elementStoragePair,
179 governorStoragePair,
180 context=context,
181 )
182 elementStoragePair[0].connect(overlapStorage)
183 elementStoragePair[1].connect(overlapStorage)
184 overlaps[overlapStorage.elements] = overlapStorage
185 # Create table that stores DimensionGraph definitions.
186 dimension_group_storage = _DimensionGroupStorage.initialize(db, context, universe=universe)
187 return cls(
188 db=db,
189 records=records,
190 universe=universe,
191 overlaps=overlaps,
192 dimension_group_storage=dimension_group_storage,
193 registry_schema_version=registry_schema_version,
194 )
196 def get(self, element: DimensionElement | str) -> DimensionRecordStorage | None:
197 # Docstring inherited from DimensionRecordStorageManager.
198 r = self._records.get(element)
199 if r is None:
200 if (dimension := self.universe.skypix_dimensions.get(element)) is not None: 200 ↛ 202line 200 didn't jump to line 202, because the condition on line 200 was never false
201 return dimension.makeStorage()
202 return r
204 def register(self, element: DimensionElement) -> DimensionRecordStorage:
205 # Docstring inherited from DimensionRecordStorageManager.
206 result = self.get(element)
207 assert result, "All records instances should be created in initialize()."
208 return result
210 def save_dimension_group(self, graph: DimensionGroup) -> int:
211 # Docstring inherited from DimensionRecordStorageManager.
212 return self._dimension_group_storage.save(graph)
214 def load_dimension_group(self, key: int) -> DimensionGroup:
215 # Docstring inherited from DimensionRecordStorageManager.
216 return self._dimension_group_storage.load(key)
218 def clearCaches(self) -> None:
219 # Docstring inherited from DimensionRecordStorageManager.
220 for storage in self._records.values():
221 storage.clearCaches()
223 def make_spatial_join_relation(
224 self,
225 element1: str,
226 element2: str,
227 context: queries.SqlQueryContext,
228 governor_constraints: Mapping[str, Set[str]],
229 existing_relationships: Set[frozenset[str]] = frozenset(),
230 ) -> tuple[Relation, bool]:
231 # Docstring inherited.
232 overlap_relationship = frozenset(
233 self.universe[element1].dimensions.names | self.universe[element2].dimensions.names
234 )
235 if overlap_relationship in existing_relationships: 235 ↛ 236line 235 didn't jump to line 236, because the condition on line 235 was never true
236 return context.preferred_engine.make_join_identity_relation(), False
237 storage1 = self[element1]
238 storage2 = self[element2]
239 overlaps: Relation | None = None
240 needs_refinement: bool = False
241 match (storage1, storage2):
242 case [
243 DatabaseDimensionRecordStorage() as db_storage1,
244 DatabaseDimensionRecordStorage() as db_storage2,
245 ]:
246 # Construction guarantees that we only need to try this in one
247 # direction; either both storage objects know about the other
248 # or neither do.
249 overlaps = db_storage1.make_spatial_join_relation(
250 db_storage2.element, context, governor_constraints
251 )
252 if overlaps is None: 252 ↛ 300line 252 didn't jump to line 300, because the condition on line 252 was never false
253 # No direct materialized overlaps; use commonSkyPix as an
254 # intermediary.
255 have_overlap1_already = (
256 frozenset(
257 self.universe[element1].dimensions.names | {self.universe.commonSkyPix.name}
258 )
259 in existing_relationships
260 )
261 have_overlap2_already = (
262 frozenset(
263 self.universe[element2].dimensions.names | {self.universe.commonSkyPix.name}
264 )
265 in existing_relationships
266 )
267 overlap1 = context.preferred_engine.make_join_identity_relation()
268 overlap2 = context.preferred_engine.make_join_identity_relation()
269 if not have_overlap1_already:
270 overlap1 = cast(
271 Relation,
272 db_storage1.make_spatial_join_relation(
273 self.universe.commonSkyPix, context, governor_constraints
274 ),
275 )
276 if not have_overlap2_already:
277 overlap2 = cast(
278 Relation,
279 db_storage2.make_spatial_join_relation(
280 self.universe.commonSkyPix, context, governor_constraints
281 ),
282 )
283 overlaps = overlap1.join(overlap2)
284 if not have_overlap1_already and not have_overlap2_already:
285 # Drop the common skypix ID column from the overlap
286 # relation we return, since we don't want that column
287 # to be mistakenly equated with any other appearance of
288 # that column, since this would mangle queries like
289 # "join visit to tract and tract to healpix10", by
290 # incorrectly requiring all visits and healpix10 pixels
291 # share common skypix pixels, not just tracts.
292 columns = set(overlaps.columns)
293 columns.remove(DimensionKeyColumnTag(self.universe.commonSkyPix.name))
294 overlaps = overlaps.with_only_columns(columns)
295 needs_refinement = True
296 case [DatabaseDimensionRecordStorage() as db_storage, other]: 296 ↛ 297line 296 didn't jump to line 297, because the pattern on line 296 never matched
297 overlaps = db_storage.make_spatial_join_relation(other.element, context, governor_constraints)
298 case [other, DatabaseDimensionRecordStorage() as db_storage]: 298 ↛ 300line 298 didn't jump to line 300, because the pattern on line 298 never matched
299 overlaps = db_storage.make_spatial_join_relation(other.element, context, governor_constraints)
300 if overlaps is None:
301 # In the future, there's a lot more we could try here:
302 #
303 # - for skypix dimensions, looking for materialized overlaps at
304 # smaller spatial scales (higher-levels) and using bit-shifting;
305 #
306 # - for non-skypix dimensions, looking for materialized overlaps
307 # for more finer-grained members of the same family, and then
308 # doing SELECT DISTINCT (or even tolerating duplicates) on the
309 # columns we care about (e.g. use patch overlaps to satisfy a
310 # request for tract overlaps).
311 #
312 # It's not obvious that's better than just telling the user to
313 # materialize more overlaps, though.
314 raise MissingSpatialOverlapError(
315 f"No materialized overlaps for spatial join between {element1!r} and {element2!r}."
316 )
317 return overlaps, needs_refinement
319 @classmethod
320 def currentVersions(cls) -> list[VersionTuple]:
321 # Docstring inherited from VersionedExtension.
322 return [_VERSION]
325class _DimensionGroupStorage:
326 """Helper object that manages saved DimensionGroup definitions.
328 Should generally be constructed by calling `initialize` instead of invoking
329 the constructor directly.
331 Parameters
332 ----------
333 db : `Database`
334 Interface to the underlying database engine and namespace.
335 idTable : `sqlalchemy.schema.Table`
336 Table that just holds unique IDs for dimension graphs.
337 definitionTable : `sqlalchemy.schema.Table`
338 Table that maps dimension names to the IDs of the dimension graphs to
339 which they belong.
340 universe : `DimensionUniverse`
341 All known dimensions.
342 """
344 def __init__(
345 self,
346 db: Database,
347 idTable: sqlalchemy.schema.Table,
348 definitionTable: sqlalchemy.schema.Table,
349 universe: DimensionUniverse,
350 ):
351 self._db = db
352 self._idTable = idTable
353 self._definitionTable = definitionTable
354 self._universe = universe
355 self._keysByGroup: dict[DimensionGroup, int] = {universe.empty.as_group(): 0}
356 self._groupsByKey: dict[int, DimensionGroup] = {0: universe.empty.as_group()}
358 @classmethod
359 def initialize(
360 cls,
361 db: Database,
362 context: StaticTablesContext,
363 *,
364 universe: DimensionUniverse,
365 ) -> _DimensionGroupStorage:
366 """Construct a new instance, including creating tables if necessary.
368 Parameters
369 ----------
370 db : `Database`
371 Interface to the underlying database engine and namespace.
372 context : `StaticTablesContext`
373 Context object obtained from `Database.declareStaticTables`; used
374 to declare any tables that should always be present.
375 universe : `DimensionUniverse`
376 All known dimensions.
378 Returns
379 -------
380 storage : `_DimensionGroupStorage`
381 New instance of this class.
382 """
383 # We need two tables just so we have one where the autoincrement key is
384 # the only primary key column, as is required by (at least) SQLite. In
385 # other databases, we might be able to use a Sequence directly.
386 idTable = context.addTable(
387 "dimension_graph_key",
388 ddl.TableSpec(
389 fields=[
390 ddl.FieldSpec(
391 name="id",
392 dtype=sqlalchemy.BigInteger,
393 autoincrement=True,
394 primaryKey=True,
395 ),
396 ],
397 ),
398 )
399 definitionTable = context.addTable(
400 "dimension_graph_definition",
401 ddl.TableSpec(
402 fields=[
403 ddl.FieldSpec(name="dimension_graph_id", dtype=sqlalchemy.BigInteger, primaryKey=True),
404 ddl.FieldSpec(name="dimension_name", dtype=sqlalchemy.Text, primaryKey=True),
405 ],
406 foreignKeys=[
407 ddl.ForeignKeySpec(
408 "dimension_graph_key",
409 source=("dimension_graph_id",),
410 target=("id",),
411 onDelete="CASCADE",
412 ),
413 ],
414 ),
415 )
416 return cls(db, idTable, definitionTable, universe=universe)
418 def refresh(self) -> None:
419 """Refresh the in-memory cache of saved DimensionGraph definitions.
421 This should be done automatically whenever needed, but it can also
422 be called explicitly.
423 """
424 dimensionNamesByKey: dict[int, set[str]] = defaultdict(set)
425 with self._db.query(self._definitionTable.select()) as sql_result:
426 sql_rows = sql_result.mappings().fetchall()
427 for row in sql_rows:
428 key = row[self._definitionTable.columns.dimension_graph_id]
429 dimensionNamesByKey[key].add(row[self._definitionTable.columns.dimension_name])
430 keysByGraph: dict[DimensionGroup, int] = {self._universe.empty.as_group(): 0}
431 graphsByKey: dict[int, DimensionGroup] = {0: self._universe.empty.as_group()}
432 for key, dimensionNames in dimensionNamesByKey.items():
433 graph = DimensionGroup(self._universe, names=dimensionNames)
434 keysByGraph[graph] = key
435 graphsByKey[key] = graph
436 self._groupsByKey = graphsByKey
437 self._keysByGroup = keysByGraph
439 def save(self, group: DimensionGroup) -> int:
440 """Save a `DimensionGraph` definition to the database, allowing it to
441 be retrieved later via the returned key.
443 Parameters
444 ----------
445 group : `DimensionGroup`
446 Set of dimensions to save.
448 Returns
449 -------
450 key : `int`
451 Integer used as the unique key for this `DimensionGraph` in the
452 database.
453 """
454 key = self._keysByGroup.get(group)
455 if key is not None:
456 return key
457 # Lock tables and then refresh to guard against races where some other
458 # process is trying to register the exact same dimension graph. This
459 # is probably not the most efficient way to do it, but it should be a
460 # rare operation, especially since the short-circuit above will usually
461 # work in long-lived data repositories.
462 with self._db.transaction(lock=[self._idTable, self._definitionTable]):
463 self.refresh()
464 key = self._keysByGroup.get(group)
465 if key is None:
466 (key,) = self._db.insert(self._idTable, {}, returnIds=True) # type: ignore
467 self._db.insert(
468 self._definitionTable,
469 *[{"dimension_graph_id": key, "dimension_name": name} for name in group.required],
470 )
471 self._keysByGroup[group] = key
472 self._groupsByKey[key] = group
473 return key
475 def load(self, key: int) -> DimensionGroup:
476 """Retrieve a `DimensionGraph` that was previously saved in the
477 database.
479 Parameters
480 ----------
481 key : `int`
482 Integer used as the unique key for this `DimensionGraph` in the
483 database.
485 Returns
486 -------
487 graph : `DimensionGraph`
488 Retrieved graph.
489 """
490 graph = self._groupsByKey.get(key)
491 if graph is None:
492 self.refresh()
493 graph = self._groupsByKey[key]
494 return graph