Coverage for python/lsst/daf/butler/registry/dimensions/static.py: 93%
159 statements
« prev ^ index » next coverage.py v7.3.2, created at 2023-10-27 09:43 +0000
« prev ^ index » next coverage.py v7.3.2, created at 2023-10-27 09:43 +0000
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This software is dual licensed under the GNU General Public License and also
10# under a 3-clause BSD license. Recipients may choose which of these licenses
11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt,
12# respectively. If you choose the GPL option then the following text applies
13# (but note that there is still no warranty even if you opt for BSD instead):
14#
15# This program is free software: you can redistribute it and/or modify
16# it under the terms of the GNU General Public License as published by
17# the Free Software Foundation, either version 3 of the License, or
18# (at your option) any later version.
19#
20# This program is distributed in the hope that it will be useful,
21# but WITHOUT ANY WARRANTY; without even the implied warranty of
22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23# GNU General Public License for more details.
24#
25# You should have received a copy of the GNU General Public License
26# along with this program. If not, see <http://www.gnu.org/licenses/>.
27from __future__ import annotations
29import itertools
30from collections import defaultdict
31from collections.abc import Mapping, Set
32from typing import TYPE_CHECKING, cast
34import sqlalchemy
35from lsst.daf.relation import Relation
37from ... import ddl
38from ..._column_tags import DimensionKeyColumnTag
39from ..._named import NamedKeyDict
40from ...dimensions import (
41 DatabaseDimensionElement,
42 DatabaseTopologicalFamily,
43 DimensionElement,
44 DimensionGraph,
45 DimensionUniverse,
46 GovernorDimension,
47 SkyPixDimension,
48)
49from .._exceptions import MissingSpatialOverlapError
50from ..interfaces import (
51 Database,
52 DatabaseDimensionOverlapStorage,
53 DatabaseDimensionRecordStorage,
54 DimensionRecordStorage,
55 DimensionRecordStorageManager,
56 GovernorDimensionRecordStorage,
57 StaticTablesContext,
58 VersionTuple,
59)
61if TYPE_CHECKING:
62 from .. import queries
65# This has to be updated on every schema change
66_VERSION = VersionTuple(6, 0, 2)
69class StaticDimensionRecordStorageManager(DimensionRecordStorageManager):
70 """An implementation of `DimensionRecordStorageManager` for single-layer
71 `Registry` and the base layers of multi-layer `Registry`.
73 This manager creates `DimensionRecordStorage` instances for all elements
74 in the `DimensionUniverse` in its own `initialize` method, as part of
75 static table creation, so it never needs to manage any dynamic registry
76 tables.
78 Parameters
79 ----------
80 db : `Database`
81 Interface to the underlying database engine and namespace.
82 records : `NamedKeyDict`
83 Mapping from `DimensionElement` to `DimensionRecordStorage` for that
84 element.
85 overlaps : `list` [ `DatabaseDimensionOverlapStorage` ]
86 Objects that manage materialized overlaps between database-backed
87 dimensions.
88 dimensionGraphStorage : `_DimensionGraphStorage`
89 Object that manages saved `DimensionGraph` definitions.
90 universe : `DimensionUniverse`
91 All known dimensions.
92 """
94 def __init__(
95 self,
96 db: Database,
97 *,
98 records: NamedKeyDict[DimensionElement, DimensionRecordStorage],
99 overlaps: dict[
100 tuple[DatabaseDimensionElement, DatabaseDimensionElement], DatabaseDimensionOverlapStorage
101 ],
102 dimensionGraphStorage: _DimensionGraphStorage,
103 universe: DimensionUniverse,
104 registry_schema_version: VersionTuple | None = None,
105 ):
106 super().__init__(universe=universe, registry_schema_version=registry_schema_version)
107 self._db = db
108 self._records = records
109 self._overlaps = overlaps
110 self._dimensionGraphStorage = dimensionGraphStorage
112 @classmethod
113 def initialize(
114 cls,
115 db: Database,
116 context: StaticTablesContext,
117 *,
118 universe: DimensionUniverse,
119 registry_schema_version: VersionTuple | None = None,
120 ) -> DimensionRecordStorageManager:
121 # Docstring inherited from DimensionRecordStorageManager.
122 # Start by initializing governor dimensions; those go both in the main
123 # 'records' mapping we'll pass to init, and a local dictionary that we
124 # can pass in when initializing storage for DatabaseDimensionElements.
125 governors = NamedKeyDict[GovernorDimension, GovernorDimensionRecordStorage]()
126 records = NamedKeyDict[DimensionElement, DimensionRecordStorage]()
127 for dimension in universe.getGovernorDimensions():
128 governorStorage = dimension.makeStorage(db, context=context)
129 governors[dimension] = governorStorage
130 records[dimension] = governorStorage
131 # Next we initialize storage for DatabaseDimensionElements. Some
132 # elements' storage may be views into anothers; we'll do a first pass
133 # to gather a mapping from the names of those targets back to their
134 # views.
135 view_targets = {
136 element.viewOf: element
137 for element in universe.getDatabaseElements()
138 if element.viewOf is not None
139 }
140 # We remember the spatial ones (grouped by family) so we can go back
141 # and initialize overlap storage for them later.
142 spatial = NamedKeyDict[DatabaseTopologicalFamily, list[DatabaseDimensionRecordStorage]]()
143 for element in universe.getDatabaseElements():
144 if element.viewOf is not None:
145 # We'll initialize this storage when the view's target is
146 # initialized.
147 continue
148 elementStorage = element.makeStorage(db, context=context, governors=governors)
149 records[element] = elementStorage
150 if element.spatial is not None:
151 spatial.setdefault(element.spatial, []).append(elementStorage)
152 if (view_element := view_targets.get(element.name)) is not None:
153 view_element_storage = view_element.makeStorage(
154 db,
155 context=context,
156 governors=governors,
157 view_target=elementStorage,
158 )
159 records[view_element] = view_element_storage
160 if view_element.spatial is not None: 160 ↛ 161line 160 didn't jump to line 161, because the condition on line 160 was never true
161 spatial.setdefault(view_element.spatial, []).append(view_element_storage)
163 # Finally we initialize overlap storage. The implementation class for
164 # this is currently hard-coded (it's not obvious there will ever be
165 # others). Note that overlaps between database-backed dimensions and
166 # skypix dimensions is internal to `DatabaseDimensionRecordStorage`,
167 # and hence is not included here.
168 from ..dimensions.overlaps import CrossFamilyDimensionOverlapStorage
170 overlaps: dict[
171 tuple[DatabaseDimensionElement, DatabaseDimensionElement], DatabaseDimensionOverlapStorage
172 ] = {}
173 for (family1, storages1), (family2, storages2) in itertools.combinations(spatial.items(), 2):
174 for elementStoragePair in itertools.product(storages1, storages2):
175 governorStoragePair = (governors[family1.governor], governors[family2.governor])
176 if elementStoragePair[0].element > elementStoragePair[1].element: 176 ↛ 177line 176 didn't jump to line 177, because the condition on line 176 was never true
177 elementStoragePair = (elementStoragePair[1], elementStoragePair[0])
178 governorStoragePair = (governorStoragePair[1], governorStoragePair[1])
179 overlapStorage = CrossFamilyDimensionOverlapStorage.initialize(
180 db,
181 elementStoragePair,
182 governorStoragePair,
183 context=context,
184 )
185 elementStoragePair[0].connect(overlapStorage)
186 elementStoragePair[1].connect(overlapStorage)
187 overlaps[overlapStorage.elements] = overlapStorage
188 # Create table that stores DimensionGraph definitions.
189 dimensionGraphStorage = _DimensionGraphStorage.initialize(db, context, universe=universe)
190 return cls(
191 db=db,
192 records=records,
193 universe=universe,
194 overlaps=overlaps,
195 dimensionGraphStorage=dimensionGraphStorage,
196 registry_schema_version=registry_schema_version,
197 )
199 def get(self, element: DimensionElement | str) -> DimensionRecordStorage | None:
200 # Docstring inherited from DimensionRecordStorageManager.
201 r = self._records.get(element)
202 if r is None:
203 if isinstance(element, str):
204 element = self.universe[element]
205 if isinstance(element, SkyPixDimension): 205 ↛ 207line 205 didn't jump to line 207, because the condition on line 205 was never false
206 return self.universe.skypix[element.system][element.level].makeStorage()
207 return r
209 def register(self, element: DimensionElement) -> DimensionRecordStorage:
210 # Docstring inherited from DimensionRecordStorageManager.
211 result = self.get(element)
212 assert result, "All records instances should be created in initialize()."
213 return result
215 def saveDimensionGraph(self, graph: DimensionGraph) -> int:
216 # Docstring inherited from DimensionRecordStorageManager.
217 return self._dimensionGraphStorage.save(graph)
219 def loadDimensionGraph(self, key: int) -> DimensionGraph:
220 # Docstring inherited from DimensionRecordStorageManager.
221 return self._dimensionGraphStorage.load(key)
223 def clearCaches(self) -> None:
224 # Docstring inherited from DimensionRecordStorageManager.
225 for storage in self._records.values():
226 storage.clearCaches()
228 def make_spatial_join_relation(
229 self,
230 element1: str,
231 element2: str,
232 context: queries.SqlQueryContext,
233 governor_constraints: Mapping[str, Set[str]],
234 existing_relationships: Set[frozenset[str]] = frozenset(),
235 ) -> tuple[Relation, bool]:
236 # Docstring inherited.
237 overlap_relationship = frozenset(
238 self.universe[element1].dimensions.names | self.universe[element2].dimensions.names
239 )
240 if overlap_relationship in existing_relationships: 240 ↛ 241line 240 didn't jump to line 241, because the condition on line 240 was never true
241 return context.preferred_engine.make_join_identity_relation(), False
242 storage1 = self[element1]
243 storage2 = self[element2]
244 overlaps: Relation | None = None
245 needs_refinement: bool = False
246 match (storage1, storage2):
247 case [
248 DatabaseDimensionRecordStorage() as db_storage1,
249 DatabaseDimensionRecordStorage() as db_storage2,
250 ]:
251 # Construction guarantees that we only need to try this in one
252 # direction; either both storage objects know about the other
253 # or neither do.
254 overlaps = db_storage1.make_spatial_join_relation(
255 db_storage2.element, context, governor_constraints
256 )
257 if overlaps is None: 257 ↛ 305line 257 didn't jump to line 305, because the condition on line 257 was never false
258 # No direct materialized overlaps; use commonSkyPix as an
259 # intermediary.
260 have_overlap1_already = (
261 frozenset(
262 self.universe[element1].dimensions.names | {self.universe.commonSkyPix.name}
263 )
264 in existing_relationships
265 )
266 have_overlap2_already = (
267 frozenset(
268 self.universe[element2].dimensions.names | {self.universe.commonSkyPix.name}
269 )
270 in existing_relationships
271 )
272 overlap1 = context.preferred_engine.make_join_identity_relation()
273 overlap2 = context.preferred_engine.make_join_identity_relation()
274 if not have_overlap1_already:
275 overlap1 = cast(
276 Relation,
277 db_storage1.make_spatial_join_relation(
278 self.universe.commonSkyPix, context, governor_constraints
279 ),
280 )
281 if not have_overlap2_already:
282 overlap2 = cast(
283 Relation,
284 db_storage2.make_spatial_join_relation(
285 self.universe.commonSkyPix, context, governor_constraints
286 ),
287 )
288 overlaps = overlap1.join(overlap2)
289 if not have_overlap1_already and not have_overlap2_already:
290 # Drop the common skypix ID column from the overlap
291 # relation we return, since we don't want that column
292 # to be mistakenly equated with any other appearance of
293 # that column, since this would mangle queries like
294 # "join visit to tract and tract to healpix10", by
295 # incorrectly requiring all visits and healpix10 pixels
296 # share common skypix pixels, not just tracts.
297 columns = set(overlaps.columns)
298 columns.remove(DimensionKeyColumnTag(self.universe.commonSkyPix.name))
299 overlaps = overlaps.with_only_columns(columns)
300 needs_refinement = True
301 case [DatabaseDimensionRecordStorage() as db_storage, other]: 301 ↛ 302line 301 didn't jump to line 302, because the pattern on line 301 never matched
302 overlaps = db_storage.make_spatial_join_relation(other.element, context, governor_constraints)
303 case [other, DatabaseDimensionRecordStorage() as db_storage]: 303 ↛ 305line 303 didn't jump to line 305, because the pattern on line 303 never matched
304 overlaps = db_storage.make_spatial_join_relation(other.element, context, governor_constraints)
305 if overlaps is None:
306 # In the future, there's a lot more we could try here:
307 #
308 # - for skypix dimensions, looking for materialized overlaps at
309 # smaller spatial scales (higher-levels) and using bit-shifting;
310 #
311 # - for non-skypix dimensions, looking for materialized overlaps
312 # for more finer-grained members of the same family, and then
313 # doing SELECT DISTINCT (or even tolerating duplicates) on the
314 # columns we care about (e.g. use patch overlaps to satisfy a
315 # request for tract overlaps).
316 #
317 # It's not obvious that's better than just telling the user to
318 # materialize more overlaps, though.
319 raise MissingSpatialOverlapError(
320 f"No materialized overlaps for spatial join between {element1!r} and {element2!r}."
321 )
322 return overlaps, needs_refinement
324 @classmethod
325 def currentVersions(cls) -> list[VersionTuple]:
326 # Docstring inherited from VersionedExtension.
327 return [_VERSION]
330class _DimensionGraphStorage:
331 """Helper object that manages saved DimensionGraph definitions.
333 Should generally be constructed by calling `initialize` instead of invoking
334 the constructor directly.
336 Parameters
337 ----------
338 db : `Database`
339 Interface to the underlying database engine and namespace.
340 idTable : `sqlalchemy.schema.Table`
341 Table that just holds unique IDs for dimension graphs.
342 definitionTable : `sqlalchemy.schema.Table`
343 Table that maps dimension names to the IDs of the dimension graphs to
344 which they belong.
345 universe : `DimensionUniverse`
346 All known dimensions.
347 """
349 def __init__(
350 self,
351 db: Database,
352 idTable: sqlalchemy.schema.Table,
353 definitionTable: sqlalchemy.schema.Table,
354 universe: DimensionUniverse,
355 ):
356 self._db = db
357 self._idTable = idTable
358 self._definitionTable = definitionTable
359 self._universe = universe
360 self._keysByGraph: dict[DimensionGraph, int] = {universe.empty: 0}
361 self._graphsByKey: dict[int, DimensionGraph] = {0: universe.empty}
363 @classmethod
364 def initialize(
365 cls,
366 db: Database,
367 context: StaticTablesContext,
368 *,
369 universe: DimensionUniverse,
370 ) -> _DimensionGraphStorage:
371 """Construct a new instance, including creating tables if necessary.
373 Parameters
374 ----------
375 db : `Database`
376 Interface to the underlying database engine and namespace.
377 context : `StaticTablesContext`
378 Context object obtained from `Database.declareStaticTables`; used
379 to declare any tables that should always be present.
380 universe : `DimensionUniverse`
381 All known dimensions.
383 Returns
384 -------
385 storage : `_DimensionGraphStorage`
386 New instance of this class.
387 """
388 # We need two tables just so we have one where the autoincrement key is
389 # the only primary key column, as is required by (at least) SQLite. In
390 # other databases, we might be able to use a Sequence directly.
391 idTable = context.addTable(
392 "dimension_graph_key",
393 ddl.TableSpec(
394 fields=[
395 ddl.FieldSpec(
396 name="id",
397 dtype=sqlalchemy.BigInteger,
398 autoincrement=True,
399 primaryKey=True,
400 ),
401 ],
402 ),
403 )
404 definitionTable = context.addTable(
405 "dimension_graph_definition",
406 ddl.TableSpec(
407 fields=[
408 ddl.FieldSpec(name="dimension_graph_id", dtype=sqlalchemy.BigInteger, primaryKey=True),
409 ddl.FieldSpec(name="dimension_name", dtype=sqlalchemy.Text, primaryKey=True),
410 ],
411 foreignKeys=[
412 ddl.ForeignKeySpec(
413 "dimension_graph_key",
414 source=("dimension_graph_id",),
415 target=("id",),
416 onDelete="CASCADE",
417 ),
418 ],
419 ),
420 )
421 return cls(db, idTable, definitionTable, universe=universe)
423 def refresh(self) -> None:
424 """Refresh the in-memory cache of saved DimensionGraph definitions.
426 This should be done automatically whenever needed, but it can also
427 be called explicitly.
428 """
429 dimensionNamesByKey: dict[int, set[str]] = defaultdict(set)
430 with self._db.query(self._definitionTable.select()) as sql_result:
431 sql_rows = sql_result.mappings().fetchall()
432 for row in sql_rows:
433 key = row[self._definitionTable.columns.dimension_graph_id]
434 dimensionNamesByKey[key].add(row[self._definitionTable.columns.dimension_name])
435 keysByGraph: dict[DimensionGraph, int] = {self._universe.empty: 0}
436 graphsByKey: dict[int, DimensionGraph] = {0: self._universe.empty}
437 for key, dimensionNames in dimensionNamesByKey.items():
438 graph = DimensionGraph(self._universe, names=dimensionNames)
439 keysByGraph[graph] = key
440 graphsByKey[key] = graph
441 self._graphsByKey = graphsByKey
442 self._keysByGraph = keysByGraph
444 def save(self, graph: DimensionGraph) -> int:
445 """Save a `DimensionGraph` definition to the database, allowing it to
446 be retrieved later via the returned key.
448 Parameters
449 ----------
450 graph : `DimensionGraph`
451 Set of dimensions to save.
453 Returns
454 -------
455 key : `int`
456 Integer used as the unique key for this `DimensionGraph` in the
457 database.
458 """
459 key = self._keysByGraph.get(graph)
460 if key is not None:
461 return key
462 # Lock tables and then refresh to guard against races where some other
463 # process is trying to register the exact same dimension graph. This
464 # is probably not the most efficient way to do it, but it should be a
465 # rare operation, especially since the short-circuit above will usually
466 # work in long-lived data repositories.
467 with self._db.transaction(lock=[self._idTable, self._definitionTable]):
468 self.refresh()
469 key = self._keysByGraph.get(graph)
470 if key is None: 470 ↛ 476line 470 didn't jump to line 476, because the condition on line 470 was never false
471 (key,) = self._db.insert(self._idTable, {}, returnIds=True) # type: ignore
472 self._db.insert(
473 self._definitionTable,
474 *[{"dimension_graph_id": key, "dimension_name": name} for name in graph.required.names],
475 )
476 self._keysByGraph[graph] = key
477 self._graphsByKey[key] = graph
478 return key
480 def load(self, key: int) -> DimensionGraph:
481 """Retrieve a `DimensionGraph` that was previously saved in the
482 database.
484 Parameters
485 ----------
486 key : `int`
487 Integer used as the unique key for this `DimensionGraph` in the
488 database.
490 Returns
491 -------
492 graph : `DimensionGraph`
493 Retrieved graph.
494 """
495 graph = self._graphsByKey.get(key)
496 if graph is None:
497 self.refresh()
498 graph = self._graphsByKey[key]
499 return graph