Coverage for python/lsst/daf/butler/registry/dimensions/static.py: 95%
417 statements
« prev ^ index » next coverage.py v7.4.4, created at 2024-04-19 10:52 +0000
« prev ^ index » next coverage.py v7.4.4, created at 2024-04-19 10:52 +0000
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This software is dual licensed under the GNU General Public License and also
10# under a 3-clause BSD license. Recipients may choose which of these licenses
11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt,
12# respectively. If you choose the GPL option then the following text applies
13# (but note that there is still no warranty even if you opt for BSD instead):
14#
15# This program is free software: you can redistribute it and/or modify
16# it under the terms of the GNU General Public License as published by
17# the Free Software Foundation, either version 3 of the License, or
18# (at your option) any later version.
19#
20# This program is distributed in the hope that it will be useful,
21# but WITHOUT ANY WARRANTY; without even the implied warranty of
22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23# GNU General Public License for more details.
24#
25# You should have received a copy of the GNU General Public License
26# along with this program. If not, see <http://www.gnu.org/licenses/>.
27from __future__ import annotations
29import dataclasses
30import itertools
31import logging
32from collections import defaultdict
33from collections.abc import Iterable, Mapping, Sequence, Set
34from typing import TYPE_CHECKING, Any
36import sqlalchemy
37from lsst.daf.relation import Calculation, ColumnExpression, Join, Relation, sql
38from lsst.sphgeom import Region
40from ... import ddl
41from ..._column_tags import DimensionKeyColumnTag, DimensionRecordColumnTag
42from ..._column_type_info import LogicalColumn
43from ..._named import NamedKeyDict
44from ...dimensions import (
45 DatabaseDimensionElement,
46 DatabaseTopologicalFamily,
47 DataCoordinate,
48 Dimension,
49 DimensionElement,
50 DimensionGroup,
51 DimensionRecord,
52 DimensionRecordSet,
53 DimensionUniverse,
54 SkyPixDimension,
55 addDimensionForeignKey,
56)
57from ...dimensions.record_cache import DimensionRecordCache
58from ...direct_query_driver import QueryBuilder, QueryJoiner # Future query system (direct,server).
59from ...queries import tree as qt # Future query system (direct,client,server)
60from ...queries.overlaps import OverlapsVisitor
61from ...queries.visitors import PredicateVisitFlags
62from .._exceptions import MissingSpatialOverlapError
63from ..interfaces import Database, DimensionRecordStorageManager, StaticTablesContext, VersionTuple
65if TYPE_CHECKING:
66 from .. import queries # Current Registry.query* system.
69# This has to be updated on every schema change
70_VERSION = VersionTuple(6, 0, 2)
72_LOG = logging.getLogger(__name__)
75class StaticDimensionRecordStorageManager(DimensionRecordStorageManager):
76 """An implementation of `DimensionRecordStorageManager` for single-layer
77 `Registry` and the base layers of multi-layer `Registry`.
79 This manager creates `DimensionRecordStorage` instances for all elements
80 in the `DimensionUniverse` in its own `initialize` method, as part of
81 static table creation, so it never needs to manage any dynamic registry
82 tables.
84 Parameters
85 ----------
86 db : `Database`
87 Interface to the underlying database engine and namespace.
88 tables : `dict` [ `str`, `sqlalchemy.Table` ]
89 Mapping from dimension element name to SQL table, for all elements that
90 have `DimensionElement.has_own_table` `True`.
91 overlap_tables : `dict` [ `str`, `tuple` [ `sqlalchemy.Table`, \
92 `sqlalchemy.Table` ] ]
93 Mapping from dimension element name to SQL table holding overlaps
94 between the common skypix dimension and that element, for all elements
95 that have `DimensionElement.has_own_table` `True` and
96 `DimensionElement.spatial` not `None`.
97 dimension_group_storage : `_DimensionGroupStorage`
98 Object that manages saved `DimensionGroup` definitions.
99 universe : `DimensionUniverse`
100 All known dimensions.
101 registry_schema_version : `VersionTuple` or `None`, optional
102 Version of registry schema.
103 """
105 def __init__(
106 self,
107 db: Database,
108 *,
109 tables: dict[str, sqlalchemy.Table],
110 overlap_tables: dict[str, tuple[sqlalchemy.Table, sqlalchemy.Table]],
111 dimension_group_storage: _DimensionGroupStorage,
112 universe: DimensionUniverse,
113 registry_schema_version: VersionTuple | None = None,
114 ):
115 super().__init__(universe=universe, registry_schema_version=registry_schema_version)
116 self._db = db
117 self._tables = tables
118 self._overlap_tables = overlap_tables
119 self._dimension_group_storage = dimension_group_storage
121 def clone(self, db: Database) -> StaticDimensionRecordStorageManager:
122 return StaticDimensionRecordStorageManager(
123 db,
124 tables=self._tables,
125 overlap_tables=self._overlap_tables,
126 dimension_group_storage=self._dimension_group_storage.clone(db),
127 universe=self.universe,
128 registry_schema_version=self._registry_schema_version,
129 )
131 @classmethod
132 def initialize(
133 cls,
134 db: Database,
135 context: StaticTablesContext,
136 *,
137 universe: DimensionUniverse,
138 registry_schema_version: VersionTuple | None = None,
139 ) -> DimensionRecordStorageManager:
140 # Docstring inherited from DimensionRecordStorageManager.
141 tables: dict[str, sqlalchemy.Table] = {}
142 # Define tables for governor dimensions, which are never spatial or
143 # temporal and always have tables.
144 for dimension in universe.governor_dimensions:
145 spec = dimension.RecordClass.fields.makeTableSpec(
146 TimespanReprClass=db.getTimespanRepresentation()
147 )
148 tables[dimension.name] = context.addTable(dimension.name, spec)
149 # Define tables for database dimension elements, which may or may not
150 # have their own tables and may be spatial or temporal.
151 spatial = NamedKeyDict[DatabaseTopologicalFamily, list[DimensionElement]]()
152 overlap_tables: dict[str, tuple[sqlalchemy.Table, sqlalchemy.Table]] = {}
153 for element in universe.database_elements:
154 if not element.has_own_table:
155 continue
156 spec = element.RecordClass.fields.makeTableSpec(TimespanReprClass=db.getTimespanRepresentation())
157 tables[element.name] = context.addTable(element.name, spec)
158 if element.spatial is not None:
159 spatial.setdefault(element.spatial, []).append(element)
160 overlap_tables[element.name] = cls._make_skypix_overlap_tables(context, element)
161 for field_name in spec.fields.names:
162 if ( 162 ↛ 170line 162 didn't jump to line 170
163 len(qt.ColumnSet.get_qualified_name(element.name, field_name))
164 >= db.dialect.max_identifier_length
165 ):
166 # Being able to assume that all dimension fields fit inside
167 # the DB's identifier limit is really convenient and very
168 # unlikely to cause trouble in practice. We'll just make
169 # sure we catch any such trouble as early as possible.
170 raise RuntimeError(
171 f"Dimension filed '{element.name}.{field_name}' is too long for this database. "
172 "Please file a ticket for long-field support if this was not a mistake."
173 )
174 # Add some tables for materialized overlaps between database
175 # dimensions. We've never used these and no longer plan to, but we
176 # have to keep creating them to keep schema versioning consistent.
177 cls._make_legacy_overlap_tables(context, spatial)
178 # Create tables that store DimensionGraph definitions.
179 dimension_group_storage = _DimensionGroupStorage.initialize(db, context, universe=universe)
180 return cls(
181 db=db,
182 tables=tables,
183 overlap_tables=overlap_tables,
184 universe=universe,
185 dimension_group_storage=dimension_group_storage,
186 registry_schema_version=registry_schema_version,
187 )
189 def fetch_cache_dict(self) -> dict[str, DimensionRecordSet]:
190 # Docstring inherited.
191 result: dict[str, DimensionRecordSet] = {}
192 with self._db.transaction():
193 for element in self.universe.elements:
194 if not element.is_cached:
195 continue
196 assert not element.temporal, (
197 "Cached dimension elements should not be spatial or temporal, as that "
198 "suggests a large number of records."
199 )
200 if element.implied_union_target is not None:
201 assert isinstance(element, Dimension), "Only dimensions can be implied dependencies."
202 table = self._tables[element.implied_union_target.name]
203 sql = sqlalchemy.select(
204 table.columns[element.name].label(element.primary_key.name)
205 ).distinct()
206 else:
207 table = self._tables[element.name]
208 sql = table.select()
209 with self._db.query(sql) as results:
210 result[element.name] = DimensionRecordSet(
211 element=element,
212 records=[element.RecordClass(**row) for row in results.mappings()],
213 )
214 return result
216 def insert(
217 self,
218 element: DimensionElement,
219 *records: DimensionRecord,
220 replace: bool = False,
221 skip_existing: bool = False,
222 ) -> None:
223 # Docstring inherited.
224 if not element.has_own_table:
225 raise TypeError(f"Cannot insert {element.name} records.")
226 db_rows = self._make_record_db_rows(element, records, replace=replace)
227 table = self._tables[element.name]
228 with self._db.transaction():
229 if replace:
230 self._db.replace(table, *db_rows.main_rows)
231 elif skip_existing:
232 self._db.ensure(table, *db_rows.main_rows, primary_key_only=True)
233 else:
234 self._db.insert(table, *db_rows.main_rows)
235 self._insert_overlaps(
236 element, db_rows.overlap_insert_rows, db_rows.overlap_delete_rows, skip_existing=skip_existing
237 )
238 for related_element_name, summary_rows in db_rows.overlap_summary_rows.items():
239 self._db.ensure(self._overlap_tables[related_element_name][0], *summary_rows)
241 def sync(self, record: DimensionRecord, update: bool = False) -> bool | dict[str, Any]:
242 # Docstring inherited.
243 if not record.definition.has_own_table: 243 ↛ 244line 243 didn't jump to line 244, because the condition on line 243 was never true
244 raise TypeError(f"Cannot sync {record.definition.name} records.")
245 # We might not need the overlap rows at all; we won't know until we try
246 # to insert the main row. But we figure it's better to spend the time
247 # to compute them in advance always *outside* the database transaction
248 # than to compute them only as-needed inside the database transaction,
249 # since in-transaction time is especially precious.
250 db_rows = self._make_record_db_rows(record.definition, [record], replace=True)
251 (compared,) = db_rows.main_rows
252 keys = {}
253 for name in record.fields.required.names:
254 keys[name] = compared.pop(name)
255 with self._db.transaction():
256 _, inserted_or_updated = self._db.sync(
257 self._tables[record.definition.name],
258 keys=keys,
259 compared=compared,
260 update=update,
261 )
262 if inserted_or_updated:
263 if inserted_or_updated is True:
264 # Inserted a new row, so we just need to insert new
265 # overlap rows (if there are any).
266 self._insert_overlaps(
267 record.definition, db_rows.overlap_insert_rows, overlap_delete_rows=[]
268 )
269 elif "region" in inserted_or_updated: 269 ↛ 275line 269 didn't jump to line 275, because the condition on line 269 was never false
270 # Updated the region, so we need to delete old overlap
271 # rows and insert new ones.
272 self._insert_overlaps(
273 record.definition, db_rows.overlap_insert_rows, db_rows.overlap_delete_rows
274 )
275 for related_element_name, summary_rows in db_rows.overlap_summary_rows.items():
276 self._db.ensure(self._overlap_tables[related_element_name][0], *summary_rows)
277 return inserted_or_updated
279 def fetch_one(
280 self,
281 element_name: str,
282 data_id: DataCoordinate,
283 cache: DimensionRecordCache,
284 ) -> DimensionRecord | None:
285 # Docstring inherited.
286 element = self.universe[element_name]
287 if element_name in cache:
288 try:
289 return cache[element_name].find(data_id)
290 except LookupError:
291 return None
292 if element.implied_union_target is not None: 292 ↛ 293line 292 didn't jump to line 293, because the condition on line 292 was never true
293 assert isinstance(element, Dimension), "Only dimensions can be implied dependencies."
294 table = self._tables[element.implied_union_target.name]
295 sql = sqlalchemy.select(table.columns[element.name].label(element.primary_key.name)).where(
296 table.columns[element_name] == data_id[element_name]
297 )
298 elif isinstance(element, SkyPixDimension):
299 id = data_id[element_name]
300 return element.RecordClass(id=id, region=element.pixelization.pixel(id))
301 else:
302 table = self._tables[element.name]
303 sql = table.select().where(
304 *[
305 table.columns[column_name] == data_id[dimension_name]
306 for column_name, dimension_name in zip(
307 element.schema.required.names, element.required.names
308 )
309 ]
310 )
311 with self._db.query(sql) as results:
312 row = results.fetchone()
313 if row is None:
314 return None
315 mapping: Mapping
316 if element.temporal is not None:
317 mapping = dict(**row._mapping)
318 timespan = self._db.getTimespanRepresentation().extract(mapping)
319 for name in self._db.getTimespanRepresentation().getFieldNames():
320 del mapping[name]
321 mapping["timespan"] = timespan
322 else:
323 mapping = row._mapping
324 return element.RecordClass(**mapping)
326 def save_dimension_group(self, graph: DimensionGroup) -> int:
327 # Docstring inherited from DimensionRecordStorageManager.
328 return self._dimension_group_storage.save(graph)
330 def load_dimension_group(self, key: int) -> DimensionGroup:
331 # Docstring inherited from DimensionRecordStorageManager.
332 return self._dimension_group_storage.load(key)
334 def join(
335 self,
336 element_name: str,
337 target: Relation,
338 join: Join,
339 context: queries.SqlQueryContext,
340 ) -> Relation:
341 # Docstring inherited.
342 element = self.universe[element_name]
343 # We use Join.partial(...).apply(...) instead of Join.apply(..., ...)
344 # for the "backtracking" insertion capabilities of the former; more
345 # specifically, if `target` is a tree that starts with SQL relations
346 # and ends with iteration-engine operations (e.g. region-overlap
347 # postprocessing), this will try to perform the join upstream in the
348 # SQL engine before the transfer to iteration.
349 if element.has_own_table:
350 return join.partial(self._make_relation(element, context)).apply(target)
351 elif element.implied_union_target is not None:
352 columns = DimensionKeyColumnTag(element.name)
353 return join.partial(
354 self._make_relation(element.implied_union_target, context)
355 .with_only_columns(
356 {columns},
357 preferred_engine=context.preferred_engine,
358 require_preferred_engine=True,
359 )
360 .without_duplicates()
361 ).apply(target)
362 elif isinstance(element, SkyPixDimension):
363 assert join.predicate.as_trivial(), "Expected trivial join predicate for skypix relation."
364 id_column = DimensionKeyColumnTag(element.name)
365 assert id_column in target.columns, "Guaranteed by QueryBuilder.make_dimension_target."
366 function_name = f"{element.name}_region"
367 context.iteration_engine.functions[function_name] = element.pixelization.pixel
368 calculation = Calculation(
369 tag=DimensionRecordColumnTag(element.name, "region"),
370 expression=ColumnExpression.function(function_name, ColumnExpression.reference(id_column)),
371 )
372 return calculation.apply(
373 target, preferred_engine=context.iteration_engine, transfer=True, backtrack=True
374 )
375 else:
376 raise AssertionError(f"Unexpected definition of {element_name!r}.")
378 def make_spatial_join_relation(
379 self,
380 element1: str,
381 element2: str,
382 context: queries.SqlQueryContext,
383 existing_relationships: Set[frozenset[str]] = frozenset(),
384 ) -> tuple[Relation, bool]:
385 # Docstring inherited.
386 overlap_relationship = frozenset(
387 self.universe[element1].dimensions.names | self.universe[element2].dimensions.names
388 )
389 if overlap_relationship in existing_relationships: 389 ↛ 390line 389 didn't jump to line 390, because the condition on line 389 was never true
390 return context.preferred_engine.make_join_identity_relation(), False
391 overlaps: Relation | None = None
392 needs_refinement: bool = False
393 if element1 == self.universe.commonSkyPix.name:
394 (element1, element2) = (element2, element1)
396 if element1 in self._overlap_tables:
397 if element2 in self._overlap_tables:
398 # Use commonSkyPix as an intermediary with post-query
399 # refinement.
400 have_overlap1_already = (
401 frozenset(self.universe[element1].dimensions.names | {self.universe.commonSkyPix.name})
402 in existing_relationships
403 )
404 have_overlap2_already = (
405 frozenset(self.universe[element2].dimensions.names | {self.universe.commonSkyPix.name})
406 in existing_relationships
407 )
408 overlap1 = context.preferred_engine.make_join_identity_relation()
409 overlap2 = context.preferred_engine.make_join_identity_relation()
410 if not have_overlap1_already:
411 overlap1 = self._make_common_skypix_join_relation(self.universe[element1], context)
412 if not have_overlap2_already:
413 overlap2 = self._make_common_skypix_join_relation(self.universe[element2], context)
414 overlaps = overlap1.join(overlap2)
415 if not have_overlap1_already and not have_overlap2_already:
416 # Drop the common skypix ID column from the overlap
417 # relation we return, since we don't want that column
418 # to be mistakenly equated with any other appearance of
419 # that column, since this would mangle queries like
420 # "join visit to tract and tract to healpix10", by
421 # incorrectly requiring all visits and healpix10 pixels
422 # share common skypix pixels, not just tracts.
423 columns = set(overlaps.columns)
424 columns.remove(DimensionKeyColumnTag(self.universe.commonSkyPix.name))
425 overlaps = overlaps.with_only_columns(columns)
426 needs_refinement = True
427 elif element2 == self.universe.commonSkyPix.name: 427 ↛ 429line 427 didn't jump to line 429, because the condition on line 427 was never false
428 overlaps = self._make_common_skypix_join_relation(self.universe[element1], context)
429 if overlaps is None:
430 # In the future, there's a lot more we could try here:
431 #
432 # - for skypix dimensions, looking for materialized overlaps at
433 # smaller spatial scales (higher-levels) and using bit-shifting;
434 #
435 # - for non-skypix dimensions, looking for materialized overlaps
436 # for more finer-grained members of the same family, and then
437 # doing SELECT DISTINCT (or even tolerating duplicates) on the
438 # columns we care about (e.g. use patch overlaps to satisfy a
439 # request for tract overlaps).
440 #
441 # It's not obvious that's better than just telling the user to
442 # materialize more overlaps, though.
443 raise MissingSpatialOverlapError(
444 f"No materialized overlaps for spatial join between {element1!r} and {element2!r}."
445 )
446 return overlaps, needs_refinement
448 def make_query_joiner(self, element: DimensionElement, fields: Set[str]) -> QueryJoiner:
449 if element.implied_union_target is not None:
450 assert not fields, "Dimensions with implied-union storage never have fields."
451 return QueryBuilder(
452 self.make_query_joiner(element.implied_union_target, fields),
453 columns=qt.ColumnSet(element.minimal_group).drop_implied_dimension_keys(),
454 distinct=True,
455 ).to_joiner()
456 if not element.has_own_table:
457 raise NotImplementedError(f"Cannot join dimension element {element} with no table.")
458 table = self._tables[element.name]
459 result = QueryJoiner(self._db, table)
460 for dimension_name, column_name in zip(element.required.names, element.schema.required.names):
461 result.dimension_keys[dimension_name].append(table.columns[column_name])
462 result.extract_dimensions(element.implied.names)
463 for field in fields:
464 if field == "timespan":
465 result.timespans[element.name] = self._db.getTimespanRepresentation().from_columns(
466 table.columns
467 )
468 else:
469 result.fields[element.name][field] = table.columns[field]
470 return result
472 def process_query_overlaps(
473 self,
474 dimensions: DimensionGroup,
475 predicate: qt.Predicate,
476 join_operands: Iterable[DimensionGroup],
477 ) -> tuple[qt.Predicate, QueryBuilder]:
478 overlaps_visitor = _CommonSkyPixMediatedOverlapsVisitor(self._db, dimensions, self._overlap_tables)
479 new_predicate = overlaps_visitor.run(predicate, join_operands)
480 return new_predicate, overlaps_visitor.builder
482 def _make_relation(
483 self,
484 element: DimensionElement,
485 context: queries.SqlQueryContext,
486 ) -> Relation:
487 table = self._tables[element.name]
488 payload = sql.Payload[LogicalColumn](table)
489 for tag, field_name in element.RecordClass.fields.columns.items():
490 if field_name == "timespan":
491 payload.columns_available[tag] = self._db.getTimespanRepresentation().from_columns(
492 table.columns, name=field_name
493 )
494 else:
495 payload.columns_available[tag] = table.columns[field_name]
496 return context.sql_engine.make_leaf(
497 payload.columns_available.keys(),
498 name=element.name,
499 payload=payload,
500 )
502 def _make_common_skypix_join_relation(
503 self,
504 element: DimensionElement,
505 context: queries.SqlQueryContext,
506 ) -> Relation:
507 """Construct a subquery expression containing overlaps between the
508 common skypix dimension and the given dimension element.
510 Parameters
511 ----------
512 element : `DimensionElement`
513 Spatial dimension element whose overlaps with the common skypix
514 system are represented by the returned relation.
515 context : `.queries.SqlQueryContext`
516 Object that manages relation engines and database-side state
517 (e.g. temporary tables) for the query.
519 Returns
520 -------
521 relation : `sql.Relation`
522 Join relation.
523 """
524 assert element.spatial is not None, "Only called for spatial dimension elements."
525 assert element.has_own_table, "Only called for dimension elements with their own tables."
526 _, table = self._overlap_tables[element.name]
527 payload = sql.Payload[LogicalColumn](table)
528 payload.columns_available[DimensionKeyColumnTag(self.universe.commonSkyPix.name)] = (
529 payload.from_clause.columns.skypix_index
530 )
531 for dimension_name in element.graph.required.names:
532 payload.columns_available[DimensionKeyColumnTag(dimension_name)] = payload.from_clause.columns[
533 dimension_name
534 ]
535 payload.where.append(table.columns.skypix_system == self.universe.commonSkyPix.system.name)
536 payload.where.append(table.columns.skypix_level == self.universe.commonSkyPix.level)
537 leaf = context.sql_engine.make_leaf(
538 payload.columns_available.keys(),
539 name=f"{element.name}_{self.universe.commonSkyPix.name}_overlap",
540 payload=payload,
541 )
542 return leaf
544 @classmethod
545 def currentVersions(cls) -> list[VersionTuple]:
546 # Docstring inherited from VersionedExtension.
547 return [_VERSION]
549 @classmethod
550 def _make_skypix_overlap_tables(
551 cls, context: StaticTablesContext, element: DimensionElement
552 ) -> tuple[sqlalchemy.Table, sqlalchemy.Table]:
553 assert element.governor is not None
554 summary_spec = ddl.TableSpec(
555 fields=[
556 ddl.FieldSpec(
557 name="skypix_system",
558 dtype=sqlalchemy.String,
559 length=16,
560 nullable=False,
561 primaryKey=True,
562 ),
563 ddl.FieldSpec(
564 name="skypix_level",
565 dtype=sqlalchemy.SmallInteger,
566 nullable=False,
567 primaryKey=True,
568 ),
569 ]
570 )
571 addDimensionForeignKey(summary_spec, element.governor, primaryKey=True)
572 overlap_spec = ddl.TableSpec(
573 fields=[
574 ddl.FieldSpec(
575 name="skypix_system",
576 dtype=sqlalchemy.String,
577 length=16,
578 nullable=False,
579 primaryKey=True,
580 ),
581 ddl.FieldSpec(
582 name="skypix_level",
583 dtype=sqlalchemy.SmallInteger,
584 nullable=False,
585 primaryKey=True,
586 ),
587 # (more columns added below)
588 ],
589 unique=set(),
590 indexes={
591 # This index has the same fields as the PK, in a different
592 # order, to facilitate queries that know skypix_index and want
593 # to find the other element.
594 ddl.IndexSpec(
595 "skypix_system",
596 "skypix_level",
597 "skypix_index",
598 *element.graph.required.names,
599 ),
600 },
601 foreignKeys=[
602 # Foreign key to summary table. This makes sure we don't
603 # materialize any overlaps without remembering that we've done
604 # so in the summary table, though it can't prevent the converse
605 # of adding a summary row without adding overlap row (either of
606 # those is a logic bug, of course, but we want to be defensive
607 # about those). Using ON DELETE CASCADE, it'd be very easy to
608 # implement "disabling" an overlap materialization, because we
609 # can just delete the summary row.
610 # Note that the governor dimension column is added below, in
611 # the call to addDimensionForeignKey.
612 ddl.ForeignKeySpec(
613 f"{element.name}_skypix_overlap_summary",
614 source=("skypix_system", "skypix_level", element.governor.name),
615 target=("skypix_system", "skypix_level", element.governor.name),
616 onDelete="CASCADE",
617 ),
618 ],
619 )
620 # Add fields for the standard element this class manages overlaps for.
621 # This is guaranteed to add a column for the governor dimension,
622 # because that's a required dependency of element.
623 for dimension in element.required:
624 addDimensionForeignKey(overlap_spec, dimension, primaryKey=True)
625 # Add field for the actual skypix index. We do this later because I
626 # think we care (at least a bit) about the order in which the primary
627 # key is defined, in that we want a non-summary column like this one
628 # to appear after the governor dimension column.
629 overlap_spec.fields.add(
630 ddl.FieldSpec(
631 name="skypix_index",
632 dtype=sqlalchemy.BigInteger,
633 nullable=False,
634 primaryKey=True,
635 )
636 )
637 return (
638 context.addTable(f"{element.name}_skypix_overlap_summary", summary_spec),
639 context.addTable(f"{element.name}_skypix_overlap", overlap_spec),
640 )
642 @classmethod
643 def _make_legacy_overlap_tables(
644 cls,
645 context: StaticTablesContext,
646 spatial: NamedKeyDict[DatabaseTopologicalFamily, list[DimensionElement]],
647 ) -> None:
648 for (_, elements1), (_, elements2) in itertools.combinations(spatial.items(), 2):
649 for element1, element2 in itertools.product(elements1, elements2):
650 if element1 > element2: 650 ↛ 651line 650 didn't jump to line 651, because the condition on line 650 was never true
651 (element2, element1) = (element1, element2)
652 assert element1.spatial is not None and element2.spatial is not None
653 assert element1.governor != element2.governor
654 assert element1.governor is not None and element2.governor is not None
655 summary_spec = ddl.TableSpec(fields=[])
656 addDimensionForeignKey(summary_spec, element1.governor, primaryKey=True)
657 addDimensionForeignKey(summary_spec, element2.governor, primaryKey=True)
658 context.addTable(f"{element1.name}_{element2.name}_overlap_summary", summary_spec)
659 overlap_spec = ddl.TableSpec(fields=[])
660 addDimensionForeignKey(overlap_spec, element1.governor, primaryKey=True)
661 addDimensionForeignKey(overlap_spec, element2.governor, primaryKey=True)
662 for dimension in element1.required:
663 if dimension != element1.governor:
664 addDimensionForeignKey(overlap_spec, dimension, primaryKey=True)
665 for dimension in element2.required:
666 if dimension != element2.governor:
667 addDimensionForeignKey(overlap_spec, dimension, primaryKey=True)
668 context.addTable(f"{element1.name}_{element2.name}_overlap", overlap_spec)
670 def _make_record_db_rows(
671 self, element: DimensionElement, records: Sequence[DimensionRecord], replace: bool
672 ) -> _DimensionRecordDatabaseRows:
673 result = _DimensionRecordDatabaseRows()
674 result.main_rows = [record.toDict() for record in records]
675 if element.temporal is not None:
676 TimespanReprClass = self._db.getTimespanRepresentation()
677 for row in result.main_rows:
678 timespan = row.pop("timespan")
679 TimespanReprClass.update(timespan, result=row)
680 if element.spatial is not None:
681 result.overlap_insert_rows = self._compute_common_skypix_overlap_inserts(element, records)
682 if replace:
683 result.overlap_delete_rows = self._compute_common_skypix_overlap_deletes(records)
684 if element in self.universe.governor_dimensions:
685 for related_element_name in self._overlap_tables.keys():
686 if self.universe[related_element_name].governor == element:
687 result.overlap_summary_rows[related_element_name] = [
688 {
689 "skypix_system": self.universe.commonSkyPix.system.name,
690 "skypix_level": self.universe.commonSkyPix.level,
691 element.name: record.dataId[element.name],
692 }
693 for record in records
694 ]
695 return result
697 def _compute_common_skypix_overlap_deletes(
698 self, records: Sequence[DimensionRecord]
699 ) -> list[dict[str, Any]]:
700 return [
701 {
702 "skypix_system": self.universe.commonSkyPix.system.name,
703 "skypix_level": self.universe.commonSkyPix.level,
704 **record.dataId.required,
705 }
706 for record in records
707 ]
709 def _compute_common_skypix_overlap_inserts(
710 self,
711 element: DimensionElement,
712 records: Sequence[DimensionRecord],
713 ) -> list[dict[str, Any]]:
714 _LOG.debug("Precomputing common skypix overlaps for %s.", element.name)
715 overlap_records: list[dict[str, Any]] = []
716 for record in records:
717 if record.region is None:
718 continue
719 base_overlap_record = dict(record.dataId.required)
720 base_overlap_record["skypix_system"] = self.universe.commonSkyPix.system.name
721 base_overlap_record["skypix_level"] = self.universe.commonSkyPix.level
722 for begin, end in self.universe.commonSkyPix.pixelization.envelope(record.region):
723 for index in range(begin, end):
724 overlap_records.append({"skypix_index": index, **base_overlap_record})
725 return overlap_records
727 def _insert_overlaps(
728 self,
729 element: DimensionElement,
730 overlap_insert_rows: list[dict[str, Any]],
731 overlap_delete_rows: list[dict[str, Any]],
732 skip_existing: bool = False,
733 ) -> None:
734 if overlap_delete_rows:
735 # Since any of the new records might have replaced existing ones
736 # that already have overlap records, and we don't know which, we
737 # have no choice but to delete all overlaps for these records and
738 # recompute them. We include the skypix_system and skypix_level
739 # column values explicitly instead of just letting the query search
740 # for all of those related to the given records, because they are
741 # the first columns in the primary key, and hence searching with
742 # them will be way faster (and we don't want to add a new index
743 # just for this operation).
744 _LOG.debug("Deleting old common skypix overlaps for %s.", element.name)
745 self._db.delete(
746 self._overlap_tables[element.name][1],
747 ["skypix_system", "skypix_level"] + list(element.minimal_group.required),
748 *overlap_delete_rows,
749 )
750 if overlap_insert_rows:
751 _LOG.debug("Inserting %d new skypix overlap rows for %s.", len(overlap_insert_rows), element.name)
752 if skip_existing:
753 self._db.ensure(
754 self._overlap_tables[element.name][1], *overlap_insert_rows, primary_key_only=True
755 )
756 else:
757 self._db.insert(self._overlap_tables[element.name][1], *overlap_insert_rows)
758 # We have only ever put overlaps with the commonSkyPix system into
759 # this table, and *probably* only ever will. But the schema leaves
760 # open the possibility that we should be inserting overlaps for
761 # some other skypix system, as we once thought we'd support. In
762 # case that door opens again in the future, we need to check the
763 # "overlap summary" table to see if are any skypix systems other
764 # than the common skypix system and raise (rolling back the entire
765 # transaction) if there are.
766 summary_table = self._overlap_tables[element.name][0]
767 check_sql = (
768 sqlalchemy.sql.select(summary_table.columns.skypix_system, summary_table.columns.skypix_level)
769 .select_from(summary_table)
770 .where(
771 sqlalchemy.sql.not_(
772 sqlalchemy.sql.and_(
773 summary_table.columns.skypix_system == self.universe.commonSkyPix.system.name,
774 summary_table.columns.skypix_level == self.universe.commonSkyPix.level,
775 )
776 )
777 )
778 )
779 with self._db.query(check_sql) as sql_result:
780 bad_summary_rows = sql_result.fetchall()
781 if bad_summary_rows: 781 ↛ 782line 781 didn't jump to line 782, because the condition on line 781 was never true
782 bad_skypix_names = [f"{row.skypix_system}{row.skypix.level}" for row in bad_summary_rows]
783 raise RuntimeError(
784 f"Data repository has overlaps between {element} and {bad_skypix_names} that "
785 "are not supported by this version of daf_butler. Please use a newer version."
786 )
789@dataclasses.dataclass
790class _DimensionRecordDatabaseRows:
791 """Rows to be inserted into the database whenever a DimensionRecord is
792 added.
793 """
795 main_rows: list[dict[str, Any]] = dataclasses.field(default_factory=list)
796 """Rows for the dimension element table itself."""
798 overlap_insert_rows: list[dict[str, Any]] = dataclasses.field(default_factory=list)
799 """Rows for overlaps with the common skypix dimension."""
801 overlap_delete_rows: list[dict[str, Any]] = dataclasses.field(default_factory=list)
802 """Rows for overlaps with the common skypix dimension that should be
803 deleted before inserting new ones.
804 """
806 overlap_summary_rows: dict[str, list[dict[str, Any]]] = dataclasses.field(default_factory=dict)
807 """Rows that record which overlaps between skypix dimensiosn and other
808 dimension elements are stored.
810 This is populated when inserting governor dimension rows, with keys being
811 the names of spatial dimension elements associated with that governor.
812 """
815class _DimensionGroupStorage:
816 """Helper object that manages saved DimensionGroup definitions.
818 Should generally be constructed by calling `initialize` instead of invoking
819 the constructor directly.
821 Parameters
822 ----------
823 db : `Database`
824 Interface to the underlying database engine and namespace.
825 idTable : `sqlalchemy.schema.Table`
826 Table that just holds unique IDs for dimension graphs.
827 definitionTable : `sqlalchemy.schema.Table`
828 Table that maps dimension names to the IDs of the dimension graphs to
829 which they belong.
830 universe : `DimensionUniverse`
831 All known dimensions.
832 """
834 def __init__(
835 self,
836 db: Database,
837 idTable: sqlalchemy.schema.Table,
838 definitionTable: sqlalchemy.schema.Table,
839 universe: DimensionUniverse,
840 ):
841 self._db = db
842 self._idTable = idTable
843 self._definitionTable = definitionTable
844 self._universe = universe
845 self._keysByGroup: dict[DimensionGroup, int] = {universe.empty.as_group(): 0}
846 self._groupsByKey: dict[int, DimensionGroup] = {0: universe.empty.as_group()}
848 def clone(self, db: Database) -> _DimensionGroupStorage:
849 """Make an independent copy of this manager instance bound to a new
850 `Database` instance.
852 Parameters
853 ----------
854 db : `Database`
855 New `Database` object to use when instantiating the manager.
857 Returns
858 -------
859 instance : `_DimensionGroupStorage`
860 New manager instance with the same configuration as this instance,
861 but bound to a new Database object.
862 """
863 return _DimensionGroupStorage(
864 db=db, idTable=self._idTable, definitionTable=self._definitionTable, universe=self._universe
865 )
867 @classmethod
868 def initialize(
869 cls,
870 db: Database,
871 context: StaticTablesContext,
872 *,
873 universe: DimensionUniverse,
874 ) -> _DimensionGroupStorage:
875 """Construct a new instance, including creating tables if necessary.
877 Parameters
878 ----------
879 db : `Database`
880 Interface to the underlying database engine and namespace.
881 context : `StaticTablesContext`
882 Context object obtained from `Database.declareStaticTables`; used
883 to declare any tables that should always be present.
884 universe : `DimensionUniverse`
885 All known dimensions.
887 Returns
888 -------
889 storage : `_DimensionGroupStorage`
890 New instance of this class.
891 """
892 # We need two tables just so we have one where the autoincrement key is
893 # the only primary key column, as is required by (at least) SQLite. In
894 # other databases, we might be able to use a Sequence directly.
895 idTable = context.addTable(
896 "dimension_graph_key",
897 ddl.TableSpec(
898 fields=[
899 ddl.FieldSpec(
900 name="id",
901 dtype=sqlalchemy.BigInteger,
902 autoincrement=True,
903 primaryKey=True,
904 ),
905 ],
906 ),
907 )
908 definitionTable = context.addTable(
909 "dimension_graph_definition",
910 ddl.TableSpec(
911 fields=[
912 ddl.FieldSpec(name="dimension_graph_id", dtype=sqlalchemy.BigInteger, primaryKey=True),
913 ddl.FieldSpec(name="dimension_name", dtype=sqlalchemy.Text, primaryKey=True),
914 ],
915 foreignKeys=[
916 ddl.ForeignKeySpec(
917 "dimension_graph_key",
918 source=("dimension_graph_id",),
919 target=("id",),
920 onDelete="CASCADE",
921 ),
922 ],
923 ),
924 )
925 return cls(db, idTable, definitionTable, universe=universe)
927 def refresh(self) -> None:
928 """Refresh the in-memory cache of saved DimensionGraph definitions.
930 This should be done automatically whenever needed, but it can also
931 be called explicitly.
932 """
933 dimensionNamesByKey: dict[int, set[str]] = defaultdict(set)
934 with self._db.query(self._definitionTable.select()) as sql_result:
935 sql_rows = sql_result.mappings().fetchall()
936 for row in sql_rows:
937 key = row[self._definitionTable.columns.dimension_graph_id]
938 dimensionNamesByKey[key].add(row[self._definitionTable.columns.dimension_name])
939 keysByGraph: dict[DimensionGroup, int] = {self._universe.empty.as_group(): 0}
940 graphsByKey: dict[int, DimensionGroup] = {0: self._universe.empty.as_group()}
941 for key, dimensionNames in dimensionNamesByKey.items():
942 graph = DimensionGroup(self._universe, names=dimensionNames)
943 keysByGraph[graph] = key
944 graphsByKey[key] = graph
945 self._groupsByKey = graphsByKey
946 self._keysByGroup = keysByGraph
948 def save(self, group: DimensionGroup) -> int:
949 """Save a `DimensionGraph` definition to the database, allowing it to
950 be retrieved later via the returned key.
952 Parameters
953 ----------
954 group : `DimensionGroup`
955 Set of dimensions to save.
957 Returns
958 -------
959 key : `int`
960 Integer used as the unique key for this `DimensionGraph` in the
961 database.
962 """
963 key = self._keysByGroup.get(group)
964 if key is not None:
965 return key
966 # Lock tables and then refresh to guard against races where some other
967 # process is trying to register the exact same dimension graph. This
968 # is probably not the most efficient way to do it, but it should be a
969 # rare operation, especially since the short-circuit above will usually
970 # work in long-lived data repositories.
971 with self._db.transaction(lock=[self._idTable, self._definitionTable]):
972 self.refresh()
973 key = self._keysByGroup.get(group)
974 if key is None:
975 (key,) = self._db.insert(self._idTable, {}, returnIds=True) # type: ignore
976 self._db.insert(
977 self._definitionTable,
978 *[{"dimension_graph_id": key, "dimension_name": name} for name in group.required],
979 )
980 self._keysByGroup[group] = key
981 self._groupsByKey[key] = group
982 return key
984 def load(self, key: int) -> DimensionGroup:
985 """Retrieve a `DimensionGraph` that was previously saved in the
986 database.
988 Parameters
989 ----------
990 key : `int`
991 Integer used as the unique key for this `DimensionGraph` in the
992 database.
994 Returns
995 -------
996 graph : `DimensionGraph`
997 Retrieved graph.
998 """
999 graph = self._groupsByKey.get(key)
1000 if graph is None:
1001 self.refresh()
1002 graph = self._groupsByKey[key]
1003 return graph
1006class _CommonSkyPixMediatedOverlapsVisitor(OverlapsVisitor):
1007 def __init__(
1008 self,
1009 db: Database,
1010 dimensions: DimensionGroup,
1011 overlap_tables: Mapping[str, tuple[sqlalchemy.Table, sqlalchemy.Table]],
1012 ):
1013 super().__init__(dimensions)
1014 self.builder: QueryBuilder = QueryJoiner(db).to_builder(qt.ColumnSet(dimensions))
1015 self.common_skypix = dimensions.universe.commonSkyPix
1016 self.overlap_tables: Mapping[str, tuple[sqlalchemy.Table, sqlalchemy.Table]] = overlap_tables
1017 self.common_skypix_overlaps_done: set[DatabaseDimensionElement] = set()
1019 def visit_spatial_constraint(
1020 self,
1021 element: DimensionElement,
1022 region: Region,
1023 flags: PredicateVisitFlags,
1024 ) -> qt.Predicate | None:
1025 # Reject spatial constraints that are nested inside OR or NOT, because
1026 # the postprocessing needed for those would be a lot harder.
1027 if flags & PredicateVisitFlags.INVERTED or flags & PredicateVisitFlags.HAS_OR_SIBLINGS:
1028 raise NotImplementedError(
1029 "Spatial overlap constraints nested inside OR or NOT are not supported."
1030 )
1031 # Delegate to super just because that's good practice with
1032 # OverlapVisitor.
1033 super().visit_spatial_constraint(element, region, flags)
1034 match element:
1035 case DatabaseDimensionElement():
1036 # If this is a database dimension element like tract, patch, or
1037 # visit, we need to:
1038 # - join in the common skypix overlap table for this element;
1039 # - constrain the common skypix index to be inside the
1040 # ranges that overlap the region as a SQL where clause;
1041 # - add postprocessing to reject rows where the database
1042 # dimension element's region doesn't actually overlap the
1043 # region.
1044 self.builder.postprocessing.spatial_where_filtering.append((element, region))
1045 if self.common_skypix.name in self.dimensions: 1045 ↛ 1051line 1045 didn't jump to line 1051, because the condition on line 1045 was never true
1046 # The common skypix dimension should be part of the query
1047 # as a first-class dimension, so we can join in the overlap
1048 # table directly, and fall through to the end of this
1049 # function to construct a Predicate that will turn into the
1050 # SQL WHERE clause we want.
1051 self._join_common_skypix_overlap(element)
1052 skypix = self.common_skypix
1053 else:
1054 # We need to hide the common skypix dimension from the
1055 # larger query, so we make a subquery out of the overlap
1056 # table that embeds the SQL WHERE clause we want and then
1057 # projects out that dimension (with SELECT DISTINCT, to
1058 # avoid introducing duplicate rows into the larger query).
1059 joiner = self._make_common_skypix_overlap_joiner(element)
1060 sql_where_or: list[sqlalchemy.ColumnElement[bool]] = []
1061 sql_skypix_col = joiner.dimension_keys[self.common_skypix.name][0]
1062 for begin, end in self.common_skypix.pixelization.envelope(region):
1063 sql_where_or.append(sqlalchemy.and_(sql_skypix_col >= begin, sql_skypix_col < end))
1064 joiner.where(sqlalchemy.or_(*sql_where_or))
1065 self.builder.join(
1066 joiner.to_builder(
1067 qt.ColumnSet(element.minimal_group).drop_implied_dimension_keys(), distinct=True
1068 ).to_joiner()
1069 )
1070 # Short circuit here since the SQL WHERE clause has already
1071 # been embedded in the subquery.
1072 return qt.Predicate.from_bool(True)
1073 case SkyPixDimension(): 1073 ↛ 1085line 1073 didn't jump to line 1085, because the pattern on line 1073 always matched
1074 # If this is a skypix dimension, we can do a index-in-ranges
1075 # test directly on that dimension. Note that this doesn't on
1076 # its own guarantee the skypix dimension column will be in the
1077 # query; that'll be the job of the DirectQueryDriver to sort
1078 # out (generally this will require a dataset using that skypix
1079 # dimension to be joined in, unless this is the common skypix
1080 # system).
1081 assert (
1082 element.name in self.dimensions
1083 ), "QueryTree guarantees dimensions are expanded when constraints are added."
1084 skypix = element
1085 case _:
1086 raise NotImplementedError(
1087 f"Spatial overlap constraint for dimension {element} not supported."
1088 )
1089 # Convert the region-overlap constraint into a skypix
1090 # index range-membership constraint in SQL.
1091 result = qt.Predicate.from_bool(False)
1092 skypix_col_ref = qt.DimensionKeyReference.model_construct(dimension=skypix)
1093 for begin, end in skypix.pixelization.envelope(region):
1094 result = result.logical_or(qt.Predicate.in_range(skypix_col_ref, start=begin, stop=end))
1095 return result
1097 def visit_spatial_join(
1098 self, a: DimensionElement, b: DimensionElement, flags: PredicateVisitFlags
1099 ) -> qt.Predicate | None:
1100 # Reject spatial joins that are nested inside OR or NOT, because the
1101 # postprocessing needed for those would be a lot harder.
1102 if flags & PredicateVisitFlags.INVERTED or flags & PredicateVisitFlags.HAS_OR_SIBLINGS:
1103 raise NotImplementedError("Spatial overlap joins nested inside OR or NOT are not supported.")
1104 # Delegate to super to check for invalid joins and record this
1105 # "connection" for use when seeing whether to add an automatic join
1106 # later.
1107 super().visit_spatial_join(a, b, flags)
1108 match (a, b):
1109 case (self.common_skypix, DatabaseDimensionElement() as b):
1110 self._join_common_skypix_overlap(b)
1111 case (DatabaseDimensionElement() as a, self.common_skypix):
1112 self._join_common_skypix_overlap(a)
1113 case (DatabaseDimensionElement() as a, DatabaseDimensionElement() as b): 1113 ↛ 1139line 1113 didn't jump to line 1139, because the pattern on line 1113 always matched
1114 if self.common_skypix.name in self.dimensions: 1114 ↛ 1118line 1114 didn't jump to line 1118, because the condition on line 1114 was never true
1115 # We want the common skypix dimension to appear in the
1116 # query as a first-class dimension, so just join in the
1117 # two overlap tables directly.
1118 self._join_common_skypix_overlap(a)
1119 self._join_common_skypix_overlap(b)
1120 else:
1121 # We do not want the common skypix system to appear in the
1122 # query or cause duplicate rows, so we join the two overlap
1123 # tables in a subquery that projects out the common skypix
1124 # index column with SELECT DISTINCT.
1126 self.builder.join(
1127 self._make_common_skypix_overlap_joiner(a)
1128 .join(self._make_common_skypix_overlap_joiner(b))
1129 .to_builder(
1130 qt.ColumnSet(a.minimal_group | b.minimal_group).drop_implied_dimension_keys(),
1131 distinct=True,
1132 )
1133 .to_joiner()
1134 )
1135 # In both cases we add postprocessing to check that the regions
1136 # really do overlap, since overlapping the same common skypix
1137 # tile is necessary but not sufficient for that.
1138 self.builder.postprocessing.spatial_join_filtering.append((a, b))
1139 case _:
1140 raise NotImplementedError(f"Unsupported combination for spatial join: {a, b}.")
1141 return qt.Predicate.from_bool(True)
1143 def _join_common_skypix_overlap(self, element: DatabaseDimensionElement) -> None:
1144 if element not in self.common_skypix_overlaps_done: 1144 ↛ exitline 1144 didn't return from function '_join_common_skypix_overlap', because the condition on line 1144 was never false
1145 self.builder.join(self._make_common_skypix_overlap_joiner(element))
1146 self.common_skypix_overlaps_done.add(element)
1148 def _make_common_skypix_overlap_joiner(self, element: DatabaseDimensionElement) -> QueryJoiner:
1149 _, overlap_table = self.overlap_tables[element.name]
1150 return (
1151 QueryJoiner(self.builder.joiner.db, overlap_table)
1152 .extract_dimensions(element.required.names, skypix_index=self.common_skypix.name)
1153 .where(
1154 sqlalchemy.and_(
1155 overlap_table.c.skypix_system == self.common_skypix.system.name,
1156 overlap_table.c.skypix_level == self.common_skypix.level,
1157 )
1158 )
1159 )