Coverage for python/lsst/daf/butler/registry/dimensions/table.py: 96%
190 statements
« prev ^ index » next coverage.py v7.2.5, created at 2023-05-15 00:09 +0000
« prev ^ index » next coverage.py v7.2.5, created at 2023-05-15 00:09 +0000
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
21from __future__ import annotations
23__all__ = ["TableDimensionRecordStorage"]
25import dataclasses
26import logging
27import warnings
28from collections.abc import Iterable, Mapping, Sequence
29from typing import Any
31import sqlalchemy
33from ...core import (
34 DatabaseDimensionElement,
35 DataCoordinateIterable,
36 DimensionElement,
37 DimensionRecord,
38 GovernorDimension,
39 NamedKeyDict,
40 NamedKeyMapping,
41 NamedValueSet,
42 SimpleQuery,
43 TimespanDatabaseRepresentation,
44 addDimensionForeignKey,
45 ddl,
46)
47from ..interfaces import (
48 Database,
49 DatabaseDimensionOverlapStorage,
50 DatabaseDimensionRecordStorage,
51 GovernorDimensionRecordStorage,
52 StaticTablesContext,
53)
54from ..queries import QueryBuilder
56_LOG = logging.getLogger(__name__)
59MAX_FETCH_CHUNK = 1000
60"""Maximum number of data IDs we fetch records at a time.
62Barring something database-engine-specific, this sets the size of the actual
63SQL query, not just the number of result rows, because the only way to query
64for multiple data IDs in a single SELECT query via SQLAlchemy is to have an OR
65term in the WHERE clause for each one.
66"""
69class TableDimensionRecordStorage(DatabaseDimensionRecordStorage):
70 """A record storage implementation uses a regular database table.
72 Parameters
73 ----------
74 db : `Database`
75 Interface to the database engine and namespace that will hold these
76 dimension records.
77 element : `DatabaseDimensionElement`
78 The element whose records this storage will manage.
79 table : `sqlalchemy.schema.Table`
80 The logical table for the element.
81 skypix_overlap_tables : `_SkyPixOverlapTables`, optional
82 Object that manages the tables that hold materialized spatial overlap
83 joins to skypix dimensions. Should be `None` if (and only if)
84 ``element.spatial is None``.
85 """
87 def __init__(
88 self,
89 db: Database,
90 element: DatabaseDimensionElement,
91 *,
92 table: sqlalchemy.schema.Table,
93 skypix_overlap_tables: _SkyPixOverlapTables | None = None,
94 ):
95 self._db = db
96 self._table = table
97 self._element = element
98 self._fetchColumns: dict[str, sqlalchemy.sql.ColumnElement] = {
99 dimension.name: self._table.columns[name]
100 for dimension, name in zip(
101 self._element.dimensions, self._element.RecordClass.fields.dimensions.names
102 )
103 }
104 self._skypix_overlap_tables = skypix_overlap_tables
105 self._otherOverlaps: list[DatabaseDimensionOverlapStorage] = []
107 @classmethod
108 def initialize(
109 cls,
110 db: Database,
111 element: DatabaseDimensionElement,
112 *,
113 context: StaticTablesContext | None = None,
114 config: Mapping[str, Any],
115 governors: NamedKeyMapping[GovernorDimension, GovernorDimensionRecordStorage],
116 ) -> DatabaseDimensionRecordStorage:
117 # Docstring inherited from DatabaseDimensionRecordStorage.
118 spec = element.RecordClass.fields.makeTableSpec(TimespanReprClass=db.getTimespanRepresentation())
119 if context is not None: 119 ↛ 122line 119 didn't jump to line 122, because the condition on line 119 was never false
120 table = context.addTable(element.name, spec)
121 else:
122 table = db.ensureTableExists(element.name, spec)
123 if element.spatial is not None:
124 governor = governors[element.spatial.governor]
125 skypix_overlap_tables = _SkyPixOverlapTables.initialize(db, element, context=context)
126 result = cls(db, element, table=table, skypix_overlap_tables=skypix_overlap_tables)
127 governor.registerInsertionListener(result._on_governor_insert)
128 return result
129 else:
130 return cls(db, element, table=table)
132 @property
133 def element(self) -> DatabaseDimensionElement:
134 # Docstring inherited from DimensionRecordStorage.element.
135 return self._element
137 def clearCaches(self) -> None:
138 # Docstring inherited from DimensionRecordStorage.clearCaches.
139 pass
141 def join(
142 self,
143 builder: QueryBuilder,
144 *,
145 regions: NamedKeyDict[DimensionElement, sqlalchemy.sql.ColumnElement] | None = None,
146 timespans: NamedKeyDict[DimensionElement, TimespanDatabaseRepresentation] | None = None,
147 ) -> None:
148 # Docstring inherited from DimensionRecordStorage.
149 if regions is not None:
150 dimensions = NamedValueSet(self.element.required)
151 dimensions.add(self.element.universe.commonSkyPix)
152 assert self._skypix_overlap_tables is not None
153 builder.joinTable(self._select_skypix_overlaps(), dimensions)
154 regionsInTable = self._table.columns["region"]
155 regions[self.element] = regionsInTable
156 joinOn = builder.startJoin(
157 self._table, self.element.dimensions, self.element.RecordClass.fields.dimensions.names
158 )
159 if timespans is not None:
160 timespanInTable = self._db.getTimespanRepresentation().from_columns(self._table.columns)
161 for timespanInQuery in timespans.values(): 161 ↛ 162line 161 didn't jump to line 162, because the loop on line 161 never started
162 joinOn.append(timespanInQuery.overlaps(timespanInTable))
163 timespans[self.element] = timespanInTable
164 builder.finishJoin(self._table, joinOn)
165 return self._table
167 def fetch(self, dataIds: DataCoordinateIterable) -> Iterable[DimensionRecord]:
168 # Docstring inherited from DimensionRecordStorage.fetch.
169 RecordClass = self.element.RecordClass
170 query = SimpleQuery()
171 query.columns.extend(self._table.columns[name] for name in RecordClass.fields.standard.names)
172 if self.element.spatial is not None:
173 query.columns.append(self._table.columns["region"])
174 if self.element.temporal is not None:
175 TimespanReprClass = self._db.getTimespanRepresentation()
176 query.columns.extend(self._table.columns[name] for name in TimespanReprClass.getFieldNames())
177 query.join(self._table)
178 dataIds.constrain(query, lambda name: self._fetchColumns[name])
179 with warnings.catch_warnings():
180 # Some of our generated queries may contain cartesian joins, this
181 # is not a serious issue as it is properly constrained, so we want
182 # to suppress sqlalchemy warnings.
183 warnings.filterwarnings(
184 "ignore",
185 message="SELECT statement has a cartesian product",
186 category=sqlalchemy.exc.SAWarning,
187 )
188 with self._db.query(query.combine()) as sql_result:
189 for row in sql_result.fetchall():
190 values = row._asdict()
191 if self.element.temporal is not None:
192 values[TimespanDatabaseRepresentation.NAME] = TimespanReprClass.extract(values)
193 yield RecordClass(**values)
195 def insert(self, *records: DimensionRecord, replace: bool = False, skip_existing: bool = False) -> None:
196 # Docstring inherited from DimensionRecordStorage.insert.
197 elementRows = [record.toDict() for record in records]
198 if self.element.temporal is not None:
199 TimespanReprClass = self._db.getTimespanRepresentation()
200 for row in elementRows:
201 timespan = row.pop(TimespanDatabaseRepresentation.NAME)
202 TimespanReprClass.update(timespan, result=row)
203 with self._db.transaction():
204 if replace:
205 self._db.replace(self._table, *elementRows)
206 elif skip_existing:
207 self._db.ensure(self._table, *elementRows, primary_key_only=True)
208 else:
209 self._db.insert(self._table, *elementRows)
210 if self._skypix_overlap_tables is not None:
211 self._insert_skypix_overlaps(records, replace=replace, skip_existing=skip_existing)
213 def sync(self, record: DimensionRecord, update: bool = False) -> bool | dict[str, Any]:
214 # Docstring inherited from DimensionRecordStorage.sync.
215 compared = record.toDict()
216 keys = {}
217 for name in record.fields.required.names:
218 keys[name] = compared.pop(name)
219 if self.element.temporal is not None:
220 TimespanReprClass = self._db.getTimespanRepresentation()
221 timespan = compared.pop(TimespanDatabaseRepresentation.NAME)
222 TimespanReprClass.update(timespan, result=compared)
223 with self._db.transaction():
224 _, inserted_or_updated = self._db.sync(
225 self._table,
226 keys=keys,
227 compared=compared,
228 update=update,
229 )
230 if inserted_or_updated and self._skypix_overlap_tables is not None:
231 if inserted_or_updated is True:
232 # Inserted a new row, so we just need to insert new overlap
233 # rows.
234 self._insert_skypix_overlaps([record])
235 elif "region" in inserted_or_updated: 235 ↛ 223line 235 didn't jump to line 223
236 # Updated the region, so we need to delete old overlap rows
237 # and insert new ones.
238 self._insert_skypix_overlaps([record], replace=True)
239 # We updated something other than a region.
240 return inserted_or_updated
242 def digestTables(self) -> Iterable[sqlalchemy.schema.Table]:
243 # Docstring inherited from DimensionRecordStorage.digestTables.
244 result = [self._table]
245 if self._skypix_overlap_tables is not None:
246 result.append(self._skypix_overlap_tables.summary)
247 result.append(self._skypix_overlap_tables.overlaps)
248 return result
250 def connect(self, overlaps: DatabaseDimensionOverlapStorage) -> None:
251 # Docstring inherited from DatabaseDimensionRecordStorage.
252 self._otherOverlaps.append(overlaps)
254 def _on_governor_insert(self, record: DimensionRecord) -> None:
255 """A `GovernorDimensionRecordStorage.registerInsertionListener`
256 callback for this element.
258 Parameters
259 ----------
260 record : `DimensionRecord`
261 Record for this element's governor dimension.
262 """
263 # We need to enable overlaps between this new governor dimension value
264 # and the common skypix dimension to record that we materialize
265 # overlaps for that combination. Foreign keys guarantee that there
266 # can't be any rows of this storage object's own element with that
267 # governor value yet, so we know there's nothing to insert into the
268 # overlaps table yet.
269 skypix = self.element.universe.commonSkyPix
270 assert self._element.spatial is not None, "Only called for spatial dimension elements."
271 assert (
272 self._skypix_overlap_tables is not None
273 ), "Spatial dimension elements always have skypix overlap tables."
274 governor = self._element.spatial.governor
275 self._db.sync(
276 self._skypix_overlap_tables.summary,
277 keys={
278 "skypix_system": skypix.system.name,
279 "skypix_level": skypix.level,
280 governor.name: record.dataId[governor.name],
281 },
282 )
284 def _insert_skypix_overlaps(
285 self, records: Sequence[DimensionRecord], replace: bool = False, skip_existing: bool = False
286 ) -> None:
287 """Compute and insert overlap rows between this dimesion element and
288 the common skypix system.
290 Parameters
291 ----------
292 records : `Sequence` [ `DimensionRecord` ]
293 Records for ``self.element`` that are being inserted.
294 replace : `bool`, optional
295 If `True`, the given records are being inserted in a mode that may
296 replace existing records, and hence overlap rows may need to be
297 replaced as well.
298 skip_existing : `bool`, optional
299 If `True`, the given records are being inserted in a mode that
300 ignored existing records with the same data ID, and hence overlap
301 rows need to be inserted this way as well.
302 """
303 assert self._element.spatial is not None, "Only called for spatial dimension elements."
304 assert (
305 self._skypix_overlap_tables is not None
306 ), "Spatial dimension elements always have skypix overlap tables."
307 # At present, only overlaps with the "commonSkyPix" system can be
308 # materialized, so we just compute and insert overlaps with those.
309 #
310 # To guard against this code being used with a data repository in which
311 # newer code has enabled other overlaps, we check afterwards that the
312 # summary table only contains commonSkyPix for all of these governor
313 # dimensions. In the future, we'll have to think about whether we need
314 # some table locking to guarantee consistency for those other overlaps
315 # if the summary table is updated at the same time as records are
316 # being inserted. This should happen within the same transaction
317 # (handled by the caller) so that previous inserts get rolled back.
318 skypix = self._element.universe.commonSkyPix
319 if replace:
320 # Since any of the new records might have replaced existing ones
321 # that already have overlap records, and we don't know which, we
322 # have no choice but to delete all overlaps for these records and
323 # recompute them.
324 # We include the skypix_system and skypix_level column values
325 # explicitly instead of just letting the query search for all
326 # of those related to the given records, because they are the
327 # first columns in the primary key, and hence searching with
328 # them will be way faster (and we don't want to add a new index
329 # just for this operation).
330 to_delete: list[dict[str, Any]] = [
331 {"skypix_system": skypix.system.name, "skypix_level": skypix.level, **record.dataId.byName()}
332 for record in records
333 ]
334 _LOG.debug("Deleting old common skypix overlaps for %s.", self.element.name)
335 self._db.delete(
336 self._skypix_overlap_tables.overlaps,
337 ["skypix_system", "skypix_level"] + list(self.element.graph.required.names),
338 *to_delete,
339 )
340 _LOG.debug("Precomputing common skypix overlaps for %s.", self.element.name)
341 overlap_records: list[dict[str, Any]] = []
342 for record in records:
343 if record.region is None:
344 continue
345 base_overlap_record = record.dataId.byName()
346 base_overlap_record["skypix_system"] = skypix.system.name
347 base_overlap_record["skypix_level"] = skypix.level
348 for begin, end in skypix.pixelization.envelope(record.region):
349 for index in range(begin, end):
350 overlap_records.append({"skypix_index": index, **base_overlap_record})
351 _LOG.debug("Inserting %d new skypix overlap rows for %s.", len(overlap_records), self.element.name)
352 if skip_existing:
353 self._db.ensure(self._skypix_overlap_tables.overlaps, *overlap_records, primary_key_only=True)
354 else:
355 self._db.insert(self._skypix_overlap_tables.overlaps, *overlap_records)
356 # Finally we check for non-commonSkyPix values in the summary table, as
357 # noted above.
358 summary = self._skypix_overlap_tables.summary
359 check_sql = (
360 sqlalchemy.sql.select([summary.columns.skypix_system, summary.columns.skypix_level])
361 .select_from(summary)
362 .where(
363 sqlalchemy.sql.not_(
364 sqlalchemy.sql.and_(
365 summary.columns.skypix_system == skypix.system.name,
366 summary.columns.skypix_level == skypix.level,
367 )
368 )
369 )
370 )
371 with self._db.query(check_sql) as sql_result:
372 bad_summary_rows = sql_result.fetchall()
373 if bad_summary_rows: 373 ↛ 374line 373 didn't jump to line 374, because the condition on line 373 was never true
374 bad_skypix_names = [f"{row.skypix_system}{row.skypix.level}" for row in bad_summary_rows]
375 raise RuntimeError(
376 f"Data repository has overlaps between {self._element} and {bad_skypix_names} that "
377 "are not supported by this version of daf_butler. Please use a newer version."
378 )
380 def _select_skypix_overlaps(self) -> sqlalchemy.sql.FromClause:
381 """Construct a subquery expression containing overlaps between common
382 skypix dimension and this dimension element.
384 Returns
385 -------
386 subquery : `sqlalchemy.sql.FromClause`
387 A SELECT query with an alias, intended for use as a subquery, with
388 columns equal to::
390 list(self.element.required.names)
391 + [self.element.universe.commonSkyPix.name]
392 """
393 assert self._element.spatial is not None, "Only called for spatial dimension elements."
394 assert (
395 self._skypix_overlap_tables is not None
396 ), "Spatial dimension elements always have skypix overlap tables."
397 skypix = self._element.universe.commonSkyPix
398 table = self._skypix_overlap_tables.overlaps
399 columns = [table.columns.skypix_index.label(skypix.name)]
400 columns.extend(table.columns[name] for name in self.element.graph.required.names)
401 query = (
402 sqlalchemy.sql.select(*columns)
403 .select_from(table)
404 .where(
405 sqlalchemy.sql.and_(
406 table.columns.skypix_system == skypix.system.name,
407 table.columns.skypix_level == skypix.level,
408 )
409 )
410 )
411 return query.alias(f"{self.element.name}_{skypix.name}_overlap")
414@dataclasses.dataclass
415class _SkyPixOverlapTables:
416 """A helper object for `TableDimensionRecordStorage` that manages the
417 tables for materialized overlaps with skypix dimensions.
419 New instances should be constructed by calling `initialize`, not by calling
420 the dataclass-provided constructor directly.
422 Notes
423 -----
424 This class (and the related methods in TableDimensionRecordStorage) can in
425 principle manage overlaps between a database dimension element and any
426 skypix dimension, but at present it is only being used to manage
427 relationships with the special ``commonSkyPix`` dimension, because that's
428 all the query system uses. Eventually, we expect to require users to
429 explicitly materialize more relationships.
431 Other possible future improvements include:
433 - allowing finer-grained skypix dimensions to provide overlap rows for
434 coarser ones, by dividing indices by powers of 4 (and possibly doing
435 ``SELECT DISTINCT`` in the subquery to remove duplicates);
437 - allowing finer-grained database elements (e.g. patch) to provide overlap
438 rows for coarser ones (e.g. tract), by ignoring irrelevant columns (e.g.
439 the patch IDs) in the subquery (again, possible with ``SELECT
440 DISTINCT``).
442 But there's no point to doing any of that until the query system can figure
443 out how best to ask for overlap rows when an exact match isn't available.
444 """
446 summary: sqlalchemy.schema.Table
447 """Table that records which governor value / skypix combinations have
448 materialized overlaps.
449 """
451 overlaps: sqlalchemy.schema.Table
452 """Table that actually holds overlap rows.
453 """
455 @classmethod
456 def initialize(
457 cls,
458 db: Database,
459 element: DatabaseDimensionElement,
460 *,
461 context: StaticTablesContext | None,
462 ) -> _SkyPixOverlapTables:
463 """Construct a new instance, creating tables as needed.
465 Parameters
466 ----------
467 db : `Database`
468 Interface to the underlying database engine and namespace.
469 element : `DatabaseDimensionElement`
470 Dimension element whose overlaps are to be managed.
471 context : `StaticTablesContext`, optional
472 If provided, an object to use to create any new tables. If not
473 provided, ``db.ensureTableExists`` should be used instead.
474 """
475 if context is not None: 475 ↛ 478line 475 didn't jump to line 478, because the condition on line 475 was never false
476 op = context.addTable
477 else:
478 op = db.ensureTableExists
479 summary = op(
480 cls._SUMMARY_TABLE_NAME_SPEC.format(element=element),
481 cls._makeSummaryTableSpec(element),
482 )
483 overlaps = op(
484 cls._OVERLAP_TABLE_NAME_SPEC.format(element=element),
485 cls._makeOverlapTableSpec(element),
486 )
487 return cls(summary=summary, overlaps=overlaps)
489 _SUMMARY_TABLE_NAME_SPEC = "{element.name}_skypix_overlap_summary"
491 @classmethod
492 def _makeSummaryTableSpec(cls, element: DatabaseDimensionElement) -> ddl.TableSpec:
493 """Create a specification for the table that records which combinations
494 of skypix dimension and governor value have materialized overlaps.
496 Parameters
497 ----------
498 element : `DatabaseDimensionElement`
499 Dimension element whose overlaps are to be managed.
501 Returns
502 -------
503 tableSpec : `ddl.TableSpec`
504 Table specification.
505 """
506 assert element.spatial is not None
507 tableSpec = ddl.TableSpec(
508 fields=[
509 ddl.FieldSpec(
510 name="skypix_system",
511 dtype=sqlalchemy.String,
512 length=16,
513 nullable=False,
514 primaryKey=True,
515 ),
516 ddl.FieldSpec(
517 name="skypix_level",
518 dtype=sqlalchemy.SmallInteger,
519 nullable=False,
520 primaryKey=True,
521 ),
522 ]
523 )
524 addDimensionForeignKey(tableSpec, element.spatial.governor, primaryKey=True)
525 return tableSpec
527 _OVERLAP_TABLE_NAME_SPEC = "{element.name}_skypix_overlap"
529 @classmethod
530 def _makeOverlapTableSpec(cls, element: DatabaseDimensionElement) -> ddl.TableSpec:
531 """Create a specification for the table that holds materialized
532 overlap rows.
534 Parameters
535 ----------
536 element : `DatabaseDimensionElement`
537 Dimension element whose overlaps are to be managed.
539 Returns
540 -------
541 tableSpec : `ddl.TableSpec`
542 Table specification.
543 """
544 assert element.spatial is not None
545 tableSpec = ddl.TableSpec(
546 fields=[
547 ddl.FieldSpec(
548 name="skypix_system",
549 dtype=sqlalchemy.String,
550 length=16,
551 nullable=False,
552 primaryKey=True,
553 ),
554 ddl.FieldSpec(
555 name="skypix_level",
556 dtype=sqlalchemy.SmallInteger,
557 nullable=False,
558 primaryKey=True,
559 ),
560 # (more columns added below)
561 ],
562 unique=set(),
563 indexes={
564 # This index has the same fields as the PK, in a different
565 # order, to facilitate queries that know skypix_index and want
566 # to find the other element.
567 ddl.IndexSpec(
568 "skypix_system",
569 "skypix_level",
570 "skypix_index",
571 *element.graph.required.names,
572 ),
573 },
574 foreignKeys=[
575 # Foreign key to summary table. This makes sure we don't
576 # materialize any overlaps without remembering that we've done
577 # so in the summary table, though it can't prevent the converse
578 # of adding a summary row without adding overlap row (either of
579 # those is a logic bug, of course, but we want to be defensive
580 # about those). Using ON DELETE CASCADE, it'd be very easy to
581 # implement "disabling" an overlap materialization, because we
582 # can just delete the summary row.
583 # Note that the governor dimension column is added below, in
584 # the call to addDimensionForeignKey.
585 ddl.ForeignKeySpec(
586 cls._SUMMARY_TABLE_NAME_SPEC.format(element=element),
587 source=("skypix_system", "skypix_level", element.spatial.governor.name),
588 target=("skypix_system", "skypix_level", element.spatial.governor.name),
589 onDelete="CASCADE",
590 ),
591 ],
592 )
593 # Add fields for the standard element this class manages overlaps for.
594 # This is guaranteed to add a column for the governor dimension,
595 # because that's a required dependency of element.
596 for dimension in element.required:
597 addDimensionForeignKey(tableSpec, dimension, primaryKey=True)
598 # Add field for the actual skypix index. We do this later because I
599 # think we care (at least a bit) about the order in which the primary
600 # key is defined, in that we want a non-summary column like this one
601 # to appear after the governor dimension column.
602 tableSpec.fields.add(
603 ddl.FieldSpec(
604 name="skypix_index",
605 dtype=sqlalchemy.BigInteger,
606 nullable=False,
607 primaryKey=True,
608 )
609 )
610 return tableSpec