Coverage for python/lsst/daf/butler/registry/dimensions/table.py: 96%
188 statements
« prev ^ index » next coverage.py v6.5.0, created at 2022-11-12 02:19 -0800
« prev ^ index » next coverage.py v6.5.0, created at 2022-11-12 02:19 -0800
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
21from __future__ import annotations
23__all__ = ["TableDimensionRecordStorage"]
25import dataclasses
26import logging
27import warnings
28from collections.abc import Iterable, Mapping, Sequence
29from typing import Any
31import sqlalchemy
33from ...core import (
34 DatabaseDimensionElement,
35 DataCoordinateIterable,
36 DimensionElement,
37 DimensionRecord,
38 GovernorDimension,
39 NamedKeyDict,
40 NamedKeyMapping,
41 NamedValueSet,
42 SimpleQuery,
43 TimespanDatabaseRepresentation,
44 addDimensionForeignKey,
45 ddl,
46)
47from ..interfaces import (
48 Database,
49 DatabaseDimensionOverlapStorage,
50 DatabaseDimensionRecordStorage,
51 GovernorDimensionRecordStorage,
52 StaticTablesContext,
53)
54from ..queries import QueryBuilder
56_LOG = logging.getLogger(__name__)
59MAX_FETCH_CHUNK = 1000
60"""Maximum number of data IDs we fetch records at a time.
62Barring something database-engine-specific, this sets the size of the actual
63SQL query, not just the number of result rows, because the only way to query
64for multiple data IDs in a single SELECT query via SQLAlchemy is to have an OR
65term in the WHERE clause for each one.
66"""
69class TableDimensionRecordStorage(DatabaseDimensionRecordStorage):
70 """A record storage implementation uses a regular database table.
72 Parameters
73 ----------
74 db : `Database`
75 Interface to the database engine and namespace that will hold these
76 dimension records.
77 element : `DatabaseDimensionElement`
78 The element whose records this storage will manage.
79 table : `sqlalchemy.schema.Table`
80 The logical table for the element.
81 skypix_overlap_tables : `_SkyPixOverlapTables`, optional
82 Object that manages the tables that hold materialized spatial overlap
83 joins to skypix dimensions. Should be `None` if (and only if)
84 ``element.spatial is None``.
85 """
87 def __init__(
88 self,
89 db: Database,
90 element: DatabaseDimensionElement,
91 *,
92 table: sqlalchemy.schema.Table,
93 skypix_overlap_tables: _SkyPixOverlapTables | None = None,
94 ):
95 self._db = db
96 self._table = table
97 self._element = element
98 self._fetchColumns: dict[str, sqlalchemy.sql.ColumnElement] = {
99 dimension.name: self._table.columns[name]
100 for dimension, name in zip(
101 self._element.dimensions, self._element.RecordClass.fields.dimensions.names
102 )
103 }
104 self._skypix_overlap_tables = skypix_overlap_tables
105 self._otherOverlaps: list[DatabaseDimensionOverlapStorage] = []
107 @classmethod
108 def initialize(
109 cls,
110 db: Database,
111 element: DatabaseDimensionElement,
112 *,
113 context: StaticTablesContext | None = None,
114 config: Mapping[str, Any],
115 governors: NamedKeyMapping[GovernorDimension, GovernorDimensionRecordStorage],
116 ) -> DatabaseDimensionRecordStorage:
117 # Docstring inherited from DatabaseDimensionRecordStorage.
118 spec = element.RecordClass.fields.makeTableSpec(TimespanReprClass=db.getTimespanRepresentation())
119 if context is not None: 119 ↛ 122line 119 didn't jump to line 122, because the condition on line 119 was never false
120 table = context.addTable(element.name, spec)
121 else:
122 table = db.ensureTableExists(element.name, spec)
123 if element.spatial is not None:
124 governor = governors[element.spatial.governor]
125 skypix_overlap_tables = _SkyPixOverlapTables.initialize(db, element, context=context)
126 result = cls(db, element, table=table, skypix_overlap_tables=skypix_overlap_tables)
127 governor.registerInsertionListener(result._on_governor_insert)
128 return result
129 else:
130 return cls(db, element, table=table)
132 @property
133 def element(self) -> DatabaseDimensionElement:
134 # Docstring inherited from DimensionRecordStorage.element.
135 return self._element
137 def clearCaches(self) -> None:
138 # Docstring inherited from DimensionRecordStorage.clearCaches.
139 pass
141 def join(
142 self,
143 builder: QueryBuilder,
144 *,
145 regions: NamedKeyDict[DimensionElement, sqlalchemy.sql.ColumnElement] | None = None,
146 timespans: NamedKeyDict[DimensionElement, TimespanDatabaseRepresentation] | None = None,
147 ) -> None:
148 # Docstring inherited from DimensionRecordStorage.
149 if regions is not None:
150 dimensions = NamedValueSet(self.element.required)
151 dimensions.add(self.element.universe.commonSkyPix)
152 assert self._skypix_overlap_tables is not None
153 builder.joinTable(self._select_skypix_overlaps(), dimensions)
154 regionsInTable = self._table.columns["region"]
155 regions[self.element] = regionsInTable
156 joinOn = builder.startJoin(
157 self._table, self.element.dimensions, self.element.RecordClass.fields.dimensions.names
158 )
159 if timespans is not None:
160 timespanInTable = self._db.getTimespanRepresentation().from_columns(self._table.columns)
161 for timespanInQuery in timespans.values(): 161 ↛ 162line 161 didn't jump to line 162, because the loop on line 161 never started
162 joinOn.append(timespanInQuery.overlaps(timespanInTable))
163 timespans[self.element] = timespanInTable
164 builder.finishJoin(self._table, joinOn)
165 return self._table
167 def fetch(self, dataIds: DataCoordinateIterable) -> Iterable[DimensionRecord]:
168 # Docstring inherited from DimensionRecordStorage.fetch.
169 RecordClass = self.element.RecordClass
170 query = SimpleQuery()
171 query.columns.extend(self._table.columns[name] for name in RecordClass.fields.standard.names)
172 if self.element.spatial is not None:
173 query.columns.append(self._table.columns["region"])
174 if self.element.temporal is not None:
175 TimespanReprClass = self._db.getTimespanRepresentation()
176 query.columns.extend(self._table.columns[name] for name in TimespanReprClass.getFieldNames())
177 query.join(self._table)
178 dataIds.constrain(query, lambda name: self._fetchColumns[name])
179 with warnings.catch_warnings():
180 # Some of our generated queries may contain cartesian joins, this
181 # is not a serious issue as it is properly constrained, so we want
182 # to suppress sqlalchemy warnings.
183 warnings.filterwarnings(
184 "ignore",
185 message="SELECT statement has a cartesian product",
186 category=sqlalchemy.exc.SAWarning,
187 )
188 for row in self._db.query(query.combine()):
189 values = row._asdict()
190 if self.element.temporal is not None:
191 values[TimespanDatabaseRepresentation.NAME] = TimespanReprClass.extract(values)
192 yield RecordClass(**values)
194 def insert(self, *records: DimensionRecord, replace: bool = False, skip_existing: bool = False) -> None:
195 # Docstring inherited from DimensionRecordStorage.insert.
196 elementRows = [record.toDict() for record in records]
197 if self.element.temporal is not None:
198 TimespanReprClass = self._db.getTimespanRepresentation()
199 for row in elementRows:
200 timespan = row.pop(TimespanDatabaseRepresentation.NAME)
201 TimespanReprClass.update(timespan, result=row)
202 with self._db.transaction():
203 if replace:
204 self._db.replace(self._table, *elementRows)
205 elif skip_existing:
206 self._db.ensure(self._table, *elementRows, primary_key_only=True)
207 else:
208 self._db.insert(self._table, *elementRows)
209 if self._skypix_overlap_tables is not None:
210 self._insert_skypix_overlaps(records, replace=replace, skip_existing=skip_existing)
212 def sync(self, record: DimensionRecord, update: bool = False) -> bool | dict[str, Any]:
213 # Docstring inherited from DimensionRecordStorage.sync.
214 compared = record.toDict()
215 keys = {}
216 for name in record.fields.required.names:
217 keys[name] = compared.pop(name)
218 if self.element.temporal is not None:
219 TimespanReprClass = self._db.getTimespanRepresentation()
220 timespan = compared.pop(TimespanDatabaseRepresentation.NAME)
221 TimespanReprClass.update(timespan, result=compared)
222 with self._db.transaction():
223 _, inserted_or_updated = self._db.sync(
224 self._table,
225 keys=keys,
226 compared=compared,
227 update=update,
228 )
229 if inserted_or_updated and self._skypix_overlap_tables is not None:
230 if inserted_or_updated is True:
231 # Inserted a new row, so we just need to insert new overlap
232 # rows.
233 self._insert_skypix_overlaps([record])
234 elif "region" in inserted_or_updated: 234 ↛ 222line 234 didn't jump to line 222
235 # Updated the region, so we need to delete old overlap rows
236 # and insert new ones.
237 self._insert_skypix_overlaps([record], replace=True)
238 # We updated something other than a region.
239 return inserted_or_updated
241 def digestTables(self) -> Iterable[sqlalchemy.schema.Table]:
242 # Docstring inherited from DimensionRecordStorage.digestTables.
243 result = [self._table]
244 if self._skypix_overlap_tables is not None:
245 result.append(self._skypix_overlap_tables.summary)
246 result.append(self._skypix_overlap_tables.overlaps)
247 return result
249 def connect(self, overlaps: DatabaseDimensionOverlapStorage) -> None:
250 # Docstring inherited from DatabaseDimensionRecordStorage.
251 self._otherOverlaps.append(overlaps)
253 def _on_governor_insert(self, record: DimensionRecord) -> None:
254 """A `GovernorDimensionRecordStorage.registerInsertionListener`
255 callback for this element.
257 Parameters
258 ----------
259 record : `DimensionRecord`
260 Record for this element's governor dimension.
261 """
262 # We need to enable overlaps between this new governor dimension value
263 # and the common skypix dimension to record that we materialize
264 # overlaps for that combination. Foreign keys guarantee that there
265 # can't be any rows of this storage object's own element with that
266 # governor value yet, so we know there's nothing to insert into the
267 # overlaps table yet.
268 skypix = self.element.universe.commonSkyPix
269 assert self._element.spatial is not None, "Only called for spatial dimension elements."
270 assert (
271 self._skypix_overlap_tables is not None
272 ), "Spatial dimension elements always have skypix overlap tables."
273 governor = self._element.spatial.governor
274 self._db.sync(
275 self._skypix_overlap_tables.summary,
276 keys={
277 "skypix_system": skypix.system.name,
278 "skypix_level": skypix.level,
279 governor.name: record.dataId[governor.name],
280 },
281 )
283 def _insert_skypix_overlaps(
284 self, records: Sequence[DimensionRecord], replace: bool = False, skip_existing: bool = False
285 ) -> None:
286 """Compute and insert overlap rows between this dimesion element and
287 the common skypix system.
289 Parameters
290 ----------
291 records : `Sequence` [ `DimensionRecord` ]
292 Records for ``self.element`` that are being inserted.
293 replace : `bool`, optional
294 If `True`, the given records are being inserted in a mode that may
295 replace existing records, and hence overlap rows may need to be
296 replaced as well.
297 skip_existing : `bool`, optional
298 If `True`, the given records are being inserted in a mode that
299 ignored existing records with the same data ID, and hence overlap
300 rows need to be inserted this way as well.
301 """
302 assert self._element.spatial is not None, "Only called for spatial dimension elements."
303 assert (
304 self._skypix_overlap_tables is not None
305 ), "Spatial dimension elements always have skypix overlap tables."
306 # At present, only overlaps with the "commonSkyPix" system can be
307 # materialized, so we just compute and insert overlaps with those.
308 #
309 # To guard against this code being used with a data repository in which
310 # newer code has enabled other overlaps, we check afterwards that the
311 # summary table only contains commonSkyPix for all of these governor
312 # dimensions. In the future, we'll have to think about whether we need
313 # some table locking to guarantee consistency for those other overlaps
314 # if the summary table is updated at the same time as records are
315 # being inserted. This should happen within the same transaction
316 # (handled by the caller) so that previous inserts get rolled back.
317 skypix = self._element.universe.commonSkyPix
318 if replace:
319 # Since any of the new records might have replaced existing ones
320 # that already have overlap records, and we don't know which, we
321 # have no choice but to delete all overlaps for these records and
322 # recompute them.
323 # We include the skypix_system and skypix_level column values
324 # explicitly instead of just letting the query search for all
325 # of those related to the given records, because they are the
326 # first columns in the primary key, and hence searching with
327 # them will be way faster (and we don't want to add a new index
328 # just for this operation).
329 to_delete: list[dict[str, Any]] = [
330 {"skypix_system": skypix.system.name, "skypix_level": skypix.level, **record.dataId.byName()}
331 for record in records
332 ]
333 _LOG.debug("Deleting old common skypix overlaps for %s.", self.element.name)
334 self._db.delete(
335 self._skypix_overlap_tables.overlaps,
336 ["skypix_system", "skypix_level"] + list(self.element.graph.required.names),
337 *to_delete,
338 )
339 _LOG.debug("Precomputing common skypix overlaps for %s.", self.element.name)
340 overlap_records: list[dict[str, Any]] = []
341 for record in records:
342 if record.region is None:
343 continue
344 base_overlap_record = record.dataId.byName()
345 base_overlap_record["skypix_system"] = skypix.system.name
346 base_overlap_record["skypix_level"] = skypix.level
347 for begin, end in skypix.pixelization.envelope(record.region):
348 for index in range(begin, end):
349 overlap_records.append({"skypix_index": index, **base_overlap_record})
350 _LOG.debug("Inserting %d new skypix overlap rows for %s.", len(overlap_records), self.element.name)
351 if skip_existing:
352 self._db.ensure(self._skypix_overlap_tables.overlaps, *overlap_records, primary_key_only=True)
353 else:
354 self._db.insert(self._skypix_overlap_tables.overlaps, *overlap_records)
355 # Finally we check for non-commonSkyPix values in the summary table, as
356 # noted above.
357 summary = self._skypix_overlap_tables.summary
358 check_sql = (
359 sqlalchemy.sql.select([summary.columns.skypix_system, summary.columns.skypix_level])
360 .select_from(summary)
361 .where(
362 sqlalchemy.sql.not_(
363 sqlalchemy.sql.and_(
364 summary.columns.skypix_system == skypix.system.name,
365 summary.columns.skypix_level == skypix.level,
366 )
367 )
368 )
369 )
370 bad_summary_rows = self._db.query(check_sql).fetchall()
371 if bad_summary_rows: 371 ↛ 372line 371 didn't jump to line 372, because the condition on line 371 was never true
372 bad_skypix_names = [f"{row.skypix_system}{row.skypix.level}" for row in bad_summary_rows]
373 raise RuntimeError(
374 f"Data repository has overlaps between {self._element} and {bad_skypix_names} that "
375 "are not supported by this version of daf_butler. Please use a newer version."
376 )
378 def _select_skypix_overlaps(self) -> sqlalchemy.sql.FromClause:
379 """Construct a subquery expression containing overlaps between common
380 skypix dimension and this dimension element.
382 Returns
383 -------
384 subquery : `sqlalchemy.sql.FromClause`
385 A SELECT query with an alias, intended for use as a subquery, with
386 columns equal to::
388 list(self.element.required.names)
389 + [self.element.universe.commonSkyPix.name]
390 """
391 assert self._element.spatial is not None, "Only called for spatial dimension elements."
392 assert (
393 self._skypix_overlap_tables is not None
394 ), "Spatial dimension elements always have skypix overlap tables."
395 skypix = self._element.universe.commonSkyPix
396 table = self._skypix_overlap_tables.overlaps
397 columns = [table.columns.skypix_index.label(skypix.name)]
398 columns.extend(table.columns[name] for name in self.element.graph.required.names)
399 query = (
400 sqlalchemy.sql.select(*columns)
401 .select_from(table)
402 .where(
403 sqlalchemy.sql.and_(
404 table.columns.skypix_system == skypix.system.name,
405 table.columns.skypix_level == skypix.level,
406 )
407 )
408 )
409 return query.alias(f"{self.element.name}_{skypix.name}_overlap")
412@dataclasses.dataclass
413class _SkyPixOverlapTables:
414 """A helper object for `TableDimensionRecordStorage` that manages the
415 tables for materialized overlaps with skypix dimensions.
417 New instances should be constructed by calling `initialize`, not by calling
418 the dataclass-provided constructor directly.
420 Notes
421 -----
422 This class (and the related methods in TableDimensionRecordStorage) can in
423 principle manage overlaps between a database dimension element and any
424 skypix dimension, but at present it is only being used to manage
425 relationships with the special ``commonSkyPix`` dimension, because that's
426 all the query system uses. Eventually, we expect to require users to
427 explicitly materialize more relationships.
429 Other possible future improvements include:
431 - allowing finer-grained skypix dimensions to provide overlap rows for
432 coarser ones, by dividing indices by powers of 4 (and possibly doing
433 ``SELECT DISTINCT`` in the subquery to remove duplicates);
435 - allowing finer-grained database elements (e.g. patch) to provide overlap
436 rows for coarser ones (e.g. tract), by ignoring irrelevant columns (e.g.
437 the patch IDs) in the subquery (again, possible with ``SELECT
438 DISTINCT``).
440 But there's no point to doing any of that until the query system can figure
441 out how best to ask for overlap rows when an exact match isn't available.
442 """
444 summary: sqlalchemy.schema.Table
445 """Table that records which governor value / skypix combinations have
446 materialized overlaps.
447 """
449 overlaps: sqlalchemy.schema.Table
450 """Table that actually holds overlap rows.
451 """
453 @classmethod
454 def initialize(
455 cls,
456 db: Database,
457 element: DatabaseDimensionElement,
458 *,
459 context: StaticTablesContext | None,
460 ) -> _SkyPixOverlapTables:
461 """Construct a new instance, creating tables as needed.
463 Parameters
464 ----------
465 db : `Database`
466 Interface to the underlying database engine and namespace.
467 element : `DatabaseDimensionElement`
468 Dimension element whose overlaps are to be managed.
469 context : `StaticTablesContext`, optional
470 If provided, an object to use to create any new tables. If not
471 provided, ``db.ensureTableExists`` should be used instead.
472 """
473 if context is not None: 473 ↛ 476line 473 didn't jump to line 476, because the condition on line 473 was never false
474 op = context.addTable
475 else:
476 op = db.ensureTableExists
477 summary = op(
478 cls._SUMMARY_TABLE_NAME_SPEC.format(element=element),
479 cls._makeSummaryTableSpec(element),
480 )
481 overlaps = op(
482 cls._OVERLAP_TABLE_NAME_SPEC.format(element=element),
483 cls._makeOverlapTableSpec(element),
484 )
485 return cls(summary=summary, overlaps=overlaps)
487 _SUMMARY_TABLE_NAME_SPEC = "{element.name}_skypix_overlap_summary"
489 @classmethod
490 def _makeSummaryTableSpec(cls, element: DatabaseDimensionElement) -> ddl.TableSpec:
491 """Create a specification for the table that records which combinations
492 of skypix dimension and governor value have materialized overlaps.
494 Parameters
495 ----------
496 element : `DatabaseDimensionElement`
497 Dimension element whose overlaps are to be managed.
499 Returns
500 -------
501 tableSpec : `ddl.TableSpec`
502 Table specification.
503 """
504 assert element.spatial is not None
505 tableSpec = ddl.TableSpec(
506 fields=[
507 ddl.FieldSpec(
508 name="skypix_system",
509 dtype=sqlalchemy.String,
510 length=16,
511 nullable=False,
512 primaryKey=True,
513 ),
514 ddl.FieldSpec(
515 name="skypix_level",
516 dtype=sqlalchemy.SmallInteger,
517 nullable=False,
518 primaryKey=True,
519 ),
520 ]
521 )
522 addDimensionForeignKey(tableSpec, element.spatial.governor, primaryKey=True)
523 return tableSpec
525 _OVERLAP_TABLE_NAME_SPEC = "{element.name}_skypix_overlap"
527 @classmethod
528 def _makeOverlapTableSpec(cls, element: DatabaseDimensionElement) -> ddl.TableSpec:
529 """Create a specification for the table that holds materialized
530 overlap rows.
532 Parameters
533 ----------
534 element : `DatabaseDimensionElement`
535 Dimension element whose overlaps are to be managed.
537 Returns
538 -------
539 tableSpec : `ddl.TableSpec`
540 Table specification.
541 """
542 assert element.spatial is not None
543 tableSpec = ddl.TableSpec(
544 fields=[
545 ddl.FieldSpec(
546 name="skypix_system",
547 dtype=sqlalchemy.String,
548 length=16,
549 nullable=False,
550 primaryKey=True,
551 ),
552 ddl.FieldSpec(
553 name="skypix_level",
554 dtype=sqlalchemy.SmallInteger,
555 nullable=False,
556 primaryKey=True,
557 ),
558 # (more columns added below)
559 ],
560 unique=set(),
561 indexes={
562 # This index has the same fields as the PK, in a different
563 # order, to facilitate queries that know skypix_index and want
564 # to find the other element.
565 ddl.IndexSpec(
566 "skypix_system",
567 "skypix_level",
568 "skypix_index",
569 *element.graph.required.names,
570 ),
571 },
572 foreignKeys=[
573 # Foreign key to summary table. This makes sure we don't
574 # materialize any overlaps without remembering that we've done
575 # so in the summary table, though it can't prevent the converse
576 # of adding a summary row without adding overlap row (either of
577 # those is a logic bug, of course, but we want to be defensive
578 # about those). Using ON DELETE CASCADE, it'd be very easy to
579 # implement "disabling" an overlap materialization, because we
580 # can just delete the summary row.
581 # Note that the governor dimension column is added below, in
582 # the call to addDimensionForeignKey.
583 ddl.ForeignKeySpec(
584 cls._SUMMARY_TABLE_NAME_SPEC.format(element=element),
585 source=("skypix_system", "skypix_level", element.spatial.governor.name),
586 target=("skypix_system", "skypix_level", element.spatial.governor.name),
587 onDelete="CASCADE",
588 ),
589 ],
590 )
591 # Add fields for the standard element this class manages overlaps for.
592 # This is guaranteed to add a column for the governor dimension,
593 # because that's a required dependency of element.
594 for dimension in element.required:
595 addDimensionForeignKey(tableSpec, dimension, primaryKey=True)
596 # Add field for the actual skypix index. We do this later because I
597 # think we care (at least a bit) about the order in which the primary
598 # key is defined, in that we want a non-summary column like this one
599 # to appear after the governor dimension column.
600 tableSpec.fields.add(
601 ddl.FieldSpec(
602 name="skypix_index",
603 dtype=sqlalchemy.BigInteger,
604 nullable=False,
605 primaryKey=True,
606 )
607 )
608 return tableSpec