Coverage for python/lsst/daf/butler/registry/queries/_query.py: 22%
Shortcuts on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
21from __future__ import annotations
23__all__ = ("Query",)
25from abc import ABC, abstractmethod
26from contextlib import contextmanager
27import dataclasses
28import enum
29import itertools
30from typing import (
31 ContextManager,
32 Dict,
33 Iterable,
34 Iterator,
35 Mapping,
36 Optional,
37 Tuple,
38 TYPE_CHECKING,
39)
41import sqlalchemy
43from lsst.sphgeom import Region
45from ...core import (
46 addDimensionForeignKey,
47 DataCoordinate,
48 DatasetRef,
49 DatasetType,
50 ddl,
51 Dimension,
52 DimensionElement,
53 DimensionGraph,
54 DimensionRecord,
55 DimensionUniverse,
56 SpatialRegionDatabaseRepresentation,
57 SimpleQuery,
58)
59from ..interfaces import Database
60from ._structs import DatasetQueryColumns, QueryColumns, QuerySummary, RegistryManagers
62if TYPE_CHECKING: 62 ↛ 63line 62 didn't jump to line 63, because the condition on line 62 was never true
63 from ._builder import QueryBuilder
66@dataclasses.dataclass(frozen=True)
67class OrderByColumn:
68 """Information about single column in ORDER BY clause.
69 """
70 column: sqlalchemy.sql.ColumnElement
71 """Name of the column or `None` for primary key (`str` or `None`)"""
73 ordering: bool
74 """True for ascending order, False for descending (`bool`)."""
76 add_to_select: bool
77 """True if columns is a non-key column and needs to be added to select
78 columns explicitly (`bool`)."""
80 field_spec: Optional[ddl.FieldSpec]
81 """Field specification for a column in materialized table (`ddl.FieldSpec`)
82 """
84 dimension: Optional[Dimension]
85 """Not-None if column corresponds to a dimension (`Dimension` or `None`)"""
87 @property
88 def column_order(self) -> sqlalchemy.sql.ColumnElement:
89 """Column element for use in ORDER BY clause
90 (`sqlalchemy.sql.ColumnElement`)
91 """
92 return self.column.asc() if self.ordering else self.column.desc()
94 def materialized(self, table: sqlalchemy.schema.Table) -> OrderByColumn:
95 """Re-purpose ordering column definition for a materialized table.
97 Parameters
98 ----------
99 table : `sqlalchemy.schema.Table`
100 Materialized table, it should have all columns in SELECT clause
101 already.
103 Returns
104 -------
105 column : `OrderByColumn`
106 Column definition to use with ORDER BY in materialized table.
107 """
108 return OrderByColumn(
109 column=table.columns[self.dimension.name if self.dimension else self.column.name],
110 ordering=self.ordering,
111 add_to_select=False,
112 field_spec=None,
113 dimension=self.dimension
114 )
117class Query(ABC):
118 """An abstract base class for queries that return some combination of
119 `DatasetRef` and `DataCoordinate` objects.
121 Parameters
122 ----------
123 graph : `DimensionGraph`
124 Object describing the dimensions included in the query.
125 whereRegion : `lsst.sphgeom.Region`, optional
126 Region that all region columns in all returned rows must overlap.
127 managers : `RegistryManagers`
128 A struct containing the registry manager instances used by the query
129 system.
130 doomed_by : `Iterable` [ `str` ], optional
131 A list of messages (appropriate for e.g. logging or exceptions) that
132 explain why the query is known to return no results even before it is
133 executed. Queries with a non-empty list will never be executed.
135 Notes
136 -----
137 The `Query` hierarchy abstracts over the database/SQL representation of a
138 particular set of data IDs or datasets. It is expected to be used as a
139 backend for other objects that provide more natural interfaces for one or
140 both of these, not as part of a public interface to query results.
141 """
142 def __init__(self, *,
143 graph: DimensionGraph,
144 whereRegion: Optional[Region],
145 managers: RegistryManagers,
146 doomed_by: Iterable[str] = (),
147 ):
148 self.graph = graph
149 self.whereRegion = whereRegion
150 self.managers = managers
151 self._doomed_by = tuple(doomed_by)
152 self._filtered_by_join: Optional[int] = None
153 self._filtered_by_where: Optional[int] = None
155 @abstractmethod
156 def isUnique(self) -> bool:
157 """Return `True` if this query's rows are guaranteed to be unique, and
158 `False` otherwise.
160 If this query has dataset results (`datasetType` is not `None`),
161 uniqueness applies to the `DatasetRef` instances returned by
162 `extractDatasetRef` from the result of `rows`. If it does not have
163 dataset results, uniqueness applies to the `DataCoordinate` instances
164 returned by `extractDataId`.
165 """
166 raise NotImplementedError()
168 @abstractmethod
169 def getDimensionColumn(self, name: str) -> sqlalchemy.sql.ColumnElement:
170 """Return the query column that contains the primary key value for
171 the dimension with the given name.
173 Parameters
174 ----------
175 name : `str`
176 Name of the dimension.
178 Returns
179 -------
180 column : `sqlalchemy.sql.ColumnElement`.
181 SQLAlchemy object representing a column in the query.
183 Notes
184 -----
185 This method is intended primarily as a hook for subclasses to implement
186 and the ABC to call in order to provide higher-level functionality;
187 code that uses `Query` objects (but does not implement one) should
188 usually not have to call this method.
189 """
190 raise NotImplementedError()
192 @property
193 @abstractmethod
194 def spatial(self) -> Iterator[DimensionElement]:
195 """An iterator over the dimension element columns used in post-query
196 filtering of spatial overlaps (`Iterator` [ `DimensionElement` ]).
198 Notes
199 -----
200 This property is intended primarily as a hook for subclasses to
201 implement and the ABC to call in order to provide higher-level
202 functionality; code that uses `Query` objects (but does not implement
203 one) should usually not have to access this property.
204 """
205 raise NotImplementedError()
207 @abstractmethod
208 def getRegionColumn(self, name: str) -> sqlalchemy.sql.ColumnElement:
209 """Return a region column for one of the dimension elements iterated
210 over by `spatial`.
212 Parameters
213 ----------
214 name : `str`
215 Name of the element.
217 Returns
218 -------
219 column : `sqlalchemy.sql.ColumnElement`
220 SQLAlchemy representing a result column in the query.
222 Notes
223 -----
224 This method is intended primarily as a hook for subclasses to implement
225 and the ABC to call in order to provide higher-level functionality;
226 code that uses `Query` objects (but does not implement one) should
227 usually not have to call this method.
228 """
229 raise NotImplementedError()
231 @property
232 def datasetType(self) -> Optional[DatasetType]:
233 """The `DatasetType` of datasets returned by this query, or `None`
234 if there are no dataset results (`DatasetType` or `None`).
235 """
236 cols = self.getDatasetColumns()
237 if cols is None:
238 return None
239 return cols.datasetType
241 def count(self, db: Database, *, region: Optional[Region] = None, exact: bool = True) -> int:
242 """Count the number of rows this query would return.
244 Parameters
245 ----------
246 db : `Database`
247 Object managing the database connection.
248 region : `sphgeom.Region`, optional
249 A region that any result-row regions must overlap in order to be
250 yielded. If not provided, this will be ``self.whereRegion``, if
251 that exists.
252 exact : `bool`, optional
253 If `True`, run the full query and perform post-query filtering if
254 needed to account for that filtering in the count. If `False`, the
255 result may be an upper bound.
257 Returns
258 -------
259 count : `int`
260 The number of rows the query would return, or an upper bound if
261 ``exact=False``.
263 Notes
264 -----
265 This counts the number of rows returned, not the number of unique rows
266 returned, so even with ``exact=True`` it may provide only an upper
267 bound on the number of *deduplicated* result rows.
268 """
269 if self._doomed_by:
270 return 0
271 sql = self.sql
272 if sql is None:
273 return 1
274 if exact and self.spatial:
275 filtered_count = 0
276 for _ in self.rows(db, region=region):
277 filtered_count += 1
278 return filtered_count
279 else:
280 return db.query(
281 sql.with_only_columns([sqlalchemy.sql.func.count()]).order_by(None)
282 ).scalar()
284 def any(
285 self,
286 db: Database, *,
287 region: Optional[Region] = None,
288 execute: bool = True,
289 exact: bool = True,
290 ) -> bool:
291 """Test whether this query returns any results.
293 Parameters
294 ----------
295 db : `Database`
296 Object managing the database connection.
297 region : `sphgeom.Region`, optional
298 A region that any result-row regions must overlap in order to be
299 yielded. If not provided, this will be ``self.whereRegion``, if
300 that exists.
301 execute : `bool`, optional
302 If `True`, execute at least a ``LIMIT 1`` query if it cannot be
303 determined prior to execution that the query would return no rows.
304 exact : `bool`, optional
305 If `True`, run the full query and perform post-query filtering if
306 needed, until at least one result row is found. If `False`, the
307 returned result does not account for post-query filtering, and
308 hence may be `True` even when all result rows would be filtered
309 out.
311 Returns
312 -------
313 any : `bool`
314 `True` if the query would (or might, depending on arguments) yield
315 result rows. `False` if it definitely would not.
316 """
317 if self._doomed_by:
318 return False
319 sql = self.sql
320 if sql is None:
321 return True
322 if exact and not execute:
323 raise TypeError("Cannot obtain exact results without executing the query.")
324 if exact and self.spatial:
325 for _ in self.rows(db, region=region):
326 return True
327 return False
328 elif execute:
329 return db.query(sql.limit(1)).one_or_none() is not None
330 else:
331 return True
333 def explain_no_results(
334 self,
335 db: Database, *,
336 region: Optional[Region] = None,
337 followup: bool = True,
338 ) -> Iterator[str]:
339 """Return human-readable messages that may help explain why the query
340 yields no results.
342 Parameters
343 ----------
344 db : `Database`
345 Object managing the database connection.
346 region : `sphgeom.Region`, optional
347 A region that any result-row regions must overlap in order to be
348 yielded. If not provided, this will be ``self.whereRegion``, if
349 that exists.
350 followup : `bool`, optional
351 If `True` (default) perform inexpensive follow-up queries if no
352 diagnostics are available from query generation alone.
354 Returns
355 -------
356 messages : `Iterator` [ `str` ]
357 String messages that describe reasons the query might not yield any
358 results.
360 Notes
361 -----
362 Messages related to post-query filtering are only available if `rows`,
363 `any`, or `count` was already called with the same region (with
364 ``exact=True`` for the latter two).
365 """
366 from ._builder import QueryBuilder
367 if self._doomed_by:
368 yield from self._doomed_by
369 return
370 if self._filtered_by_where:
371 yield (
372 f"{self._filtered_by_where} result rows were filtered out because "
373 "one or more region did not overlap the WHERE-clause region."
374 )
375 if self._filtered_by_join:
376 yield (
377 f"{self._filtered_by_join} result rows were filtered out because "
378 "one or more regions did not overlap."
379 )
380 if (not followup) or self._filtered_by_join or self._filtered_by_where:
381 return
382 # Query didn't return results even before client-side filtering, and
383 # caller says we can do follow-up queries to determine why.
384 # Start by seeing if there are _any_ dimension records for each element
385 # involved.
386 for element in self.graph.elements:
387 summary = QuerySummary(element.graph)
388 builder = QueryBuilder(summary, self.managers)
389 followup_query = builder.finish()
390 if not followup_query.any(db, exact=False):
391 yield f"No dimension records for element '{element.name}' found."
392 yield from followup_query.explain_no_results(db, region=region, followup=False)
393 return
395 @abstractmethod
396 def getDatasetColumns(self) -> Optional[DatasetQueryColumns]:
397 """Return the columns for the datasets returned by this query.
399 Returns
400 -------
401 columns : `DatasetQueryColumns` or `None`
402 Struct containing SQLAlchemy representations of the result columns
403 for a dataset.
405 Notes
406 -----
407 This method is intended primarily as a hook for subclasses to implement
408 and the ABC to call in order to provide higher-level functionality;
409 code that uses `Query` objects (but does not implement one) should
410 usually not have to call this method.
411 """
412 raise NotImplementedError()
414 @property
415 @abstractmethod
416 def sql(self) -> Optional[sqlalchemy.sql.FromClause]:
417 """A SQLAlchemy object representing the full query
418 (`sqlalchemy.sql.FromClause` or `None`).
420 This is `None` in the special case where the query has no columns, and
421 only one logical row.
422 """
423 raise NotImplementedError()
425 def rows(self, db: Database, *, region: Optional[Region] = None
426 ) -> Iterator[Optional[sqlalchemy.engine.RowProxy]]:
427 """Execute the query and yield result rows, applying `predicate`.
429 Parameters
430 ----------
431 db : `Database`
432 Object managing the database connection.
433 region : `sphgeom.Region`, optional
434 A region that any result-row regions must overlap in order to be
435 yielded. If not provided, this will be ``self.whereRegion``, if
436 that exists.
438 Yields
439 ------
440 row : `sqlalchemy.engine.RowProxy` or `None`
441 Result row from the query. `None` may yielded exactly once instead
442 of any real rows to indicate an empty query (see `EmptyQuery`).
443 """
444 if self._doomed_by:
445 return
446 whereRegion = region if region is not None else self.whereRegion
447 self._filtered_by_where = 0
448 self._filtered_by_join = 0
449 for row in db.query(self.sql):
450 rowRegions = [row._mapping[self.getRegionColumn(element.name)] for element in self.spatial]
451 if whereRegion and any(r.isDisjointFrom(whereRegion) for r in rowRegions):
452 self._filtered_by_where += 1
453 continue
454 if not not any(a.isDisjointFrom(b) for a, b in itertools.combinations(rowRegions, 2)):
455 self._filtered_by_join += 1
456 continue
457 yield row
459 def extractDimensionsTuple(self, row: Optional[sqlalchemy.engine.RowProxy],
460 dimensions: Iterable[Dimension]) -> tuple:
461 """Extract a tuple of data ID values from a result row.
463 Parameters
464 ----------
465 row : `sqlalchemy.engine.RowProxy` or `None`
466 A result row from a SQLAlchemy SELECT query, or `None` to indicate
467 the row from an `EmptyQuery`.
468 dimensions : `Iterable` [ `Dimension` ]
469 The dimensions to include in the returned tuple, in order.
471 Returns
472 -------
473 values : `tuple`
474 A tuple of dimension primary key values.
475 """
476 if row is None:
477 assert not tuple(dimensions), "Can only utilize empty query row when there are no dimensions."
478 return ()
479 return tuple(row._mapping[self.getDimensionColumn(dimension.name)] for dimension in dimensions)
481 def extractDataId(self, row: Optional[sqlalchemy.engine.RowProxy], *,
482 graph: Optional[DimensionGraph] = None,
483 records: Optional[Mapping[str, Mapping[tuple, DimensionRecord]]] = None,
484 ) -> DataCoordinate:
485 """Extract a data ID from a result row.
487 Parameters
488 ----------
489 row : `sqlalchemy.engine.RowProxy` or `None`
490 A result row from a SQLAlchemy SELECT query, or `None` to indicate
491 the row from an `EmptyQuery`.
492 graph : `DimensionGraph`, optional
493 The dimensions the returned data ID should identify. If not
494 provided, this will be all dimensions in `QuerySummary.requested`.
495 records : `Mapping` [ `str`, `Mapping` [ `tuple`, `DimensionRecord` ] ]
496 Nested mapping containing records to attach to the returned
497 `DataCoordinate`, for which `~DataCoordinate.hasRecords` will
498 return `True`. If provided, outer keys must include all dimension
499 element names in ``graph``, and inner keys should be tuples of
500 dimension primary key values in the same order as
501 ``element.graph.required``. If not provided,
502 `DataCoordinate.hasRecords` will return `False` on the returned
503 object.
505 Returns
506 -------
507 dataId : `DataCoordinate`
508 A data ID that identifies all required and implied dimensions. If
509 ``records is not None``, this is have
510 `~DataCoordinate.hasRecords()` return `True`.
511 """
512 if graph is None:
513 graph = self.graph
514 if not graph:
515 return DataCoordinate.makeEmpty(self.graph.universe)
516 dataId = DataCoordinate.fromFullValues(
517 graph,
518 self.extractDimensionsTuple(row, itertools.chain(graph.required, graph.implied))
519 )
520 if records is not None:
521 recordsForRow = {}
522 for element in graph.elements:
523 key = tuple(dataId.subset(element.graph).values())
524 recordsForRow[element.name] = records[element.name].get(key)
525 return dataId.expanded(recordsForRow)
526 else:
527 return dataId
529 def extractDatasetRef(self, row: sqlalchemy.engine.RowProxy,
530 dataId: Optional[DataCoordinate] = None,
531 records: Optional[Mapping[str, Mapping[tuple, DimensionRecord]]] = None,
532 ) -> DatasetRef:
533 """Extract a `DatasetRef` from a result row.
535 Parameters
536 ----------
537 row : `sqlalchemy.engine.RowProxy`
538 A result row from a SQLAlchemy SELECT query.
539 dataId : `DataCoordinate`
540 Data ID to attach to the `DatasetRef`. A minimal (i.e. base class)
541 `DataCoordinate` is constructed from ``row`` if `None`.
542 records : `Mapping` [ `str`, `Mapping` [ `tuple`, `DimensionRecord` ] ]
543 Records to use to return an `ExpandedDataCoordinate`. If provided,
544 outer keys must include all dimension element names in ``graph``,
545 and inner keys should be tuples of dimension primary key values
546 in the same order as ``element.graph.required``.
548 Returns
549 -------
550 ref : `DatasetRef`
551 Reference to the dataset; guaranteed to have `DatasetRef.id` not
552 `None`.
553 """
554 datasetColumns = self.getDatasetColumns()
555 assert datasetColumns is not None
556 if dataId is None:
557 dataId = self.extractDataId(row, graph=datasetColumns.datasetType.dimensions, records=records)
558 runRecord = self.managers.collections[row._mapping[datasetColumns.runKey]]
559 return DatasetRef(datasetColumns.datasetType, dataId, id=row._mapping[datasetColumns.id],
560 run=runRecord.name)
562 def _makeSubsetQueryColumns(self, *, graph: Optional[DimensionGraph] = None,
563 datasets: bool = True,
564 unique: bool = False) -> Tuple[DimensionGraph, Optional[QueryColumns]]:
565 """Helper method for subclass implementations of `subset`.
567 Parameters
568 ----------
569 graph : `DimensionGraph`, optional
570 Dimensions to include in the new `Query` being constructed.
571 ``subset`` implementations should generally just forward their
572 own ``graph`` argument here.
573 datasets : `bool`, optional
574 Whether the new `Query` should include dataset results. Defaults
575 to `True`, but is ignored if ``self`` does not include dataset
576 results.
577 unique : `bool`, optional
578 Whether the new `Query` should guarantee unique results (this may
579 come with a performance penalty).
581 Returns
582 -------
583 graph : `DimensionGraph`
584 The dimensions of the new `Query`. This is exactly the same as
585 the argument of the same name, with ``self.graph`` used if that
586 argument is `None`.
587 columns : `QueryColumns` or `None`
588 A struct containing the SQLAlchemy column objects to use in the
589 new query, constructed by delegating to other (mostly abstract)
590 methods on ``self``. If `None`, `subset` may return ``self``.
591 """
592 if graph is None:
593 graph = self.graph
594 if (graph == self.graph and (self.getDatasetColumns() is None or datasets)
595 and (self.isUnique() or not unique)):
596 return graph, None
597 columns = QueryColumns()
598 for dimension in graph.dimensions:
599 col = self.getDimensionColumn(dimension.name)
600 columns.keys[dimension] = [col]
601 if not unique:
602 for element in self.spatial:
603 col = self.getRegionColumn(element.name)
604 columns.regions[element] = col
605 if datasets and self.getDatasetColumns() is not None:
606 columns.datasets = self.getDatasetColumns()
607 return graph, columns
609 @abstractmethod
610 def materialize(self, db: Database) -> ContextManager[Query]:
611 """Execute this query and insert its results into a temporary table.
613 Parameters
614 ----------
615 db : `Database`
616 Database engine to execute the query against.
618 Returns
619 -------
620 context : `typing.ContextManager` [ `MaterializedQuery` ]
621 A context manager that ensures the temporary table is created and
622 populated in ``__enter__`` (returning a `MaterializedQuery` object
623 backed by that table), and dropped in ``__exit__``. If ``self``
624 is already a `MaterializedQuery`, ``__enter__`` may just return
625 ``self`` and ``__exit__`` may do nothing (reflecting the fact that
626 an outer context manager should already take care of everything
627 else).
628 """
629 raise NotImplementedError()
631 @abstractmethod
632 def subset(self, *, graph: Optional[DimensionGraph] = None,
633 datasets: bool = True,
634 unique: bool = False) -> Query:
635 """Return a new `Query` whose columns and/or rows are (mostly) subset
636 of this one's.
638 Parameters
639 ----------
640 graph : `DimensionGraph`, optional
641 Dimensions to include in the new `Query` being constructed.
642 If `None` (default), ``self.graph`` is used.
643 datasets : `bool`, optional
644 Whether the new `Query` should include dataset results. Defaults
645 to `True`, but is ignored if ``self`` does not include dataset
646 results.
647 unique : `bool`, optional
648 Whether the new `Query` should guarantee unique results (this may
649 come with a performance penalty).
651 Returns
652 -------
653 query : `Query`
654 A query object corresponding to the given inputs. May be ``self``
655 if no changes were requested.
657 Notes
658 -----
659 The way spatial overlaps are handled at present makes it impossible to
660 fully guarantee in general that the new query's rows are a subset of
661 this one's while also returning unique rows. That's because the
662 database is only capable of performing approximate, conservative
663 overlaps via the common skypix system; we defer actual region overlap
664 operations to per-result-row Python logic. But including the region
665 columns necessary to do that postprocessing in the query makes it
666 impossible to do a SELECT DISTINCT on the user-visible dimensions of
667 the query. For example, consider starting with a query with dimensions
668 (instrument, skymap, visit, tract). That involves a spatial join
669 between visit and tract, and we include the region columns from both
670 tables in the results in order to only actually yield result rows
671 (see `predicate` and `rows`) where the regions in those two columns
672 overlap. If the user then wants to subset to just (skymap, tract) with
673 unique results, we have two unpalatable options:
675 - we can do a SELECT DISTINCT with just the skymap and tract columns
676 in the SELECT clause, dropping all detailed overlap information and
677 including some tracts that did not actually overlap any of the
678 visits in the original query (but were regarded as _possibly_
679 overlapping via the coarser, common-skypix relationships);
681 - we can include the tract and visit region columns in the query, and
682 continue to filter out the non-overlapping pairs, but completely
683 disregard the user's request for unique tracts.
685 This interface specifies that implementations must do the former, as
686 that's what makes things efficient in our most important use case
687 (``QuantumGraph`` generation in ``pipe_base``). We may be able to
688 improve this situation in the future by putting exact overlap
689 information in the database, either by using built-in (but
690 engine-specific) spatial database functionality or (more likely)
691 switching to a scheme in which pairwise dimension spatial relationships
692 are explicitly precomputed (for e.g. combinations of instruments and
693 skymaps).
694 """
695 raise NotImplementedError()
697 @abstractmethod
698 def makeBuilder(self, summary: Optional[QuerySummary] = None) -> QueryBuilder:
699 """Return a `QueryBuilder` that can be used to construct a new `Query`
700 that is joined to (and hence constrained by) this one.
702 Parameters
703 ----------
704 summary : `QuerySummary`, optional
705 A `QuerySummary` instance that specifies the dimensions and any
706 additional constraints to include in the new query being
707 constructed, or `None` to use the dimensions of ``self`` with no
708 additional constraints.
709 """
710 raise NotImplementedError()
712 graph: DimensionGraph
713 """The dimensions identified by this query and included in any data IDs
714 created from its result rows (`DimensionGraph`).
715 """
717 whereRegion: Optional[Region]
718 """A spatial region that all regions in all rows returned by this query
719 must overlap (`lsst.sphgeom.Region` or `None`).
720 """
722 managers: RegistryManagers
723 """A struct containing `Registry` helper object (`RegistryManagers`).
724 """
727class DirectQueryUniqueness(enum.Enum):
728 """An enum representing the ways in which a query can have unique rows (or
729 not).
730 """
732 NOT_UNIQUE = enum.auto()
733 """The query is not expected to have unique rows.
734 """
736 NATURALLY_UNIQUE = enum.auto()
737 """The construction of the query guarantees that it will have unique
738 result rows, even without SELECT DISTINCT or a GROUP BY clause.
739 """
741 NEEDS_DISTINCT = enum.auto()
742 """The query is expected to yield unique result rows, and needs to use
743 SELECT DISTINCT or an equivalent GROUP BY clause to achieve this.
744 """
747class DirectQuery(Query):
748 """A `Query` implementation that represents a direct SELECT query that
749 usually joins many tables.
751 `DirectQuery` objects should generally only be constructed by
752 `QueryBuilder` or the methods of other `Query` objects.
754 Parameters
755 ----------
756 simpleQuery : `SimpleQuery`
757 Struct representing the actual SELECT, FROM, and WHERE clauses.
758 columns : `QueryColumns`
759 Columns that are referenced in the query in any clause.
760 uniqueness : `DirectQueryUniqueness`
761 Enum value indicating whether the query should yield unique result
762 rows, and if so whether that needs to be explicitly requested of the
763 database.
764 graph : `DimensionGraph`
765 Object describing the dimensions included in the query.
766 whereRegion : `lsst.sphgeom.Region`, optional
767 Region that all region columns in all returned rows must overlap.
768 managers : `RegistryManagers`
769 Struct containing the `Registry` manager helper objects, to be
770 forwarded to the `Query` constructor.
771 doomed_by : `Iterable` [ `str` ], optional
772 A list of messages (appropriate for e.g. logging or exceptions) that
773 explain why the query is known to return no results even before it is
774 executed. Queries with a non-empty list will never be executed.
775 """
776 def __init__(self, *,
777 simpleQuery: SimpleQuery,
778 columns: QueryColumns,
779 uniqueness: DirectQueryUniqueness,
780 graph: DimensionGraph,
781 whereRegion: Optional[Region],
782 managers: RegistryManagers,
783 order_by_columns: Iterable[OrderByColumn] = (),
784 limit: Optional[Tuple[int, Optional[int]]] = None,
785 doomed_by: Iterable[str] = ()):
786 super().__init__(graph=graph, whereRegion=whereRegion, managers=managers, doomed_by=doomed_by)
787 assert not simpleQuery.columns, "Columns should always be set on a copy in .sql"
788 assert not columns.isEmpty(), "EmptyQuery must be used when a query would have no columns."
789 self._simpleQuery = simpleQuery
790 self._columns = columns
791 self._uniqueness = uniqueness
792 self._order_by_columns = order_by_columns
793 self._limit = limit
794 self._datasetQueryColumns: Optional[DatasetQueryColumns] = None
795 self._dimensionColumns: Dict[str, sqlalchemy.sql.ColumnElement] = {}
796 self._regionColumns: Dict[str, sqlalchemy.sql.ColumnElement] = {}
798 def isUnique(self) -> bool:
799 # Docstring inherited from Query.
800 return self._uniqueness is not DirectQueryUniqueness.NOT_UNIQUE
802 def getDimensionColumn(self, name: str) -> sqlalchemy.sql.ColumnElement:
803 # Docstring inherited from Query.
804 column = self._dimensionColumns.get(name)
805 if column is None:
806 column = self._columns.getKeyColumn(name).label(name)
807 self._dimensionColumns[name] = column
808 return column
810 @property
811 def spatial(self) -> Iterator[DimensionElement]:
812 # Docstring inherited from Query.
813 return iter(self._columns.regions)
815 def getRegionColumn(self, name: str) -> sqlalchemy.sql.ColumnElement:
816 # Docstring inherited from Query.
817 column = self._regionColumns.get(name)
818 if column is None:
819 column = self._columns.regions[name].column.label(f"{name}_region")
820 self._regionColumns[name] = column
821 return column
823 def getDatasetColumns(self) -> Optional[DatasetQueryColumns]:
824 # Docstring inherited from Query.
825 if self._datasetQueryColumns is None:
826 base = self._columns.datasets
827 if base is None:
828 return None
829 ingestDate = base.ingestDate
830 if ingestDate is not None:
831 ingestDate = ingestDate.label("ingest_date")
832 self._datasetQueryColumns = DatasetQueryColumns(
833 datasetType=base.datasetType,
834 id=base.id.label("dataset_id"),
835 runKey=base.runKey.label(self.managers.collections.getRunForeignKeyName()),
836 ingestDate=ingestDate,
837 )
838 return self._datasetQueryColumns
840 @property
841 def sql(self) -> sqlalchemy.sql.FromClause:
842 # Docstring inherited from Query.
843 simpleQuery = self._simpleQuery.copy()
844 for dimension in self.graph:
845 simpleQuery.columns.append(self.getDimensionColumn(dimension.name))
846 for element in self.spatial:
847 simpleQuery.columns.append(self.getRegionColumn(element.name))
848 datasetColumns = self.getDatasetColumns()
849 if datasetColumns is not None:
850 simpleQuery.columns.extend(datasetColumns)
852 if self._order_by_columns:
853 # add ORDER BY columns
854 select_columns = [column.column for column in self._order_by_columns if column.add_to_select]
855 simpleQuery.columns.extend(select_columns)
856 sql = simpleQuery.combine()
857 order_by_columns = [column.column_order for column in self._order_by_columns]
858 sql = sql.order_by(*order_by_columns)
859 else:
860 sql = simpleQuery.combine()
862 if self._limit:
863 sql = sql.limit(self._limit[0])
864 if self._limit[1] is not None:
865 sql = sql.offset(self._limit[1])
867 if self._uniqueness is DirectQueryUniqueness.NEEDS_DISTINCT:
868 return sql.distinct()
869 else:
870 return sql
872 def _makeTableSpec(self, constraints: bool = False) -> ddl.TableSpec:
873 """Helper method for subclass implementations of `materialize`.
875 Parameters
876 ----------
877 constraints : `bool`, optional
878 If `True` (`False` is default), define a specification that
879 includes actual foreign key constraints for logical foreign keys.
880 Some database engines do not permit temporary tables to reference
881 normal tables, so this should be `False` when generating a spec
882 for a temporary table unless the database engine is known to
883 support them.
885 Returns
886 -------
887 spec : `ddl.TableSpec`
888 Specification for a table that could hold this query's result rows.
889 """
890 unique = self.isUnique()
891 spec = ddl.TableSpec(fields=())
892 for dimension in self.graph:
893 addDimensionForeignKey(spec, dimension, primaryKey=unique, constraint=constraints)
894 for element in self.spatial:
895 spec.fields.update(
896 SpatialRegionDatabaseRepresentation.makeFieldSpecs(
897 nullable=True,
898 name=f"{element.name}_region",
899 )
900 )
901 datasetColumns = self.getDatasetColumns()
902 if datasetColumns is not None:
903 self.managers.datasets.addDatasetForeignKey(spec, primaryKey=unique, constraint=constraints)
904 self.managers.collections.addRunForeignKey(spec, nullable=False, constraint=constraints)
906 # may need few extra columns from ORDER BY
907 spec.fields.update(column.field_spec for column in self._order_by_columns
908 if column.field_spec is not None)
910 return spec
912 @contextmanager
913 def materialize(self, db: Database) -> Iterator[Query]:
914 # Docstring inherited from Query.
915 spec = self._makeTableSpec()
916 with db.session() as session:
917 table = session.makeTemporaryTable(spec)
918 if not self._doomed_by:
919 db.insert(table, select=self.sql, names=spec.fields.names)
920 order_by_columns = [column.materialized(table) for column in self._order_by_columns]
921 yield MaterializedQuery(table=table,
922 spatial=self.spatial,
923 datasetType=self.datasetType,
924 isUnique=self.isUnique(),
925 graph=self.graph,
926 whereRegion=self.whereRegion,
927 managers=self.managers,
928 doomed_by=self._doomed_by,
929 order_by_columns=order_by_columns)
930 session.dropTemporaryTable(table)
932 def subset(self, *, graph: Optional[DimensionGraph] = None,
933 datasets: bool = True,
934 unique: bool = False) -> Query:
935 # Docstring inherited from Query.
936 graph, columns = self._makeSubsetQueryColumns(graph=graph, datasets=datasets, unique=unique)
937 if columns is None:
938 return self
939 if columns.isEmpty():
940 return EmptyQuery(self.graph.universe, self.managers)
941 return DirectQuery(
942 simpleQuery=self._simpleQuery.copy(),
943 columns=columns,
944 uniqueness=DirectQueryUniqueness.NEEDS_DISTINCT if unique else DirectQueryUniqueness.NOT_UNIQUE,
945 graph=graph,
946 whereRegion=self.whereRegion if not unique else None,
947 managers=self.managers,
948 doomed_by=self._doomed_by,
949 )
951 def makeBuilder(self, summary: Optional[QuerySummary] = None) -> QueryBuilder:
952 # Docstring inherited from Query.
953 from ._builder import QueryBuilder
954 if summary is None:
955 summary = QuerySummary(self.graph, whereRegion=self.whereRegion)
956 if not summary.requested.issubset(self.graph):
957 raise NotImplementedError(
958 f"Query.makeBuilder does not yet support augmenting dimensions "
959 f"({summary.requested.dimensions}) beyond those originally included in the query "
960 f"({self.graph.dimensions})."
961 )
962 builder = QueryBuilder(summary, managers=self.managers, doomed_by=self._doomed_by)
963 builder.joinTable(self.sql.alias(), dimensions=self.graph.dimensions,
964 datasets=self.getDatasetColumns())
965 return builder
968class MaterializedQuery(Query):
969 """A `Query` implementation that represents query results saved in a
970 temporary table.
972 `MaterializedQuery` instances should not be constructed directly; use
973 `Query.materialize()` instead.
975 Parameters
976 ----------
977 table : `sqlalchemy.schema.Table`
978 SQLAlchemy object representing the temporary table.
979 spatial : `Iterable` [ `DimensionElement` ]
980 Spatial dimension elements whose regions must overlap for each valid
981 result row (which may reject some rows that are in the table).
982 datasetType : `DatasetType`
983 The `DatasetType` of datasets returned by this query, or `None`
984 if there are no dataset results
985 isUnique : `bool`
986 If `True`, the table's rows are unique, and there is no need to
987 add ``SELECT DISTINCT`` to guarantee this in results.
988 graph : `DimensionGraph`
989 Dimensions included in the columns of this table.
990 whereRegion : `Region` or `None`
991 A spatial region all result-row regions must overlap to be valid (which
992 may reject some rows that are in the table).
993 managers : `RegistryManagers`
994 A struct containing `Registry` manager helper objects, forwarded to
995 the `Query` constructor.
996 doomed_by : `Iterable` [ `str` ], optional
997 A list of messages (appropriate for e.g. logging or exceptions) that
998 explain why the query is known to return no results even before it is
999 executed. Queries with a non-empty list will never be executed.
1000 order_by : `Tuple` [ `str` ], optional
1001 Optional list of column names to use in ORDER BY clause, names can be
1002 prefixed with minus sign for descending ordering.
1003 """
1004 def __init__(self, *,
1005 table: sqlalchemy.schema.Table,
1006 spatial: Iterable[DimensionElement],
1007 datasetType: Optional[DatasetType],
1008 isUnique: bool,
1009 graph: DimensionGraph,
1010 whereRegion: Optional[Region],
1011 managers: RegistryManagers,
1012 doomed_by: Iterable[str] = (),
1013 order_by_columns: Iterable[OrderByColumn] = ()):
1014 super().__init__(graph=graph, whereRegion=whereRegion, managers=managers,
1015 doomed_by=doomed_by)
1016 self._table = table
1017 self._spatial = tuple(spatial)
1018 self._datasetType = datasetType
1019 self._isUnique = isUnique
1020 self._order_by_columns = order_by_columns
1022 def isUnique(self) -> bool:
1023 # Docstring inherited from Query.
1024 return self._isUnique
1026 def getDimensionColumn(self, name: str) -> sqlalchemy.sql.ColumnElement:
1027 # Docstring inherited from Query.
1028 return self._table.columns[name]
1030 @property
1031 def spatial(self) -> Iterator[DimensionElement]:
1032 # Docstring inherited from Query.
1033 return iter(self._spatial)
1035 def order_by(self, *args: str) -> Query:
1036 # Docstring inherited from Query.
1037 raise NotImplementedError("MaterializedQuery.order_by should not be called directly")
1039 def limit(self, limit: int, offset: Optional[int] = None) -> Query:
1040 # Docstring inherited from Query.
1042 # Calling limit on materialized data is likely an error, limit should
1043 # be set before materializing.
1044 raise NotImplementedError("MaterializedQuery.limit should not be called directly")
1046 def getRegionColumn(self, name: str) -> sqlalchemy.sql.ColumnElement:
1047 # Docstring inherited from Query.
1048 return self._table.columns[f"{name}_region"]
1050 def getDatasetColumns(self) -> Optional[DatasetQueryColumns]:
1051 # Docstring inherited from Query.
1052 if self._datasetType is not None:
1053 return DatasetQueryColumns(
1054 datasetType=self._datasetType,
1055 id=self._table.columns["dataset_id"],
1056 runKey=self._table.columns[self.managers.collections.getRunForeignKeyName()],
1057 ingestDate=None,
1058 )
1059 else:
1060 return None
1062 @property
1063 def sql(self) -> sqlalchemy.sql.FromClause:
1064 # Docstring inherited from Query.
1065 select = self._table.select()
1066 if self._order_by_columns:
1067 order_by_columns = [column.column_order for column in self._order_by_columns]
1068 select = select.order_by(*order_by_columns)
1069 return select
1071 @contextmanager
1072 def materialize(self, db: Database) -> Iterator[Query]:
1073 # Docstring inherited from Query.
1074 yield self
1076 def subset(self, *, graph: Optional[DimensionGraph] = None,
1077 datasets: bool = True,
1078 unique: bool = False) -> Query:
1079 # Docstring inherited from Query.
1080 graph, columns = self._makeSubsetQueryColumns(graph=graph, datasets=datasets, unique=unique)
1081 if columns is None:
1082 return self
1083 if columns.isEmpty():
1084 return EmptyQuery(self.graph.universe, managers=self.managers)
1085 simpleQuery = SimpleQuery()
1086 simpleQuery.join(self._table)
1087 return DirectQuery(
1088 simpleQuery=simpleQuery,
1089 columns=columns,
1090 uniqueness=DirectQueryUniqueness.NEEDS_DISTINCT if unique else DirectQueryUniqueness.NOT_UNIQUE,
1091 graph=graph,
1092 whereRegion=self.whereRegion if not unique else None,
1093 managers=self.managers,
1094 doomed_by=self._doomed_by,
1095 )
1097 def makeBuilder(self, summary: Optional[QuerySummary] = None) -> QueryBuilder:
1098 # Docstring inherited from Query.
1099 from ._builder import QueryBuilder
1100 if summary is None:
1101 summary = QuerySummary(self.graph, whereRegion=self.whereRegion)
1102 if not summary.requested.issubset(self.graph):
1103 raise NotImplementedError(
1104 f"Query.makeBuilder does not yet support augmenting dimensions "
1105 f"({summary.requested.dimensions}) beyond those originally included in the query "
1106 f"({self.graph.dimensions})."
1107 )
1108 builder = QueryBuilder(summary, managers=self.managers, doomed_by=self._doomed_by)
1109 builder.joinTable(self._table, dimensions=self.graph.dimensions, datasets=self.getDatasetColumns())
1110 return builder
1113class EmptyQuery(Query):
1114 """A `Query` implementation that handes the special case where the query
1115 would have no columns.
1117 Parameters
1118 ----------
1119 universe : `DimensionUniverse`
1120 Set of all dimensions from which the null set is extracted.
1121 managers : `RegistryManagers`
1122 A struct containing the registry manager instances used by the query
1123 system.
1124 doomed_by : `Iterable` [ `str` ], optional
1125 A list of messages (appropriate for e.g. logging or exceptions) that
1126 explain why the query is known to return no results even before it is
1127 executed. Queries with a non-empty list will never be executed.
1128 """
1129 def __init__(
1130 self,
1131 universe: DimensionUniverse,
1132 managers: RegistryManagers,
1133 doomed_by: Iterable[str] = (),
1134 ):
1135 super().__init__(graph=universe.empty, whereRegion=None, managers=managers, doomed_by=doomed_by)
1137 def isUnique(self) -> bool:
1138 # Docstring inherited from Query.
1139 return True
1141 def getDimensionColumn(self, name: str) -> sqlalchemy.sql.ColumnElement:
1142 # Docstring inherited from Query.
1143 raise KeyError(f"No dimension {name} in query (no dimensions at all, actually).")
1145 @property
1146 def spatial(self) -> Iterator[DimensionElement]:
1147 # Docstring inherited from Query.
1148 return iter(())
1150 def order_by(self, *args: str) -> Query:
1151 # Docstring inherited from Query.
1152 return self
1154 def limit(self, limit: int, offset: Optional[int] = None) -> Query:
1155 # Docstring inherited from Query.
1156 return self
1158 def getRegionColumn(self, name: str) -> sqlalchemy.sql.ColumnElement:
1159 # Docstring inherited from Query.
1160 raise KeyError(f"No region for {name} in query (no regions at all, actually).")
1162 def getDatasetColumns(self) -> Optional[DatasetQueryColumns]:
1163 # Docstring inherited from Query.
1164 return None
1166 def rows(self, db: Database, *, region: Optional[Region] = None
1167 ) -> Iterator[Optional[sqlalchemy.engine.RowProxy]]:
1168 if not self._doomed_by:
1169 yield None
1171 @property
1172 def sql(self) -> Optional[sqlalchemy.sql.FromClause]:
1173 # Docstring inherited from Query.
1174 return None
1176 @contextmanager
1177 def materialize(self, db: Database) -> Iterator[Query]:
1178 # Docstring inherited from Query.
1179 yield self
1181 def subset(self, *, graph: Optional[DimensionGraph] = None,
1182 datasets: bool = True,
1183 unique: bool = False) -> Query:
1184 # Docstring inherited from Query.
1185 assert graph is None or graph.issubset(self.graph)
1186 return self
1188 def makeBuilder(self, summary: Optional[QuerySummary] = None) -> QueryBuilder:
1189 # Docstring inherited from Query.
1190 from ._builder import QueryBuilder
1191 if summary is None:
1192 summary = QuerySummary(self.graph)
1193 if not summary.requested.issubset(self.graph):
1194 raise NotImplementedError(
1195 f"Query.makeBuilder does not yet support augmenting dimensions "
1196 f"({summary.requested.dimensions}) beyond those originally included in the query "
1197 f"({self.graph.dimensions})."
1198 )
1199 return QueryBuilder(summary, managers=self.managers, doomed_by=self._doomed_by)