Coverage for python/lsst/daf/butler/registry/queries/_query.py: 22%
365 statements
« prev ^ index » next coverage.py v7.2.5, created at 2023-05-02 18:18 -0700
« prev ^ index » next coverage.py v7.2.5, created at 2023-05-02 18:18 -0700
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
21from __future__ import annotations
23__all__ = ("Query",)
25import dataclasses
26import enum
27import itertools
28from abc import ABC, abstractmethod
29from contextlib import contextmanager
30from typing import TYPE_CHECKING, ContextManager, Dict, Iterable, Iterator, Mapping, Optional, Tuple
32import sqlalchemy
33from lsst.sphgeom import Region
35from ...core import (
36 DataCoordinate,
37 DatasetRef,
38 DatasetType,
39 Dimension,
40 DimensionElement,
41 DimensionGraph,
42 DimensionRecord,
43 DimensionUniverse,
44 SimpleQuery,
45 SpatialRegionDatabaseRepresentation,
46 addDimensionForeignKey,
47 ddl,
48)
49from ..interfaces import Database
50from ._structs import DatasetQueryColumns, QueryColumns, QuerySummary, RegistryManagers
52if TYPE_CHECKING: 52 ↛ 53line 52 didn't jump to line 53, because the condition on line 52 was never true
53 from ._builder import QueryBuilder
56@dataclasses.dataclass(frozen=True)
57class OrderByColumn:
58 """Information about single column in ORDER BY clause."""
60 column: sqlalchemy.sql.ColumnElement
61 """Name of the column or `None` for primary key (`str` or `None`)"""
63 ordering: bool
64 """True for ascending order, False for descending (`bool`)."""
66 @property
67 def column_order(self) -> sqlalchemy.sql.ColumnElement:
68 """Column element for use in ORDER BY clause
69 (`sqlalchemy.sql.ColumnElement`)
70 """
71 return self.column.asc() if self.ordering else self.column.desc()
74class Query(ABC):
75 """An abstract base class for queries that return some combination of
76 `DatasetRef` and `DataCoordinate` objects.
78 Parameters
79 ----------
80 graph : `DimensionGraph`
81 Object describing the dimensions included in the query.
82 whereRegion : `lsst.sphgeom.Region`, optional
83 Region that all region columns in all returned rows must overlap.
84 managers : `RegistryManagers`
85 A struct containing the registry manager instances used by the query
86 system.
87 doomed_by : `Iterable` [ `str` ], optional
88 A list of messages (appropriate for e.g. logging or exceptions) that
89 explain why the query is known to return no results even before it is
90 executed. Queries with a non-empty list will never be executed.
92 Notes
93 -----
94 The `Query` hierarchy abstracts over the database/SQL representation of a
95 particular set of data IDs or datasets. It is expected to be used as a
96 backend for other objects that provide more natural interfaces for one or
97 both of these, not as part of a public interface to query results.
98 """
100 def __init__(
101 self,
102 *,
103 graph: DimensionGraph,
104 whereRegion: Optional[Region],
105 managers: RegistryManagers,
106 doomed_by: Iterable[str] = (),
107 ):
108 self.graph = graph
109 self.whereRegion = whereRegion
110 self.managers = managers
111 self._doomed_by = tuple(doomed_by)
112 self._filtered_by_join: Optional[int] = None
113 self._filtered_by_where: Optional[int] = None
115 @abstractmethod
116 def isUnique(self) -> bool:
117 """Return `True` if this query's rows are guaranteed to be unique, and
118 `False` otherwise.
120 If this query has dataset results (`datasetType` is not `None`),
121 uniqueness applies to the `DatasetRef` instances returned by
122 `extractDatasetRef` from the result of `rows`. If it does not have
123 dataset results, uniqueness applies to the `DataCoordinate` instances
124 returned by `extractDataId`.
125 """
126 raise NotImplementedError()
128 @abstractmethod
129 def getDimensionColumn(self, name: str) -> sqlalchemy.sql.ColumnElement:
130 """Return the query column that contains the primary key value for
131 the dimension with the given name.
133 Parameters
134 ----------
135 name : `str`
136 Name of the dimension.
138 Returns
139 -------
140 column : `sqlalchemy.sql.ColumnElement`.
141 SQLAlchemy object representing a column in the query.
143 Notes
144 -----
145 This method is intended primarily as a hook for subclasses to implement
146 and the ABC to call in order to provide higher-level functionality;
147 code that uses `Query` objects (but does not implement one) should
148 usually not have to call this method.
149 """
150 raise NotImplementedError()
152 @property
153 @abstractmethod
154 def spatial(self) -> Iterator[DimensionElement]:
155 """An iterator over the dimension element columns used in post-query
156 filtering of spatial overlaps (`Iterator` [ `DimensionElement` ]).
158 Notes
159 -----
160 This property is intended primarily as a hook for subclasses to
161 implement and the ABC to call in order to provide higher-level
162 functionality; code that uses `Query` objects (but does not implement
163 one) should usually not have to access this property.
164 """
165 raise NotImplementedError()
167 @abstractmethod
168 def getRegionColumn(self, name: str) -> sqlalchemy.sql.ColumnElement:
169 """Return a region column for one of the dimension elements iterated
170 over by `spatial`.
172 Parameters
173 ----------
174 name : `str`
175 Name of the element.
177 Returns
178 -------
179 column : `sqlalchemy.sql.ColumnElement`
180 SQLAlchemy representing a result column in the query.
182 Notes
183 -----
184 This method is intended primarily as a hook for subclasses to implement
185 and the ABC to call in order to provide higher-level functionality;
186 code that uses `Query` objects (but does not implement one) should
187 usually not have to call this method.
188 """
189 raise NotImplementedError()
191 @property
192 def datasetType(self) -> Optional[DatasetType]:
193 """The `DatasetType` of datasets returned by this query, or `None`
194 if there are no dataset results (`DatasetType` or `None`).
195 """
196 cols = self.getDatasetColumns()
197 if cols is None:
198 return None
199 return cols.datasetType
201 def count(self, db: Database, *, region: Optional[Region] = None, exact: bool = True) -> int:
202 """Count the number of rows this query would return.
204 Parameters
205 ----------
206 db : `Database`
207 Object managing the database connection.
208 region : `sphgeom.Region`, optional
209 A region that any result-row regions must overlap in order to be
210 yielded. If not provided, this will be ``self.whereRegion``, if
211 that exists.
212 exact : `bool`, optional
213 If `True`, run the full query and perform post-query filtering if
214 needed to account for that filtering in the count. If `False`, the
215 result may be an upper bound.
217 Returns
218 -------
219 count : `int`
220 The number of rows the query would return, or an upper bound if
221 ``exact=False``.
223 Notes
224 -----
225 This counts the number of rows returned, not the number of unique rows
226 returned, so even with ``exact=True`` it may provide only an upper
227 bound on the number of *deduplicated* result rows.
228 """
229 if self._doomed_by:
230 return 0
231 sql = self.sql
232 if sql is None:
233 return 1
234 if exact and self.spatial:
235 filtered_count = 0
236 for _ in self.rows(db, region=region):
237 filtered_count += 1
238 return filtered_count
239 else:
240 with db.query(sql.with_only_columns([sqlalchemy.sql.func.count()]).order_by(None)) as sql_result:
241 return sql_result.scalar()
243 def any(
244 self,
245 db: Database,
246 *,
247 region: Optional[Region] = None,
248 execute: bool = True,
249 exact: bool = True,
250 ) -> bool:
251 """Test whether this query returns any results.
253 Parameters
254 ----------
255 db : `Database`
256 Object managing the database connection.
257 region : `sphgeom.Region`, optional
258 A region that any result-row regions must overlap in order to be
259 yielded. If not provided, this will be ``self.whereRegion``, if
260 that exists.
261 execute : `bool`, optional
262 If `True`, execute at least a ``LIMIT 1`` query if it cannot be
263 determined prior to execution that the query would return no rows.
264 exact : `bool`, optional
265 If `True`, run the full query and perform post-query filtering if
266 needed, until at least one result row is found. If `False`, the
267 returned result does not account for post-query filtering, and
268 hence may be `True` even when all result rows would be filtered
269 out.
271 Returns
272 -------
273 any : `bool`
274 `True` if the query would (or might, depending on arguments) yield
275 result rows. `False` if it definitely would not.
276 """
277 if self._doomed_by:
278 return False
279 sql = self.sql
280 if sql is None:
281 return True
282 if exact and not execute:
283 raise TypeError("Cannot obtain exact results without executing the query.")
284 if exact and self.spatial:
285 for _ in self.rows(db, region=region):
286 return True
287 return False
288 elif execute:
289 with db.query(sql.limit(1)) as sql_result:
290 return sql_result.one_or_none() is not None
291 else:
292 return True
294 def explain_no_results(
295 self,
296 db: Database,
297 *,
298 region: Optional[Region] = None,
299 followup: bool = True,
300 ) -> Iterator[str]:
301 """Return human-readable messages that may help explain why the query
302 yields no results.
304 Parameters
305 ----------
306 db : `Database`
307 Object managing the database connection.
308 region : `sphgeom.Region`, optional
309 A region that any result-row regions must overlap in order to be
310 yielded. If not provided, this will be ``self.whereRegion``, if
311 that exists.
312 followup : `bool`, optional
313 If `True` (default) perform inexpensive follow-up queries if no
314 diagnostics are available from query generation alone.
316 Returns
317 -------
318 messages : `Iterator` [ `str` ]
319 String messages that describe reasons the query might not yield any
320 results.
322 Notes
323 -----
324 Messages related to post-query filtering are only available if `rows`,
325 `any`, or `count` was already called with the same region (with
326 ``exact=True`` for the latter two).
327 """
328 from ._builder import QueryBuilder
330 if self._doomed_by:
331 yield from self._doomed_by
332 return
333 if self._filtered_by_where:
334 yield (
335 f"{self._filtered_by_where} result rows were filtered out because "
336 "one or more region did not overlap the WHERE-clause region."
337 )
338 if self._filtered_by_join:
339 yield (
340 f"{self._filtered_by_join} result rows were filtered out because "
341 "one or more regions did not overlap."
342 )
343 if (not followup) or self._filtered_by_join or self._filtered_by_where:
344 return
345 # Query didn't return results even before client-side filtering, and
346 # caller says we can do follow-up queries to determine why.
347 # Start by seeing if there are _any_ dimension records for each element
348 # involved.
349 for element in self.graph.elements:
350 summary = QuerySummary(element.graph)
351 builder = QueryBuilder(summary, self.managers)
352 followup_query = builder.finish()
353 if not followup_query.any(db, exact=False):
354 yield f"No dimension records for element '{element.name}' found."
355 yield from followup_query.explain_no_results(db, region=region, followup=False)
356 return
358 @abstractmethod
359 def getDatasetColumns(self) -> Optional[DatasetQueryColumns]:
360 """Return the columns for the datasets returned by this query.
362 Returns
363 -------
364 columns : `DatasetQueryColumns` or `None`
365 Struct containing SQLAlchemy representations of the result columns
366 for a dataset.
368 Notes
369 -----
370 This method is intended primarily as a hook for subclasses to implement
371 and the ABC to call in order to provide higher-level functionality;
372 code that uses `Query` objects (but does not implement one) should
373 usually not have to call this method.
374 """
375 raise NotImplementedError()
377 @property
378 @abstractmethod
379 def sql(self) -> Optional[sqlalchemy.sql.FromClause]:
380 """A SQLAlchemy object representing the full query
381 (`sqlalchemy.sql.FromClause` or `None`).
383 This is `None` in the special case where the query has no columns, and
384 only one logical row.
385 """
386 raise NotImplementedError()
388 def rows(
389 self, db: Database, *, region: Optional[Region] = None
390 ) -> Iterator[Optional[sqlalchemy.engine.Row]]:
391 """Execute the query and yield result rows, applying `predicate`.
393 Parameters
394 ----------
395 db : `Database`
396 Object managing the database connection.
397 region : `sphgeom.Region`, optional
398 A region that any result-row regions must overlap in order to be
399 yielded. If not provided, this will be ``self.whereRegion``, if
400 that exists.
402 Yields
403 ------
404 row : `sqlalchemy.engine.RowProxy` or `None`
405 Result row from the query. `None` may yielded exactly once instead
406 of any real rows to indicate an empty query (see `EmptyQuery`).
407 """
408 if self._doomed_by:
409 return
410 whereRegion = region if region is not None else self.whereRegion
411 self._filtered_by_where = 0
412 self._filtered_by_join = 0
413 with db.query(self.sql) as sql_result:
414 sql_rows = sql_result.fetchall()
415 for row in sql_rows:
416 rowRegions = [row._mapping[self.getRegionColumn(element.name)] for element in self.spatial]
417 if whereRegion and any(r.isDisjointFrom(whereRegion) for r in rowRegions):
418 self._filtered_by_where += 1
419 continue
420 if not not any(a.isDisjointFrom(b) for a, b in itertools.combinations(rowRegions, 2)):
421 self._filtered_by_join += 1
422 continue
423 yield row
425 def extractDimensionsTuple(
426 self, row: Optional[sqlalchemy.engine.RowProxy], dimensions: Iterable[Dimension]
427 ) -> tuple:
428 """Extract a tuple of data ID values from a result row.
430 Parameters
431 ----------
432 row : `sqlalchemy.engine.RowProxy` or `None`
433 A result row from a SQLAlchemy SELECT query, or `None` to indicate
434 the row from an `EmptyQuery`.
435 dimensions : `Iterable` [ `Dimension` ]
436 The dimensions to include in the returned tuple, in order.
438 Returns
439 -------
440 values : `tuple`
441 A tuple of dimension primary key values.
442 """
443 if row is None:
444 assert not tuple(dimensions), "Can only utilize empty query row when there are no dimensions."
445 return ()
446 return tuple(row._mapping[self.getDimensionColumn(dimension.name)] for dimension in dimensions)
448 def extractDataId(
449 self,
450 row: Optional[sqlalchemy.engine.RowProxy],
451 *,
452 graph: Optional[DimensionGraph] = None,
453 records: Optional[Mapping[str, Mapping[tuple, DimensionRecord]]] = None,
454 ) -> DataCoordinate:
455 """Extract a data ID from a result row.
457 Parameters
458 ----------
459 row : `sqlalchemy.engine.RowProxy` or `None`
460 A result row from a SQLAlchemy SELECT query, or `None` to indicate
461 the row from an `EmptyQuery`.
462 graph : `DimensionGraph`, optional
463 The dimensions the returned data ID should identify. If not
464 provided, this will be all dimensions in `QuerySummary.requested`.
465 records : `Mapping` [ `str`, `Mapping` [ `tuple`, `DimensionRecord` ] ]
466 Nested mapping containing records to attach to the returned
467 `DataCoordinate`, for which `~DataCoordinate.hasRecords` will
468 return `True`. If provided, outer keys must include all dimension
469 element names in ``graph``, and inner keys should be tuples of
470 dimension primary key values in the same order as
471 ``element.graph.required``. If not provided,
472 `DataCoordinate.hasRecords` will return `False` on the returned
473 object.
475 Returns
476 -------
477 dataId : `DataCoordinate`
478 A data ID that identifies all required and implied dimensions. If
479 ``records is not None``, this is have
480 `~DataCoordinate.hasRecords()` return `True`.
481 """
482 if graph is None:
483 graph = self.graph
484 if not graph:
485 return DataCoordinate.makeEmpty(self.graph.universe)
486 dataId = DataCoordinate.fromFullValues(
487 graph, self.extractDimensionsTuple(row, itertools.chain(graph.required, graph.implied))
488 )
489 if records is not None:
490 recordsForRow = {}
491 for element in graph.elements:
492 key = tuple(dataId.subset(element.graph).values())
493 recordsForRow[element.name] = records[element.name].get(key)
494 return dataId.expanded(recordsForRow)
495 else:
496 return dataId
498 def extractDatasetRef(
499 self,
500 row: sqlalchemy.engine.RowProxy,
501 dataId: Optional[DataCoordinate] = None,
502 records: Optional[Mapping[str, Mapping[tuple, DimensionRecord]]] = None,
503 ) -> DatasetRef:
504 """Extract a `DatasetRef` from a result row.
506 Parameters
507 ----------
508 row : `sqlalchemy.engine.RowProxy`
509 A result row from a SQLAlchemy SELECT query.
510 dataId : `DataCoordinate`
511 Data ID to attach to the `DatasetRef`. A minimal (i.e. base class)
512 `DataCoordinate` is constructed from ``row`` if `None`.
513 records : `Mapping` [ `str`, `Mapping` [ `tuple`, `DimensionRecord` ] ]
514 Records to use to return an `ExpandedDataCoordinate`. If provided,
515 outer keys must include all dimension element names in ``graph``,
516 and inner keys should be tuples of dimension primary key values
517 in the same order as ``element.graph.required``.
519 Returns
520 -------
521 ref : `DatasetRef`
522 Reference to the dataset; guaranteed to have `DatasetRef.id` not
523 `None`.
524 """
525 datasetColumns = self.getDatasetColumns()
526 assert datasetColumns is not None
527 if dataId is None:
528 dataId = self.extractDataId(row, graph=datasetColumns.datasetType.dimensions, records=records)
529 runRecord = self.managers.collections[row._mapping[datasetColumns.runKey]]
530 return DatasetRef(
531 datasetColumns.datasetType, dataId, id=row._mapping[datasetColumns.id], run=runRecord.name
532 )
534 def _makeSubsetQueryColumns(
535 self, *, graph: Optional[DimensionGraph] = None, datasets: bool = True, unique: bool = False
536 ) -> Tuple[DimensionGraph, Optional[QueryColumns]]:
537 """Helper method for subclass implementations of `subset`.
539 Parameters
540 ----------
541 graph : `DimensionGraph`, optional
542 Dimensions to include in the new `Query` being constructed.
543 ``subset`` implementations should generally just forward their
544 own ``graph`` argument here.
545 datasets : `bool`, optional
546 Whether the new `Query` should include dataset results. Defaults
547 to `True`, but is ignored if ``self`` does not include dataset
548 results.
549 unique : `bool`, optional
550 Whether the new `Query` should guarantee unique results (this may
551 come with a performance penalty).
553 Returns
554 -------
555 graph : `DimensionGraph`
556 The dimensions of the new `Query`. This is exactly the same as
557 the argument of the same name, with ``self.graph`` used if that
558 argument is `None`.
559 columns : `QueryColumns` or `None`
560 A struct containing the SQLAlchemy column objects to use in the
561 new query, constructed by delegating to other (mostly abstract)
562 methods on ``self``. If `None`, `subset` may return ``self``.
563 """
564 if graph is None:
565 graph = self.graph
566 if (
567 graph == self.graph
568 and (self.getDatasetColumns() is None or datasets)
569 and (self.isUnique() or not unique)
570 ):
571 return graph, None
572 columns = QueryColumns()
573 for dimension in graph.dimensions:
574 col = self.getDimensionColumn(dimension.name)
575 columns.keys[dimension] = [col]
576 if not unique:
577 for element in self.spatial:
578 col = self.getRegionColumn(element.name)
579 columns.regions[element] = col
580 if datasets and self.getDatasetColumns() is not None:
581 columns.datasets = self.getDatasetColumns()
582 return graph, columns
584 @abstractmethod
585 def materialize(self, db: Database) -> ContextManager[Query]:
586 """Execute this query and insert its results into a temporary table.
588 Parameters
589 ----------
590 db : `Database`
591 Database engine to execute the query against.
593 Returns
594 -------
595 context : `typing.ContextManager` [ `MaterializedQuery` ]
596 A context manager that ensures the temporary table is created and
597 populated in ``__enter__`` (returning a `MaterializedQuery` object
598 backed by that table), and dropped in ``__exit__``. If ``self``
599 is already a `MaterializedQuery`, ``__enter__`` may just return
600 ``self`` and ``__exit__`` may do nothing (reflecting the fact that
601 an outer context manager should already take care of everything
602 else).
603 """
604 raise NotImplementedError()
606 @abstractmethod
607 def subset(
608 self, *, graph: Optional[DimensionGraph] = None, datasets: bool = True, unique: bool = False
609 ) -> Query:
610 """Return a new `Query` whose columns and/or rows are (mostly) subset
611 of this one's.
613 Parameters
614 ----------
615 graph : `DimensionGraph`, optional
616 Dimensions to include in the new `Query` being constructed.
617 If `None` (default), ``self.graph`` is used.
618 datasets : `bool`, optional
619 Whether the new `Query` should include dataset results. Defaults
620 to `True`, but is ignored if ``self`` does not include dataset
621 results.
622 unique : `bool`, optional
623 Whether the new `Query` should guarantee unique results (this may
624 come with a performance penalty).
626 Returns
627 -------
628 query : `Query`
629 A query object corresponding to the given inputs. May be ``self``
630 if no changes were requested.
632 Notes
633 -----
634 The way spatial overlaps are handled at present makes it impossible to
635 fully guarantee in general that the new query's rows are a subset of
636 this one's while also returning unique rows. That's because the
637 database is only capable of performing approximate, conservative
638 overlaps via the common skypix system; we defer actual region overlap
639 operations to per-result-row Python logic. But including the region
640 columns necessary to do that postprocessing in the query makes it
641 impossible to do a SELECT DISTINCT on the user-visible dimensions of
642 the query. For example, consider starting with a query with dimensions
643 (instrument, skymap, visit, tract). That involves a spatial join
644 between visit and tract, and we include the region columns from both
645 tables in the results in order to only actually yield result rows
646 (see `predicate` and `rows`) where the regions in those two columns
647 overlap. If the user then wants to subset to just (skymap, tract) with
648 unique results, we have two unpalatable options:
650 - we can do a SELECT DISTINCT with just the skymap and tract columns
651 in the SELECT clause, dropping all detailed overlap information and
652 including some tracts that did not actually overlap any of the
653 visits in the original query (but were regarded as _possibly_
654 overlapping via the coarser, common-skypix relationships);
656 - we can include the tract and visit region columns in the query, and
657 continue to filter out the non-overlapping pairs, but completely
658 disregard the user's request for unique tracts.
660 This interface specifies that implementations must do the former, as
661 that's what makes things efficient in our most important use case
662 (``QuantumGraph`` generation in ``pipe_base``). We may be able to
663 improve this situation in the future by putting exact overlap
664 information in the database, either by using built-in (but
665 engine-specific) spatial database functionality or (more likely)
666 switching to a scheme in which pairwise dimension spatial relationships
667 are explicitly precomputed (for e.g. combinations of instruments and
668 skymaps).
669 """
670 raise NotImplementedError()
672 @abstractmethod
673 def makeBuilder(self, summary: Optional[QuerySummary] = None) -> QueryBuilder:
674 """Return a `QueryBuilder` that can be used to construct a new `Query`
675 that is joined to (and hence constrained by) this one.
677 Parameters
678 ----------
679 summary : `QuerySummary`, optional
680 A `QuerySummary` instance that specifies the dimensions and any
681 additional constraints to include in the new query being
682 constructed, or `None` to use the dimensions of ``self`` with no
683 additional constraints.
684 """
685 raise NotImplementedError()
687 graph: DimensionGraph
688 """The dimensions identified by this query and included in any data IDs
689 created from its result rows (`DimensionGraph`).
690 """
692 whereRegion: Optional[Region]
693 """A spatial region that all regions in all rows returned by this query
694 must overlap (`lsst.sphgeom.Region` or `None`).
695 """
697 managers: RegistryManagers
698 """A struct containing `Registry` helper object (`RegistryManagers`).
699 """
702class DirectQueryUniqueness(enum.Enum):
703 """An enum representing the ways in which a query can have unique rows (or
704 not).
705 """
707 NOT_UNIQUE = enum.auto()
708 """The query is not expected to have unique rows.
709 """
711 NATURALLY_UNIQUE = enum.auto()
712 """The construction of the query guarantees that it will have unique
713 result rows, even without SELECT DISTINCT or a GROUP BY clause.
714 """
716 NEEDS_DISTINCT = enum.auto()
717 """The query is expected to yield unique result rows, and needs to use
718 SELECT DISTINCT or an equivalent GROUP BY clause to achieve this.
719 """
722class DirectQuery(Query):
723 """A `Query` implementation that represents a direct SELECT query that
724 usually joins many tables.
726 `DirectQuery` objects should generally only be constructed by
727 `QueryBuilder` or the methods of other `Query` objects.
729 Parameters
730 ----------
731 simpleQuery : `SimpleQuery`
732 Struct representing the actual SELECT, FROM, and WHERE clauses.
733 columns : `QueryColumns`
734 Columns that are referenced in the query in any clause.
735 uniqueness : `DirectQueryUniqueness`
736 Enum value indicating whether the query should yield unique result
737 rows, and if so whether that needs to be explicitly requested of the
738 database.
739 graph : `DimensionGraph`
740 Object describing the dimensions included in the query.
741 whereRegion : `lsst.sphgeom.Region`, optional
742 Region that all region columns in all returned rows must overlap.
743 managers : `RegistryManagers`
744 Struct containing the `Registry` manager helper objects, to be
745 forwarded to the `Query` constructor.
746 doomed_by : `Iterable` [ `str` ], optional
747 A list of messages (appropriate for e.g. logging or exceptions) that
748 explain why the query is known to return no results even before it is
749 executed. Queries with a non-empty list will never be executed.
750 """
752 def __init__(
753 self,
754 *,
755 simpleQuery: SimpleQuery,
756 columns: QueryColumns,
757 uniqueness: DirectQueryUniqueness,
758 graph: DimensionGraph,
759 whereRegion: Optional[Region],
760 managers: RegistryManagers,
761 order_by_columns: Iterable[OrderByColumn] = (),
762 limit: Optional[Tuple[int, Optional[int]]] = None,
763 doomed_by: Iterable[str] = (),
764 ):
765 super().__init__(graph=graph, whereRegion=whereRegion, managers=managers, doomed_by=doomed_by)
766 assert not simpleQuery.columns, "Columns should always be set on a copy in .sql"
767 assert not columns.isEmpty(), "EmptyQuery must be used when a query would have no columns."
768 self._simpleQuery = simpleQuery
769 self._columns = columns
770 self._uniqueness = uniqueness
771 self._order_by_columns = order_by_columns
772 self._limit = limit
773 self._datasetQueryColumns: Optional[DatasetQueryColumns] = None
774 self._dimensionColumns: Dict[str, sqlalchemy.sql.ColumnElement] = {}
775 self._regionColumns: Dict[str, sqlalchemy.sql.ColumnElement] = {}
777 def isUnique(self) -> bool:
778 # Docstring inherited from Query.
779 return self._uniqueness is not DirectQueryUniqueness.NOT_UNIQUE
781 def getDimensionColumn(self, name: str) -> sqlalchemy.sql.ColumnElement:
782 # Docstring inherited from Query.
783 column = self._dimensionColumns.get(name)
784 if column is None:
785 column = self._columns.getKeyColumn(name).label(name)
786 self._dimensionColumns[name] = column
787 return column
789 @property
790 def spatial(self) -> Iterator[DimensionElement]:
791 # Docstring inherited from Query.
792 return iter(self._columns.regions)
794 def getRegionColumn(self, name: str) -> sqlalchemy.sql.ColumnElement:
795 # Docstring inherited from Query.
796 column = self._regionColumns.get(name)
797 if column is None:
798 column = self._columns.regions[name].column.label(f"{name}_region")
799 self._regionColumns[name] = column
800 return column
802 def getDatasetColumns(self) -> Optional[DatasetQueryColumns]:
803 # Docstring inherited from Query.
804 if self._datasetQueryColumns is None:
805 base = self._columns.datasets
806 if base is None:
807 return None
808 ingestDate = base.ingestDate
809 if ingestDate is not None:
810 ingestDate = ingestDate.label("ingest_date")
811 self._datasetQueryColumns = DatasetQueryColumns(
812 datasetType=base.datasetType,
813 id=base.id.label("dataset_id"),
814 runKey=base.runKey.label(self.managers.collections.getRunForeignKeyName()),
815 ingestDate=ingestDate,
816 )
817 return self._datasetQueryColumns
819 @property
820 def sql(self) -> sqlalchemy.sql.FromClause:
821 # Docstring inherited from Query.
822 simpleQuery = self._simpleQuery.copy()
823 for dimension in self.graph:
824 simpleQuery.columns.append(self.getDimensionColumn(dimension.name))
825 for element in self.spatial:
826 simpleQuery.columns.append(self.getRegionColumn(element.name))
827 datasetColumns = self.getDatasetColumns()
828 if datasetColumns is not None:
829 simpleQuery.columns.extend(datasetColumns)
831 assert not simpleQuery.order_by, "Input query cannot have ORDER BY"
832 if self._order_by_columns:
833 # add ORDER BY column
834 order_by_columns = [column.column_order for column in self._order_by_columns]
835 order_by_column = sqlalchemy.func.row_number().over(order_by=order_by_columns).label("_orderby")
836 simpleQuery.columns.append(order_by_column)
837 simpleQuery.order_by = [order_by_column]
839 assert simpleQuery.limit is None, "Input query cannot have LIMIT"
840 simpleQuery.limit = self._limit
842 sql = simpleQuery.combine()
844 if self._uniqueness is DirectQueryUniqueness.NEEDS_DISTINCT:
845 return sql.distinct()
846 else:
847 return sql
849 def _makeTableSpec(self, constraints: bool = False) -> ddl.TableSpec:
850 """Helper method for subclass implementations of `materialize`.
852 Parameters
853 ----------
854 constraints : `bool`, optional
855 If `True` (`False` is default), define a specification that
856 includes actual foreign key constraints for logical foreign keys.
857 Some database engines do not permit temporary tables to reference
858 normal tables, so this should be `False` when generating a spec
859 for a temporary table unless the database engine is known to
860 support them.
862 Returns
863 -------
864 spec : `ddl.TableSpec`
865 Specification for a table that could hold this query's result rows.
866 """
867 unique = self.isUnique()
868 spec = ddl.TableSpec(fields=())
869 for dimension in self.graph:
870 addDimensionForeignKey(spec, dimension, primaryKey=unique, constraint=constraints)
871 for element in self.spatial:
872 spec.fields.update(
873 SpatialRegionDatabaseRepresentation.makeFieldSpecs(
874 nullable=True,
875 name=f"{element.name}_region",
876 )
877 )
878 datasetColumns = self.getDatasetColumns()
879 if datasetColumns is not None:
880 self.managers.datasets.addDatasetForeignKey(spec, primaryKey=unique, constraint=constraints)
881 self.managers.collections.addRunForeignKey(spec, nullable=False, constraint=constraints)
883 # Need a column for ORDER BY if ordering is requested
884 if self._order_by_columns:
885 spec.fields.add(
886 ddl.FieldSpec(
887 name="_orderby",
888 dtype=sqlalchemy.BigInteger,
889 nullable=False,
890 doc="Column to use with ORDER BY",
891 )
892 )
894 return spec
896 @contextmanager
897 def materialize(self, db: Database) -> Iterator[Query]:
898 # Docstring inherited from Query.
899 spec = self._makeTableSpec()
900 with db.temporary_table(spec) as table:
901 if not self._doomed_by:
902 db.insert(table, select=self.sql, names=spec.fields.names)
903 yield MaterializedQuery(
904 table=table,
905 spatial=self.spatial,
906 datasetType=self.datasetType,
907 isUnique=self.isUnique(),
908 graph=self.graph,
909 whereRegion=self.whereRegion,
910 managers=self.managers,
911 doomed_by=self._doomed_by,
912 )
914 def subset(
915 self, *, graph: Optional[DimensionGraph] = None, datasets: bool = True, unique: bool = False
916 ) -> Query:
917 # Docstring inherited from Query.
918 graph, columns = self._makeSubsetQueryColumns(graph=graph, datasets=datasets, unique=unique)
919 if columns is None:
920 return self
921 if columns.isEmpty():
922 return EmptyQuery(self.graph.universe, self.managers)
923 return DirectQuery(
924 simpleQuery=self._simpleQuery.copy(),
925 columns=columns,
926 uniqueness=DirectQueryUniqueness.NEEDS_DISTINCT if unique else DirectQueryUniqueness.NOT_UNIQUE,
927 graph=graph,
928 whereRegion=self.whereRegion if not unique else None,
929 managers=self.managers,
930 doomed_by=self._doomed_by,
931 )
933 def makeBuilder(self, summary: Optional[QuerySummary] = None) -> QueryBuilder:
934 # Docstring inherited from Query.
935 from ._builder import QueryBuilder
937 if summary is None:
938 summary = QuerySummary(self.graph, whereRegion=self.whereRegion)
939 if not summary.requested.issubset(self.graph):
940 raise NotImplementedError(
941 f"Query.makeBuilder does not yet support augmenting dimensions "
942 f"({summary.requested.dimensions}) beyond those originally included in the query "
943 f"({self.graph.dimensions})."
944 )
945 builder = QueryBuilder(summary, managers=self.managers, doomed_by=self._doomed_by)
946 builder.joinTable(
947 self.sql.alias(), dimensions=self.graph.dimensions, datasets=self.getDatasetColumns()
948 )
949 return builder
952class MaterializedQuery(Query):
953 """A `Query` implementation that represents query results saved in a
954 temporary table.
956 `MaterializedQuery` instances should not be constructed directly; use
957 `Query.materialize()` instead.
959 Parameters
960 ----------
961 table : `sqlalchemy.schema.Table`
962 SQLAlchemy object representing the temporary table.
963 spatial : `Iterable` [ `DimensionElement` ]
964 Spatial dimension elements whose regions must overlap for each valid
965 result row (which may reject some rows that are in the table).
966 datasetType : `DatasetType`
967 The `DatasetType` of datasets returned by this query, or `None`
968 if there are no dataset results
969 isUnique : `bool`
970 If `True`, the table's rows are unique, and there is no need to
971 add ``SELECT DISTINCT`` to guarantee this in results.
972 graph : `DimensionGraph`
973 Dimensions included in the columns of this table.
974 whereRegion : `Region` or `None`
975 A spatial region all result-row regions must overlap to be valid (which
976 may reject some rows that are in the table).
977 managers : `RegistryManagers`
978 A struct containing `Registry` manager helper objects, forwarded to
979 the `Query` constructor.
980 doomed_by : `Iterable` [ `str` ], optional
981 A list of messages (appropriate for e.g. logging or exceptions) that
982 explain why the query is known to return no results even before it is
983 executed. Queries with a non-empty list will never be executed.
984 """
986 def __init__(
987 self,
988 *,
989 table: sqlalchemy.schema.Table,
990 spatial: Iterable[DimensionElement],
991 datasetType: Optional[DatasetType],
992 isUnique: bool,
993 graph: DimensionGraph,
994 whereRegion: Optional[Region],
995 managers: RegistryManagers,
996 doomed_by: Iterable[str] = (),
997 ):
998 super().__init__(graph=graph, whereRegion=whereRegion, managers=managers, doomed_by=doomed_by)
999 self._table = table
1000 self._spatial = tuple(spatial)
1001 self._datasetType = datasetType
1002 self._isUnique = isUnique
1004 def isUnique(self) -> bool:
1005 # Docstring inherited from Query.
1006 return self._isUnique
1008 def getDimensionColumn(self, name: str) -> sqlalchemy.sql.ColumnElement:
1009 # Docstring inherited from Query.
1010 return self._table.columns[name]
1012 @property
1013 def spatial(self) -> Iterator[DimensionElement]:
1014 # Docstring inherited from Query.
1015 return iter(self._spatial)
1017 def getRegionColumn(self, name: str) -> sqlalchemy.sql.ColumnElement:
1018 # Docstring inherited from Query.
1019 return self._table.columns[f"{name}_region"]
1021 def getDatasetColumns(self) -> Optional[DatasetQueryColumns]:
1022 # Docstring inherited from Query.
1023 if self._datasetType is not None:
1024 return DatasetQueryColumns(
1025 datasetType=self._datasetType,
1026 id=self._table.columns["dataset_id"],
1027 runKey=self._table.columns[self.managers.collections.getRunForeignKeyName()],
1028 ingestDate=None,
1029 )
1030 else:
1031 return None
1033 @property
1034 def sql(self) -> sqlalchemy.sql.FromClause:
1035 # Docstring inherited from Query.
1036 select = self._table.select()
1037 if "_orderby" in self._table.columns:
1038 select = select.order_by(self._table.columns["_orderby"])
1039 return select
1041 @contextmanager
1042 def materialize(self, db: Database) -> Iterator[Query]:
1043 # Docstring inherited from Query.
1044 yield self
1046 def subset(
1047 self, *, graph: Optional[DimensionGraph] = None, datasets: bool = True, unique: bool = False
1048 ) -> Query:
1049 # Docstring inherited from Query.
1050 graph, columns = self._makeSubsetQueryColumns(graph=graph, datasets=datasets, unique=unique)
1051 if columns is None:
1052 return self
1053 if columns.isEmpty():
1054 return EmptyQuery(self.graph.universe, managers=self.managers)
1055 simpleQuery = SimpleQuery()
1056 simpleQuery.join(self._table)
1057 return DirectQuery(
1058 simpleQuery=simpleQuery,
1059 columns=columns,
1060 uniqueness=DirectQueryUniqueness.NEEDS_DISTINCT if unique else DirectQueryUniqueness.NOT_UNIQUE,
1061 graph=graph,
1062 whereRegion=self.whereRegion if not unique else None,
1063 managers=self.managers,
1064 doomed_by=self._doomed_by,
1065 )
1067 def makeBuilder(self, summary: Optional[QuerySummary] = None) -> QueryBuilder:
1068 # Docstring inherited from Query.
1069 from ._builder import QueryBuilder
1071 if summary is None:
1072 summary = QuerySummary(self.graph, whereRegion=self.whereRegion)
1073 if not summary.requested.issubset(self.graph):
1074 raise NotImplementedError(
1075 f"Query.makeBuilder does not yet support augmenting dimensions "
1076 f"({summary.requested.dimensions}) beyond those originally included in the query "
1077 f"({self.graph.dimensions})."
1078 )
1079 builder = QueryBuilder(summary, managers=self.managers, doomed_by=self._doomed_by)
1080 builder.joinTable(self._table, dimensions=self.graph.dimensions, datasets=self.getDatasetColumns())
1081 return builder
1084class EmptyQuery(Query):
1085 """A `Query` implementation that handes the special case where the query
1086 would have no columns.
1088 Parameters
1089 ----------
1090 universe : `DimensionUniverse`
1091 Set of all dimensions from which the null set is extracted.
1092 managers : `RegistryManagers`
1093 A struct containing the registry manager instances used by the query
1094 system.
1095 doomed_by : `Iterable` [ `str` ], optional
1096 A list of messages (appropriate for e.g. logging or exceptions) that
1097 explain why the query is known to return no results even before it is
1098 executed. Queries with a non-empty list will never be executed.
1099 """
1101 def __init__(
1102 self,
1103 universe: DimensionUniverse,
1104 managers: RegistryManagers,
1105 doomed_by: Iterable[str] = (),
1106 ):
1107 super().__init__(graph=universe.empty, whereRegion=None, managers=managers, doomed_by=doomed_by)
1109 def isUnique(self) -> bool:
1110 # Docstring inherited from Query.
1111 return True
1113 def getDimensionColumn(self, name: str) -> sqlalchemy.sql.ColumnElement:
1114 # Docstring inherited from Query.
1115 raise KeyError(f"No dimension {name} in query (no dimensions at all, actually).")
1117 @property
1118 def spatial(self) -> Iterator[DimensionElement]:
1119 # Docstring inherited from Query.
1120 return iter(())
1122 def getRegionColumn(self, name: str) -> sqlalchemy.sql.ColumnElement:
1123 # Docstring inherited from Query.
1124 raise KeyError(f"No region for {name} in query (no regions at all, actually).")
1126 def getDatasetColumns(self) -> Optional[DatasetQueryColumns]:
1127 # Docstring inherited from Query.
1128 return None
1130 def rows(
1131 self, db: Database, *, region: Optional[Region] = None
1132 ) -> Iterator[Optional[sqlalchemy.engine.RowProxy]]:
1133 if not self._doomed_by:
1134 yield None
1136 @property
1137 def sql(self) -> Optional[sqlalchemy.sql.FromClause]:
1138 # Docstring inherited from Query.
1139 return None
1141 @contextmanager
1142 def materialize(self, db: Database) -> Iterator[Query]:
1143 # Docstring inherited from Query.
1144 yield self
1146 def subset(
1147 self, *, graph: Optional[DimensionGraph] = None, datasets: bool = True, unique: bool = False
1148 ) -> Query:
1149 # Docstring inherited from Query.
1150 assert graph is None or graph.issubset(self.graph)
1151 return self
1153 def makeBuilder(self, summary: Optional[QuerySummary] = None) -> QueryBuilder:
1154 # Docstring inherited from Query.
1155 from ._builder import QueryBuilder
1157 if summary is None:
1158 summary = QuerySummary(self.graph)
1159 if not summary.requested.issubset(self.graph):
1160 raise NotImplementedError(
1161 f"Query.makeBuilder does not yet support augmenting dimensions "
1162 f"({summary.requested.dimensions}) beyond those originally included in the query "
1163 f"({self.graph.dimensions})."
1164 )
1165 return QueryBuilder(summary, managers=self.managers, doomed_by=self._doomed_by)