Coverage for python/lsst/daf/butler/registry/queries/_query.py: 24%
363 statements
« prev ^ index » next coverage.py v6.4.1, created at 2022-06-15 02:06 -0700
« prev ^ index » next coverage.py v6.4.1, created at 2022-06-15 02:06 -0700
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
21from __future__ import annotations
23__all__ = ("Query",)
25import dataclasses
26import enum
27import itertools
28from abc import ABC, abstractmethod
29from contextlib import contextmanager
30from typing import TYPE_CHECKING, ContextManager, Dict, Iterable, Iterator, Mapping, Optional, Tuple
32import sqlalchemy
33from lsst.sphgeom import Region
35from ...core import (
36 DataCoordinate,
37 DatasetRef,
38 DatasetType,
39 Dimension,
40 DimensionElement,
41 DimensionGraph,
42 DimensionRecord,
43 DimensionUniverse,
44 SimpleQuery,
45 SpatialRegionDatabaseRepresentation,
46 addDimensionForeignKey,
47 ddl,
48)
49from ..interfaces import Database
50from ._structs import DatasetQueryColumns, QueryColumns, QuerySummary, RegistryManagers
52if TYPE_CHECKING: 52 ↛ 53line 52 didn't jump to line 53, because the condition on line 52 was never true
53 from ._builder import QueryBuilder
56@dataclasses.dataclass(frozen=True)
57class OrderByColumn:
58 """Information about single column in ORDER BY clause."""
60 column: sqlalchemy.sql.ColumnElement
61 """Name of the column or `None` for primary key (`str` or `None`)"""
63 ordering: bool
64 """True for ascending order, False for descending (`bool`)."""
66 @property
67 def column_order(self) -> sqlalchemy.sql.ColumnElement:
68 """Column element for use in ORDER BY clause
69 (`sqlalchemy.sql.ColumnElement`)
70 """
71 return self.column.asc() if self.ordering else self.column.desc()
74class Query(ABC):
75 """An abstract base class for queries that return some combination of
76 `DatasetRef` and `DataCoordinate` objects.
78 Parameters
79 ----------
80 graph : `DimensionGraph`
81 Object describing the dimensions included in the query.
82 whereRegion : `lsst.sphgeom.Region`, optional
83 Region that all region columns in all returned rows must overlap.
84 managers : `RegistryManagers`
85 A struct containing the registry manager instances used by the query
86 system.
87 doomed_by : `Iterable` [ `str` ], optional
88 A list of messages (appropriate for e.g. logging or exceptions) that
89 explain why the query is known to return no results even before it is
90 executed. Queries with a non-empty list will never be executed.
92 Notes
93 -----
94 The `Query` hierarchy abstracts over the database/SQL representation of a
95 particular set of data IDs or datasets. It is expected to be used as a
96 backend for other objects that provide more natural interfaces for one or
97 both of these, not as part of a public interface to query results.
98 """
100 def __init__(
101 self,
102 *,
103 graph: DimensionGraph,
104 whereRegion: Optional[Region],
105 managers: RegistryManagers,
106 doomed_by: Iterable[str] = (),
107 ):
108 self.graph = graph
109 self.whereRegion = whereRegion
110 self.managers = managers
111 self._doomed_by = tuple(doomed_by)
112 self._filtered_by_join: Optional[int] = None
113 self._filtered_by_where: Optional[int] = None
115 @abstractmethod
116 def isUnique(self) -> bool:
117 """Return `True` if this query's rows are guaranteed to be unique, and
118 `False` otherwise.
120 If this query has dataset results (`datasetType` is not `None`),
121 uniqueness applies to the `DatasetRef` instances returned by
122 `extractDatasetRef` from the result of `rows`. If it does not have
123 dataset results, uniqueness applies to the `DataCoordinate` instances
124 returned by `extractDataId`.
125 """
126 raise NotImplementedError()
128 @abstractmethod
129 def getDimensionColumn(self, name: str) -> sqlalchemy.sql.ColumnElement:
130 """Return the query column that contains the primary key value for
131 the dimension with the given name.
133 Parameters
134 ----------
135 name : `str`
136 Name of the dimension.
138 Returns
139 -------
140 column : `sqlalchemy.sql.ColumnElement`.
141 SQLAlchemy object representing a column in the query.
143 Notes
144 -----
145 This method is intended primarily as a hook for subclasses to implement
146 and the ABC to call in order to provide higher-level functionality;
147 code that uses `Query` objects (but does not implement one) should
148 usually not have to call this method.
149 """
150 raise NotImplementedError()
152 @property
153 @abstractmethod
154 def spatial(self) -> Iterator[DimensionElement]:
155 """An iterator over the dimension element columns used in post-query
156 filtering of spatial overlaps (`Iterator` [ `DimensionElement` ]).
158 Notes
159 -----
160 This property is intended primarily as a hook for subclasses to
161 implement and the ABC to call in order to provide higher-level
162 functionality; code that uses `Query` objects (but does not implement
163 one) should usually not have to access this property.
164 """
165 raise NotImplementedError()
167 @abstractmethod
168 def getRegionColumn(self, name: str) -> sqlalchemy.sql.ColumnElement:
169 """Return a region column for one of the dimension elements iterated
170 over by `spatial`.
172 Parameters
173 ----------
174 name : `str`
175 Name of the element.
177 Returns
178 -------
179 column : `sqlalchemy.sql.ColumnElement`
180 SQLAlchemy representing a result column in the query.
182 Notes
183 -----
184 This method is intended primarily as a hook for subclasses to implement
185 and the ABC to call in order to provide higher-level functionality;
186 code that uses `Query` objects (but does not implement one) should
187 usually not have to call this method.
188 """
189 raise NotImplementedError()
191 @property
192 def datasetType(self) -> Optional[DatasetType]:
193 """The `DatasetType` of datasets returned by this query, or `None`
194 if there are no dataset results (`DatasetType` or `None`).
195 """
196 cols = self.getDatasetColumns()
197 if cols is None:
198 return None
199 return cols.datasetType
201 def count(self, db: Database, *, region: Optional[Region] = None, exact: bool = True) -> int:
202 """Count the number of rows this query would return.
204 Parameters
205 ----------
206 db : `Database`
207 Object managing the database connection.
208 region : `sphgeom.Region`, optional
209 A region that any result-row regions must overlap in order to be
210 yielded. If not provided, this will be ``self.whereRegion``, if
211 that exists.
212 exact : `bool`, optional
213 If `True`, run the full query and perform post-query filtering if
214 needed to account for that filtering in the count. If `False`, the
215 result may be an upper bound.
217 Returns
218 -------
219 count : `int`
220 The number of rows the query would return, or an upper bound if
221 ``exact=False``.
223 Notes
224 -----
225 This counts the number of rows returned, not the number of unique rows
226 returned, so even with ``exact=True`` it may provide only an upper
227 bound on the number of *deduplicated* result rows.
228 """
229 if self._doomed_by:
230 return 0
231 sql = self.sql
232 if sql is None:
233 return 1
234 if exact and self.spatial:
235 filtered_count = 0
236 for _ in self.rows(db, region=region):
237 filtered_count += 1
238 return filtered_count
239 else:
240 return db.query(sql.with_only_columns([sqlalchemy.sql.func.count()]).order_by(None)).scalar()
242 def any(
243 self,
244 db: Database,
245 *,
246 region: Optional[Region] = None,
247 execute: bool = True,
248 exact: bool = True,
249 ) -> bool:
250 """Test whether this query returns any results.
252 Parameters
253 ----------
254 db : `Database`
255 Object managing the database connection.
256 region : `sphgeom.Region`, optional
257 A region that any result-row regions must overlap in order to be
258 yielded. If not provided, this will be ``self.whereRegion``, if
259 that exists.
260 execute : `bool`, optional
261 If `True`, execute at least a ``LIMIT 1`` query if it cannot be
262 determined prior to execution that the query would return no rows.
263 exact : `bool`, optional
264 If `True`, run the full query and perform post-query filtering if
265 needed, until at least one result row is found. If `False`, the
266 returned result does not account for post-query filtering, and
267 hence may be `True` even when all result rows would be filtered
268 out.
270 Returns
271 -------
272 any : `bool`
273 `True` if the query would (or might, depending on arguments) yield
274 result rows. `False` if it definitely would not.
275 """
276 if self._doomed_by:
277 return False
278 sql = self.sql
279 if sql is None:
280 return True
281 if exact and not execute:
282 raise TypeError("Cannot obtain exact results without executing the query.")
283 if exact and self.spatial:
284 for _ in self.rows(db, region=region):
285 return True
286 return False
287 elif execute:
288 return db.query(sql.limit(1)).one_or_none() is not None
289 else:
290 return True
292 def explain_no_results(
293 self,
294 db: Database,
295 *,
296 region: Optional[Region] = None,
297 followup: bool = True,
298 ) -> Iterator[str]:
299 """Return human-readable messages that may help explain why the query
300 yields no results.
302 Parameters
303 ----------
304 db : `Database`
305 Object managing the database connection.
306 region : `sphgeom.Region`, optional
307 A region that any result-row regions must overlap in order to be
308 yielded. If not provided, this will be ``self.whereRegion``, if
309 that exists.
310 followup : `bool`, optional
311 If `True` (default) perform inexpensive follow-up queries if no
312 diagnostics are available from query generation alone.
314 Returns
315 -------
316 messages : `Iterator` [ `str` ]
317 String messages that describe reasons the query might not yield any
318 results.
320 Notes
321 -----
322 Messages related to post-query filtering are only available if `rows`,
323 `any`, or `count` was already called with the same region (with
324 ``exact=True`` for the latter two).
325 """
326 from ._builder import QueryBuilder
328 if self._doomed_by:
329 yield from self._doomed_by
330 return
331 if self._filtered_by_where:
332 yield (
333 f"{self._filtered_by_where} result rows were filtered out because "
334 "one or more region did not overlap the WHERE-clause region."
335 )
336 if self._filtered_by_join:
337 yield (
338 f"{self._filtered_by_join} result rows were filtered out because "
339 "one or more regions did not overlap."
340 )
341 if (not followup) or self._filtered_by_join or self._filtered_by_where:
342 return
343 # Query didn't return results even before client-side filtering, and
344 # caller says we can do follow-up queries to determine why.
345 # Start by seeing if there are _any_ dimension records for each element
346 # involved.
347 for element in self.graph.elements:
348 summary = QuerySummary(element.graph)
349 builder = QueryBuilder(summary, self.managers)
350 followup_query = builder.finish()
351 if not followup_query.any(db, exact=False):
352 yield f"No dimension records for element '{element.name}' found."
353 yield from followup_query.explain_no_results(db, region=region, followup=False)
354 return
356 @abstractmethod
357 def getDatasetColumns(self) -> Optional[DatasetQueryColumns]:
358 """Return the columns for the datasets returned by this query.
360 Returns
361 -------
362 columns : `DatasetQueryColumns` or `None`
363 Struct containing SQLAlchemy representations of the result columns
364 for a dataset.
366 Notes
367 -----
368 This method is intended primarily as a hook for subclasses to implement
369 and the ABC to call in order to provide higher-level functionality;
370 code that uses `Query` objects (but does not implement one) should
371 usually not have to call this method.
372 """
373 raise NotImplementedError()
375 @property
376 @abstractmethod
377 def sql(self) -> Optional[sqlalchemy.sql.FromClause]:
378 """A SQLAlchemy object representing the full query
379 (`sqlalchemy.sql.FromClause` or `None`).
381 This is `None` in the special case where the query has no columns, and
382 only one logical row.
383 """
384 raise NotImplementedError()
386 def rows(
387 self, db: Database, *, region: Optional[Region] = None
388 ) -> Iterator[Optional[sqlalchemy.engine.RowProxy]]:
389 """Execute the query and yield result rows, applying `predicate`.
391 Parameters
392 ----------
393 db : `Database`
394 Object managing the database connection.
395 region : `sphgeom.Region`, optional
396 A region that any result-row regions must overlap in order to be
397 yielded. If not provided, this will be ``self.whereRegion``, if
398 that exists.
400 Yields
401 ------
402 row : `sqlalchemy.engine.RowProxy` or `None`
403 Result row from the query. `None` may yielded exactly once instead
404 of any real rows to indicate an empty query (see `EmptyQuery`).
405 """
406 if self._doomed_by:
407 return
408 whereRegion = region if region is not None else self.whereRegion
409 self._filtered_by_where = 0
410 self._filtered_by_join = 0
411 for row in db.query(self.sql):
412 rowRegions = [row._mapping[self.getRegionColumn(element.name)] for element in self.spatial]
413 if whereRegion and any(r.isDisjointFrom(whereRegion) for r in rowRegions):
414 self._filtered_by_where += 1
415 continue
416 if not not any(a.isDisjointFrom(b) for a, b in itertools.combinations(rowRegions, 2)):
417 self._filtered_by_join += 1
418 continue
419 yield row
421 def extractDimensionsTuple(
422 self, row: Optional[sqlalchemy.engine.RowProxy], dimensions: Iterable[Dimension]
423 ) -> tuple:
424 """Extract a tuple of data ID values from a result row.
426 Parameters
427 ----------
428 row : `sqlalchemy.engine.RowProxy` or `None`
429 A result row from a SQLAlchemy SELECT query, or `None` to indicate
430 the row from an `EmptyQuery`.
431 dimensions : `Iterable` [ `Dimension` ]
432 The dimensions to include in the returned tuple, in order.
434 Returns
435 -------
436 values : `tuple`
437 A tuple of dimension primary key values.
438 """
439 if row is None:
440 assert not tuple(dimensions), "Can only utilize empty query row when there are no dimensions."
441 return ()
442 return tuple(row._mapping[self.getDimensionColumn(dimension.name)] for dimension in dimensions)
444 def extractDataId(
445 self,
446 row: Optional[sqlalchemy.engine.RowProxy],
447 *,
448 graph: Optional[DimensionGraph] = None,
449 records: Optional[Mapping[str, Mapping[tuple, DimensionRecord]]] = None,
450 ) -> DataCoordinate:
451 """Extract a data ID from a result row.
453 Parameters
454 ----------
455 row : `sqlalchemy.engine.RowProxy` or `None`
456 A result row from a SQLAlchemy SELECT query, or `None` to indicate
457 the row from an `EmptyQuery`.
458 graph : `DimensionGraph`, optional
459 The dimensions the returned data ID should identify. If not
460 provided, this will be all dimensions in `QuerySummary.requested`.
461 records : `Mapping` [ `str`, `Mapping` [ `tuple`, `DimensionRecord` ] ]
462 Nested mapping containing records to attach to the returned
463 `DataCoordinate`, for which `~DataCoordinate.hasRecords` will
464 return `True`. If provided, outer keys must include all dimension
465 element names in ``graph``, and inner keys should be tuples of
466 dimension primary key values in the same order as
467 ``element.graph.required``. If not provided,
468 `DataCoordinate.hasRecords` will return `False` on the returned
469 object.
471 Returns
472 -------
473 dataId : `DataCoordinate`
474 A data ID that identifies all required and implied dimensions. If
475 ``records is not None``, this is have
476 `~DataCoordinate.hasRecords()` return `True`.
477 """
478 if graph is None:
479 graph = self.graph
480 if not graph:
481 return DataCoordinate.makeEmpty(self.graph.universe)
482 dataId = DataCoordinate.fromFullValues(
483 graph, self.extractDimensionsTuple(row, itertools.chain(graph.required, graph.implied))
484 )
485 if records is not None:
486 recordsForRow = {}
487 for element in graph.elements:
488 key = tuple(dataId.subset(element.graph).values())
489 recordsForRow[element.name] = records[element.name].get(key)
490 return dataId.expanded(recordsForRow)
491 else:
492 return dataId
494 def extractDatasetRef(
495 self,
496 row: sqlalchemy.engine.RowProxy,
497 dataId: Optional[DataCoordinate] = None,
498 records: Optional[Mapping[str, Mapping[tuple, DimensionRecord]]] = None,
499 ) -> DatasetRef:
500 """Extract a `DatasetRef` from a result row.
502 Parameters
503 ----------
504 row : `sqlalchemy.engine.RowProxy`
505 A result row from a SQLAlchemy SELECT query.
506 dataId : `DataCoordinate`
507 Data ID to attach to the `DatasetRef`. A minimal (i.e. base class)
508 `DataCoordinate` is constructed from ``row`` if `None`.
509 records : `Mapping` [ `str`, `Mapping` [ `tuple`, `DimensionRecord` ] ]
510 Records to use to return an `ExpandedDataCoordinate`. If provided,
511 outer keys must include all dimension element names in ``graph``,
512 and inner keys should be tuples of dimension primary key values
513 in the same order as ``element.graph.required``.
515 Returns
516 -------
517 ref : `DatasetRef`
518 Reference to the dataset; guaranteed to have `DatasetRef.id` not
519 `None`.
520 """
521 datasetColumns = self.getDatasetColumns()
522 assert datasetColumns is not None
523 if dataId is None:
524 dataId = self.extractDataId(row, graph=datasetColumns.datasetType.dimensions, records=records)
525 runRecord = self.managers.collections[row._mapping[datasetColumns.runKey]]
526 return DatasetRef(
527 datasetColumns.datasetType, dataId, id=row._mapping[datasetColumns.id], run=runRecord.name
528 )
530 def _makeSubsetQueryColumns(
531 self, *, graph: Optional[DimensionGraph] = None, datasets: bool = True, unique: bool = False
532 ) -> Tuple[DimensionGraph, Optional[QueryColumns]]:
533 """Helper method for subclass implementations of `subset`.
535 Parameters
536 ----------
537 graph : `DimensionGraph`, optional
538 Dimensions to include in the new `Query` being constructed.
539 ``subset`` implementations should generally just forward their
540 own ``graph`` argument here.
541 datasets : `bool`, optional
542 Whether the new `Query` should include dataset results. Defaults
543 to `True`, but is ignored if ``self`` does not include dataset
544 results.
545 unique : `bool`, optional
546 Whether the new `Query` should guarantee unique results (this may
547 come with a performance penalty).
549 Returns
550 -------
551 graph : `DimensionGraph`
552 The dimensions of the new `Query`. This is exactly the same as
553 the argument of the same name, with ``self.graph`` used if that
554 argument is `None`.
555 columns : `QueryColumns` or `None`
556 A struct containing the SQLAlchemy column objects to use in the
557 new query, constructed by delegating to other (mostly abstract)
558 methods on ``self``. If `None`, `subset` may return ``self``.
559 """
560 if graph is None:
561 graph = self.graph
562 if (
563 graph == self.graph
564 and (self.getDatasetColumns() is None or datasets)
565 and (self.isUnique() or not unique)
566 ):
567 return graph, None
568 columns = QueryColumns()
569 for dimension in graph.dimensions:
570 col = self.getDimensionColumn(dimension.name)
571 columns.keys[dimension] = [col]
572 if not unique:
573 for element in self.spatial:
574 col = self.getRegionColumn(element.name)
575 columns.regions[element] = col
576 if datasets and self.getDatasetColumns() is not None:
577 columns.datasets = self.getDatasetColumns()
578 return graph, columns
580 @abstractmethod
581 def materialize(self, db: Database) -> ContextManager[Query]:
582 """Execute this query and insert its results into a temporary table.
584 Parameters
585 ----------
586 db : `Database`
587 Database engine to execute the query against.
589 Returns
590 -------
591 context : `typing.ContextManager` [ `MaterializedQuery` ]
592 A context manager that ensures the temporary table is created and
593 populated in ``__enter__`` (returning a `MaterializedQuery` object
594 backed by that table), and dropped in ``__exit__``. If ``self``
595 is already a `MaterializedQuery`, ``__enter__`` may just return
596 ``self`` and ``__exit__`` may do nothing (reflecting the fact that
597 an outer context manager should already take care of everything
598 else).
599 """
600 raise NotImplementedError()
602 @abstractmethod
603 def subset(
604 self, *, graph: Optional[DimensionGraph] = None, datasets: bool = True, unique: bool = False
605 ) -> Query:
606 """Return a new `Query` whose columns and/or rows are (mostly) subset
607 of this one's.
609 Parameters
610 ----------
611 graph : `DimensionGraph`, optional
612 Dimensions to include in the new `Query` being constructed.
613 If `None` (default), ``self.graph`` is used.
614 datasets : `bool`, optional
615 Whether the new `Query` should include dataset results. Defaults
616 to `True`, but is ignored if ``self`` does not include dataset
617 results.
618 unique : `bool`, optional
619 Whether the new `Query` should guarantee unique results (this may
620 come with a performance penalty).
622 Returns
623 -------
624 query : `Query`
625 A query object corresponding to the given inputs. May be ``self``
626 if no changes were requested.
628 Notes
629 -----
630 The way spatial overlaps are handled at present makes it impossible to
631 fully guarantee in general that the new query's rows are a subset of
632 this one's while also returning unique rows. That's because the
633 database is only capable of performing approximate, conservative
634 overlaps via the common skypix system; we defer actual region overlap
635 operations to per-result-row Python logic. But including the region
636 columns necessary to do that postprocessing in the query makes it
637 impossible to do a SELECT DISTINCT on the user-visible dimensions of
638 the query. For example, consider starting with a query with dimensions
639 (instrument, skymap, visit, tract). That involves a spatial join
640 between visit and tract, and we include the region columns from both
641 tables in the results in order to only actually yield result rows
642 (see `predicate` and `rows`) where the regions in those two columns
643 overlap. If the user then wants to subset to just (skymap, tract) with
644 unique results, we have two unpalatable options:
646 - we can do a SELECT DISTINCT with just the skymap and tract columns
647 in the SELECT clause, dropping all detailed overlap information and
648 including some tracts that did not actually overlap any of the
649 visits in the original query (but were regarded as _possibly_
650 overlapping via the coarser, common-skypix relationships);
652 - we can include the tract and visit region columns in the query, and
653 continue to filter out the non-overlapping pairs, but completely
654 disregard the user's request for unique tracts.
656 This interface specifies that implementations must do the former, as
657 that's what makes things efficient in our most important use case
658 (``QuantumGraph`` generation in ``pipe_base``). We may be able to
659 improve this situation in the future by putting exact overlap
660 information in the database, either by using built-in (but
661 engine-specific) spatial database functionality or (more likely)
662 switching to a scheme in which pairwise dimension spatial relationships
663 are explicitly precomputed (for e.g. combinations of instruments and
664 skymaps).
665 """
666 raise NotImplementedError()
668 @abstractmethod
669 def makeBuilder(self, summary: Optional[QuerySummary] = None) -> QueryBuilder:
670 """Return a `QueryBuilder` that can be used to construct a new `Query`
671 that is joined to (and hence constrained by) this one.
673 Parameters
674 ----------
675 summary : `QuerySummary`, optional
676 A `QuerySummary` instance that specifies the dimensions and any
677 additional constraints to include in the new query being
678 constructed, or `None` to use the dimensions of ``self`` with no
679 additional constraints.
680 """
681 raise NotImplementedError()
683 graph: DimensionGraph
684 """The dimensions identified by this query and included in any data IDs
685 created from its result rows (`DimensionGraph`).
686 """
688 whereRegion: Optional[Region]
689 """A spatial region that all regions in all rows returned by this query
690 must overlap (`lsst.sphgeom.Region` or `None`).
691 """
693 managers: RegistryManagers
694 """A struct containing `Registry` helper object (`RegistryManagers`).
695 """
698class DirectQueryUniqueness(enum.Enum):
699 """An enum representing the ways in which a query can have unique rows (or
700 not).
701 """
703 NOT_UNIQUE = enum.auto()
704 """The query is not expected to have unique rows.
705 """
707 NATURALLY_UNIQUE = enum.auto()
708 """The construction of the query guarantees that it will have unique
709 result rows, even without SELECT DISTINCT or a GROUP BY clause.
710 """
712 NEEDS_DISTINCT = enum.auto()
713 """The query is expected to yield unique result rows, and needs to use
714 SELECT DISTINCT or an equivalent GROUP BY clause to achieve this.
715 """
718class DirectQuery(Query):
719 """A `Query` implementation that represents a direct SELECT query that
720 usually joins many tables.
722 `DirectQuery` objects should generally only be constructed by
723 `QueryBuilder` or the methods of other `Query` objects.
725 Parameters
726 ----------
727 simpleQuery : `SimpleQuery`
728 Struct representing the actual SELECT, FROM, and WHERE clauses.
729 columns : `QueryColumns`
730 Columns that are referenced in the query in any clause.
731 uniqueness : `DirectQueryUniqueness`
732 Enum value indicating whether the query should yield unique result
733 rows, and if so whether that needs to be explicitly requested of the
734 database.
735 graph : `DimensionGraph`
736 Object describing the dimensions included in the query.
737 whereRegion : `lsst.sphgeom.Region`, optional
738 Region that all region columns in all returned rows must overlap.
739 managers : `RegistryManagers`
740 Struct containing the `Registry` manager helper objects, to be
741 forwarded to the `Query` constructor.
742 doomed_by : `Iterable` [ `str` ], optional
743 A list of messages (appropriate for e.g. logging or exceptions) that
744 explain why the query is known to return no results even before it is
745 executed. Queries with a non-empty list will never be executed.
746 """
748 def __init__(
749 self,
750 *,
751 simpleQuery: SimpleQuery,
752 columns: QueryColumns,
753 uniqueness: DirectQueryUniqueness,
754 graph: DimensionGraph,
755 whereRegion: Optional[Region],
756 managers: RegistryManagers,
757 order_by_columns: Iterable[OrderByColumn] = (),
758 limit: Optional[Tuple[int, Optional[int]]] = None,
759 doomed_by: Iterable[str] = (),
760 ):
761 super().__init__(graph=graph, whereRegion=whereRegion, managers=managers, doomed_by=doomed_by)
762 assert not simpleQuery.columns, "Columns should always be set on a copy in .sql"
763 assert not columns.isEmpty(), "EmptyQuery must be used when a query would have no columns."
764 self._simpleQuery = simpleQuery
765 self._columns = columns
766 self._uniqueness = uniqueness
767 self._order_by_columns = order_by_columns
768 self._limit = limit
769 self._datasetQueryColumns: Optional[DatasetQueryColumns] = None
770 self._dimensionColumns: Dict[str, sqlalchemy.sql.ColumnElement] = {}
771 self._regionColumns: Dict[str, sqlalchemy.sql.ColumnElement] = {}
773 def isUnique(self) -> bool:
774 # Docstring inherited from Query.
775 return self._uniqueness is not DirectQueryUniqueness.NOT_UNIQUE
777 def getDimensionColumn(self, name: str) -> sqlalchemy.sql.ColumnElement:
778 # Docstring inherited from Query.
779 column = self._dimensionColumns.get(name)
780 if column is None:
781 column = self._columns.getKeyColumn(name).label(name)
782 self._dimensionColumns[name] = column
783 return column
785 @property
786 def spatial(self) -> Iterator[DimensionElement]:
787 # Docstring inherited from Query.
788 return iter(self._columns.regions)
790 def getRegionColumn(self, name: str) -> sqlalchemy.sql.ColumnElement:
791 # Docstring inherited from Query.
792 column = self._regionColumns.get(name)
793 if column is None:
794 column = self._columns.regions[name].column.label(f"{name}_region")
795 self._regionColumns[name] = column
796 return column
798 def getDatasetColumns(self) -> Optional[DatasetQueryColumns]:
799 # Docstring inherited from Query.
800 if self._datasetQueryColumns is None:
801 base = self._columns.datasets
802 if base is None:
803 return None
804 ingestDate = base.ingestDate
805 if ingestDate is not None:
806 ingestDate = ingestDate.label("ingest_date")
807 self._datasetQueryColumns = DatasetQueryColumns(
808 datasetType=base.datasetType,
809 id=base.id.label("dataset_id"),
810 runKey=base.runKey.label(self.managers.collections.getRunForeignKeyName()),
811 ingestDate=ingestDate,
812 )
813 return self._datasetQueryColumns
815 @property
816 def sql(self) -> sqlalchemy.sql.FromClause:
817 # Docstring inherited from Query.
818 simpleQuery = self._simpleQuery.copy()
819 for dimension in self.graph:
820 simpleQuery.columns.append(self.getDimensionColumn(dimension.name))
821 for element in self.spatial:
822 simpleQuery.columns.append(self.getRegionColumn(element.name))
823 datasetColumns = self.getDatasetColumns()
824 if datasetColumns is not None:
825 simpleQuery.columns.extend(datasetColumns)
827 assert not simpleQuery.order_by, "Input query cannot have ORDER BY"
828 if self._order_by_columns:
829 # add ORDER BY column
830 order_by_columns = [column.column_order for column in self._order_by_columns]
831 order_by_column = sqlalchemy.func.row_number().over(order_by=order_by_columns).label("_orderby")
832 simpleQuery.columns.append(order_by_column)
833 simpleQuery.order_by = [order_by_column]
835 assert simpleQuery.limit is None, "Input query cannot have LIMIT"
836 simpleQuery.limit = self._limit
838 sql = simpleQuery.combine()
840 if self._uniqueness is DirectQueryUniqueness.NEEDS_DISTINCT:
841 return sql.distinct()
842 else:
843 return sql
845 def _makeTableSpec(self, constraints: bool = False) -> ddl.TableSpec:
846 """Helper method for subclass implementations of `materialize`.
848 Parameters
849 ----------
850 constraints : `bool`, optional
851 If `True` (`False` is default), define a specification that
852 includes actual foreign key constraints for logical foreign keys.
853 Some database engines do not permit temporary tables to reference
854 normal tables, so this should be `False` when generating a spec
855 for a temporary table unless the database engine is known to
856 support them.
858 Returns
859 -------
860 spec : `ddl.TableSpec`
861 Specification for a table that could hold this query's result rows.
862 """
863 unique = self.isUnique()
864 spec = ddl.TableSpec(fields=())
865 for dimension in self.graph:
866 addDimensionForeignKey(spec, dimension, primaryKey=unique, constraint=constraints)
867 for element in self.spatial:
868 spec.fields.update(
869 SpatialRegionDatabaseRepresentation.makeFieldSpecs(
870 nullable=True,
871 name=f"{element.name}_region",
872 )
873 )
874 datasetColumns = self.getDatasetColumns()
875 if datasetColumns is not None:
876 self.managers.datasets.addDatasetForeignKey(spec, primaryKey=unique, constraint=constraints)
877 self.managers.collections.addRunForeignKey(spec, nullable=False, constraint=constraints)
879 # Need a column for ORDER BY if ordering is requested
880 if self._order_by_columns:
881 spec.fields.add(
882 ddl.FieldSpec(
883 name="_orderby",
884 dtype=sqlalchemy.BigInteger,
885 nullable=False,
886 doc="Column to use with ORDER BY",
887 )
888 )
890 return spec
892 @contextmanager
893 def materialize(self, db: Database) -> Iterator[Query]:
894 # Docstring inherited from Query.
895 spec = self._makeTableSpec()
896 with db.session() as session:
897 table = session.makeTemporaryTable(spec)
898 if not self._doomed_by:
899 db.insert(table, select=self.sql, names=spec.fields.names)
900 yield MaterializedQuery(
901 table=table,
902 spatial=self.spatial,
903 datasetType=self.datasetType,
904 isUnique=self.isUnique(),
905 graph=self.graph,
906 whereRegion=self.whereRegion,
907 managers=self.managers,
908 doomed_by=self._doomed_by,
909 )
910 session.dropTemporaryTable(table)
912 def subset(
913 self, *, graph: Optional[DimensionGraph] = None, datasets: bool = True, unique: bool = False
914 ) -> Query:
915 # Docstring inherited from Query.
916 graph, columns = self._makeSubsetQueryColumns(graph=graph, datasets=datasets, unique=unique)
917 if columns is None:
918 return self
919 if columns.isEmpty():
920 return EmptyQuery(self.graph.universe, self.managers)
921 return DirectQuery(
922 simpleQuery=self._simpleQuery.copy(),
923 columns=columns,
924 uniqueness=DirectQueryUniqueness.NEEDS_DISTINCT if unique else DirectQueryUniqueness.NOT_UNIQUE,
925 graph=graph,
926 whereRegion=self.whereRegion if not unique else None,
927 managers=self.managers,
928 doomed_by=self._doomed_by,
929 )
931 def makeBuilder(self, summary: Optional[QuerySummary] = None) -> QueryBuilder:
932 # Docstring inherited from Query.
933 from ._builder import QueryBuilder
935 if summary is None:
936 summary = QuerySummary(self.graph, whereRegion=self.whereRegion)
937 if not summary.requested.issubset(self.graph):
938 raise NotImplementedError(
939 f"Query.makeBuilder does not yet support augmenting dimensions "
940 f"({summary.requested.dimensions}) beyond those originally included in the query "
941 f"({self.graph.dimensions})."
942 )
943 builder = QueryBuilder(summary, managers=self.managers, doomed_by=self._doomed_by)
944 builder.joinTable(
945 self.sql.alias(), dimensions=self.graph.dimensions, datasets=self.getDatasetColumns()
946 )
947 return builder
950class MaterializedQuery(Query):
951 """A `Query` implementation that represents query results saved in a
952 temporary table.
954 `MaterializedQuery` instances should not be constructed directly; use
955 `Query.materialize()` instead.
957 Parameters
958 ----------
959 table : `sqlalchemy.schema.Table`
960 SQLAlchemy object representing the temporary table.
961 spatial : `Iterable` [ `DimensionElement` ]
962 Spatial dimension elements whose regions must overlap for each valid
963 result row (which may reject some rows that are in the table).
964 datasetType : `DatasetType`
965 The `DatasetType` of datasets returned by this query, or `None`
966 if there are no dataset results
967 isUnique : `bool`
968 If `True`, the table's rows are unique, and there is no need to
969 add ``SELECT DISTINCT`` to guarantee this in results.
970 graph : `DimensionGraph`
971 Dimensions included in the columns of this table.
972 whereRegion : `Region` or `None`
973 A spatial region all result-row regions must overlap to be valid (which
974 may reject some rows that are in the table).
975 managers : `RegistryManagers`
976 A struct containing `Registry` manager helper objects, forwarded to
977 the `Query` constructor.
978 doomed_by : `Iterable` [ `str` ], optional
979 A list of messages (appropriate for e.g. logging or exceptions) that
980 explain why the query is known to return no results even before it is
981 executed. Queries with a non-empty list will never be executed.
982 """
984 def __init__(
985 self,
986 *,
987 table: sqlalchemy.schema.Table,
988 spatial: Iterable[DimensionElement],
989 datasetType: Optional[DatasetType],
990 isUnique: bool,
991 graph: DimensionGraph,
992 whereRegion: Optional[Region],
993 managers: RegistryManagers,
994 doomed_by: Iterable[str] = (),
995 ):
996 super().__init__(graph=graph, whereRegion=whereRegion, managers=managers, doomed_by=doomed_by)
997 self._table = table
998 self._spatial = tuple(spatial)
999 self._datasetType = datasetType
1000 self._isUnique = isUnique
1002 def isUnique(self) -> bool:
1003 # Docstring inherited from Query.
1004 return self._isUnique
1006 def getDimensionColumn(self, name: str) -> sqlalchemy.sql.ColumnElement:
1007 # Docstring inherited from Query.
1008 return self._table.columns[name]
1010 @property
1011 def spatial(self) -> Iterator[DimensionElement]:
1012 # Docstring inherited from Query.
1013 return iter(self._spatial)
1015 def getRegionColumn(self, name: str) -> sqlalchemy.sql.ColumnElement:
1016 # Docstring inherited from Query.
1017 return self._table.columns[f"{name}_region"]
1019 def getDatasetColumns(self) -> Optional[DatasetQueryColumns]:
1020 # Docstring inherited from Query.
1021 if self._datasetType is not None:
1022 return DatasetQueryColumns(
1023 datasetType=self._datasetType,
1024 id=self._table.columns["dataset_id"],
1025 runKey=self._table.columns[self.managers.collections.getRunForeignKeyName()],
1026 ingestDate=None,
1027 )
1028 else:
1029 return None
1031 @property
1032 def sql(self) -> sqlalchemy.sql.FromClause:
1033 # Docstring inherited from Query.
1034 select = self._table.select()
1035 if "_orderby" in self._table.columns:
1036 select = select.order_by(self._table.columns["_orderby"])
1037 return select
1039 @contextmanager
1040 def materialize(self, db: Database) -> Iterator[Query]:
1041 # Docstring inherited from Query.
1042 yield self
1044 def subset(
1045 self, *, graph: Optional[DimensionGraph] = None, datasets: bool = True, unique: bool = False
1046 ) -> Query:
1047 # Docstring inherited from Query.
1048 graph, columns = self._makeSubsetQueryColumns(graph=graph, datasets=datasets, unique=unique)
1049 if columns is None:
1050 return self
1051 if columns.isEmpty():
1052 return EmptyQuery(self.graph.universe, managers=self.managers)
1053 simpleQuery = SimpleQuery()
1054 simpleQuery.join(self._table)
1055 return DirectQuery(
1056 simpleQuery=simpleQuery,
1057 columns=columns,
1058 uniqueness=DirectQueryUniqueness.NEEDS_DISTINCT if unique else DirectQueryUniqueness.NOT_UNIQUE,
1059 graph=graph,
1060 whereRegion=self.whereRegion if not unique else None,
1061 managers=self.managers,
1062 doomed_by=self._doomed_by,
1063 )
1065 def makeBuilder(self, summary: Optional[QuerySummary] = None) -> QueryBuilder:
1066 # Docstring inherited from Query.
1067 from ._builder import QueryBuilder
1069 if summary is None:
1070 summary = QuerySummary(self.graph, whereRegion=self.whereRegion)
1071 if not summary.requested.issubset(self.graph):
1072 raise NotImplementedError(
1073 f"Query.makeBuilder does not yet support augmenting dimensions "
1074 f"({summary.requested.dimensions}) beyond those originally included in the query "
1075 f"({self.graph.dimensions})."
1076 )
1077 builder = QueryBuilder(summary, managers=self.managers, doomed_by=self._doomed_by)
1078 builder.joinTable(self._table, dimensions=self.graph.dimensions, datasets=self.getDatasetColumns())
1079 return builder
1082class EmptyQuery(Query):
1083 """A `Query` implementation that handes the special case where the query
1084 would have no columns.
1086 Parameters
1087 ----------
1088 universe : `DimensionUniverse`
1089 Set of all dimensions from which the null set is extracted.
1090 managers : `RegistryManagers`
1091 A struct containing the registry manager instances used by the query
1092 system.
1093 doomed_by : `Iterable` [ `str` ], optional
1094 A list of messages (appropriate for e.g. logging or exceptions) that
1095 explain why the query is known to return no results even before it is
1096 executed. Queries with a non-empty list will never be executed.
1097 """
1099 def __init__(
1100 self,
1101 universe: DimensionUniverse,
1102 managers: RegistryManagers,
1103 doomed_by: Iterable[str] = (),
1104 ):
1105 super().__init__(graph=universe.empty, whereRegion=None, managers=managers, doomed_by=doomed_by)
1107 def isUnique(self) -> bool:
1108 # Docstring inherited from Query.
1109 return True
1111 def getDimensionColumn(self, name: str) -> sqlalchemy.sql.ColumnElement:
1112 # Docstring inherited from Query.
1113 raise KeyError(f"No dimension {name} in query (no dimensions at all, actually).")
1115 @property
1116 def spatial(self) -> Iterator[DimensionElement]:
1117 # Docstring inherited from Query.
1118 return iter(())
1120 def getRegionColumn(self, name: str) -> sqlalchemy.sql.ColumnElement:
1121 # Docstring inherited from Query.
1122 raise KeyError(f"No region for {name} in query (no regions at all, actually).")
1124 def getDatasetColumns(self) -> Optional[DatasetQueryColumns]:
1125 # Docstring inherited from Query.
1126 return None
1128 def rows(
1129 self, db: Database, *, region: Optional[Region] = None
1130 ) -> Iterator[Optional[sqlalchemy.engine.RowProxy]]:
1131 if not self._doomed_by:
1132 yield None
1134 @property
1135 def sql(self) -> Optional[sqlalchemy.sql.FromClause]:
1136 # Docstring inherited from Query.
1137 return None
1139 @contextmanager
1140 def materialize(self, db: Database) -> Iterator[Query]:
1141 # Docstring inherited from Query.
1142 yield self
1144 def subset(
1145 self, *, graph: Optional[DimensionGraph] = None, datasets: bool = True, unique: bool = False
1146 ) -> Query:
1147 # Docstring inherited from Query.
1148 assert graph is None or graph.issubset(self.graph)
1149 return self
1151 def makeBuilder(self, summary: Optional[QuerySummary] = None) -> QueryBuilder:
1152 # Docstring inherited from Query.
1153 from ._builder import QueryBuilder
1155 if summary is None:
1156 summary = QuerySummary(self.graph)
1157 if not summary.requested.issubset(self.graph):
1158 raise NotImplementedError(
1159 f"Query.makeBuilder does not yet support augmenting dimensions "
1160 f"({summary.requested.dimensions}) beyond those originally included in the query "
1161 f"({self.graph.dimensions})."
1162 )
1163 return QueryBuilder(summary, managers=self.managers, doomed_by=self._doomed_by)