Coverage for python/lsst/daf/butler/registry/queries/_query.py: 24%
Shortcuts on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
21from __future__ import annotations
23__all__ = ("Query",)
25import dataclasses
26import enum
27import itertools
28from abc import ABC, abstractmethod
29from contextlib import contextmanager
30from typing import TYPE_CHECKING, ContextManager, Dict, Iterable, Iterator, Mapping, Optional, Tuple
32import sqlalchemy
33from lsst.sphgeom import Region
35from ...core import (
36 DataCoordinate,
37 DatasetRef,
38 DatasetType,
39 Dimension,
40 DimensionElement,
41 DimensionGraph,
42 DimensionRecord,
43 DimensionUniverse,
44 SimpleQuery,
45 SpatialRegionDatabaseRepresentation,
46 addDimensionForeignKey,
47 ddl,
48)
49from ..interfaces import Database
50from ._structs import DatasetQueryColumns, QueryColumns, QuerySummary, RegistryManagers
52if TYPE_CHECKING: 52 ↛ 53line 52 didn't jump to line 53, because the condition on line 52 was never true
53 from ._builder import QueryBuilder
56@dataclasses.dataclass(frozen=True)
57class OrderByColumn:
58 """Information about single column in ORDER BY clause."""
60 column: sqlalchemy.sql.ColumnElement
61 """Name of the column or `None` for primary key (`str` or `None`)"""
63 ordering: bool
64 """True for ascending order, False for descending (`bool`)."""
66 add_to_select: bool
67 """True if columns is a non-key column and needs to be added to select
68 columns explicitly (`bool`)."""
70 field_spec: Optional[ddl.FieldSpec]
71 """Field specification for a column in materialized table (`ddl.FieldSpec`)
72 """
74 dimension: Optional[Dimension]
75 """Not-None if column corresponds to a dimension (`Dimension` or `None`)"""
77 @property
78 def column_order(self) -> sqlalchemy.sql.ColumnElement:
79 """Column element for use in ORDER BY clause
80 (`sqlalchemy.sql.ColumnElement`)
81 """
82 return self.column.asc() if self.ordering else self.column.desc()
84 def materialized(self, table: sqlalchemy.schema.Table) -> OrderByColumn:
85 """Re-purpose ordering column definition for a materialized table.
87 Parameters
88 ----------
89 table : `sqlalchemy.schema.Table`
90 Materialized table, it should have all columns in SELECT clause
91 already.
93 Returns
94 -------
95 column : `OrderByColumn`
96 Column definition to use with ORDER BY in materialized table.
97 """
98 return OrderByColumn(
99 column=table.columns[self.dimension.name if self.dimension else self.column.name],
100 ordering=self.ordering,
101 add_to_select=False,
102 field_spec=None,
103 dimension=self.dimension,
104 )
107class Query(ABC):
108 """An abstract base class for queries that return some combination of
109 `DatasetRef` and `DataCoordinate` objects.
111 Parameters
112 ----------
113 graph : `DimensionGraph`
114 Object describing the dimensions included in the query.
115 whereRegion : `lsst.sphgeom.Region`, optional
116 Region that all region columns in all returned rows must overlap.
117 managers : `RegistryManagers`
118 A struct containing the registry manager instances used by the query
119 system.
120 doomed_by : `Iterable` [ `str` ], optional
121 A list of messages (appropriate for e.g. logging or exceptions) that
122 explain why the query is known to return no results even before it is
123 executed. Queries with a non-empty list will never be executed.
125 Notes
126 -----
127 The `Query` hierarchy abstracts over the database/SQL representation of a
128 particular set of data IDs or datasets. It is expected to be used as a
129 backend for other objects that provide more natural interfaces for one or
130 both of these, not as part of a public interface to query results.
131 """
133 def __init__(
134 self,
135 *,
136 graph: DimensionGraph,
137 whereRegion: Optional[Region],
138 managers: RegistryManagers,
139 doomed_by: Iterable[str] = (),
140 ):
141 self.graph = graph
142 self.whereRegion = whereRegion
143 self.managers = managers
144 self._doomed_by = tuple(doomed_by)
145 self._filtered_by_join: Optional[int] = None
146 self._filtered_by_where: Optional[int] = None
148 @abstractmethod
149 def isUnique(self) -> bool:
150 """Return `True` if this query's rows are guaranteed to be unique, and
151 `False` otherwise.
153 If this query has dataset results (`datasetType` is not `None`),
154 uniqueness applies to the `DatasetRef` instances returned by
155 `extractDatasetRef` from the result of `rows`. If it does not have
156 dataset results, uniqueness applies to the `DataCoordinate` instances
157 returned by `extractDataId`.
158 """
159 raise NotImplementedError()
161 @abstractmethod
162 def getDimensionColumn(self, name: str) -> sqlalchemy.sql.ColumnElement:
163 """Return the query column that contains the primary key value for
164 the dimension with the given name.
166 Parameters
167 ----------
168 name : `str`
169 Name of the dimension.
171 Returns
172 -------
173 column : `sqlalchemy.sql.ColumnElement`.
174 SQLAlchemy object representing a column in the query.
176 Notes
177 -----
178 This method is intended primarily as a hook for subclasses to implement
179 and the ABC to call in order to provide higher-level functionality;
180 code that uses `Query` objects (but does not implement one) should
181 usually not have to call this method.
182 """
183 raise NotImplementedError()
185 @property
186 @abstractmethod
187 def spatial(self) -> Iterator[DimensionElement]:
188 """An iterator over the dimension element columns used in post-query
189 filtering of spatial overlaps (`Iterator` [ `DimensionElement` ]).
191 Notes
192 -----
193 This property is intended primarily as a hook for subclasses to
194 implement and the ABC to call in order to provide higher-level
195 functionality; code that uses `Query` objects (but does not implement
196 one) should usually not have to access this property.
197 """
198 raise NotImplementedError()
200 @abstractmethod
201 def getRegionColumn(self, name: str) -> sqlalchemy.sql.ColumnElement:
202 """Return a region column for one of the dimension elements iterated
203 over by `spatial`.
205 Parameters
206 ----------
207 name : `str`
208 Name of the element.
210 Returns
211 -------
212 column : `sqlalchemy.sql.ColumnElement`
213 SQLAlchemy representing a result column in the query.
215 Notes
216 -----
217 This method is intended primarily as a hook for subclasses to implement
218 and the ABC to call in order to provide higher-level functionality;
219 code that uses `Query` objects (but does not implement one) should
220 usually not have to call this method.
221 """
222 raise NotImplementedError()
224 @property
225 def datasetType(self) -> Optional[DatasetType]:
226 """The `DatasetType` of datasets returned by this query, or `None`
227 if there are no dataset results (`DatasetType` or `None`).
228 """
229 cols = self.getDatasetColumns()
230 if cols is None:
231 return None
232 return cols.datasetType
234 def count(self, db: Database, *, region: Optional[Region] = None, exact: bool = True) -> int:
235 """Count the number of rows this query would return.
237 Parameters
238 ----------
239 db : `Database`
240 Object managing the database connection.
241 region : `sphgeom.Region`, optional
242 A region that any result-row regions must overlap in order to be
243 yielded. If not provided, this will be ``self.whereRegion``, if
244 that exists.
245 exact : `bool`, optional
246 If `True`, run the full query and perform post-query filtering if
247 needed to account for that filtering in the count. If `False`, the
248 result may be an upper bound.
250 Returns
251 -------
252 count : `int`
253 The number of rows the query would return, or an upper bound if
254 ``exact=False``.
256 Notes
257 -----
258 This counts the number of rows returned, not the number of unique rows
259 returned, so even with ``exact=True`` it may provide only an upper
260 bound on the number of *deduplicated* result rows.
261 """
262 if self._doomed_by:
263 return 0
264 sql = self.sql
265 if sql is None:
266 return 1
267 if exact and self.spatial:
268 filtered_count = 0
269 for _ in self.rows(db, region=region):
270 filtered_count += 1
271 return filtered_count
272 else:
273 return db.query(sql.with_only_columns([sqlalchemy.sql.func.count()]).order_by(None)).scalar()
275 def any(
276 self,
277 db: Database,
278 *,
279 region: Optional[Region] = None,
280 execute: bool = True,
281 exact: bool = True,
282 ) -> bool:
283 """Test whether this query returns any results.
285 Parameters
286 ----------
287 db : `Database`
288 Object managing the database connection.
289 region : `sphgeom.Region`, optional
290 A region that any result-row regions must overlap in order to be
291 yielded. If not provided, this will be ``self.whereRegion``, if
292 that exists.
293 execute : `bool`, optional
294 If `True`, execute at least a ``LIMIT 1`` query if it cannot be
295 determined prior to execution that the query would return no rows.
296 exact : `bool`, optional
297 If `True`, run the full query and perform post-query filtering if
298 needed, until at least one result row is found. If `False`, the
299 returned result does not account for post-query filtering, and
300 hence may be `True` even when all result rows would be filtered
301 out.
303 Returns
304 -------
305 any : `bool`
306 `True` if the query would (or might, depending on arguments) yield
307 result rows. `False` if it definitely would not.
308 """
309 if self._doomed_by:
310 return False
311 sql = self.sql
312 if sql is None:
313 return True
314 if exact and not execute:
315 raise TypeError("Cannot obtain exact results without executing the query.")
316 if exact and self.spatial:
317 for _ in self.rows(db, region=region):
318 return True
319 return False
320 elif execute:
321 return db.query(sql.limit(1)).one_or_none() is not None
322 else:
323 return True
325 def explain_no_results(
326 self,
327 db: Database,
328 *,
329 region: Optional[Region] = None,
330 followup: bool = True,
331 ) -> Iterator[str]:
332 """Return human-readable messages that may help explain why the query
333 yields no results.
335 Parameters
336 ----------
337 db : `Database`
338 Object managing the database connection.
339 region : `sphgeom.Region`, optional
340 A region that any result-row regions must overlap in order to be
341 yielded. If not provided, this will be ``self.whereRegion``, if
342 that exists.
343 followup : `bool`, optional
344 If `True` (default) perform inexpensive follow-up queries if no
345 diagnostics are available from query generation alone.
347 Returns
348 -------
349 messages : `Iterator` [ `str` ]
350 String messages that describe reasons the query might not yield any
351 results.
353 Notes
354 -----
355 Messages related to post-query filtering are only available if `rows`,
356 `any`, or `count` was already called with the same region (with
357 ``exact=True`` for the latter two).
358 """
359 from ._builder import QueryBuilder
361 if self._doomed_by:
362 yield from self._doomed_by
363 return
364 if self._filtered_by_where:
365 yield (
366 f"{self._filtered_by_where} result rows were filtered out because "
367 "one or more region did not overlap the WHERE-clause region."
368 )
369 if self._filtered_by_join:
370 yield (
371 f"{self._filtered_by_join} result rows were filtered out because "
372 "one or more regions did not overlap."
373 )
374 if (not followup) or self._filtered_by_join or self._filtered_by_where:
375 return
376 # Query didn't return results even before client-side filtering, and
377 # caller says we can do follow-up queries to determine why.
378 # Start by seeing if there are _any_ dimension records for each element
379 # involved.
380 for element in self.graph.elements:
381 summary = QuerySummary(element.graph)
382 builder = QueryBuilder(summary, self.managers)
383 followup_query = builder.finish()
384 if not followup_query.any(db, exact=False):
385 yield f"No dimension records for element '{element.name}' found."
386 yield from followup_query.explain_no_results(db, region=region, followup=False)
387 return
389 @abstractmethod
390 def getDatasetColumns(self) -> Optional[DatasetQueryColumns]:
391 """Return the columns for the datasets returned by this query.
393 Returns
394 -------
395 columns : `DatasetQueryColumns` or `None`
396 Struct containing SQLAlchemy representations of the result columns
397 for a dataset.
399 Notes
400 -----
401 This method is intended primarily as a hook for subclasses to implement
402 and the ABC to call in order to provide higher-level functionality;
403 code that uses `Query` objects (but does not implement one) should
404 usually not have to call this method.
405 """
406 raise NotImplementedError()
408 @property
409 @abstractmethod
410 def sql(self) -> Optional[sqlalchemy.sql.FromClause]:
411 """A SQLAlchemy object representing the full query
412 (`sqlalchemy.sql.FromClause` or `None`).
414 This is `None` in the special case where the query has no columns, and
415 only one logical row.
416 """
417 raise NotImplementedError()
419 def rows(
420 self, db: Database, *, region: Optional[Region] = None
421 ) -> Iterator[Optional[sqlalchemy.engine.RowProxy]]:
422 """Execute the query and yield result rows, applying `predicate`.
424 Parameters
425 ----------
426 db : `Database`
427 Object managing the database connection.
428 region : `sphgeom.Region`, optional
429 A region that any result-row regions must overlap in order to be
430 yielded. If not provided, this will be ``self.whereRegion``, if
431 that exists.
433 Yields
434 ------
435 row : `sqlalchemy.engine.RowProxy` or `None`
436 Result row from the query. `None` may yielded exactly once instead
437 of any real rows to indicate an empty query (see `EmptyQuery`).
438 """
439 if self._doomed_by:
440 return
441 whereRegion = region if region is not None else self.whereRegion
442 self._filtered_by_where = 0
443 self._filtered_by_join = 0
444 for row in db.query(self.sql):
445 rowRegions = [row._mapping[self.getRegionColumn(element.name)] for element in self.spatial]
446 if whereRegion and any(r.isDisjointFrom(whereRegion) for r in rowRegions):
447 self._filtered_by_where += 1
448 continue
449 if not not any(a.isDisjointFrom(b) for a, b in itertools.combinations(rowRegions, 2)):
450 self._filtered_by_join += 1
451 continue
452 yield row
454 def extractDimensionsTuple(
455 self, row: Optional[sqlalchemy.engine.RowProxy], dimensions: Iterable[Dimension]
456 ) -> tuple:
457 """Extract a tuple of data ID values from a result row.
459 Parameters
460 ----------
461 row : `sqlalchemy.engine.RowProxy` or `None`
462 A result row from a SQLAlchemy SELECT query, or `None` to indicate
463 the row from an `EmptyQuery`.
464 dimensions : `Iterable` [ `Dimension` ]
465 The dimensions to include in the returned tuple, in order.
467 Returns
468 -------
469 values : `tuple`
470 A tuple of dimension primary key values.
471 """
472 if row is None:
473 assert not tuple(dimensions), "Can only utilize empty query row when there are no dimensions."
474 return ()
475 return tuple(row._mapping[self.getDimensionColumn(dimension.name)] for dimension in dimensions)
477 def extractDataId(
478 self,
479 row: Optional[sqlalchemy.engine.RowProxy],
480 *,
481 graph: Optional[DimensionGraph] = None,
482 records: Optional[Mapping[str, Mapping[tuple, DimensionRecord]]] = None,
483 ) -> DataCoordinate:
484 """Extract a data ID from a result row.
486 Parameters
487 ----------
488 row : `sqlalchemy.engine.RowProxy` or `None`
489 A result row from a SQLAlchemy SELECT query, or `None` to indicate
490 the row from an `EmptyQuery`.
491 graph : `DimensionGraph`, optional
492 The dimensions the returned data ID should identify. If not
493 provided, this will be all dimensions in `QuerySummary.requested`.
494 records : `Mapping` [ `str`, `Mapping` [ `tuple`, `DimensionRecord` ] ]
495 Nested mapping containing records to attach to the returned
496 `DataCoordinate`, for which `~DataCoordinate.hasRecords` will
497 return `True`. If provided, outer keys must include all dimension
498 element names in ``graph``, and inner keys should be tuples of
499 dimension primary key values in the same order as
500 ``element.graph.required``. If not provided,
501 `DataCoordinate.hasRecords` will return `False` on the returned
502 object.
504 Returns
505 -------
506 dataId : `DataCoordinate`
507 A data ID that identifies all required and implied dimensions. If
508 ``records is not None``, this is have
509 `~DataCoordinate.hasRecords()` return `True`.
510 """
511 if graph is None:
512 graph = self.graph
513 if not graph:
514 return DataCoordinate.makeEmpty(self.graph.universe)
515 dataId = DataCoordinate.fromFullValues(
516 graph, self.extractDimensionsTuple(row, itertools.chain(graph.required, graph.implied))
517 )
518 if records is not None:
519 recordsForRow = {}
520 for element in graph.elements:
521 key = tuple(dataId.subset(element.graph).values())
522 recordsForRow[element.name] = records[element.name].get(key)
523 return dataId.expanded(recordsForRow)
524 else:
525 return dataId
527 def extractDatasetRef(
528 self,
529 row: sqlalchemy.engine.RowProxy,
530 dataId: Optional[DataCoordinate] = None,
531 records: Optional[Mapping[str, Mapping[tuple, DimensionRecord]]] = None,
532 ) -> DatasetRef:
533 """Extract a `DatasetRef` from a result row.
535 Parameters
536 ----------
537 row : `sqlalchemy.engine.RowProxy`
538 A result row from a SQLAlchemy SELECT query.
539 dataId : `DataCoordinate`
540 Data ID to attach to the `DatasetRef`. A minimal (i.e. base class)
541 `DataCoordinate` is constructed from ``row`` if `None`.
542 records : `Mapping` [ `str`, `Mapping` [ `tuple`, `DimensionRecord` ] ]
543 Records to use to return an `ExpandedDataCoordinate`. If provided,
544 outer keys must include all dimension element names in ``graph``,
545 and inner keys should be tuples of dimension primary key values
546 in the same order as ``element.graph.required``.
548 Returns
549 -------
550 ref : `DatasetRef`
551 Reference to the dataset; guaranteed to have `DatasetRef.id` not
552 `None`.
553 """
554 datasetColumns = self.getDatasetColumns()
555 assert datasetColumns is not None
556 if dataId is None:
557 dataId = self.extractDataId(row, graph=datasetColumns.datasetType.dimensions, records=records)
558 runRecord = self.managers.collections[row._mapping[datasetColumns.runKey]]
559 return DatasetRef(
560 datasetColumns.datasetType, dataId, id=row._mapping[datasetColumns.id], run=runRecord.name
561 )
563 def _makeSubsetQueryColumns(
564 self, *, graph: Optional[DimensionGraph] = None, datasets: bool = True, unique: bool = False
565 ) -> Tuple[DimensionGraph, Optional[QueryColumns]]:
566 """Helper method for subclass implementations of `subset`.
568 Parameters
569 ----------
570 graph : `DimensionGraph`, optional
571 Dimensions to include in the new `Query` being constructed.
572 ``subset`` implementations should generally just forward their
573 own ``graph`` argument here.
574 datasets : `bool`, optional
575 Whether the new `Query` should include dataset results. Defaults
576 to `True`, but is ignored if ``self`` does not include dataset
577 results.
578 unique : `bool`, optional
579 Whether the new `Query` should guarantee unique results (this may
580 come with a performance penalty).
582 Returns
583 -------
584 graph : `DimensionGraph`
585 The dimensions of the new `Query`. This is exactly the same as
586 the argument of the same name, with ``self.graph`` used if that
587 argument is `None`.
588 columns : `QueryColumns` or `None`
589 A struct containing the SQLAlchemy column objects to use in the
590 new query, constructed by delegating to other (mostly abstract)
591 methods on ``self``. If `None`, `subset` may return ``self``.
592 """
593 if graph is None:
594 graph = self.graph
595 if (
596 graph == self.graph
597 and (self.getDatasetColumns() is None or datasets)
598 and (self.isUnique() or not unique)
599 ):
600 return graph, None
601 columns = QueryColumns()
602 for dimension in graph.dimensions:
603 col = self.getDimensionColumn(dimension.name)
604 columns.keys[dimension] = [col]
605 if not unique:
606 for element in self.spatial:
607 col = self.getRegionColumn(element.name)
608 columns.regions[element] = col
609 if datasets and self.getDatasetColumns() is not None:
610 columns.datasets = self.getDatasetColumns()
611 return graph, columns
613 @abstractmethod
614 def materialize(self, db: Database) -> ContextManager[Query]:
615 """Execute this query and insert its results into a temporary table.
617 Parameters
618 ----------
619 db : `Database`
620 Database engine to execute the query against.
622 Returns
623 -------
624 context : `typing.ContextManager` [ `MaterializedQuery` ]
625 A context manager that ensures the temporary table is created and
626 populated in ``__enter__`` (returning a `MaterializedQuery` object
627 backed by that table), and dropped in ``__exit__``. If ``self``
628 is already a `MaterializedQuery`, ``__enter__`` may just return
629 ``self`` and ``__exit__`` may do nothing (reflecting the fact that
630 an outer context manager should already take care of everything
631 else).
632 """
633 raise NotImplementedError()
635 @abstractmethod
636 def subset(
637 self, *, graph: Optional[DimensionGraph] = None, datasets: bool = True, unique: bool = False
638 ) -> Query:
639 """Return a new `Query` whose columns and/or rows are (mostly) subset
640 of this one's.
642 Parameters
643 ----------
644 graph : `DimensionGraph`, optional
645 Dimensions to include in the new `Query` being constructed.
646 If `None` (default), ``self.graph`` is used.
647 datasets : `bool`, optional
648 Whether the new `Query` should include dataset results. Defaults
649 to `True`, but is ignored if ``self`` does not include dataset
650 results.
651 unique : `bool`, optional
652 Whether the new `Query` should guarantee unique results (this may
653 come with a performance penalty).
655 Returns
656 -------
657 query : `Query`
658 A query object corresponding to the given inputs. May be ``self``
659 if no changes were requested.
661 Notes
662 -----
663 The way spatial overlaps are handled at present makes it impossible to
664 fully guarantee in general that the new query's rows are a subset of
665 this one's while also returning unique rows. That's because the
666 database is only capable of performing approximate, conservative
667 overlaps via the common skypix system; we defer actual region overlap
668 operations to per-result-row Python logic. But including the region
669 columns necessary to do that postprocessing in the query makes it
670 impossible to do a SELECT DISTINCT on the user-visible dimensions of
671 the query. For example, consider starting with a query with dimensions
672 (instrument, skymap, visit, tract). That involves a spatial join
673 between visit and tract, and we include the region columns from both
674 tables in the results in order to only actually yield result rows
675 (see `predicate` and `rows`) where the regions in those two columns
676 overlap. If the user then wants to subset to just (skymap, tract) with
677 unique results, we have two unpalatable options:
679 - we can do a SELECT DISTINCT with just the skymap and tract columns
680 in the SELECT clause, dropping all detailed overlap information and
681 including some tracts that did not actually overlap any of the
682 visits in the original query (but were regarded as _possibly_
683 overlapping via the coarser, common-skypix relationships);
685 - we can include the tract and visit region columns in the query, and
686 continue to filter out the non-overlapping pairs, but completely
687 disregard the user's request for unique tracts.
689 This interface specifies that implementations must do the former, as
690 that's what makes things efficient in our most important use case
691 (``QuantumGraph`` generation in ``pipe_base``). We may be able to
692 improve this situation in the future by putting exact overlap
693 information in the database, either by using built-in (but
694 engine-specific) spatial database functionality or (more likely)
695 switching to a scheme in which pairwise dimension spatial relationships
696 are explicitly precomputed (for e.g. combinations of instruments and
697 skymaps).
698 """
699 raise NotImplementedError()
701 @abstractmethod
702 def makeBuilder(self, summary: Optional[QuerySummary] = None) -> QueryBuilder:
703 """Return a `QueryBuilder` that can be used to construct a new `Query`
704 that is joined to (and hence constrained by) this one.
706 Parameters
707 ----------
708 summary : `QuerySummary`, optional
709 A `QuerySummary` instance that specifies the dimensions and any
710 additional constraints to include in the new query being
711 constructed, or `None` to use the dimensions of ``self`` with no
712 additional constraints.
713 """
714 raise NotImplementedError()
716 graph: DimensionGraph
717 """The dimensions identified by this query and included in any data IDs
718 created from its result rows (`DimensionGraph`).
719 """
721 whereRegion: Optional[Region]
722 """A spatial region that all regions in all rows returned by this query
723 must overlap (`lsst.sphgeom.Region` or `None`).
724 """
726 managers: RegistryManagers
727 """A struct containing `Registry` helper object (`RegistryManagers`).
728 """
731class DirectQueryUniqueness(enum.Enum):
732 """An enum representing the ways in which a query can have unique rows (or
733 not).
734 """
736 NOT_UNIQUE = enum.auto()
737 """The query is not expected to have unique rows.
738 """
740 NATURALLY_UNIQUE = enum.auto()
741 """The construction of the query guarantees that it will have unique
742 result rows, even without SELECT DISTINCT or a GROUP BY clause.
743 """
745 NEEDS_DISTINCT = enum.auto()
746 """The query is expected to yield unique result rows, and needs to use
747 SELECT DISTINCT or an equivalent GROUP BY clause to achieve this.
748 """
751class DirectQuery(Query):
752 """A `Query` implementation that represents a direct SELECT query that
753 usually joins many tables.
755 `DirectQuery` objects should generally only be constructed by
756 `QueryBuilder` or the methods of other `Query` objects.
758 Parameters
759 ----------
760 simpleQuery : `SimpleQuery`
761 Struct representing the actual SELECT, FROM, and WHERE clauses.
762 columns : `QueryColumns`
763 Columns that are referenced in the query in any clause.
764 uniqueness : `DirectQueryUniqueness`
765 Enum value indicating whether the query should yield unique result
766 rows, and if so whether that needs to be explicitly requested of the
767 database.
768 graph : `DimensionGraph`
769 Object describing the dimensions included in the query.
770 whereRegion : `lsst.sphgeom.Region`, optional
771 Region that all region columns in all returned rows must overlap.
772 managers : `RegistryManagers`
773 Struct containing the `Registry` manager helper objects, to be
774 forwarded to the `Query` constructor.
775 doomed_by : `Iterable` [ `str` ], optional
776 A list of messages (appropriate for e.g. logging or exceptions) that
777 explain why the query is known to return no results even before it is
778 executed. Queries with a non-empty list will never be executed.
779 """
781 def __init__(
782 self,
783 *,
784 simpleQuery: SimpleQuery,
785 columns: QueryColumns,
786 uniqueness: DirectQueryUniqueness,
787 graph: DimensionGraph,
788 whereRegion: Optional[Region],
789 managers: RegistryManagers,
790 order_by_columns: Iterable[OrderByColumn] = (),
791 limit: Optional[Tuple[int, Optional[int]]] = None,
792 doomed_by: Iterable[str] = (),
793 ):
794 super().__init__(graph=graph, whereRegion=whereRegion, managers=managers, doomed_by=doomed_by)
795 assert not simpleQuery.columns, "Columns should always be set on a copy in .sql"
796 assert not columns.isEmpty(), "EmptyQuery must be used when a query would have no columns."
797 self._simpleQuery = simpleQuery
798 self._columns = columns
799 self._uniqueness = uniqueness
800 self._order_by_columns = order_by_columns
801 self._limit = limit
802 self._datasetQueryColumns: Optional[DatasetQueryColumns] = None
803 self._dimensionColumns: Dict[str, sqlalchemy.sql.ColumnElement] = {}
804 self._regionColumns: Dict[str, sqlalchemy.sql.ColumnElement] = {}
806 def isUnique(self) -> bool:
807 # Docstring inherited from Query.
808 return self._uniqueness is not DirectQueryUniqueness.NOT_UNIQUE
810 def getDimensionColumn(self, name: str) -> sqlalchemy.sql.ColumnElement:
811 # Docstring inherited from Query.
812 column = self._dimensionColumns.get(name)
813 if column is None:
814 column = self._columns.getKeyColumn(name).label(name)
815 self._dimensionColumns[name] = column
816 return column
818 @property
819 def spatial(self) -> Iterator[DimensionElement]:
820 # Docstring inherited from Query.
821 return iter(self._columns.regions)
823 def getRegionColumn(self, name: str) -> sqlalchemy.sql.ColumnElement:
824 # Docstring inherited from Query.
825 column = self._regionColumns.get(name)
826 if column is None:
827 column = self._columns.regions[name].column.label(f"{name}_region")
828 self._regionColumns[name] = column
829 return column
831 def getDatasetColumns(self) -> Optional[DatasetQueryColumns]:
832 # Docstring inherited from Query.
833 if self._datasetQueryColumns is None:
834 base = self._columns.datasets
835 if base is None:
836 return None
837 ingestDate = base.ingestDate
838 if ingestDate is not None:
839 ingestDate = ingestDate.label("ingest_date")
840 self._datasetQueryColumns = DatasetQueryColumns(
841 datasetType=base.datasetType,
842 id=base.id.label("dataset_id"),
843 runKey=base.runKey.label(self.managers.collections.getRunForeignKeyName()),
844 ingestDate=ingestDate,
845 )
846 return self._datasetQueryColumns
848 @property
849 def sql(self) -> sqlalchemy.sql.FromClause:
850 # Docstring inherited from Query.
851 simpleQuery = self._simpleQuery.copy()
852 for dimension in self.graph:
853 simpleQuery.columns.append(self.getDimensionColumn(dimension.name))
854 for element in self.spatial:
855 simpleQuery.columns.append(self.getRegionColumn(element.name))
856 datasetColumns = self.getDatasetColumns()
857 if datasetColumns is not None:
858 simpleQuery.columns.extend(datasetColumns)
860 if self._order_by_columns:
861 # add ORDER BY columns
862 select_columns = [column.column for column in self._order_by_columns if column.add_to_select]
863 simpleQuery.columns.extend(select_columns)
864 sql = simpleQuery.combine()
865 order_by_columns = [column.column_order for column in self._order_by_columns]
866 sql = sql.order_by(*order_by_columns)
867 else:
868 sql = simpleQuery.combine()
870 if self._limit:
871 sql = sql.limit(self._limit[0])
872 if self._limit[1] is not None:
873 sql = sql.offset(self._limit[1])
875 if self._uniqueness is DirectQueryUniqueness.NEEDS_DISTINCT:
876 return sql.distinct()
877 else:
878 return sql
880 def _makeTableSpec(self, constraints: bool = False) -> ddl.TableSpec:
881 """Helper method for subclass implementations of `materialize`.
883 Parameters
884 ----------
885 constraints : `bool`, optional
886 If `True` (`False` is default), define a specification that
887 includes actual foreign key constraints for logical foreign keys.
888 Some database engines do not permit temporary tables to reference
889 normal tables, so this should be `False` when generating a spec
890 for a temporary table unless the database engine is known to
891 support them.
893 Returns
894 -------
895 spec : `ddl.TableSpec`
896 Specification for a table that could hold this query's result rows.
897 """
898 unique = self.isUnique()
899 spec = ddl.TableSpec(fields=())
900 for dimension in self.graph:
901 addDimensionForeignKey(spec, dimension, primaryKey=unique, constraint=constraints)
902 for element in self.spatial:
903 spec.fields.update(
904 SpatialRegionDatabaseRepresentation.makeFieldSpecs(
905 nullable=True,
906 name=f"{element.name}_region",
907 )
908 )
909 datasetColumns = self.getDatasetColumns()
910 if datasetColumns is not None:
911 self.managers.datasets.addDatasetForeignKey(spec, primaryKey=unique, constraint=constraints)
912 self.managers.collections.addRunForeignKey(spec, nullable=False, constraint=constraints)
914 # may need few extra columns from ORDER BY
915 spec.fields.update(
916 column.field_spec for column in self._order_by_columns if column.field_spec is not None
917 )
919 return spec
921 @contextmanager
922 def materialize(self, db: Database) -> Iterator[Query]:
923 # Docstring inherited from Query.
924 spec = self._makeTableSpec()
925 with db.session() as session:
926 table = session.makeTemporaryTable(spec)
927 if not self._doomed_by:
928 db.insert(table, select=self.sql, names=spec.fields.names)
929 order_by_columns = [column.materialized(table) for column in self._order_by_columns]
930 yield MaterializedQuery(
931 table=table,
932 spatial=self.spatial,
933 datasetType=self.datasetType,
934 isUnique=self.isUnique(),
935 graph=self.graph,
936 whereRegion=self.whereRegion,
937 managers=self.managers,
938 doomed_by=self._doomed_by,
939 order_by_columns=order_by_columns,
940 )
941 session.dropTemporaryTable(table)
943 def subset(
944 self, *, graph: Optional[DimensionGraph] = None, datasets: bool = True, unique: bool = False
945 ) -> Query:
946 # Docstring inherited from Query.
947 graph, columns = self._makeSubsetQueryColumns(graph=graph, datasets=datasets, unique=unique)
948 if columns is None:
949 return self
950 if columns.isEmpty():
951 return EmptyQuery(self.graph.universe, self.managers)
952 return DirectQuery(
953 simpleQuery=self._simpleQuery.copy(),
954 columns=columns,
955 uniqueness=DirectQueryUniqueness.NEEDS_DISTINCT if unique else DirectQueryUniqueness.NOT_UNIQUE,
956 graph=graph,
957 whereRegion=self.whereRegion if not unique else None,
958 managers=self.managers,
959 doomed_by=self._doomed_by,
960 )
962 def makeBuilder(self, summary: Optional[QuerySummary] = None) -> QueryBuilder:
963 # Docstring inherited from Query.
964 from ._builder import QueryBuilder
966 if summary is None:
967 summary = QuerySummary(self.graph, whereRegion=self.whereRegion)
968 if not summary.requested.issubset(self.graph):
969 raise NotImplementedError(
970 f"Query.makeBuilder does not yet support augmenting dimensions "
971 f"({summary.requested.dimensions}) beyond those originally included in the query "
972 f"({self.graph.dimensions})."
973 )
974 builder = QueryBuilder(summary, managers=self.managers, doomed_by=self._doomed_by)
975 builder.joinTable(
976 self.sql.alias(), dimensions=self.graph.dimensions, datasets=self.getDatasetColumns()
977 )
978 return builder
981class MaterializedQuery(Query):
982 """A `Query` implementation that represents query results saved in a
983 temporary table.
985 `MaterializedQuery` instances should not be constructed directly; use
986 `Query.materialize()` instead.
988 Parameters
989 ----------
990 table : `sqlalchemy.schema.Table`
991 SQLAlchemy object representing the temporary table.
992 spatial : `Iterable` [ `DimensionElement` ]
993 Spatial dimension elements whose regions must overlap for each valid
994 result row (which may reject some rows that are in the table).
995 datasetType : `DatasetType`
996 The `DatasetType` of datasets returned by this query, or `None`
997 if there are no dataset results
998 isUnique : `bool`
999 If `True`, the table's rows are unique, and there is no need to
1000 add ``SELECT DISTINCT`` to guarantee this in results.
1001 graph : `DimensionGraph`
1002 Dimensions included in the columns of this table.
1003 whereRegion : `Region` or `None`
1004 A spatial region all result-row regions must overlap to be valid (which
1005 may reject some rows that are in the table).
1006 managers : `RegistryManagers`
1007 A struct containing `Registry` manager helper objects, forwarded to
1008 the `Query` constructor.
1009 doomed_by : `Iterable` [ `str` ], optional
1010 A list of messages (appropriate for e.g. logging or exceptions) that
1011 explain why the query is known to return no results even before it is
1012 executed. Queries with a non-empty list will never be executed.
1013 order_by : `Tuple` [ `str` ], optional
1014 Optional list of column names to use in ORDER BY clause, names can be
1015 prefixed with minus sign for descending ordering.
1016 """
1018 def __init__(
1019 self,
1020 *,
1021 table: sqlalchemy.schema.Table,
1022 spatial: Iterable[DimensionElement],
1023 datasetType: Optional[DatasetType],
1024 isUnique: bool,
1025 graph: DimensionGraph,
1026 whereRegion: Optional[Region],
1027 managers: RegistryManagers,
1028 doomed_by: Iterable[str] = (),
1029 order_by_columns: Iterable[OrderByColumn] = (),
1030 ):
1031 super().__init__(graph=graph, whereRegion=whereRegion, managers=managers, doomed_by=doomed_by)
1032 self._table = table
1033 self._spatial = tuple(spatial)
1034 self._datasetType = datasetType
1035 self._isUnique = isUnique
1036 self._order_by_columns = order_by_columns
1038 def isUnique(self) -> bool:
1039 # Docstring inherited from Query.
1040 return self._isUnique
1042 def getDimensionColumn(self, name: str) -> sqlalchemy.sql.ColumnElement:
1043 # Docstring inherited from Query.
1044 return self._table.columns[name]
1046 @property
1047 def spatial(self) -> Iterator[DimensionElement]:
1048 # Docstring inherited from Query.
1049 return iter(self._spatial)
1051 def order_by(self, *args: str) -> Query:
1052 # Docstring inherited from Query.
1053 raise NotImplementedError("MaterializedQuery.order_by should not be called directly")
1055 def limit(self, limit: int, offset: Optional[int] = None) -> Query:
1056 # Docstring inherited from Query.
1058 # Calling limit on materialized data is likely an error, limit should
1059 # be set before materializing.
1060 raise NotImplementedError("MaterializedQuery.limit should not be called directly")
1062 def getRegionColumn(self, name: str) -> sqlalchemy.sql.ColumnElement:
1063 # Docstring inherited from Query.
1064 return self._table.columns[f"{name}_region"]
1066 def getDatasetColumns(self) -> Optional[DatasetQueryColumns]:
1067 # Docstring inherited from Query.
1068 if self._datasetType is not None:
1069 return DatasetQueryColumns(
1070 datasetType=self._datasetType,
1071 id=self._table.columns["dataset_id"],
1072 runKey=self._table.columns[self.managers.collections.getRunForeignKeyName()],
1073 ingestDate=None,
1074 )
1075 else:
1076 return None
1078 @property
1079 def sql(self) -> sqlalchemy.sql.FromClause:
1080 # Docstring inherited from Query.
1081 select = self._table.select()
1082 if self._order_by_columns:
1083 order_by_columns = [column.column_order for column in self._order_by_columns]
1084 select = select.order_by(*order_by_columns)
1085 return select
1087 @contextmanager
1088 def materialize(self, db: Database) -> Iterator[Query]:
1089 # Docstring inherited from Query.
1090 yield self
1092 def subset(
1093 self, *, graph: Optional[DimensionGraph] = None, datasets: bool = True, unique: bool = False
1094 ) -> Query:
1095 # Docstring inherited from Query.
1096 graph, columns = self._makeSubsetQueryColumns(graph=graph, datasets=datasets, unique=unique)
1097 if columns is None:
1098 return self
1099 if columns.isEmpty():
1100 return EmptyQuery(self.graph.universe, managers=self.managers)
1101 simpleQuery = SimpleQuery()
1102 simpleQuery.join(self._table)
1103 return DirectQuery(
1104 simpleQuery=simpleQuery,
1105 columns=columns,
1106 uniqueness=DirectQueryUniqueness.NEEDS_DISTINCT if unique else DirectQueryUniqueness.NOT_UNIQUE,
1107 graph=graph,
1108 whereRegion=self.whereRegion if not unique else None,
1109 managers=self.managers,
1110 doomed_by=self._doomed_by,
1111 )
1113 def makeBuilder(self, summary: Optional[QuerySummary] = None) -> QueryBuilder:
1114 # Docstring inherited from Query.
1115 from ._builder import QueryBuilder
1117 if summary is None:
1118 summary = QuerySummary(self.graph, whereRegion=self.whereRegion)
1119 if not summary.requested.issubset(self.graph):
1120 raise NotImplementedError(
1121 f"Query.makeBuilder does not yet support augmenting dimensions "
1122 f"({summary.requested.dimensions}) beyond those originally included in the query "
1123 f"({self.graph.dimensions})."
1124 )
1125 builder = QueryBuilder(summary, managers=self.managers, doomed_by=self._doomed_by)
1126 builder.joinTable(self._table, dimensions=self.graph.dimensions, datasets=self.getDatasetColumns())
1127 return builder
1130class EmptyQuery(Query):
1131 """A `Query` implementation that handes the special case where the query
1132 would have no columns.
1134 Parameters
1135 ----------
1136 universe : `DimensionUniverse`
1137 Set of all dimensions from which the null set is extracted.
1138 managers : `RegistryManagers`
1139 A struct containing the registry manager instances used by the query
1140 system.
1141 doomed_by : `Iterable` [ `str` ], optional
1142 A list of messages (appropriate for e.g. logging or exceptions) that
1143 explain why the query is known to return no results even before it is
1144 executed. Queries with a non-empty list will never be executed.
1145 """
1147 def __init__(
1148 self,
1149 universe: DimensionUniverse,
1150 managers: RegistryManagers,
1151 doomed_by: Iterable[str] = (),
1152 ):
1153 super().__init__(graph=universe.empty, whereRegion=None, managers=managers, doomed_by=doomed_by)
1155 def isUnique(self) -> bool:
1156 # Docstring inherited from Query.
1157 return True
1159 def getDimensionColumn(self, name: str) -> sqlalchemy.sql.ColumnElement:
1160 # Docstring inherited from Query.
1161 raise KeyError(f"No dimension {name} in query (no dimensions at all, actually).")
1163 @property
1164 def spatial(self) -> Iterator[DimensionElement]:
1165 # Docstring inherited from Query.
1166 return iter(())
1168 def order_by(self, *args: str) -> Query:
1169 # Docstring inherited from Query.
1170 return self
1172 def limit(self, limit: int, offset: Optional[int] = None) -> Query:
1173 # Docstring inherited from Query.
1174 return self
1176 def getRegionColumn(self, name: str) -> sqlalchemy.sql.ColumnElement:
1177 # Docstring inherited from Query.
1178 raise KeyError(f"No region for {name} in query (no regions at all, actually).")
1180 def getDatasetColumns(self) -> Optional[DatasetQueryColumns]:
1181 # Docstring inherited from Query.
1182 return None
1184 def rows(
1185 self, db: Database, *, region: Optional[Region] = None
1186 ) -> Iterator[Optional[sqlalchemy.engine.RowProxy]]:
1187 if not self._doomed_by:
1188 yield None
1190 @property
1191 def sql(self) -> Optional[sqlalchemy.sql.FromClause]:
1192 # Docstring inherited from Query.
1193 return None
1195 @contextmanager
1196 def materialize(self, db: Database) -> Iterator[Query]:
1197 # Docstring inherited from Query.
1198 yield self
1200 def subset(
1201 self, *, graph: Optional[DimensionGraph] = None, datasets: bool = True, unique: bool = False
1202 ) -> Query:
1203 # Docstring inherited from Query.
1204 assert graph is None or graph.issubset(self.graph)
1205 return self
1207 def makeBuilder(self, summary: Optional[QuerySummary] = None) -> QueryBuilder:
1208 # Docstring inherited from Query.
1209 from ._builder import QueryBuilder
1211 if summary is None:
1212 summary = QuerySummary(self.graph)
1213 if not summary.requested.issubset(self.graph):
1214 raise NotImplementedError(
1215 f"Query.makeBuilder does not yet support augmenting dimensions "
1216 f"({summary.requested.dimensions}) beyond those originally included in the query "
1217 f"({self.graph.dimensions})."
1218 )
1219 return QueryBuilder(summary, managers=self.managers, doomed_by=self._doomed_by)