Coverage for python/lsst/daf/butler/registry/queries/_query.py: 23%
365 statements
« prev ^ index » next coverage.py v7.2.5, created at 2023-05-15 00:10 +0000
« prev ^ index » next coverage.py v7.2.5, created at 2023-05-15 00:10 +0000
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
21from __future__ import annotations
23__all__ = ("Query",)
25import dataclasses
26import enum
27import itertools
28from abc import ABC, abstractmethod
29from contextlib import contextmanager
30from typing import TYPE_CHECKING, ContextManager, Dict, Iterable, Iterator, Mapping, Optional, Tuple
32import sqlalchemy
33from lsst.sphgeom import Region
35from ...core import (
36 DataCoordinate,
37 DatasetRef,
38 DatasetType,
39 Dimension,
40 DimensionElement,
41 DimensionGraph,
42 DimensionRecord,
43 DimensionUniverse,
44 SimpleQuery,
45 addDimensionForeignKey,
46 ddl,
47)
48from ..interfaces import Database
49from ._query_backend import QueryBackend
50from ._structs import DatasetQueryColumns, QueryColumns, QuerySummary
52if TYPE_CHECKING: 52 ↛ 53line 52 didn't jump to line 53, because the condition on line 52 was never true
53 from ._builder import QueryBuilder
56@dataclasses.dataclass(frozen=True)
57class OrderByColumn:
58 """Information about single column in ORDER BY clause."""
60 column: sqlalchemy.sql.ColumnElement
61 """Name of the column or `None` for primary key (`str` or `None`)"""
63 ordering: bool
64 """True for ascending order, False for descending (`bool`)."""
66 @property
67 def column_order(self) -> sqlalchemy.sql.ColumnElement:
68 """Column element for use in ORDER BY clause
69 (`sqlalchemy.sql.ColumnElement`)
70 """
71 return self.column.asc() if self.ordering else self.column.desc()
74class Query(ABC):
75 """An abstract base class for queries that return some combination of
76 `DatasetRef` and `DataCoordinate` objects.
78 Parameters
79 ----------
80 graph : `DimensionGraph`
81 Object describing the dimensions included in the query.
82 whereRegion : `lsst.sphgeom.Region`, optional
83 Region that all region columns in all returned rows must overlap.
84 backend : `QueryBackend`
85 Backend object that represents the `Registry` implementation.
86 doomed_by : `Iterable` [ `str` ], optional
87 A list of messages (appropriate for e.g. logging or exceptions) that
88 explain why the query is known to return no results even before it is
89 executed. Queries with a non-empty list will never be executed.
91 Notes
92 -----
93 The `Query` hierarchy abstracts over the database/SQL representation of a
94 particular set of data IDs or datasets. It is expected to be used as a
95 backend for other objects that provide more natural interfaces for one or
96 both of these, not as part of a public interface to query results.
97 """
99 def __init__(
100 self,
101 *,
102 graph: DimensionGraph,
103 whereRegion: Optional[Region],
104 backend: QueryBackend,
105 doomed_by: Iterable[str] = (),
106 ):
107 self.graph = graph
108 self.whereRegion = whereRegion
109 self.backend = backend
110 self._doomed_by = tuple(doomed_by)
111 self._filtered_by_join: Optional[int] = None
112 self._filtered_by_where: Optional[int] = None
114 @abstractmethod
115 def isUnique(self) -> bool:
116 """Return `True` if this query's rows are guaranteed to be unique, and
117 `False` otherwise.
119 If this query has dataset results (`datasetType` is not `None`),
120 uniqueness applies to the `DatasetRef` instances returned by
121 `extractDatasetRef` from the result of `rows`. If it does not have
122 dataset results, uniqueness applies to the `DataCoordinate` instances
123 returned by `extractDataId`.
124 """
125 raise NotImplementedError()
127 @abstractmethod
128 def getDimensionColumn(self, name: str) -> sqlalchemy.sql.ColumnElement:
129 """Return the query column that contains the primary key value for
130 the dimension with the given name.
132 Parameters
133 ----------
134 name : `str`
135 Name of the dimension.
137 Returns
138 -------
139 column : `sqlalchemy.sql.ColumnElement`.
140 SQLAlchemy object representing a column in the query.
142 Notes
143 -----
144 This method is intended primarily as a hook for subclasses to implement
145 and the ABC to call in order to provide higher-level functionality;
146 code that uses `Query` objects (but does not implement one) should
147 usually not have to call this method.
148 """
149 raise NotImplementedError()
151 @property
152 @abstractmethod
153 def spatial(self) -> Iterator[DimensionElement]:
154 """An iterator over the dimension element columns used in post-query
155 filtering of spatial overlaps (`Iterator` [ `DimensionElement` ]).
157 Notes
158 -----
159 This property is intended primarily as a hook for subclasses to
160 implement and the ABC to call in order to provide higher-level
161 functionality; code that uses `Query` objects (but does not implement
162 one) should usually not have to access this property.
163 """
164 raise NotImplementedError()
166 @abstractmethod
167 def getRegionColumn(self, name: str) -> sqlalchemy.sql.ColumnElement:
168 """Return a region column for one of the dimension elements iterated
169 over by `spatial`.
171 Parameters
172 ----------
173 name : `str`
174 Name of the element.
176 Returns
177 -------
178 column : `sqlalchemy.sql.ColumnElement`
179 SQLAlchemy representing a result column in the query.
181 Notes
182 -----
183 This method is intended primarily as a hook for subclasses to implement
184 and the ABC to call in order to provide higher-level functionality;
185 code that uses `Query` objects (but does not implement one) should
186 usually not have to call this method.
187 """
188 raise NotImplementedError()
190 @property
191 def datasetType(self) -> Optional[DatasetType]:
192 """The `DatasetType` of datasets returned by this query, or `None`
193 if there are no dataset results (`DatasetType` or `None`).
194 """
195 cols = self.getDatasetColumns()
196 if cols is None:
197 return None
198 return cols.datasetType
200 def count(self, db: Database, *, exact: bool = True) -> int:
201 """Count the number of rows this query would return.
203 Parameters
204 ----------
205 db : `Database`
206 Object managing the database connection.
207 exact : `bool`, optional
208 If `True`, run the full query and perform post-query filtering if
209 needed to account for that filtering in the count. If `False`, the
210 result may be an upper bound.
212 Returns
213 -------
214 count : `int`
215 The number of rows the query would return, or an upper bound if
216 ``exact=False``.
218 Notes
219 -----
220 This counts the number of rows returned, not the number of unique rows
221 returned, so even with ``exact=True`` it may provide only an upper
222 bound on the number of *deduplicated* result rows.
223 """
224 if self._doomed_by:
225 return 0
226 sql = self.sql
227 if sql is None:
228 return 1
229 if exact and self.spatial:
230 filtered_count = 0
231 for _ in self.rows(db):
232 filtered_count += 1
233 return filtered_count
234 else:
235 with db.query(sql.with_only_columns([sqlalchemy.sql.func.count()]).order_by(None)) as sql_result:
236 return sql_result.scalar()
238 def any(
239 self,
240 db: Database,
241 *,
242 execute: bool = True,
243 exact: bool = True,
244 ) -> bool:
245 """Test whether this query returns any results.
247 Parameters
248 ----------
249 db : `Database`
250 Object managing the database connection.
251 execute : `bool`, optional
252 If `True`, execute at least a ``LIMIT 1`` query if it cannot be
253 determined prior to execution that the query would return no rows.
254 exact : `bool`, optional
255 If `True`, run the full query and perform post-query filtering if
256 needed, until at least one result row is found. If `False`, the
257 returned result does not account for post-query filtering, and
258 hence may be `True` even when all result rows would be filtered
259 out.
261 Returns
262 -------
263 any : `bool`
264 `True` if the query would (or might, depending on arguments) yield
265 result rows. `False` if it definitely would not.
266 """
267 if self._doomed_by:
268 return False
269 sql = self.sql
270 if sql is None:
271 return True
272 if exact and not execute:
273 raise TypeError("Cannot obtain exact results without executing the query.")
274 if exact and self.spatial:
275 for _ in self.rows(db):
276 return True
277 return False
278 elif execute:
279 with db.query(sql.limit(1)) as sql_result:
280 return sql_result.one_or_none() is not None
281 else:
282 return True
284 def explain_no_results(
285 self,
286 db: Database,
287 *,
288 followup: bool = True,
289 ) -> Iterator[str]:
290 """Return human-readable messages that may help explain why the query
291 yields no results.
293 Parameters
294 ----------
295 db : `Database`
296 Object managing the database connection.
297 followup : `bool`, optional
298 If `True` (default) perform inexpensive follow-up queries if no
299 diagnostics are available from query generation alone.
301 Returns
302 -------
303 messages : `Iterator` [ `str` ]
304 String messages that describe reasons the query might not yield any
305 results.
307 Notes
308 -----
309 Messages related to post-query filtering are only available if `rows`,
310 `any`, or `count` was already called with the same region (with
311 ``exact=True`` for the latter two).
312 """
313 from ._builder import QueryBuilder
315 if self._doomed_by:
316 yield from self._doomed_by
317 return
318 if self._filtered_by_where:
319 yield (
320 f"{self._filtered_by_where} result rows were filtered out because "
321 "one or more region did not overlap the WHERE-clause region."
322 )
323 if self._filtered_by_join:
324 yield (
325 f"{self._filtered_by_join} result rows were filtered out because "
326 "one or more regions did not overlap."
327 )
328 if (not followup) or self._filtered_by_join or self._filtered_by_where:
329 return
330 # Query didn't return results even before client-side filtering, and
331 # caller says we can do follow-up queries to determine why.
332 # Start by seeing if there are _any_ dimension records for each element
333 # involved.
334 for element in self.graph.elements:
335 summary = QuerySummary(element.graph)
336 builder = QueryBuilder(summary, self.backend)
337 followup_query = builder.finish()
338 if not followup_query.any(db, exact=False):
339 yield f"No dimension records for element '{element.name}' found."
340 yield from followup_query.explain_no_results(db, followup=False)
341 return
343 @abstractmethod
344 def getDatasetColumns(self) -> Optional[DatasetQueryColumns]:
345 """Return the columns for the datasets returned by this query.
347 Returns
348 -------
349 columns : `DatasetQueryColumns` or `None`
350 Struct containing SQLAlchemy representations of the result columns
351 for a dataset.
353 Notes
354 -----
355 This method is intended primarily as a hook for subclasses to implement
356 and the ABC to call in order to provide higher-level functionality;
357 code that uses `Query` objects (but does not implement one) should
358 usually not have to call this method.
359 """
360 raise NotImplementedError()
362 @property
363 @abstractmethod
364 def sql(self) -> Optional[sqlalchemy.sql.FromClause]:
365 """A SQLAlchemy object representing the full query
366 (`sqlalchemy.sql.FromClause` or `None`).
368 This is `None` in the special case where the query has no columns, and
369 only one logical row.
370 """
371 raise NotImplementedError()
373 def rows(self, db: Database) -> Iterator[Optional[sqlalchemy.engine.Row]]:
374 """Execute the query and yield result rows, applying `predicate`.
376 Parameters
377 ----------
378 db : `Database`
379 Object managing the database connection.
381 Yields
382 ------
383 row : `sqlalchemy.engine.RowProxy` or `None`
384 Result row from the query. `None` may yielded exactly once instead
385 of any real rows to indicate an empty query (see `EmptyQuery`).
386 """
387 if self._doomed_by:
388 return
389 self._filtered_by_where = 0
390 self._filtered_by_join = 0
391 with db.query(self.sql) as sql_result:
392 sql_rows = sql_result.fetchall()
393 for row in sql_rows:
394 rowRegions = [row._mapping[self.getRegionColumn(element.name)] for element in self.spatial]
395 if self.whereRegion and any(r.isDisjointFrom(self.whereRegion) for r in rowRegions):
396 self._filtered_by_where += 1
397 continue
398 if not not any(a.isDisjointFrom(b) for a, b in itertools.combinations(rowRegions, 2)):
399 self._filtered_by_join += 1
400 continue
401 yield row
403 def extractDimensionsTuple(
404 self, row: Optional[sqlalchemy.engine.RowProxy], dimensions: Iterable[Dimension]
405 ) -> tuple:
406 """Extract a tuple of data ID values from a result row.
408 Parameters
409 ----------
410 row : `sqlalchemy.engine.RowProxy` or `None`
411 A result row from a SQLAlchemy SELECT query, or `None` to indicate
412 the row from an `EmptyQuery`.
413 dimensions : `Iterable` [ `Dimension` ]
414 The dimensions to include in the returned tuple, in order.
416 Returns
417 -------
418 values : `tuple`
419 A tuple of dimension primary key values.
420 """
421 if row is None:
422 assert not tuple(dimensions), "Can only utilize empty query row when there are no dimensions."
423 return ()
424 return tuple(row._mapping[self.getDimensionColumn(dimension.name)] for dimension in dimensions)
426 def extractDataId(
427 self,
428 row: Optional[sqlalchemy.engine.RowProxy],
429 *,
430 graph: Optional[DimensionGraph] = None,
431 records: Optional[Mapping[str, Mapping[tuple, DimensionRecord]]] = None,
432 ) -> DataCoordinate:
433 """Extract a data ID from a result row.
435 Parameters
436 ----------
437 row : `sqlalchemy.engine.RowProxy` or `None`
438 A result row from a SQLAlchemy SELECT query, or `None` to indicate
439 the row from an `EmptyQuery`.
440 graph : `DimensionGraph`, optional
441 The dimensions the returned data ID should identify. If not
442 provided, this will be all dimensions in `QuerySummary.requested`.
443 records : `Mapping` [ `str`, `Mapping` [ `tuple`, `DimensionRecord` ] ]
444 Nested mapping containing records to attach to the returned
445 `DataCoordinate`, for which `~DataCoordinate.hasRecords` will
446 return `True`. If provided, outer keys must include all dimension
447 element names in ``graph``, and inner keys should be tuples of
448 dimension primary key values in the same order as
449 ``element.graph.required``. If not provided,
450 `DataCoordinate.hasRecords` will return `False` on the returned
451 object.
453 Returns
454 -------
455 dataId : `DataCoordinate`
456 A data ID that identifies all required and implied dimensions. If
457 ``records is not None``, this is have
458 `~DataCoordinate.hasRecords()` return `True`.
459 """
460 if graph is None:
461 graph = self.graph
462 if not graph:
463 return DataCoordinate.makeEmpty(self.graph.universe)
464 dataId = DataCoordinate.fromFullValues(
465 graph, self.extractDimensionsTuple(row, itertools.chain(graph.required, graph.implied))
466 )
467 if records is not None:
468 recordsForRow = {}
469 for element in graph.elements:
470 key = tuple(dataId.subset(element.graph).values())
471 recordsForRow[element.name] = records[element.name].get(key)
472 return dataId.expanded(recordsForRow)
473 else:
474 return dataId
476 def extractDatasetRef(
477 self,
478 row: sqlalchemy.engine.RowProxy,
479 dataId: Optional[DataCoordinate] = None,
480 records: Optional[Mapping[str, Mapping[tuple, DimensionRecord]]] = None,
481 ) -> DatasetRef:
482 """Extract a `DatasetRef` from a result row.
484 Parameters
485 ----------
486 row : `sqlalchemy.engine.RowProxy`
487 A result row from a SQLAlchemy SELECT query.
488 dataId : `DataCoordinate`
489 Data ID to attach to the `DatasetRef`. A minimal (i.e. base class)
490 `DataCoordinate` is constructed from ``row`` if `None`.
491 records : `Mapping` [ `str`, `Mapping` [ `tuple`, `DimensionRecord` ] ]
492 Records to use to return an `ExpandedDataCoordinate`. If provided,
493 outer keys must include all dimension element names in ``graph``,
494 and inner keys should be tuples of dimension primary key values
495 in the same order as ``element.graph.required``.
497 Returns
498 -------
499 ref : `DatasetRef`
500 Reference to the dataset; guaranteed to have `DatasetRef.id` not
501 `None`.
502 """
503 datasetColumns = self.getDatasetColumns()
504 assert datasetColumns is not None
505 if dataId is None:
506 dataId = self.extractDataId(row, graph=datasetColumns.datasetType.dimensions, records=records)
507 runRecord = self.backend.managers.collections[row._mapping[datasetColumns.runKey]]
508 return DatasetRef(
509 datasetColumns.datasetType, dataId, id=row._mapping[datasetColumns.id], run=runRecord.name
510 )
512 def _makeSubsetQueryColumns(
513 self, *, graph: Optional[DimensionGraph] = None, datasets: bool = True, unique: bool = False
514 ) -> Tuple[DimensionGraph, Optional[QueryColumns]]:
515 """Helper method for subclass implementations of `subset`.
517 Parameters
518 ----------
519 graph : `DimensionGraph`, optional
520 Dimensions to include in the new `Query` being constructed.
521 ``subset`` implementations should generally just forward their
522 own ``graph`` argument here.
523 datasets : `bool`, optional
524 Whether the new `Query` should include dataset results. Defaults
525 to `True`, but is ignored if ``self`` does not include dataset
526 results.
527 unique : `bool`, optional
528 Whether the new `Query` should guarantee unique results (this may
529 come with a performance penalty).
531 Returns
532 -------
533 graph : `DimensionGraph`
534 The dimensions of the new `Query`. This is exactly the same as
535 the argument of the same name, with ``self.graph`` used if that
536 argument is `None`.
537 columns : `QueryColumns` or `None`
538 A struct containing the SQLAlchemy column objects to use in the
539 new query, constructed by delegating to other (mostly abstract)
540 methods on ``self``. If `None`, `subset` may return ``self``.
541 """
542 if graph is None:
543 graph = self.graph
544 if (
545 graph == self.graph
546 and (self.getDatasetColumns() is None or datasets)
547 and (self.isUnique() or not unique)
548 ):
549 return graph, None
550 columns = QueryColumns()
551 for dimension in graph.dimensions:
552 col = self.getDimensionColumn(dimension.name)
553 columns.keys[dimension] = [col]
554 if not unique:
555 for element in self.spatial:
556 col = self.getRegionColumn(element.name)
557 columns.regions[element] = col
558 if datasets and self.getDatasetColumns() is not None:
559 columns.datasets = self.getDatasetColumns()
560 return graph, columns
562 @abstractmethod
563 def materialize(self, db: Database) -> ContextManager[Query]:
564 """Execute this query and insert its results into a temporary table.
566 Parameters
567 ----------
568 db : `Database`
569 Database engine to execute the query against.
571 Returns
572 -------
573 context : `typing.ContextManager` [ `MaterializedQuery` ]
574 A context manager that ensures the temporary table is created and
575 populated in ``__enter__`` (returning a `MaterializedQuery` object
576 backed by that table), and dropped in ``__exit__``. If ``self``
577 is already a `MaterializedQuery`, ``__enter__`` may just return
578 ``self`` and ``__exit__`` may do nothing (reflecting the fact that
579 an outer context manager should already take care of everything
580 else).
581 """
582 raise NotImplementedError()
584 @abstractmethod
585 def subset(
586 self, *, graph: Optional[DimensionGraph] = None, datasets: bool = True, unique: bool = False
587 ) -> Query:
588 """Return a new `Query` whose columns and/or rows are (mostly) subset
589 of this one's.
591 Parameters
592 ----------
593 graph : `DimensionGraph`, optional
594 Dimensions to include in the new `Query` being constructed.
595 If `None` (default), ``self.graph`` is used.
596 datasets : `bool`, optional
597 Whether the new `Query` should include dataset results. Defaults
598 to `True`, but is ignored if ``self`` does not include dataset
599 results.
600 unique : `bool`, optional
601 Whether the new `Query` should guarantee unique results (this may
602 come with a performance penalty).
604 Returns
605 -------
606 query : `Query`
607 A query object corresponding to the given inputs. May be ``self``
608 if no changes were requested.
610 Notes
611 -----
612 The way spatial overlaps are handled at present makes it impossible to
613 fully guarantee in general that the new query's rows are a subset of
614 this one's while also returning unique rows. That's because the
615 database is only capable of performing approximate, conservative
616 overlaps via the common skypix system; we defer actual region overlap
617 operations to per-result-row Python logic. But including the region
618 columns necessary to do that postprocessing in the query makes it
619 impossible to do a SELECT DISTINCT on the user-visible dimensions of
620 the query. For example, consider starting with a query with dimensions
621 (instrument, skymap, visit, tract). That involves a spatial join
622 between visit and tract, and we include the region columns from both
623 tables in the results in order to only actually yield result rows
624 (see `predicate` and `rows`) where the regions in those two columns
625 overlap. If the user then wants to subset to just (skymap, tract) with
626 unique results, we have two unpalatable options:
628 - we can do a SELECT DISTINCT with just the skymap and tract columns
629 in the SELECT clause, dropping all detailed overlap information and
630 including some tracts that did not actually overlap any of the
631 visits in the original query (but were regarded as _possibly_
632 overlapping via the coarser, common-skypix relationships);
634 - we can include the tract and visit region columns in the query, and
635 continue to filter out the non-overlapping pairs, but completely
636 disregard the user's request for unique tracts.
638 This interface specifies that implementations must do the former, as
639 that's what makes things efficient in our most important use case
640 (``QuantumGraph`` generation in ``pipe_base``). We may be able to
641 improve this situation in the future by putting exact overlap
642 information in the database, either by using built-in (but
643 engine-specific) spatial database functionality or (more likely)
644 switching to a scheme in which pairwise dimension spatial relationships
645 are explicitly precomputed (for e.g. combinations of instruments and
646 skymaps).
647 """
648 raise NotImplementedError()
650 @abstractmethod
651 def makeBuilder(self, summary: Optional[QuerySummary] = None) -> QueryBuilder:
652 """Return a `QueryBuilder` that can be used to construct a new `Query`
653 that is joined to (and hence constrained by) this one.
655 Parameters
656 ----------
657 summary : `QuerySummary`, optional
658 A `QuerySummary` instance that specifies the dimensions and any
659 additional constraints to include in the new query being
660 constructed, or `None` to use the dimensions of ``self`` with no
661 additional constraints.
662 """
663 raise NotImplementedError()
665 graph: DimensionGraph
666 """The dimensions identified by this query and included in any data IDs
667 created from its result rows (`DimensionGraph`).
668 """
670 whereRegion: Optional[Region]
671 """A spatial region that all regions in all rows returned by this query
672 must overlap (`lsst.sphgeom.Region` or `None`).
673 """
675 backend: QueryBackend
676 """Backend object that represents the `Registry` implementation.
677 """
680class DirectQueryUniqueness(enum.Enum):
681 """An enum representing the ways in which a query can have unique rows (or
682 not).
683 """
685 NOT_UNIQUE = enum.auto()
686 """The query is not expected to have unique rows.
687 """
689 NATURALLY_UNIQUE = enum.auto()
690 """The construction of the query guarantees that it will have unique
691 result rows, even without SELECT DISTINCT or a GROUP BY clause.
692 """
694 NEEDS_DISTINCT = enum.auto()
695 """The query is expected to yield unique result rows, and needs to use
696 SELECT DISTINCT or an equivalent GROUP BY clause to achieve this.
697 """
700class DirectQuery(Query):
701 """A `Query` implementation that represents a direct SELECT query that
702 usually joins many tables.
704 `DirectQuery` objects should generally only be constructed by
705 `QueryBuilder` or the methods of other `Query` objects.
707 Parameters
708 ----------
709 simpleQuery : `SimpleQuery`
710 Struct representing the actual SELECT, FROM, and WHERE clauses.
711 columns : `QueryColumns`
712 Columns that are referenced in the query in any clause.
713 uniqueness : `DirectQueryUniqueness`
714 Enum value indicating whether the query should yield unique result
715 rows, and if so whether that needs to be explicitly requested of the
716 database.
717 graph : `DimensionGraph`
718 Object describing the dimensions included in the query.
719 whereRegion : `lsst.sphgeom.Region`, optional
720 Region that all region columns in all returned rows must overlap.
721 backend : `QueryBackend`
722 Backend object that represents the `Registry` implementation.
723 doomed_by : `Iterable` [ `str` ], optional
724 A list of messages (appropriate for e.g. logging or exceptions) that
725 explain why the query is known to return no results even before it is
726 executed. Queries with a non-empty list will never be executed.
727 """
729 def __init__(
730 self,
731 *,
732 simpleQuery: SimpleQuery,
733 columns: QueryColumns,
734 uniqueness: DirectQueryUniqueness,
735 graph: DimensionGraph,
736 whereRegion: Optional[Region],
737 backend: QueryBackend,
738 order_by_columns: Iterable[OrderByColumn] = (),
739 limit: Optional[Tuple[int, Optional[int]]] = None,
740 doomed_by: Iterable[str] = (),
741 ):
742 super().__init__(graph=graph, whereRegion=whereRegion, backend=backend, doomed_by=doomed_by)
743 assert not simpleQuery.columns, "Columns should always be set on a copy in .sql"
744 assert not columns.isEmpty(), "EmptyQuery must be used when a query would have no columns."
745 self._simpleQuery = simpleQuery
746 self._columns = columns
747 self._uniqueness = uniqueness
748 self._order_by_columns = order_by_columns
749 self._limit = limit
750 self._datasetQueryColumns: Optional[DatasetQueryColumns] = None
751 self._dimensionColumns: Dict[str, sqlalchemy.sql.ColumnElement] = {}
752 self._regionColumns: Dict[str, sqlalchemy.sql.ColumnElement] = {}
754 def isUnique(self) -> bool:
755 # Docstring inherited from Query.
756 return self._uniqueness is not DirectQueryUniqueness.NOT_UNIQUE
758 def getDimensionColumn(self, name: str) -> sqlalchemy.sql.ColumnElement:
759 # Docstring inherited from Query.
760 column = self._dimensionColumns.get(name)
761 if column is None:
762 column = self._columns.getKeyColumn(name).label(name)
763 self._dimensionColumns[name] = column
764 return column
766 @property
767 def spatial(self) -> Iterator[DimensionElement]:
768 # Docstring inherited from Query.
769 return iter(self._columns.regions)
771 def getRegionColumn(self, name: str) -> sqlalchemy.sql.ColumnElement:
772 # Docstring inherited from Query.
773 column = self._regionColumns.get(name)
774 if column is None:
775 column = self._columns.regions[name].label(f"{name}_region")
776 self._regionColumns[name] = column
777 return column
779 def getDatasetColumns(self) -> Optional[DatasetQueryColumns]:
780 # Docstring inherited from Query.
781 if self._datasetQueryColumns is None:
782 base = self._columns.datasets
783 if base is None:
784 return None
785 ingestDate = base.ingestDate
786 if ingestDate is not None:
787 ingestDate = ingestDate.label("ingest_date")
788 self._datasetQueryColumns = DatasetQueryColumns(
789 datasetType=base.datasetType,
790 id=base.id.label("dataset_id"),
791 runKey=base.runKey.label(self.backend.managers.collections.getRunForeignKeyName()),
792 ingestDate=ingestDate,
793 )
794 return self._datasetQueryColumns
796 @property
797 def sql(self) -> sqlalchemy.sql.FromClause:
798 # Docstring inherited from Query.
799 simpleQuery = self._simpleQuery.copy()
800 for dimension in self.graph:
801 simpleQuery.columns.append(self.getDimensionColumn(dimension.name))
802 for element in self.spatial:
803 simpleQuery.columns.append(self.getRegionColumn(element.name))
804 datasetColumns = self.getDatasetColumns()
805 if datasetColumns is not None:
806 simpleQuery.columns.extend(datasetColumns)
808 assert not simpleQuery.order_by, "Input query cannot have ORDER BY"
809 if self._order_by_columns:
810 # add ORDER BY column
811 order_by_columns = [column.column_order for column in self._order_by_columns]
812 order_by_column = sqlalchemy.func.row_number().over(order_by=order_by_columns).label("_orderby")
813 simpleQuery.columns.append(order_by_column)
814 simpleQuery.order_by = [order_by_column]
816 assert simpleQuery.limit is None, "Input query cannot have LIMIT"
817 simpleQuery.limit = self._limit
819 sql = simpleQuery.combine()
821 if self._uniqueness is DirectQueryUniqueness.NEEDS_DISTINCT:
822 return sql.distinct()
823 else:
824 return sql
826 def _makeTableSpec(self, constraints: bool = False) -> ddl.TableSpec:
827 """Helper method for subclass implementations of `materialize`.
829 Parameters
830 ----------
831 constraints : `bool`, optional
832 If `True` (`False` is default), define a specification that
833 includes actual foreign key constraints for logical foreign keys.
834 Some database engines do not permit temporary tables to reference
835 normal tables, so this should be `False` when generating a spec
836 for a temporary table unless the database engine is known to
837 support them.
839 Returns
840 -------
841 spec : `ddl.TableSpec`
842 Specification for a table that could hold this query's result rows.
843 """
844 unique = self.isUnique()
845 spec = ddl.TableSpec(fields=())
846 for dimension in self.graph:
847 addDimensionForeignKey(spec, dimension, primaryKey=unique, constraint=constraints)
848 for element in self.spatial:
849 spec.fields.add(ddl.FieldSpec.for_region(f"{element.name}_region"))
850 datasetColumns = self.getDatasetColumns()
851 if datasetColumns is not None:
852 self.backend.managers.datasets.addDatasetForeignKey(
853 spec, primaryKey=unique, constraint=constraints
854 )
855 self.backend.managers.collections.addRunForeignKey(spec, nullable=False, constraint=constraints)
857 # Need a column for ORDER BY if ordering is requested
858 if self._order_by_columns:
859 spec.fields.add(
860 ddl.FieldSpec(
861 name="_orderby",
862 dtype=sqlalchemy.BigInteger,
863 nullable=False,
864 doc="Column to use with ORDER BY",
865 )
866 )
868 return spec
870 @contextmanager
871 def materialize(self, db: Database) -> Iterator[Query]:
872 # Docstring inherited from Query.
873 spec = self._makeTableSpec()
874 with db.temporary_table(spec) as table:
875 if not self._doomed_by:
876 db.insert(table, select=self.sql, names=spec.fields.names)
877 yield MaterializedQuery(
878 table=table,
879 spatial=self.spatial,
880 datasetType=self.datasetType,
881 isUnique=self.isUnique(),
882 graph=self.graph,
883 whereRegion=self.whereRegion,
884 backend=self.backend,
885 doomed_by=self._doomed_by,
886 )
888 def subset(
889 self, *, graph: Optional[DimensionGraph] = None, datasets: bool = True, unique: bool = False
890 ) -> Query:
891 # Docstring inherited from Query.
892 graph, columns = self._makeSubsetQueryColumns(graph=graph, datasets=datasets, unique=unique)
893 if columns is None:
894 return self
895 if columns.isEmpty():
896 return EmptyQuery(self.graph.universe, self.backend)
897 return DirectQuery(
898 simpleQuery=self._simpleQuery.copy(),
899 columns=columns,
900 uniqueness=DirectQueryUniqueness.NEEDS_DISTINCT if unique else DirectQueryUniqueness.NOT_UNIQUE,
901 graph=graph,
902 whereRegion=self.whereRegion if not unique else None,
903 backend=self.backend,
904 doomed_by=self._doomed_by,
905 )
907 def makeBuilder(self, summary: Optional[QuerySummary] = None) -> QueryBuilder:
908 # Docstring inherited from Query.
909 from ._builder import QueryBuilder
911 if summary is None:
912 summary = QuerySummary(self.graph, whereRegion=self.whereRegion)
913 if not summary.requested.issubset(self.graph):
914 raise NotImplementedError(
915 f"Query.makeBuilder does not yet support augmenting dimensions "
916 f"({summary.requested.dimensions}) beyond those originally included in the query "
917 f"({self.graph.dimensions})."
918 )
919 builder = QueryBuilder(summary, backend=self.backend, doomed_by=self._doomed_by)
920 builder.joinTable(
921 self.sql.alias(), dimensions=self.graph.dimensions, datasets=self.getDatasetColumns()
922 )
923 return builder
926class MaterializedQuery(Query):
927 """A `Query` implementation that represents query results saved in a
928 temporary table.
930 `MaterializedQuery` instances should not be constructed directly; use
931 `Query.materialize()` instead.
933 Parameters
934 ----------
935 table : `sqlalchemy.schema.Table`
936 SQLAlchemy object representing the temporary table.
937 spatial : `Iterable` [ `DimensionElement` ]
938 Spatial dimension elements whose regions must overlap for each valid
939 result row (which may reject some rows that are in the table).
940 datasetType : `DatasetType`
941 The `DatasetType` of datasets returned by this query, or `None`
942 if there are no dataset results
943 isUnique : `bool`
944 If `True`, the table's rows are unique, and there is no need to
945 add ``SELECT DISTINCT`` to guarantee this in results.
946 graph : `DimensionGraph`
947 Dimensions included in the columns of this table.
948 whereRegion : `Region` or `None`
949 A spatial region all result-row regions must overlap to be valid (which
950 may reject some rows that are in the table).
951 backend : `QueryBackend`
952 Backend object that represents the `Registry` implementation.
953 doomed_by : `Iterable` [ `str` ], optional
954 A list of messages (appropriate for e.g. logging or exceptions) that
955 explain why the query is known to return no results even before it is
956 executed. Queries with a non-empty list will never be executed.
957 """
959 def __init__(
960 self,
961 *,
962 table: sqlalchemy.schema.Table,
963 spatial: Iterable[DimensionElement],
964 datasetType: Optional[DatasetType],
965 isUnique: bool,
966 graph: DimensionGraph,
967 whereRegion: Optional[Region],
968 backend: QueryBackend,
969 doomed_by: Iterable[str] = (),
970 ):
971 super().__init__(graph=graph, whereRegion=whereRegion, backend=backend, doomed_by=doomed_by)
972 self._table = table
973 self._spatial = tuple(spatial)
974 self._datasetType = datasetType
975 self._isUnique = isUnique
977 def isUnique(self) -> bool:
978 # Docstring inherited from Query.
979 return self._isUnique
981 def getDimensionColumn(self, name: str) -> sqlalchemy.sql.ColumnElement:
982 # Docstring inherited from Query.
983 return self._table.columns[name]
985 @property
986 def spatial(self) -> Iterator[DimensionElement]:
987 # Docstring inherited from Query.
988 return iter(self._spatial)
990 def getRegionColumn(self, name: str) -> sqlalchemy.sql.ColumnElement:
991 # Docstring inherited from Query.
992 return self._table.columns[f"{name}_region"]
994 def getDatasetColumns(self) -> Optional[DatasetQueryColumns]:
995 # Docstring inherited from Query.
996 if self._datasetType is not None:
997 return DatasetQueryColumns(
998 datasetType=self._datasetType,
999 id=self._table.columns["dataset_id"],
1000 runKey=self._table.columns[self.backend.managers.collections.getRunForeignKeyName()],
1001 ingestDate=None,
1002 )
1003 else:
1004 return None
1006 @property
1007 def sql(self) -> sqlalchemy.sql.FromClause:
1008 # Docstring inherited from Query.
1009 select = self._table.select()
1010 if "_orderby" in self._table.columns:
1011 select = select.order_by(self._table.columns["_orderby"])
1012 return select
1014 @contextmanager
1015 def materialize(self, db: Database) -> Iterator[Query]:
1016 # Docstring inherited from Query.
1017 yield self
1019 def subset(
1020 self, *, graph: Optional[DimensionGraph] = None, datasets: bool = True, unique: bool = False
1021 ) -> Query:
1022 # Docstring inherited from Query.
1023 graph, columns = self._makeSubsetQueryColumns(graph=graph, datasets=datasets, unique=unique)
1024 if columns is None:
1025 return self
1026 if columns.isEmpty():
1027 return EmptyQuery(self.graph.universe, self.backend)
1028 simpleQuery = SimpleQuery()
1029 simpleQuery.join(self._table)
1030 return DirectQuery(
1031 simpleQuery=simpleQuery,
1032 columns=columns,
1033 uniqueness=DirectQueryUniqueness.NEEDS_DISTINCT if unique else DirectQueryUniqueness.NOT_UNIQUE,
1034 graph=graph,
1035 whereRegion=self.whereRegion if not unique else None,
1036 backend=self.backend,
1037 doomed_by=self._doomed_by,
1038 )
1040 def makeBuilder(self, summary: Optional[QuerySummary] = None) -> QueryBuilder:
1041 # Docstring inherited from Query.
1042 from ._builder import QueryBuilder
1044 if summary is None:
1045 summary = QuerySummary(self.graph, whereRegion=self.whereRegion)
1046 if not summary.requested.issubset(self.graph):
1047 raise NotImplementedError(
1048 f"Query.makeBuilder does not yet support augmenting dimensions "
1049 f"({summary.requested.dimensions}) beyond those originally included in the query "
1050 f"({self.graph.dimensions})."
1051 )
1052 builder = QueryBuilder(summary, backend=self.backend, doomed_by=self._doomed_by)
1053 builder.joinTable(self._table, dimensions=self.graph.dimensions, datasets=self.getDatasetColumns())
1054 return builder
1057class EmptyQuery(Query):
1058 """A `Query` implementation that handes the special case where the query
1059 would have no columns.
1061 Parameters
1062 ----------
1063 universe : `DimensionUniverse`
1064 Set of all dimensions from which the null set is extracted.
1065 backend : `QueryBackend`
1066 Backend object that represents the `Registry` implementation.
1067 doomed_by : `Iterable` [ `str` ], optional
1068 A list of messages (appropriate for e.g. logging or exceptions) that
1069 explain why the query is known to return no results even before it is
1070 executed. Queries with a non-empty list will never be executed.
1071 """
1073 def __init__(
1074 self,
1075 universe: DimensionUniverse,
1076 backend: QueryBackend,
1077 doomed_by: Iterable[str] = (),
1078 ):
1079 super().__init__(graph=universe.empty, whereRegion=None, backend=backend, doomed_by=doomed_by)
1081 def isUnique(self) -> bool:
1082 # Docstring inherited from Query.
1083 return True
1085 def getDimensionColumn(self, name: str) -> sqlalchemy.sql.ColumnElement:
1086 # Docstring inherited from Query.
1087 raise KeyError(f"No dimension {name} in query (no dimensions at all, actually).")
1089 @property
1090 def spatial(self) -> Iterator[DimensionElement]:
1091 # Docstring inherited from Query.
1092 return iter(())
1094 def getRegionColumn(self, name: str) -> sqlalchemy.sql.ColumnElement:
1095 # Docstring inherited from Query.
1096 raise KeyError(f"No region for {name} in query (no regions at all, actually).")
1098 def getDatasetColumns(self) -> Optional[DatasetQueryColumns]:
1099 # Docstring inherited from Query.
1100 return None
1102 def rows(self, db: Database) -> Iterator[Optional[sqlalchemy.engine.RowProxy]]:
1103 if not self._doomed_by:
1104 yield None
1106 @property
1107 def sql(self) -> Optional[sqlalchemy.sql.FromClause]:
1108 # Docstring inherited from Query.
1109 return None
1111 @contextmanager
1112 def materialize(self, db: Database) -> Iterator[Query]:
1113 # Docstring inherited from Query.
1114 yield self
1116 def subset(
1117 self, *, graph: Optional[DimensionGraph] = None, datasets: bool = True, unique: bool = False
1118 ) -> Query:
1119 # Docstring inherited from Query.
1120 assert graph is None or graph.issubset(self.graph)
1121 return self
1123 def makeBuilder(self, summary: Optional[QuerySummary] = None) -> QueryBuilder:
1124 # Docstring inherited from Query.
1125 from ._builder import QueryBuilder
1127 if summary is None:
1128 summary = QuerySummary(self.graph)
1129 if not summary.requested.issubset(self.graph):
1130 raise NotImplementedError(
1131 f"Query.makeBuilder does not yet support augmenting dimensions "
1132 f"({summary.requested.dimensions}) beyond those originally included in the query "
1133 f"({self.graph.dimensions})."
1134 )
1135 return QueryBuilder(summary, backend=self.backend, doomed_by=self._doomed_by)