Coverage for python/lsst/daf/butler/registry/queries/_query.py: 25%
363 statements
« prev ^ index » next coverage.py v6.4.4, created at 2022-09-27 08:58 +0000
« prev ^ index » next coverage.py v6.4.4, created at 2022-09-27 08:58 +0000
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
21from __future__ import annotations
23__all__ = ("Query",)
25import dataclasses
26import enum
27import itertools
28from abc import ABC, abstractmethod
29from contextlib import contextmanager
30from typing import TYPE_CHECKING, ContextManager, Dict, Iterable, Iterator, Mapping, Optional, Tuple
32import sqlalchemy
33from lsst.sphgeom import Region
35from ...core import (
36 DataCoordinate,
37 DatasetRef,
38 DatasetType,
39 Dimension,
40 DimensionElement,
41 DimensionGraph,
42 DimensionRecord,
43 DimensionUniverse,
44 SimpleQuery,
45 addDimensionForeignKey,
46 ddl,
47)
48from ..interfaces import Database
49from ._query_backend import QueryBackend
50from ._structs import DatasetQueryColumns, QueryColumns, QuerySummary
52if TYPE_CHECKING: 52 ↛ 53line 52 didn't jump to line 53, because the condition on line 52 was never true
53 from ._builder import QueryBuilder
56@dataclasses.dataclass(frozen=True)
57class OrderByColumn:
58 """Information about single column in ORDER BY clause."""
60 column: sqlalchemy.sql.ColumnElement
61 """Name of the column or `None` for primary key (`str` or `None`)"""
63 ordering: bool
64 """True for ascending order, False for descending (`bool`)."""
66 @property
67 def column_order(self) -> sqlalchemy.sql.ColumnElement:
68 """Column element for use in ORDER BY clause
69 (`sqlalchemy.sql.ColumnElement`)
70 """
71 return self.column.asc() if self.ordering else self.column.desc()
74class Query(ABC):
75 """An abstract base class for queries that return some combination of
76 `DatasetRef` and `DataCoordinate` objects.
78 Parameters
79 ----------
80 graph : `DimensionGraph`
81 Object describing the dimensions included in the query.
82 whereRegion : `lsst.sphgeom.Region`, optional
83 Region that all region columns in all returned rows must overlap.
84 backend : `QueryBackend`
85 Backend object that represents the `Registry` implementation.
86 doomed_by : `Iterable` [ `str` ], optional
87 A list of messages (appropriate for e.g. logging or exceptions) that
88 explain why the query is known to return no results even before it is
89 executed. Queries with a non-empty list will never be executed.
91 Notes
92 -----
93 The `Query` hierarchy abstracts over the database/SQL representation of a
94 particular set of data IDs or datasets. It is expected to be used as a
95 backend for other objects that provide more natural interfaces for one or
96 both of these, not as part of a public interface to query results.
97 """
99 def __init__(
100 self,
101 *,
102 graph: DimensionGraph,
103 whereRegion: Optional[Region],
104 backend: QueryBackend,
105 doomed_by: Iterable[str] = (),
106 ):
107 self.graph = graph
108 self.whereRegion = whereRegion
109 self.backend = backend
110 self._doomed_by = tuple(doomed_by)
111 self._filtered_by_join: Optional[int] = None
112 self._filtered_by_where: Optional[int] = None
114 @abstractmethod
115 def isUnique(self) -> bool:
116 """Return `True` if this query's rows are guaranteed to be unique, and
117 `False` otherwise.
119 If this query has dataset results (`datasetType` is not `None`),
120 uniqueness applies to the `DatasetRef` instances returned by
121 `extractDatasetRef` from the result of `rows`. If it does not have
122 dataset results, uniqueness applies to the `DataCoordinate` instances
123 returned by `extractDataId`.
124 """
125 raise NotImplementedError()
127 @abstractmethod
128 def getDimensionColumn(self, name: str) -> sqlalchemy.sql.ColumnElement:
129 """Return the query column that contains the primary key value for
130 the dimension with the given name.
132 Parameters
133 ----------
134 name : `str`
135 Name of the dimension.
137 Returns
138 -------
139 column : `sqlalchemy.sql.ColumnElement`.
140 SQLAlchemy object representing a column in the query.
142 Notes
143 -----
144 This method is intended primarily as a hook for subclasses to implement
145 and the ABC to call in order to provide higher-level functionality;
146 code that uses `Query` objects (but does not implement one) should
147 usually not have to call this method.
148 """
149 raise NotImplementedError()
151 @property
152 @abstractmethod
153 def spatial(self) -> Iterator[DimensionElement]:
154 """An iterator over the dimension element columns used in post-query
155 filtering of spatial overlaps (`Iterator` [ `DimensionElement` ]).
157 Notes
158 -----
159 This property is intended primarily as a hook for subclasses to
160 implement and the ABC to call in order to provide higher-level
161 functionality; code that uses `Query` objects (but does not implement
162 one) should usually not have to access this property.
163 """
164 raise NotImplementedError()
166 @abstractmethod
167 def getRegionColumn(self, name: str) -> sqlalchemy.sql.ColumnElement:
168 """Return a region column for one of the dimension elements iterated
169 over by `spatial`.
171 Parameters
172 ----------
173 name : `str`
174 Name of the element.
176 Returns
177 -------
178 column : `sqlalchemy.sql.ColumnElement`
179 SQLAlchemy representing a result column in the query.
181 Notes
182 -----
183 This method is intended primarily as a hook for subclasses to implement
184 and the ABC to call in order to provide higher-level functionality;
185 code that uses `Query` objects (but does not implement one) should
186 usually not have to call this method.
187 """
188 raise NotImplementedError()
190 @property
191 def datasetType(self) -> Optional[DatasetType]:
192 """The `DatasetType` of datasets returned by this query, or `None`
193 if there are no dataset results (`DatasetType` or `None`).
194 """
195 cols = self.getDatasetColumns()
196 if cols is None:
197 return None
198 return cols.datasetType
200 def count(self, db: Database, *, exact: bool = True) -> int:
201 """Count the number of rows this query would return.
203 Parameters
204 ----------
205 db : `Database`
206 Object managing the database connection.
207 exact : `bool`, optional
208 If `True`, run the full query and perform post-query filtering if
209 needed to account for that filtering in the count. If `False`, the
210 result may be an upper bound.
212 Returns
213 -------
214 count : `int`
215 The number of rows the query would return, or an upper bound if
216 ``exact=False``.
218 Notes
219 -----
220 This counts the number of rows returned, not the number of unique rows
221 returned, so even with ``exact=True`` it may provide only an upper
222 bound on the number of *deduplicated* result rows.
223 """
224 if self._doomed_by:
225 return 0
226 sql = self.sql
227 if sql is None:
228 return 1
229 if exact and self.spatial:
230 filtered_count = 0
231 for _ in self.rows(db):
232 filtered_count += 1
233 return filtered_count
234 else:
235 return db.query(sql.with_only_columns([sqlalchemy.sql.func.count()]).order_by(None)).scalar()
237 def any(
238 self,
239 db: Database,
240 *,
241 execute: bool = True,
242 exact: bool = True,
243 ) -> bool:
244 """Test whether this query returns any results.
246 Parameters
247 ----------
248 db : `Database`
249 Object managing the database connection.
250 execute : `bool`, optional
251 If `True`, execute at least a ``LIMIT 1`` query if it cannot be
252 determined prior to execution that the query would return no rows.
253 exact : `bool`, optional
254 If `True`, run the full query and perform post-query filtering if
255 needed, until at least one result row is found. If `False`, the
256 returned result does not account for post-query filtering, and
257 hence may be `True` even when all result rows would be filtered
258 out.
260 Returns
261 -------
262 any : `bool`
263 `True` if the query would (or might, depending on arguments) yield
264 result rows. `False` if it definitely would not.
265 """
266 if self._doomed_by:
267 return False
268 sql = self.sql
269 if sql is None:
270 return True
271 if exact and not execute:
272 raise TypeError("Cannot obtain exact results without executing the query.")
273 if exact and self.spatial:
274 for _ in self.rows(db):
275 return True
276 return False
277 elif execute:
278 return db.query(sql.limit(1)).one_or_none() is not None
279 else:
280 return True
282 def explain_no_results(
283 self,
284 db: Database,
285 *,
286 followup: bool = True,
287 ) -> Iterator[str]:
288 """Return human-readable messages that may help explain why the query
289 yields no results.
291 Parameters
292 ----------
293 db : `Database`
294 Object managing the database connection.
295 followup : `bool`, optional
296 If `True` (default) perform inexpensive follow-up queries if no
297 diagnostics are available from query generation alone.
299 Returns
300 -------
301 messages : `Iterator` [ `str` ]
302 String messages that describe reasons the query might not yield any
303 results.
305 Notes
306 -----
307 Messages related to post-query filtering are only available if `rows`,
308 `any`, or `count` was already called with the same region (with
309 ``exact=True`` for the latter two).
310 """
311 from ._builder import QueryBuilder
313 if self._doomed_by:
314 yield from self._doomed_by
315 return
316 if self._filtered_by_where:
317 yield (
318 f"{self._filtered_by_where} result rows were filtered out because "
319 "one or more region did not overlap the WHERE-clause region."
320 )
321 if self._filtered_by_join:
322 yield (
323 f"{self._filtered_by_join} result rows were filtered out because "
324 "one or more regions did not overlap."
325 )
326 if (not followup) or self._filtered_by_join or self._filtered_by_where:
327 return
328 # Query didn't return results even before client-side filtering, and
329 # caller says we can do follow-up queries to determine why.
330 # Start by seeing if there are _any_ dimension records for each element
331 # involved.
332 for element in self.graph.elements:
333 summary = QuerySummary(element.graph)
334 builder = QueryBuilder(summary, self.backend)
335 followup_query = builder.finish()
336 if not followup_query.any(db, exact=False):
337 yield f"No dimension records for element '{element.name}' found."
338 yield from followup_query.explain_no_results(db, followup=False)
339 return
341 @abstractmethod
342 def getDatasetColumns(self) -> Optional[DatasetQueryColumns]:
343 """Return the columns for the datasets returned by this query.
345 Returns
346 -------
347 columns : `DatasetQueryColumns` or `None`
348 Struct containing SQLAlchemy representations of the result columns
349 for a dataset.
351 Notes
352 -----
353 This method is intended primarily as a hook for subclasses to implement
354 and the ABC to call in order to provide higher-level functionality;
355 code that uses `Query` objects (but does not implement one) should
356 usually not have to call this method.
357 """
358 raise NotImplementedError()
360 @property
361 @abstractmethod
362 def sql(self) -> Optional[sqlalchemy.sql.FromClause]:
363 """A SQLAlchemy object representing the full query
364 (`sqlalchemy.sql.FromClause` or `None`).
366 This is `None` in the special case where the query has no columns, and
367 only one logical row.
368 """
369 raise NotImplementedError()
371 def rows(self, db: Database) -> Iterator[Optional[sqlalchemy.engine.RowProxy]]:
372 """Execute the query and yield result rows, applying `predicate`.
374 Parameters
375 ----------
376 db : `Database`
377 Object managing the database connection.
379 Yields
380 ------
381 row : `sqlalchemy.engine.RowProxy` or `None`
382 Result row from the query. `None` may yielded exactly once instead
383 of any real rows to indicate an empty query (see `EmptyQuery`).
384 """
385 if self._doomed_by:
386 return
387 self._filtered_by_where = 0
388 self._filtered_by_join = 0
389 for row in db.query(self.sql):
390 rowRegions = [row._mapping[self.getRegionColumn(element.name)] for element in self.spatial]
391 if self.whereRegion and any(r.isDisjointFrom(self.whereRegion) for r in rowRegions):
392 self._filtered_by_where += 1
393 continue
394 if not not any(a.isDisjointFrom(b) for a, b in itertools.combinations(rowRegions, 2)):
395 self._filtered_by_join += 1
396 continue
397 yield row
399 def extractDimensionsTuple(
400 self, row: Optional[sqlalchemy.engine.RowProxy], dimensions: Iterable[Dimension]
401 ) -> tuple:
402 """Extract a tuple of data ID values from a result row.
404 Parameters
405 ----------
406 row : `sqlalchemy.engine.RowProxy` or `None`
407 A result row from a SQLAlchemy SELECT query, or `None` to indicate
408 the row from an `EmptyQuery`.
409 dimensions : `Iterable` [ `Dimension` ]
410 The dimensions to include in the returned tuple, in order.
412 Returns
413 -------
414 values : `tuple`
415 A tuple of dimension primary key values.
416 """
417 if row is None:
418 assert not tuple(dimensions), "Can only utilize empty query row when there are no dimensions."
419 return ()
420 return tuple(row._mapping[self.getDimensionColumn(dimension.name)] for dimension in dimensions)
422 def extractDataId(
423 self,
424 row: Optional[sqlalchemy.engine.RowProxy],
425 *,
426 graph: Optional[DimensionGraph] = None,
427 records: Optional[Mapping[str, Mapping[tuple, DimensionRecord]]] = None,
428 ) -> DataCoordinate:
429 """Extract a data ID from a result row.
431 Parameters
432 ----------
433 row : `sqlalchemy.engine.RowProxy` or `None`
434 A result row from a SQLAlchemy SELECT query, or `None` to indicate
435 the row from an `EmptyQuery`.
436 graph : `DimensionGraph`, optional
437 The dimensions the returned data ID should identify. If not
438 provided, this will be all dimensions in `QuerySummary.requested`.
439 records : `Mapping` [ `str`, `Mapping` [ `tuple`, `DimensionRecord` ] ]
440 Nested mapping containing records to attach to the returned
441 `DataCoordinate`, for which `~DataCoordinate.hasRecords` will
442 return `True`. If provided, outer keys must include all dimension
443 element names in ``graph``, and inner keys should be tuples of
444 dimension primary key values in the same order as
445 ``element.graph.required``. If not provided,
446 `DataCoordinate.hasRecords` will return `False` on the returned
447 object.
449 Returns
450 -------
451 dataId : `DataCoordinate`
452 A data ID that identifies all required and implied dimensions. If
453 ``records is not None``, this is have
454 `~DataCoordinate.hasRecords()` return `True`.
455 """
456 if graph is None:
457 graph = self.graph
458 if not graph:
459 return DataCoordinate.makeEmpty(self.graph.universe)
460 dataId = DataCoordinate.fromFullValues(
461 graph, self.extractDimensionsTuple(row, itertools.chain(graph.required, graph.implied))
462 )
463 if records is not None:
464 recordsForRow = {}
465 for element in graph.elements:
466 key = tuple(dataId.subset(element.graph).values())
467 recordsForRow[element.name] = records[element.name].get(key)
468 return dataId.expanded(recordsForRow)
469 else:
470 return dataId
472 def extractDatasetRef(
473 self,
474 row: sqlalchemy.engine.RowProxy,
475 dataId: Optional[DataCoordinate] = None,
476 records: Optional[Mapping[str, Mapping[tuple, DimensionRecord]]] = None,
477 ) -> DatasetRef:
478 """Extract a `DatasetRef` from a result row.
480 Parameters
481 ----------
482 row : `sqlalchemy.engine.RowProxy`
483 A result row from a SQLAlchemy SELECT query.
484 dataId : `DataCoordinate`
485 Data ID to attach to the `DatasetRef`. A minimal (i.e. base class)
486 `DataCoordinate` is constructed from ``row`` if `None`.
487 records : `Mapping` [ `str`, `Mapping` [ `tuple`, `DimensionRecord` ] ]
488 Records to use to return an `ExpandedDataCoordinate`. If provided,
489 outer keys must include all dimension element names in ``graph``,
490 and inner keys should be tuples of dimension primary key values
491 in the same order as ``element.graph.required``.
493 Returns
494 -------
495 ref : `DatasetRef`
496 Reference to the dataset; guaranteed to have `DatasetRef.id` not
497 `None`.
498 """
499 datasetColumns = self.getDatasetColumns()
500 assert datasetColumns is not None
501 if dataId is None:
502 dataId = self.extractDataId(row, graph=datasetColumns.datasetType.dimensions, records=records)
503 runRecord = self.backend.managers.collections[row._mapping[datasetColumns.runKey]]
504 return DatasetRef(
505 datasetColumns.datasetType, dataId, id=row._mapping[datasetColumns.id], run=runRecord.name
506 )
508 def _makeSubsetQueryColumns(
509 self, *, graph: Optional[DimensionGraph] = None, datasets: bool = True, unique: bool = False
510 ) -> Tuple[DimensionGraph, Optional[QueryColumns]]:
511 """Helper method for subclass implementations of `subset`.
513 Parameters
514 ----------
515 graph : `DimensionGraph`, optional
516 Dimensions to include in the new `Query` being constructed.
517 ``subset`` implementations should generally just forward their
518 own ``graph`` argument here.
519 datasets : `bool`, optional
520 Whether the new `Query` should include dataset results. Defaults
521 to `True`, but is ignored if ``self`` does not include dataset
522 results.
523 unique : `bool`, optional
524 Whether the new `Query` should guarantee unique results (this may
525 come with a performance penalty).
527 Returns
528 -------
529 graph : `DimensionGraph`
530 The dimensions of the new `Query`. This is exactly the same as
531 the argument of the same name, with ``self.graph`` used if that
532 argument is `None`.
533 columns : `QueryColumns` or `None`
534 A struct containing the SQLAlchemy column objects to use in the
535 new query, constructed by delegating to other (mostly abstract)
536 methods on ``self``. If `None`, `subset` may return ``self``.
537 """
538 if graph is None:
539 graph = self.graph
540 if (
541 graph == self.graph
542 and (self.getDatasetColumns() is None or datasets)
543 and (self.isUnique() or not unique)
544 ):
545 return graph, None
546 columns = QueryColumns()
547 for dimension in graph.dimensions:
548 col = self.getDimensionColumn(dimension.name)
549 columns.keys[dimension] = [col]
550 if not unique:
551 for element in self.spatial:
552 col = self.getRegionColumn(element.name)
553 columns.regions[element] = col
554 if datasets and self.getDatasetColumns() is not None:
555 columns.datasets = self.getDatasetColumns()
556 return graph, columns
558 @abstractmethod
559 def materialize(self, db: Database) -> ContextManager[Query]:
560 """Execute this query and insert its results into a temporary table.
562 Parameters
563 ----------
564 db : `Database`
565 Database engine to execute the query against.
567 Returns
568 -------
569 context : `typing.ContextManager` [ `MaterializedQuery` ]
570 A context manager that ensures the temporary table is created and
571 populated in ``__enter__`` (returning a `MaterializedQuery` object
572 backed by that table), and dropped in ``__exit__``. If ``self``
573 is already a `MaterializedQuery`, ``__enter__`` may just return
574 ``self`` and ``__exit__`` may do nothing (reflecting the fact that
575 an outer context manager should already take care of everything
576 else).
577 """
578 raise NotImplementedError()
580 @abstractmethod
581 def subset(
582 self, *, graph: Optional[DimensionGraph] = None, datasets: bool = True, unique: bool = False
583 ) -> Query:
584 """Return a new `Query` whose columns and/or rows are (mostly) subset
585 of this one's.
587 Parameters
588 ----------
589 graph : `DimensionGraph`, optional
590 Dimensions to include in the new `Query` being constructed.
591 If `None` (default), ``self.graph`` is used.
592 datasets : `bool`, optional
593 Whether the new `Query` should include dataset results. Defaults
594 to `True`, but is ignored if ``self`` does not include dataset
595 results.
596 unique : `bool`, optional
597 Whether the new `Query` should guarantee unique results (this may
598 come with a performance penalty).
600 Returns
601 -------
602 query : `Query`
603 A query object corresponding to the given inputs. May be ``self``
604 if no changes were requested.
606 Notes
607 -----
608 The way spatial overlaps are handled at present makes it impossible to
609 fully guarantee in general that the new query's rows are a subset of
610 this one's while also returning unique rows. That's because the
611 database is only capable of performing approximate, conservative
612 overlaps via the common skypix system; we defer actual region overlap
613 operations to per-result-row Python logic. But including the region
614 columns necessary to do that postprocessing in the query makes it
615 impossible to do a SELECT DISTINCT on the user-visible dimensions of
616 the query. For example, consider starting with a query with dimensions
617 (instrument, skymap, visit, tract). That involves a spatial join
618 between visit and tract, and we include the region columns from both
619 tables in the results in order to only actually yield result rows
620 (see `predicate` and `rows`) where the regions in those two columns
621 overlap. If the user then wants to subset to just (skymap, tract) with
622 unique results, we have two unpalatable options:
624 - we can do a SELECT DISTINCT with just the skymap and tract columns
625 in the SELECT clause, dropping all detailed overlap information and
626 including some tracts that did not actually overlap any of the
627 visits in the original query (but were regarded as _possibly_
628 overlapping via the coarser, common-skypix relationships);
630 - we can include the tract and visit region columns in the query, and
631 continue to filter out the non-overlapping pairs, but completely
632 disregard the user's request for unique tracts.
634 This interface specifies that implementations must do the former, as
635 that's what makes things efficient in our most important use case
636 (``QuantumGraph`` generation in ``pipe_base``). We may be able to
637 improve this situation in the future by putting exact overlap
638 information in the database, either by using built-in (but
639 engine-specific) spatial database functionality or (more likely)
640 switching to a scheme in which pairwise dimension spatial relationships
641 are explicitly precomputed (for e.g. combinations of instruments and
642 skymaps).
643 """
644 raise NotImplementedError()
646 @abstractmethod
647 def makeBuilder(self, summary: Optional[QuerySummary] = None) -> QueryBuilder:
648 """Return a `QueryBuilder` that can be used to construct a new `Query`
649 that is joined to (and hence constrained by) this one.
651 Parameters
652 ----------
653 summary : `QuerySummary`, optional
654 A `QuerySummary` instance that specifies the dimensions and any
655 additional constraints to include in the new query being
656 constructed, or `None` to use the dimensions of ``self`` with no
657 additional constraints.
658 """
659 raise NotImplementedError()
661 graph: DimensionGraph
662 """The dimensions identified by this query and included in any data IDs
663 created from its result rows (`DimensionGraph`).
664 """
666 whereRegion: Optional[Region]
667 """A spatial region that all regions in all rows returned by this query
668 must overlap (`lsst.sphgeom.Region` or `None`).
669 """
671 backend: QueryBackend
672 """Backend object that represents the `Registry` implementation.
673 """
676class DirectQueryUniqueness(enum.Enum):
677 """An enum representing the ways in which a query can have unique rows (or
678 not).
679 """
681 NOT_UNIQUE = enum.auto()
682 """The query is not expected to have unique rows.
683 """
685 NATURALLY_UNIQUE = enum.auto()
686 """The construction of the query guarantees that it will have unique
687 result rows, even without SELECT DISTINCT or a GROUP BY clause.
688 """
690 NEEDS_DISTINCT = enum.auto()
691 """The query is expected to yield unique result rows, and needs to use
692 SELECT DISTINCT or an equivalent GROUP BY clause to achieve this.
693 """
696class DirectQuery(Query):
697 """A `Query` implementation that represents a direct SELECT query that
698 usually joins many tables.
700 `DirectQuery` objects should generally only be constructed by
701 `QueryBuilder` or the methods of other `Query` objects.
703 Parameters
704 ----------
705 simpleQuery : `SimpleQuery`
706 Struct representing the actual SELECT, FROM, and WHERE clauses.
707 columns : `QueryColumns`
708 Columns that are referenced in the query in any clause.
709 uniqueness : `DirectQueryUniqueness`
710 Enum value indicating whether the query should yield unique result
711 rows, and if so whether that needs to be explicitly requested of the
712 database.
713 graph : `DimensionGraph`
714 Object describing the dimensions included in the query.
715 whereRegion : `lsst.sphgeom.Region`, optional
716 Region that all region columns in all returned rows must overlap.
717 backend : `QueryBackend`
718 Backend object that represents the `Registry` implementation.
719 doomed_by : `Iterable` [ `str` ], optional
720 A list of messages (appropriate for e.g. logging or exceptions) that
721 explain why the query is known to return no results even before it is
722 executed. Queries with a non-empty list will never be executed.
723 """
725 def __init__(
726 self,
727 *,
728 simpleQuery: SimpleQuery,
729 columns: QueryColumns,
730 uniqueness: DirectQueryUniqueness,
731 graph: DimensionGraph,
732 whereRegion: Optional[Region],
733 backend: QueryBackend,
734 order_by_columns: Iterable[OrderByColumn] = (),
735 limit: Optional[Tuple[int, Optional[int]]] = None,
736 doomed_by: Iterable[str] = (),
737 ):
738 super().__init__(graph=graph, whereRegion=whereRegion, backend=backend, doomed_by=doomed_by)
739 assert not simpleQuery.columns, "Columns should always be set on a copy in .sql"
740 assert not columns.isEmpty(), "EmptyQuery must be used when a query would have no columns."
741 self._simpleQuery = simpleQuery
742 self._columns = columns
743 self._uniqueness = uniqueness
744 self._order_by_columns = order_by_columns
745 self._limit = limit
746 self._datasetQueryColumns: Optional[DatasetQueryColumns] = None
747 self._dimensionColumns: Dict[str, sqlalchemy.sql.ColumnElement] = {}
748 self._regionColumns: Dict[str, sqlalchemy.sql.ColumnElement] = {}
750 def isUnique(self) -> bool:
751 # Docstring inherited from Query.
752 return self._uniqueness is not DirectQueryUniqueness.NOT_UNIQUE
754 def getDimensionColumn(self, name: str) -> sqlalchemy.sql.ColumnElement:
755 # Docstring inherited from Query.
756 column = self._dimensionColumns.get(name)
757 if column is None:
758 column = self._columns.getKeyColumn(name).label(name)
759 self._dimensionColumns[name] = column
760 return column
762 @property
763 def spatial(self) -> Iterator[DimensionElement]:
764 # Docstring inherited from Query.
765 return iter(self._columns.regions)
767 def getRegionColumn(self, name: str) -> sqlalchemy.sql.ColumnElement:
768 # Docstring inherited from Query.
769 column = self._regionColumns.get(name)
770 if column is None:
771 column = self._columns.regions[name].label(f"{name}_region")
772 self._regionColumns[name] = column
773 return column
775 def getDatasetColumns(self) -> Optional[DatasetQueryColumns]:
776 # Docstring inherited from Query.
777 if self._datasetQueryColumns is None:
778 base = self._columns.datasets
779 if base is None:
780 return None
781 ingestDate = base.ingestDate
782 if ingestDate is not None:
783 ingestDate = ingestDate.label("ingest_date")
784 self._datasetQueryColumns = DatasetQueryColumns(
785 datasetType=base.datasetType,
786 id=base.id.label("dataset_id"),
787 runKey=base.runKey.label(self.backend.managers.collections.getRunForeignKeyName()),
788 ingestDate=ingestDate,
789 )
790 return self._datasetQueryColumns
792 @property
793 def sql(self) -> sqlalchemy.sql.FromClause:
794 # Docstring inherited from Query.
795 simpleQuery = self._simpleQuery.copy()
796 for dimension in self.graph:
797 simpleQuery.columns.append(self.getDimensionColumn(dimension.name))
798 for element in self.spatial:
799 simpleQuery.columns.append(self.getRegionColumn(element.name))
800 datasetColumns = self.getDatasetColumns()
801 if datasetColumns is not None:
802 simpleQuery.columns.extend(datasetColumns)
804 assert not simpleQuery.order_by, "Input query cannot have ORDER BY"
805 if self._order_by_columns:
806 # add ORDER BY column
807 order_by_columns = [column.column_order for column in self._order_by_columns]
808 order_by_column = sqlalchemy.func.row_number().over(order_by=order_by_columns).label("_orderby")
809 simpleQuery.columns.append(order_by_column)
810 simpleQuery.order_by = [order_by_column]
812 assert simpleQuery.limit is None, "Input query cannot have LIMIT"
813 simpleQuery.limit = self._limit
815 sql = simpleQuery.combine()
817 if self._uniqueness is DirectQueryUniqueness.NEEDS_DISTINCT:
818 return sql.distinct()
819 else:
820 return sql
822 def _makeTableSpec(self, constraints: bool = False) -> ddl.TableSpec:
823 """Helper method for subclass implementations of `materialize`.
825 Parameters
826 ----------
827 constraints : `bool`, optional
828 If `True` (`False` is default), define a specification that
829 includes actual foreign key constraints for logical foreign keys.
830 Some database engines do not permit temporary tables to reference
831 normal tables, so this should be `False` when generating a spec
832 for a temporary table unless the database engine is known to
833 support them.
835 Returns
836 -------
837 spec : `ddl.TableSpec`
838 Specification for a table that could hold this query's result rows.
839 """
840 unique = self.isUnique()
841 spec = ddl.TableSpec(fields=())
842 for dimension in self.graph:
843 addDimensionForeignKey(spec, dimension, primaryKey=unique, constraint=constraints)
844 for element in self.spatial:
845 spec.fields.add(ddl.FieldSpec.for_region(f"{element.name}_region"))
846 datasetColumns = self.getDatasetColumns()
847 if datasetColumns is not None:
848 self.backend.managers.datasets.addDatasetForeignKey(
849 spec, primaryKey=unique, constraint=constraints
850 )
851 self.backend.managers.collections.addRunForeignKey(spec, nullable=False, constraint=constraints)
853 # Need a column for ORDER BY if ordering is requested
854 if self._order_by_columns:
855 spec.fields.add(
856 ddl.FieldSpec(
857 name="_orderby",
858 dtype=sqlalchemy.BigInteger,
859 nullable=False,
860 doc="Column to use with ORDER BY",
861 )
862 )
864 return spec
866 @contextmanager
867 def materialize(self, db: Database) -> Iterator[Query]:
868 # Docstring inherited from Query.
869 spec = self._makeTableSpec()
870 with db.session() as session:
871 table = session.makeTemporaryTable(spec)
872 if not self._doomed_by:
873 db.insert(table, select=self.sql, names=spec.fields.names)
874 yield MaterializedQuery(
875 table=table,
876 spatial=self.spatial,
877 datasetType=self.datasetType,
878 isUnique=self.isUnique(),
879 graph=self.graph,
880 whereRegion=self.whereRegion,
881 backend=self.backend,
882 doomed_by=self._doomed_by,
883 )
884 session.dropTemporaryTable(table)
886 def subset(
887 self, *, graph: Optional[DimensionGraph] = None, datasets: bool = True, unique: bool = False
888 ) -> Query:
889 # Docstring inherited from Query.
890 graph, columns = self._makeSubsetQueryColumns(graph=graph, datasets=datasets, unique=unique)
891 if columns is None:
892 return self
893 if columns.isEmpty():
894 return EmptyQuery(self.graph.universe, self.backend)
895 return DirectQuery(
896 simpleQuery=self._simpleQuery.copy(),
897 columns=columns,
898 uniqueness=DirectQueryUniqueness.NEEDS_DISTINCT if unique else DirectQueryUniqueness.NOT_UNIQUE,
899 graph=graph,
900 whereRegion=self.whereRegion if not unique else None,
901 backend=self.backend,
902 doomed_by=self._doomed_by,
903 )
905 def makeBuilder(self, summary: Optional[QuerySummary] = None) -> QueryBuilder:
906 # Docstring inherited from Query.
907 from ._builder import QueryBuilder
909 if summary is None:
910 summary = QuerySummary(self.graph, whereRegion=self.whereRegion)
911 if not summary.requested.issubset(self.graph):
912 raise NotImplementedError(
913 f"Query.makeBuilder does not yet support augmenting dimensions "
914 f"({summary.requested.dimensions}) beyond those originally included in the query "
915 f"({self.graph.dimensions})."
916 )
917 builder = QueryBuilder(summary, backend=self.backend, doomed_by=self._doomed_by)
918 builder.joinTable(
919 self.sql.alias(), dimensions=self.graph.dimensions, datasets=self.getDatasetColumns()
920 )
921 return builder
924class MaterializedQuery(Query):
925 """A `Query` implementation that represents query results saved in a
926 temporary table.
928 `MaterializedQuery` instances should not be constructed directly; use
929 `Query.materialize()` instead.
931 Parameters
932 ----------
933 table : `sqlalchemy.schema.Table`
934 SQLAlchemy object representing the temporary table.
935 spatial : `Iterable` [ `DimensionElement` ]
936 Spatial dimension elements whose regions must overlap for each valid
937 result row (which may reject some rows that are in the table).
938 datasetType : `DatasetType`
939 The `DatasetType` of datasets returned by this query, or `None`
940 if there are no dataset results
941 isUnique : `bool`
942 If `True`, the table's rows are unique, and there is no need to
943 add ``SELECT DISTINCT`` to guarantee this in results.
944 graph : `DimensionGraph`
945 Dimensions included in the columns of this table.
946 whereRegion : `Region` or `None`
947 A spatial region all result-row regions must overlap to be valid (which
948 may reject some rows that are in the table).
949 backend : `QueryBackend`
950 Backend object that represents the `Registry` implementation.
951 doomed_by : `Iterable` [ `str` ], optional
952 A list of messages (appropriate for e.g. logging or exceptions) that
953 explain why the query is known to return no results even before it is
954 executed. Queries with a non-empty list will never be executed.
955 """
957 def __init__(
958 self,
959 *,
960 table: sqlalchemy.schema.Table,
961 spatial: Iterable[DimensionElement],
962 datasetType: Optional[DatasetType],
963 isUnique: bool,
964 graph: DimensionGraph,
965 whereRegion: Optional[Region],
966 backend: QueryBackend,
967 doomed_by: Iterable[str] = (),
968 ):
969 super().__init__(graph=graph, whereRegion=whereRegion, backend=backend, doomed_by=doomed_by)
970 self._table = table
971 self._spatial = tuple(spatial)
972 self._datasetType = datasetType
973 self._isUnique = isUnique
975 def isUnique(self) -> bool:
976 # Docstring inherited from Query.
977 return self._isUnique
979 def getDimensionColumn(self, name: str) -> sqlalchemy.sql.ColumnElement:
980 # Docstring inherited from Query.
981 return self._table.columns[name]
983 @property
984 def spatial(self) -> Iterator[DimensionElement]:
985 # Docstring inherited from Query.
986 return iter(self._spatial)
988 def getRegionColumn(self, name: str) -> sqlalchemy.sql.ColumnElement:
989 # Docstring inherited from Query.
990 return self._table.columns[f"{name}_region"]
992 def getDatasetColumns(self) -> Optional[DatasetQueryColumns]:
993 # Docstring inherited from Query.
994 if self._datasetType is not None:
995 return DatasetQueryColumns(
996 datasetType=self._datasetType,
997 id=self._table.columns["dataset_id"],
998 runKey=self._table.columns[self.backend.managers.collections.getRunForeignKeyName()],
999 ingestDate=None,
1000 )
1001 else:
1002 return None
1004 @property
1005 def sql(self) -> sqlalchemy.sql.FromClause:
1006 # Docstring inherited from Query.
1007 select = self._table.select()
1008 if "_orderby" in self._table.columns:
1009 select = select.order_by(self._table.columns["_orderby"])
1010 return select
1012 @contextmanager
1013 def materialize(self, db: Database) -> Iterator[Query]:
1014 # Docstring inherited from Query.
1015 yield self
1017 def subset(
1018 self, *, graph: Optional[DimensionGraph] = None, datasets: bool = True, unique: bool = False
1019 ) -> Query:
1020 # Docstring inherited from Query.
1021 graph, columns = self._makeSubsetQueryColumns(graph=graph, datasets=datasets, unique=unique)
1022 if columns is None:
1023 return self
1024 if columns.isEmpty():
1025 return EmptyQuery(self.graph.universe, self.backend)
1026 simpleQuery = SimpleQuery()
1027 simpleQuery.join(self._table)
1028 return DirectQuery(
1029 simpleQuery=simpleQuery,
1030 columns=columns,
1031 uniqueness=DirectQueryUniqueness.NEEDS_DISTINCT if unique else DirectQueryUniqueness.NOT_UNIQUE,
1032 graph=graph,
1033 whereRegion=self.whereRegion if not unique else None,
1034 backend=self.backend,
1035 doomed_by=self._doomed_by,
1036 )
1038 def makeBuilder(self, summary: Optional[QuerySummary] = None) -> QueryBuilder:
1039 # Docstring inherited from Query.
1040 from ._builder import QueryBuilder
1042 if summary is None:
1043 summary = QuerySummary(self.graph, whereRegion=self.whereRegion)
1044 if not summary.requested.issubset(self.graph):
1045 raise NotImplementedError(
1046 f"Query.makeBuilder does not yet support augmenting dimensions "
1047 f"({summary.requested.dimensions}) beyond those originally included in the query "
1048 f"({self.graph.dimensions})."
1049 )
1050 builder = QueryBuilder(summary, backend=self.backend, doomed_by=self._doomed_by)
1051 builder.joinTable(self._table, dimensions=self.graph.dimensions, datasets=self.getDatasetColumns())
1052 return builder
1055class EmptyQuery(Query):
1056 """A `Query` implementation that handes the special case where the query
1057 would have no columns.
1059 Parameters
1060 ----------
1061 universe : `DimensionUniverse`
1062 Set of all dimensions from which the null set is extracted.
1063 backend : `QueryBackend`
1064 Backend object that represents the `Registry` implementation.
1065 doomed_by : `Iterable` [ `str` ], optional
1066 A list of messages (appropriate for e.g. logging or exceptions) that
1067 explain why the query is known to return no results even before it is
1068 executed. Queries with a non-empty list will never be executed.
1069 """
1071 def __init__(
1072 self,
1073 universe: DimensionUniverse,
1074 backend: QueryBackend,
1075 doomed_by: Iterable[str] = (),
1076 ):
1077 super().__init__(graph=universe.empty, whereRegion=None, backend=backend, doomed_by=doomed_by)
1079 def isUnique(self) -> bool:
1080 # Docstring inherited from Query.
1081 return True
1083 def getDimensionColumn(self, name: str) -> sqlalchemy.sql.ColumnElement:
1084 # Docstring inherited from Query.
1085 raise KeyError(f"No dimension {name} in query (no dimensions at all, actually).")
1087 @property
1088 def spatial(self) -> Iterator[DimensionElement]:
1089 # Docstring inherited from Query.
1090 return iter(())
1092 def getRegionColumn(self, name: str) -> sqlalchemy.sql.ColumnElement:
1093 # Docstring inherited from Query.
1094 raise KeyError(f"No region for {name} in query (no regions at all, actually).")
1096 def getDatasetColumns(self) -> Optional[DatasetQueryColumns]:
1097 # Docstring inherited from Query.
1098 return None
1100 def rows(self, db: Database) -> Iterator[Optional[sqlalchemy.engine.RowProxy]]:
1101 if not self._doomed_by:
1102 yield None
1104 @property
1105 def sql(self) -> Optional[sqlalchemy.sql.FromClause]:
1106 # Docstring inherited from Query.
1107 return None
1109 @contextmanager
1110 def materialize(self, db: Database) -> Iterator[Query]:
1111 # Docstring inherited from Query.
1112 yield self
1114 def subset(
1115 self, *, graph: Optional[DimensionGraph] = None, datasets: bool = True, unique: bool = False
1116 ) -> Query:
1117 # Docstring inherited from Query.
1118 assert graph is None or graph.issubset(self.graph)
1119 return self
1121 def makeBuilder(self, summary: Optional[QuerySummary] = None) -> QueryBuilder:
1122 # Docstring inherited from Query.
1123 from ._builder import QueryBuilder
1125 if summary is None:
1126 summary = QuerySummary(self.graph)
1127 if not summary.requested.issubset(self.graph):
1128 raise NotImplementedError(
1129 f"Query.makeBuilder does not yet support augmenting dimensions "
1130 f"({summary.requested.dimensions}) beyond those originally included in the query "
1131 f"({self.graph.dimensions})."
1132 )
1133 return QueryBuilder(summary, backend=self.backend, doomed_by=self._doomed_by)