Coverage for python/lsst/daf/butler/registry/queries/_builder.py : 19%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
21from __future__ import annotations
23__all__ = ("QueryBuilder",)
25from typing import Any, List, Iterable, TYPE_CHECKING
27from sqlalchemy.sql import ColumnElement, and_, literal, select, FromClause
28import sqlalchemy.sql
29from sqlalchemy.engine import Connection
31from ...core import (
32 DimensionElement,
33 SkyPixDimension,
34 Dimension,
35 DatasetType,
36)
37from ...core.utils import NamedKeyDict
39from ._structs import QuerySummary, QueryColumns
40from ._datasets import DatasetRegistryStorage
41from .expressions import ClauseVisitor
42from ._query import Query
44if TYPE_CHECKING: 44 ↛ 45line 44 didn't jump to line 45, because the condition on line 44 was never true
45 from ..interfaces import DimensionRecordStorageManager
48class QueryBuilder:
49 """A builder for potentially complex queries that join tables based
50 on dimension relationships.
52 Parameters
53 ----------
54 connection : `sqlalchemy.engine.Connection`
55 SQLAlchemy connection object. This is only used to pass through
56 to the `Query` object returned by `finish`.
57 summary : `QuerySummary`
58 Struct organizing the dimensions involved in the query.
59 dimensionStorage : `DimensionRecordStorageManager`
60 Manager for storage backend objects that abstract access to dimension
61 tables.
62 datasetStorage : `DatasetRegistryStorage`
63 Storage backend object that abstracts access to dataset tables.
64 """
66 def __init__(self, connection: Connection, summary: QuerySummary,
67 dimensionStorage: DimensionRecordStorageManager,
68 datasetStorage: DatasetRegistryStorage):
69 self.summary = summary
70 self._connection = connection
71 self._dimensionStorage = dimensionStorage
72 self._datasetStorage = datasetStorage
73 self._sql = None
74 self._elements: NamedKeyDict[DimensionElement, FromClause] = NamedKeyDict()
75 self._columns = QueryColumns()
77 def hasDimensionKey(self, dimension: Dimension) -> bool:
78 """Return `True` if the given dimension's primary key column has
79 been included in the query (possibly via a foreign key column on some
80 other table).
81 """
82 return dimension in self._columns.keys
84 def joinDimensionElement(self, element: DimensionElement):
85 """Add the table for a `DimensionElement` to the query.
87 This automatically joins the element table to all other tables in the
88 query with which it is related, via both dimension keys and spatial
89 and temporal relationships.
91 External calls to this method should rarely be necessary; `finish` will
92 automatically call it if the `DimensionElement` has been identified as
93 one that must be included.
95 Parameters
96 ----------
97 element : `DimensionElement`
98 Element for which a table should be added. The element must be
99 associated with a database table (see `DimensionElement.hasTable`).
100 """
101 assert element not in self._elements, "Element already included in query."
102 storage = self._dimensionStorage[element]
103 fromClause = storage.join(
104 self,
105 regions=self._columns.regions if element in self.summary.spatial else None,
106 timespans=self._columns.timespans if element in self.summary.temporal else None,
107 )
108 self._elements[element] = fromClause
110 def joinDataset(self, datasetType: DatasetType, collections: Any, *,
111 isResult: bool = True, addRank: bool = False):
112 """Add a dataset search or constraint to the query.
114 Unlike other `QueryBuilder` join methods, this *must* be called
115 directly to search for datasets of a particular type or constrain the
116 query results based on the exists of datasets. However, all dimensions
117 used to identify the dataset type must have already been included in
118 `QuerySummary.requested` when initializing the `QueryBuilder`.
120 Parameters
121 ----------
122 datasetType : `DatasetType`
123 The type of datasets to search for.
124 collections : sequence of `str` or `Like`, or ``...``
125 An expression describing the collections in which to search for
126 the datasets. This may be a single instance of or an iterable of
127 any of the following:
129 - a `str` collection name;
130 - a `Like` pattern to match against collection names;
131 - `...`, indicating all collections.
132 isResult : `bool`, optional
133 If `True` (default), include the ``dataset_id`` column in the
134 result columns of the query, allowing complete `DatasetRef`
135 instances to be produced from the query results for this dataset
136 type. If `False`, the existence of datasets of this type is used
137 only to constrain the data IDs returned by the query.
138 addRank : `bool`, optional
139 If `True` (`False` is default), also include a calculated column
140 that ranks the collection in which the dataset was found (lower
141 is better). Requires that all entries in ``collections`` be
142 regular strings, so there is a clear search order. Ignored if
143 ``isResult`` is `False`.
144 """
145 assert datasetType.dimensions.issubset(self.summary.requested)
146 table = self._datasetStorage.getDatasetSubquery(datasetType, collections=collections,
147 isResult=isResult, addRank=addRank)
148 self.joinTable(table, datasetType.dimensions)
149 if isResult:
150 self._columns.datasets[datasetType] = (table.columns["dataset_id"],
151 table.columns["rank"] if addRank else None)
153 def joinTable(self, table: FromClause, dimensions: Iterable[Dimension]):
154 """Join an arbitrary table to the query via dimension relationships.
156 External calls to this method should only be necessary for tables whose
157 records represent neither dataset nor dimension elements (i.e.
158 extensions to the standard `Registry` schema).
160 Parameters
161 ----------
162 table : `sqlalchemy.sql.FromClause`
163 SQLAlchemy object representing the logical table (which may be a
164 join or subquery expression) to be joined.
165 dimensions : iterable of `Dimension`
166 The dimensions that relate this table to others that may be in the
167 query. The table must have columns with the names of the
168 dimensions.
169 """
170 joinOn = self.startJoin(table, dimensions, dimensions.names)
171 self.finishJoin(table, joinOn)
173 def startJoin(self, table: FromClause, dimensions: Iterable[Dimension], columnNames: Iterable[str]
174 ) -> List[ColumnElement]:
175 """Begin a join on dimensions.
177 Must be followed by call to `finishJoin`.
179 Parameters
180 ----------
181 table : `sqlalchemy.sql.FromClause`
182 SQLAlchemy object representing the logical table (which may be a
183 join or subquery expression) to be joined.
184 dimensions : iterable of `Dimension`
185 The dimensions that relate this table to others that may be in the
186 query. The table must have columns with the names of the
187 dimensions.
188 columnNames : iterable of `str`
189 Names of the columns that correspond to dimension key values; must
190 be `zip` iterable with ``dimensions``.
192 Returns
193 -------
194 joinOn : `list` of `sqlalchemy.sql.ColumnElement`
195 Sequence of boolean expressions that should be combined with AND
196 to form (part of) the ON expression for this JOIN.
197 """
198 joinOn = []
199 for dimension, columnName in zip(dimensions, columnNames):
200 columnInTable = table.columns[columnName]
201 columnsInQuery = self._columns.keys.setdefault(dimension, [])
202 for columnInQuery in columnsInQuery:
203 joinOn.append(columnInQuery == columnInTable)
204 columnsInQuery.append(columnInTable)
205 return joinOn
207 def finishJoin(self, table, joinOn):
208 """Complete a join on dimensions.
210 Must be preceded by call to `startJoin`.
212 Parameters
213 ----------
214 table : `sqlalchemy.sql.FromClause`
215 SQLAlchemy object representing the logical table (which may be a
216 join or subquery expression) to be joined. Must be the same object
217 passed to `startJoin`.
218 joinOn : `list` of `sqlalchemy.sql.ColumnElement`
219 Sequence of boolean expressions that should be combined with AND
220 to form (part of) the ON expression for this JOIN. Should include
221 at least the elements of the list returned by `startJoin`.
222 """
223 if joinOn:
224 self._sql = self._sql.join(table, and_(*joinOn))
225 elif self._sql is None:
226 self._sql = table
227 else:
228 # New table is completely unrelated to all already-included
229 # tables. We need a cross join here but SQLAlchemy does not
230 # have a specific method for that. Using join() without
231 # `onclause` will try to join on FK and will raise an exception
232 # for unrelated tables, so we have to use `onclause` which is
233 # always true.
234 self._sql = self._sql.join(table, literal(True) == literal(True))
236 def _joinMissingDimensionElements(self):
237 """Join all dimension element tables that were identified as necessary
238 by `QuerySummary` and have not yet been joined.
240 For internal use by `QueryBuilder` only; will be called (and should
241 only by called) by `finish`.
242 """
243 # Join all DimensionElement tables that we need for spatial/temporal
244 # joins/filters or a nontrivial WHERE expression.
245 # We iterate over these in *reverse* topological order to minimize the
246 # number of tables joined. For example, the "visit" table provides
247 # the primary key value for the "instrument" table it depends on, so we
248 # don't need to join "instrument" as well unless we had a nontrivial
249 # expression on it (and hence included it already above).
250 for element in self.summary.universe.sorted(self.summary.mustHaveTableJoined, reverse=True):
251 self.joinDimensionElement(element)
252 # Join in any requested Dimension tables that don't already have their
253 # primary keys identified by the query.
254 for dimension in self.summary.universe.sorted(self.summary.mustHaveKeysJoined, reverse=True):
255 if dimension not in self._columns.keys:
256 self.joinDimensionElement(dimension)
258 def _addWhereClause(self):
259 """Add a WHERE clause to the query under construction, connecting all
260 joined dimensions to the expression and data ID dimensions from
261 `QuerySummary`.
263 For internal use by `QueryBuilder` only; will be called (and should
264 only by called) by `finish`.
265 """
266 whereTerms = []
267 if self.summary.expression.tree is not None:
268 visitor = ClauseVisitor(self.summary.universe, self._columns, self._elements)
269 whereTerms.append(self.summary.expression.tree.visit(visitor))
270 for dimension, columnsInQuery in self._columns.keys.items():
271 if dimension in self.summary.dataId.graph:
272 givenKey = self.summary.dataId[dimension]
273 # Add a WHERE term for each column that corresponds to each
274 # key. This is redundant with the JOIN ON clauses that make
275 # them equal to each other, but more constraints have a chance
276 # of making things easier on the DB's query optimizer.
277 for columnInQuery in columnsInQuery:
278 whereTerms.append(columnInQuery == givenKey)
279 else:
280 # Dimension is not fully identified, but it might be a skypix
281 # dimension that's constrained by a given region.
282 if self.summary.dataId.graph.spatial and isinstance(dimension, SkyPixDimension):
283 # We know the region now.
284 givenSkyPixIds = []
285 for begin, end in dimension.pixelization.envelope(self.summary.dataId.region):
286 givenSkyPixIds.extend(range(begin, end))
287 for columnInQuery in columnsInQuery:
288 whereTerms.append(columnInQuery.in_(givenSkyPixIds))
289 # If we are given an dataId with a timespan, and there are one or more
290 # timespans in the query that aren't given, add a WHERE expression for
291 # each of them.
292 if self.summary.dataId.graph.temporal and self.summary.temporal:
293 # Timespan is known now.
294 givenInterval = self.summary.dataId.timespan
295 for element, intervalInQuery in self._columns.timespans.items():
296 assert element not in self.summary.dataId.graph.elements
297 whereTerms.append(intervalInQuery.overlaps(givenInterval, ops=sqlalchemy.sql))
298 # AND-together the full WHERE clause, and combine it with the FROM
299 # clause.
300 self._sql = self._sql.where(and_(*whereTerms))
302 def _addSelectClause(self):
303 """Add a SELECT clause to the query under construction containing all
304 output columns identified by the `QuerySummary` and requested in calls
305 to `joinDataset` with ``isResult=True``.
307 For internal use by `QueryBuilder` only; will be called (and should
308 only by called) by `finish`.
309 """
310 columns = []
311 for dimension in self.summary.requested:
312 columns.append(self._columns.getKeyColumn(dimension))
313 for datasetType, columnPair in self._columns.datasets.items():
314 columns.extend(columnPair)
315 for element, column in self._columns.regions.items():
316 columns.append(column)
317 self._sql = select(columns).select_from(self._sql)
319 def finish(self) -> Query:
320 """Finish query constructing, returning a new `Query` instance.
322 This automatically joins any missing dimension element tables
323 (according to the categorization of the `QuerySummary` the builder was
324 constructed with).
326 This consumes the `QueryBuilder`; no other methods should be called
327 after this one.
329 Returns
330 -------
331 query : `Query`
332 A `Query` object that can be executed (possibly multiple times
333 with different bind parameter values) and used to interpret result
334 rows.
335 """
336 self._joinMissingDimensionElements()
337 self._addSelectClause()
338 self._addWhereClause()
339 return Query(summary=self.summary, connection=self._connection,
340 sql=self._sql, columns=self._columns)