Coverage for python/lsst/daf/butler/registry/queries/_builder.py : 17%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
21from __future__ import annotations
23__all__ = ("QueryBuilder",)
25from typing import List, Iterable, TYPE_CHECKING
27from sqlalchemy.sql import ColumnElement, and_, literal, bindparam, select, FromClause
28import sqlalchemy.sql
29from sqlalchemy.engine import Connection
31from ...core import (
32 DimensionElement,
33 SkyPixDimension,
34 Dimension,
35 DatasetType,
36 Timespan,
37)
38from ...core.utils import NamedValueSet
40from ._structs import QuerySummary, QueryColumns, QueryParameters, GivenTime
41from ._datasets import DatasetRegistryStorage, CollectionsExpression
42from .expressions import ClauseVisitor
43from ._query import Query
45if TYPE_CHECKING: 45 ↛ 46line 45 didn't jump to line 46, because the condition on line 45 was never true
46 from ..interfaces import DimensionRecordStorageManager
49class QueryBuilder:
50 """A builder for potentially complex queries that join tables based
51 on dimension relationships.
53 Parameters
54 ----------
55 connection : `sqlalchemy.engine.Connection`
56 SQLAlchemy connection object. This is only used to pass through
57 to the `Query` object returned by `finish`.
58 summary : `QuerySummary`
59 Struct organizing the dimensions involved in the query.
60 dimensionStorage : `DimensionRecordStorageManager`
61 Manager for storage backend objects that abstract access to dimension
62 tables.
63 datasetStorage : `DatasetRegistryStorage`
64 Storage backend object that abstracts access to dataset tables.
65 """
67 def __init__(self, connection: Connection, summary: QuerySummary,
68 dimensionStorage: DimensionRecordStorageManager,
69 datasetStorage: DatasetRegistryStorage):
70 self.summary = summary
71 self._connection = connection
72 self._dimensionStorage = dimensionStorage
73 self._datasetStorage = datasetStorage
74 self._sql = None
75 self._elements: NamedValueSet[DimensionElement] = NamedValueSet()
76 self._columns = QueryColumns()
78 def hasDimensionKey(self, dimension: Dimension) -> bool:
79 """Return `True` if the given dimension's primary key column has
80 been included in the query (possibly via a foreign key column on some
81 other table).
82 """
83 return dimension in self._columns.keys
85 def joinDimensionElement(self, element: DimensionElement):
86 """Add the table for a `DimensionElement` to the query.
88 This automatically joins the element table to all other tables in the
89 query with which it is related, via both dimension keys and spatial
90 and temporal relationships.
92 External calls to this method should rarely be necessary; `finish` will
93 automatically call it if the `DimensionElement` has been identified as
94 one that must be included.
96 Parameters
97 ----------
98 element : `DimensionElement`
99 Element for which a table should be added. The element must be
100 associated with a database table (see `DimensionElement.hasTable`).
101 """
102 assert element not in self._elements, "Element already included in query."
103 storage = self._dimensionStorage[element]
104 storage.join(
105 self,
106 regions=self._columns.regions if element in self.summary.spatial else None,
107 timespans=self._columns.timespans if element in self.summary.temporal else None,
108 )
109 self._elements.add(element)
111 def joinDataset(self, datasetType: DatasetType, collections: CollectionsExpression, *,
112 isResult: bool = True, addRank: bool = False):
113 """Add a dataset search or constraint to the query.
115 Unlike other `QueryBuilder` join methods, this *must* be called
116 directly to search for datasets of a particular type or constrain the
117 query results based on the exists of datasets. However, all dimensions
118 used to identify the dataset type must have already been included in
119 `QuerySummary.requested` when initializing the `QueryBuilder`.
121 Parameters
122 ----------
123 datasetType : `DatasetType`
124 The type of datasets to search for.
125 collections : sequence of `str` or `Like`, or ``...``
126 An expression describing the collections in which to search for
127 the datasets. ``...`` indicates that all collections should be
128 searched.
129 isResult : `bool`, optional
130 If `True` (default), include the ``dataset_id`` column in the
131 result columns of the query, allowing complete `DatasetRef`
132 instances to be produced from the query results for this dataset
133 type. If `False`, the existence of datasets of this type is used
134 only to constrain the data IDs returned by the query.
135 addRank : `bool`, optional
136 If `True` (`False` is default), also include a calculated column
137 that ranks the collection in which the dataset was found (lower
138 is better). Requires that all entries in ``collections`` be
139 regular strings, so there is a clear search order. Ignored if
140 ``isResult`` is `False`.
141 """
142 assert datasetType.dimensions.issubset(self.summary.requested)
143 table = self._datasetStorage.getDatasetSubquery(datasetType, collections=collections,
144 dataId=self.summary.dataId,
145 isResult=isResult, addRank=addRank)
146 self.joinTable(table, datasetType.dimensions)
147 if isResult:
148 self._columns.datasets[datasetType] = (table.columns["dataset_id"],
149 table.columns["rank"] if addRank else None)
151 def joinTable(self, table: FromClause, dimensions: Iterable[Dimension]):
152 """Join an arbitrary table to the query via dimension relationships.
154 External calls to this method should only be necessary for tables whose
155 records represent neither dataset nor dimension elements (i.e.
156 extensions to the standard `Registry` schema).
158 Parameters
159 ----------
160 table : `sqlalchemy.sql.FromClause`
161 SQLAlchemy object representing the logical table (which may be a
162 join or subquery expression) to be joined.
163 dimensions : iterable of `Dimension`
164 The dimensions that relate this table to others that may be in the
165 query. The table must have columns with the names of the
166 dimensions.
167 """
168 joinOn = self.startJoin(table, dimensions, dimensions.names)
169 self.finishJoin(table, joinOn)
171 def startJoin(self, table: FromClause, dimensions: Iterable[Dimension], columnNames: Iterable[str]
172 ) -> List[ColumnElement]:
173 """Begin a join on dimensions.
175 Must be followed by call to `finishJoin`.
177 Parameters
178 ----------
179 table : `sqlalchemy.sql.FromClause`
180 SQLAlchemy object representing the logical table (which may be a
181 join or subquery expression) to be joined.
182 dimensions : iterable of `Dimension`
183 The dimensions that relate this table to others that may be in the
184 query. The table must have columns with the names of the
185 dimensions.
186 columnNames : iterable of `str`
187 Names of the columns that correspond to dimension key values; must
188 be `zip` iterable with ``dimensions``.
190 Returns
191 -------
192 joinOn : `list` of `sqlalchemy.sql.ColumnElement`
193 Sequence of boolean expressions that should be combined with AND
194 to form (part of) the ON expression for this JOIN.
195 """
196 joinOn = []
197 for dimension, columnName in zip(dimensions, columnNames):
198 columnInTable = table.columns[columnName]
199 columnsInQuery = self._columns.keys.setdefault(dimension, [])
200 for columnInQuery in columnsInQuery:
201 joinOn.append(columnInQuery == columnInTable)
202 columnsInQuery.append(columnInTable)
203 return joinOn
205 def finishJoin(self, table, joinOn):
206 """Complete a join on dimensions.
208 Must be preceded by call to `startJoin`.
210 Parameters
211 ----------
212 table : `sqlalchemy.sql.FromClause`
213 SQLAlchemy object representing the logical table (which may be a
214 join or subquery expression) to be joined. Must be the same object
215 passed to `startJoin`.
216 joinOn : `list` of `sqlalchemy.sql.ColumnElement`
217 Sequence of boolean expressions that should be combined with AND
218 to form (part of) the ON expression for this JOIN. Should include
219 at least the elements of the list returned by `startJoin`.
220 """
221 if joinOn:
222 self._sql = self._sql.join(table, and_(*joinOn))
223 elif self._sql is None:
224 self._sql = table
225 else:
226 # New table is completely unrelated to all already-included
227 # tables. We need a cross join here but SQLAlchemy does not
228 # have a specific method for that. Using join() without
229 # `onclause` will try to join on FK and will raise an exception
230 # for unrelated tables, so we have to use `onclause` which is
231 # always true.
232 self._sql = self._sql.join(table, literal(True) == literal(True))
234 def _joinMissingDimensionElements(self):
235 """Join all dimension element tables that were identified as necessary
236 by `QuerySummary` and have not yet been joined.
238 For internal use by `QueryBuilder` only; will be called (and should
239 only by called) by `finish`.
240 """
241 # Join all DimensionElement tables that we need for spatial/temporal
242 # joins/filters or a nontrivial WHERE expression.
243 # We iterate over these in *reverse* topological order to minimize the
244 # number of tables joined. For example, the "visit" table provides
245 # the primary key value for the "instrument" table it depends on, so we
246 # don't need to join "instrument" as well unless we had a nontrivial
247 # expression on it (and hence included it already above).
248 for element in self.summary.universe.sorted(self.summary.mustHaveTableJoined, reverse=True):
249 self.joinDimensionElement(element)
250 # Join in any requested Dimension tables that don't already have their
251 # primary keys identified by the query.
252 for dimension in self.summary.universe.sorted(self.summary.mustHaveKeysJoined, reverse=True):
253 if dimension not in self._columns.keys:
254 self.joinDimensionElement(dimension)
256 def _addWhereClause(self):
257 """Add a WHERE clause to the query under construction, connecting all
258 joined dimensions to the expression and given dimensions from
259 `QuerySummary`.
261 For internal use by `QueryBuilder` only; will be called (and should
262 only by called) by `finish`.
263 """
264 parameters = QueryParameters()
265 whereTerms = []
266 if self.summary.expression.tree is not None:
267 visitor = ClauseVisitor(self.summary.universe, self._columns, self._elements)
268 whereTerms.append(self.summary.expression.tree.visit(visitor))
269 for dimension, columnsInQuery in self._columns.keys.items():
270 if dimension in self.summary.given:
271 if self.summary.whenIsDimensionGiven(dimension) == GivenTime.AT_EXECUTION:
272 givenKey = bindparam(f"_given_later_{dimension.name}")
273 parameters.keys[dimension] = givenKey
274 else:
275 givenKey = self.summary.dataId[dimension]
276 # Add a WHERE term for each column that corresponds to each
277 # key. This is redundant with the JOIN ON clauses that make
278 # them equal to each other, but more constraints have a chance
279 # of making things easier on the DB's query optimizer.
280 for columnInQuery in columnsInQuery:
281 whereTerms.append(columnInQuery == givenKey)
282 else:
283 # Dimension is not fully identified, but it might be a skypix
284 # dimension that's constrained by a given region.
285 if self.summary.given.spatial and isinstance(dimension, SkyPixDimension):
286 if self.summary.whenIsRegionGiven() == GivenTime.AT_CONSTRUCTION:
287 # We know the region now.
288 givenSkyPixIds = []
289 for begin, end in dimension.pixelization.envelope(self.summary.dataId.region):
290 givenSkyPixIds.extend(range(begin, end))
291 else:
292 # We'll know the region later (there might be a region
293 # now, too, but we'll know a more precise one later,
294 # and hence we'll ignore the one we know now).
295 givenSkyPixIds = bindparam(f"_given_later_{dimension.name}")
296 parameters.skypix[dimension] = givenSkyPixIds
297 for columnInQuery in columnsInQuery:
298 whereTerms.append(columnInQuery.in_(givenSkyPixIds))
299 # If we are [to be] given an dataId with a timespan, and there are
300 # one or more timespans in the query that aren't given, add a WHERE
301 # expression for each of them.
302 if self.summary.given.temporal and self.summary.temporal:
303 if self.summary.whenIsTimespanGiven() == GivenTime.AT_CONSTRUCTION:
304 # Timespan is known now.
305 givenInterval = self.summary.dataId.timespan
306 else:
307 # We'll know the timespan later (there might be a timespan now,
308 # too, but we'll know a more precise one later, and hence we'll
309 # ignore the one we know now).
310 givenInterval = Timespan(
311 begin=bindparam(f"_given_later_timespan_begin"),
312 end=bindparam(f"_given_later_timespan_end"),
313 )
314 for element, intervalInQuery in self._columns.timespans.items():
315 assert element not in self.summary.given.elements
316 whereTerms.append(intervalInQuery.overlaps(givenInterval, ops=sqlalchemy.sql))
317 # AND-together the full WHERE clause, and combine it with the FROM
318 # clause.
319 self._sql = self._sql.where(and_(*whereTerms))
320 return parameters
322 def _addSelectClause(self):
323 """Add a SELECT clause to the query under construction containing all
324 output columns identified by the `QuerySummary` and requested in calls
325 to `joinDataset` with ``isResult=True``.
327 For internal use by `QueryBuilder` only; will be called (and should
328 only by called) by `finish`.
329 """
330 columns = []
331 for dimension in self.summary.requested:
332 columns.append(self._columns.getKeyColumn(dimension))
333 for datasetType, columnPair in self._columns.datasets.items():
334 columns.extend(columnPair)
335 for element, column in self._columns.regions.items():
336 columns.append(column)
337 self._sql = select(columns).select_from(self._sql)
339 def finish(self) -> Query:
340 """Finish query constructing, returning a new `Query` instance.
342 This automatically joins any missing dimension element tables
343 (according to the categorization of the `QuerySummary` the builder was
344 constructed with).
346 This consumes the `QueryBuilder`; no other methods should be called
347 after this one.
349 Returns
350 -------
351 query : `Query`
352 A `Query` object that can be executed (possibly multiple times
353 with different bind parameter values) and used to interpret result
354 rows.
355 """
356 self._joinMissingDimensionElements()
357 self._addSelectClause()
358 parameters = self._addWhereClause()
359 return Query(summary=self.summary, connection=self._connection,
360 sql=self._sql, columns=self._columns, parameters=parameters)