Coverage for python/lsst/daf/butler/registry/queries/_builder.py : 18%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
21from __future__ import annotations
23__all__ = ("QueryBuilder",)
25from typing import Any, List, Iterable, TYPE_CHECKING
27from sqlalchemy.sql import ColumnElement, and_, literal, select, FromClause
28import sqlalchemy.sql
29from sqlalchemy.engine import Connection
31from ...core import (
32 DimensionElement,
33 SkyPixDimension,
34 Dimension,
35 DatasetType,
36)
37from ...core.utils import NamedKeyDict
39from ._structs import QuerySummary, QueryColumns
40from ._datasets import DatasetRegistryStorage
41from .expressions import ClauseVisitor
42from ._query import Query
44if TYPE_CHECKING: 44 ↛ 45line 44 didn't jump to line 45, because the condition on line 44 was never true
45 from ..interfaces import DimensionRecordStorageManager
48class QueryBuilder:
49 """A builder for potentially complex queries that join tables based
50 on dimension relationships.
52 Parameters
53 ----------
54 connection : `sqlalchemy.engine.Connection`
55 SQLAlchemy connection object. This is only used to pass through
56 to the `Query` object returned by `finish`.
57 summary : `QuerySummary`
58 Struct organizing the dimensions involved in the query.
59 dimensionStorage : `DimensionRecordStorageManager`
60 Manager for storage backend objects that abstract access to dimension
61 tables.
62 datasetStorage : `DatasetRegistryStorage`
63 Storage backend object that abstracts access to dataset tables.
64 """
66 def __init__(self, connection: Connection, summary: QuerySummary,
67 dimensionStorage: DimensionRecordStorageManager,
68 datasetStorage: DatasetRegistryStorage):
69 self.summary = summary
70 self._connection = connection
71 self._dimensionStorage = dimensionStorage
72 self._datasetStorage = datasetStorage
73 self._sql = None
74 self._elements: NamedKeyDict[DimensionElement, FromClause] = NamedKeyDict()
75 self._columns = QueryColumns()
77 def hasDimensionKey(self, dimension: Dimension) -> bool:
78 """Return `True` if the given dimension's primary key column has
79 been included in the query (possibly via a foreign key column on some
80 other table).
81 """
82 return dimension in self._columns.keys
84 def joinDimensionElement(self, element: DimensionElement):
85 """Add the table for a `DimensionElement` to the query.
87 This automatically joins the element table to all other tables in the
88 query with which it is related, via both dimension keys and spatial
89 and temporal relationships.
91 External calls to this method should rarely be necessary; `finish` will
92 automatically call it if the `DimensionElement` has been identified as
93 one that must be included.
95 Parameters
96 ----------
97 element : `DimensionElement`
98 Element for which a table should be added. The element must be
99 associated with a database table (see `DimensionElement.hasTable`).
100 """
101 assert element not in self._elements, "Element already included in query."
102 storage = self._dimensionStorage[element]
103 fromClause = storage.join(
104 self,
105 regions=self._columns.regions if element in self.summary.spatial else None,
106 timespans=self._columns.timespans if element in self.summary.temporal else None,
107 )
108 self._elements[element] = fromClause
110 def joinDataset(self, datasetType: DatasetType, collections: Any, *,
111 isResult: bool = True, addRank: bool = False) -> bool:
112 """Add a dataset search or constraint to the query.
114 Unlike other `QueryBuilder` join methods, this *must* be called
115 directly to search for datasets of a particular type or constrain the
116 query results based on the exists of datasets. However, all dimensions
117 used to identify the dataset type must have already been included in
118 `QuerySummary.requested` when initializing the `QueryBuilder`.
120 Parameters
121 ----------
122 datasetType : `DatasetType`
123 The type of datasets to search for.
124 collections : sequence of `str` or `Like`, or ``...``
125 An expression describing the collections in which to search for
126 the datasets. This may be a single instance of or an iterable of
127 any of the following:
129 - a `str` collection name;
130 - a `Like` pattern to match against collection names;
131 - `...`, indicating all collections.
132 isResult : `bool`, optional
133 If `True` (default), include the ``dataset_id`` column in the
134 result columns of the query, allowing complete `DatasetRef`
135 instances to be produced from the query results for this dataset
136 type. If `False`, the existence of datasets of this type is used
137 only to constrain the data IDs returned by the query.
138 addRank : `bool`, optional
139 If `True` (`False` is default), also include a calculated column
140 that ranks the collection in which the dataset was found (lower
141 is better). Requires that all entries in ``collections`` be
142 regular strings, so there is a clear search order. Ignored if
143 ``isResult`` is `False`.
145 Returns
146 -------
147 anyRecords : `bool`
148 If `True`, joining the dataset table was successful and the query
149 should proceed. If `False`, we were able to determine (from the
150 combination of ``datasetType`` and ``collections``) that there
151 would be no results joined in from this dataset, and hence (due to
152 the inner join that would normally be present), the full query will
153 return no results.
154 """
155 assert datasetType.dimensions.issubset(self.summary.requested)
156 table = self._datasetStorage.getDatasetSubquery(datasetType, collections=collections,
157 isResult=isResult, addRank=addRank)
158 if table is None:
159 return False
160 self.joinTable(table, datasetType.dimensions)
161 if isResult:
162 self._columns.datasets[datasetType] = (table.columns["dataset_id"],
163 table.columns["rank"] if addRank else None)
164 return True
166 def joinTable(self, table: FromClause, dimensions: Iterable[Dimension]):
167 """Join an arbitrary table to the query via dimension relationships.
169 External calls to this method should only be necessary for tables whose
170 records represent neither dataset nor dimension elements (i.e.
171 extensions to the standard `Registry` schema).
173 Parameters
174 ----------
175 table : `sqlalchemy.sql.FromClause`
176 SQLAlchemy object representing the logical table (which may be a
177 join or subquery expression) to be joined.
178 dimensions : iterable of `Dimension`
179 The dimensions that relate this table to others that may be in the
180 query. The table must have columns with the names of the
181 dimensions.
182 """
183 joinOn = self.startJoin(table, dimensions, dimensions.names)
184 self.finishJoin(table, joinOn)
186 def startJoin(self, table: FromClause, dimensions: Iterable[Dimension], columnNames: Iterable[str]
187 ) -> List[ColumnElement]:
188 """Begin a join on dimensions.
190 Must be followed by call to `finishJoin`.
192 Parameters
193 ----------
194 table : `sqlalchemy.sql.FromClause`
195 SQLAlchemy object representing the logical table (which may be a
196 join or subquery expression) to be joined.
197 dimensions : iterable of `Dimension`
198 The dimensions that relate this table to others that may be in the
199 query. The table must have columns with the names of the
200 dimensions.
201 columnNames : iterable of `str`
202 Names of the columns that correspond to dimension key values; must
203 be `zip` iterable with ``dimensions``.
205 Returns
206 -------
207 joinOn : `list` of `sqlalchemy.sql.ColumnElement`
208 Sequence of boolean expressions that should be combined with AND
209 to form (part of) the ON expression for this JOIN.
210 """
211 joinOn = []
212 for dimension, columnName in zip(dimensions, columnNames):
213 columnInTable = table.columns[columnName]
214 columnsInQuery = self._columns.keys.setdefault(dimension, [])
215 for columnInQuery in columnsInQuery:
216 joinOn.append(columnInQuery == columnInTable)
217 columnsInQuery.append(columnInTable)
218 return joinOn
220 def finishJoin(self, table, joinOn):
221 """Complete a join on dimensions.
223 Must be preceded by call to `startJoin`.
225 Parameters
226 ----------
227 table : `sqlalchemy.sql.FromClause`
228 SQLAlchemy object representing the logical table (which may be a
229 join or subquery expression) to be joined. Must be the same object
230 passed to `startJoin`.
231 joinOn : `list` of `sqlalchemy.sql.ColumnElement`
232 Sequence of boolean expressions that should be combined with AND
233 to form (part of) the ON expression for this JOIN. Should include
234 at least the elements of the list returned by `startJoin`.
235 """
236 if joinOn:
237 self._sql = self._sql.join(table, and_(*joinOn))
238 elif self._sql is None:
239 self._sql = table
240 else:
241 # New table is completely unrelated to all already-included
242 # tables. We need a cross join here but SQLAlchemy does not
243 # have a specific method for that. Using join() without
244 # `onclause` will try to join on FK and will raise an exception
245 # for unrelated tables, so we have to use `onclause` which is
246 # always true.
247 self._sql = self._sql.join(table, literal(True) == literal(True))
249 def _joinMissingDimensionElements(self):
250 """Join all dimension element tables that were identified as necessary
251 by `QuerySummary` and have not yet been joined.
253 For internal use by `QueryBuilder` only; will be called (and should
254 only by called) by `finish`.
255 """
256 # Join all DimensionElement tables that we need for spatial/temporal
257 # joins/filters or a nontrivial WHERE expression.
258 # We iterate over these in *reverse* topological order to minimize the
259 # number of tables joined. For example, the "visit" table provides
260 # the primary key value for the "instrument" table it depends on, so we
261 # don't need to join "instrument" as well unless we had a nontrivial
262 # expression on it (and hence included it already above).
263 for element in self.summary.universe.sorted(self.summary.mustHaveTableJoined, reverse=True):
264 self.joinDimensionElement(element)
265 # Join in any requested Dimension tables that don't already have their
266 # primary keys identified by the query.
267 for dimension in self.summary.universe.sorted(self.summary.mustHaveKeysJoined, reverse=True):
268 if dimension not in self._columns.keys:
269 self.joinDimensionElement(dimension)
271 def _addWhereClause(self):
272 """Add a WHERE clause to the query under construction, connecting all
273 joined dimensions to the expression and data ID dimensions from
274 `QuerySummary`.
276 For internal use by `QueryBuilder` only; will be called (and should
277 only by called) by `finish`.
278 """
279 whereTerms = []
280 if self.summary.expression.tree is not None:
281 visitor = ClauseVisitor(self.summary.universe, self._columns, self._elements)
282 whereTerms.append(self.summary.expression.tree.visit(visitor))
283 for dimension, columnsInQuery in self._columns.keys.items():
284 if dimension in self.summary.dataId.graph:
285 givenKey = self.summary.dataId[dimension]
286 # Add a WHERE term for each column that corresponds to each
287 # key. This is redundant with the JOIN ON clauses that make
288 # them equal to each other, but more constraints have a chance
289 # of making things easier on the DB's query optimizer.
290 for columnInQuery in columnsInQuery:
291 whereTerms.append(columnInQuery == givenKey)
292 else:
293 # Dimension is not fully identified, but it might be a skypix
294 # dimension that's constrained by a given region.
295 if self.summary.dataId.graph.spatial and isinstance(dimension, SkyPixDimension):
296 # We know the region now.
297 givenSkyPixIds = []
298 for begin, end in dimension.pixelization.envelope(self.summary.dataId.region):
299 givenSkyPixIds.extend(range(begin, end))
300 for columnInQuery in columnsInQuery:
301 whereTerms.append(columnInQuery.in_(givenSkyPixIds))
302 # If we are given an dataId with a timespan, and there are one or more
303 # timespans in the query that aren't given, add a WHERE expression for
304 # each of them.
305 if self.summary.dataId.graph.temporal and self.summary.temporal:
306 # Timespan is known now.
307 givenInterval = self.summary.dataId.timespan
308 for element, intervalInQuery in self._columns.timespans.items():
309 assert element not in self.summary.dataId.graph.elements
310 whereTerms.append(intervalInQuery.overlaps(givenInterval, ops=sqlalchemy.sql))
311 # AND-together the full WHERE clause, and combine it with the FROM
312 # clause.
313 self._sql = self._sql.where(and_(*whereTerms))
315 def _addSelectClause(self):
316 """Add a SELECT clause to the query under construction containing all
317 output columns identified by the `QuerySummary` and requested in calls
318 to `joinDataset` with ``isResult=True``.
320 For internal use by `QueryBuilder` only; will be called (and should
321 only by called) by `finish`.
322 """
323 columns = []
324 for dimension in self.summary.requested:
325 columns.append(self._columns.getKeyColumn(dimension))
326 for datasetType, columnPair in self._columns.datasets.items():
327 columns.extend(columnPair)
328 for element, column in self._columns.regions.items():
329 columns.append(column)
330 self._sql = select(columns).select_from(self._sql)
332 def finish(self) -> Query:
333 """Finish query constructing, returning a new `Query` instance.
335 This automatically joins any missing dimension element tables
336 (according to the categorization of the `QuerySummary` the builder was
337 constructed with).
339 This consumes the `QueryBuilder`; no other methods should be called
340 after this one.
342 Returns
343 -------
344 query : `Query`
345 A `Query` object that can be executed (possibly multiple times
346 with different bind parameter values) and used to interpret result
347 rows.
348 """
349 self._joinMissingDimensionElements()
350 self._addSelectClause()
351 self._addWhereClause()
352 return Query(summary=self.summary, connection=self._connection,
353 sql=self._sql, columns=self._columns)