Coverage for python/lsst/daf/butler/registry/queries/_builder.py : 11%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
21from __future__ import annotations
23__all__ = ("QueryBuilder",)
25from typing import Any, Iterable, List, Optional
27import sqlalchemy.sql
29from ...core import (
30 DimensionElement,
31 SkyPixDimension,
32 Dimension,
33 DatasetType,
34 NamedKeyDict,
35 NamedValueSet,
36 SimpleQuery,
37)
39from ._structs import QuerySummary, QueryColumns, DatasetQueryColumns, RegistryManagers
40from .expressions import ClauseVisitor
41from ._query import DirectQuery, DirectQueryUniqueness, EmptyQuery, Query
42from ..wildcards import CollectionSearch, CollectionQuery
45class QueryBuilder:
46 """A builder for potentially complex queries that join tables based
47 on dimension relationships.
49 Parameters
50 ----------
51 summary : `QuerySummary`
52 Struct organizing the dimensions involved in the query.
53 managers : `RegistryManagers`
54 A struct containing the registry manager instances used by the query
55 system.
56 """
57 def __init__(self, summary: QuerySummary, managers: RegistryManagers):
58 self.summary = summary
59 self._simpleQuery = SimpleQuery()
60 self._elements: NamedKeyDict[DimensionElement, sqlalchemy.sql.FromClause] = NamedKeyDict()
61 self._columns = QueryColumns()
62 self._managers = managers
64 def hasDimensionKey(self, dimension: Dimension) -> bool:
65 """Return `True` if the given dimension's primary key column has
66 been included in the query (possibly via a foreign key column on some
67 other table).
68 """
69 return dimension in self._columns.keys
71 def joinDimensionElement(self, element: DimensionElement) -> None:
72 """Add the table for a `DimensionElement` to the query.
74 This automatically joins the element table to all other tables in the
75 query with which it is related, via both dimension keys and spatial
76 and temporal relationships.
78 External calls to this method should rarely be necessary; `finish` will
79 automatically call it if the `DimensionElement` has been identified as
80 one that must be included.
82 Parameters
83 ----------
84 element : `DimensionElement`
85 Element for which a table should be added. The element must be
86 associated with a database table (see `DimensionElement.hasTable`).
87 """
88 assert element not in self._elements, "Element already included in query."
89 storage = self._managers.dimensions[element]
90 fromClause = storage.join(
91 self,
92 regions=self._columns.regions if element in self.summary.spatial else None,
93 timespans=self._columns.timespans if element in self.summary.temporal else None,
94 )
95 self._elements[element] = fromClause
97 def joinDataset(self, datasetType: DatasetType, collections: Any, *,
98 isResult: bool = True, deduplicate: bool = False) -> bool:
99 """Add a dataset search or constraint to the query.
101 Unlike other `QueryBuilder` join methods, this *must* be called
102 directly to search for datasets of a particular type or constrain the
103 query results based on the exists of datasets. However, all dimensions
104 used to identify the dataset type must have already been included in
105 `QuerySummary.requested` when initializing the `QueryBuilder`.
107 Parameters
108 ----------
109 datasetType : `DatasetType`
110 The type of datasets to search for.
111 collections : `Any`
112 An expression that fully or partially identifies the collections
113 to search for datasets, such as a `str`, `re.Pattern`, or iterable
114 thereof. `...` can be used to return all collections. See
115 :ref:`daf_butler_collection_expressions` for more information.
116 isResult : `bool`, optional
117 If `True` (default), include the dataset ID column in the
118 result columns of the query, allowing complete `DatasetRef`
119 instances to be produced from the query results for this dataset
120 type. If `False`, the existence of datasets of this type is used
121 only to constrain the data IDs returned by the query.
122 `joinDataset` may be called with ``isResult=True`` at most one time
123 on a particular `QueryBuilder` instance.
124 deduplicate : `bool`, optional
125 If `True` (`False` is default), only include the first match for
126 each data ID, searching the given collections in order. Requires
127 that all entries in ``collections`` be regular strings, so there is
128 a clear search order. Ignored if ``isResult`` is `False`.
130 Returns
131 -------
132 anyRecords : `bool`
133 If `True`, joining the dataset table was successful and the query
134 should proceed. If `False`, we were able to determine (from the
135 combination of ``datasetType`` and ``collections``) that there
136 would be no results joined in from this dataset, and hence (due to
137 the inner join that would normally be present), the full query will
138 return no results.
139 """
140 assert datasetType.dimensions.issubset(self.summary.requested)
141 if isResult and deduplicate:
142 collections = CollectionSearch.fromExpression(collections)
143 else:
144 collections = CollectionQuery.fromExpression(collections)
145 datasetRecordStorage = self._managers.datasets.find(datasetType.name)
146 if datasetRecordStorage is None:
147 # Unrecognized dataset type means no results. It might be better
148 # to raise here, but this is consistent with previous behavior,
149 # which is expected by QuantumGraph generation code in pipe_base.
150 return False
151 subsubqueries = []
152 runKeyName = self._managers.collections.getRunForeignKeyName()
153 baseColumnNames = {"id", runKeyName} if isResult else set()
154 baseColumnNames.update(datasetType.dimensions.required.names)
155 for rank, collectionRecord in enumerate(collections.iter(self._managers.collections,
156 datasetType=datasetType)):
157 ssq = datasetRecordStorage.select(collection=collectionRecord,
158 dataId=SimpleQuery.Select,
159 id=SimpleQuery.Select if isResult else None,
160 run=SimpleQuery.Select if isResult else None)
161 if ssq is None:
162 continue
163 assert {c.name for c in ssq.columns} == baseColumnNames
164 if deduplicate:
165 ssq.columns.append(sqlalchemy.sql.literal(rank).label("rank"))
166 subsubqueries.append(ssq.combine())
167 if not subsubqueries:
168 return False
169 subquery = sqlalchemy.sql.union_all(*subsubqueries)
170 columns: Optional[DatasetQueryColumns] = None
171 if isResult:
172 if deduplicate:
173 # Rewrite the subquery (currently a UNION ALL over
174 # per-collection subsubqueries) to select the rows with the
175 # lowest rank per data ID. The block below will set subquery
176 # to something like this:
177 #
178 # WITH {dst}_search AS (
179 # SELECT {data-id-cols}, id, run_id, 1 AS rank
180 # FROM <collection1>
181 # UNION ALL
182 # SELECT {data-id-cols}, id, run_id, 2 AS rank
183 # FROM <collection2>
184 # UNION ALL
185 # ...
186 # )
187 # SELECT
188 # {dst}_window.{data-id-cols},
189 # {dst}_window.id,
190 # {dst}_window.run_id
191 # FROM (
192 # SELECT
193 # {dst}_search.{data-id-cols},
194 # {dst}_search.id,
195 # {dst}_search.run_id,
196 # ROW_NUMBER() OVER (
197 # PARTITION BY {dst_search}.{data-id-cols}
198 # ORDER BY rank
199 # ) AS rownum
200 # ) {dst}_window
201 # WHERE
202 # {dst}_window.rownum = 1;
203 #
204 search = subquery.cte(f"{datasetType.name}_search")
205 windowDataIdCols = [
206 search.columns[name].label(name) for name in datasetType.dimensions.required.names
207 ]
208 windowSelectCols = [
209 search.columns["id"].label("id"),
210 search.columns[runKeyName].label(runKeyName)
211 ]
212 windowSelectCols += windowDataIdCols
213 assert {c.name for c in windowSelectCols} == baseColumnNames
214 windowSelectCols.append(
215 sqlalchemy.sql.func.row_number().over(
216 partition_by=windowDataIdCols,
217 order_by=search.columns["rank"]
218 ).label("rownum")
219 )
220 window = sqlalchemy.sql.select(
221 windowSelectCols
222 ).select_from(search).alias(
223 f"{datasetType.name}_window"
224 )
225 subquery = sqlalchemy.sql.select(
226 [window.columns[name].label(name) for name in baseColumnNames]
227 ).select_from(
228 window
229 ).where(
230 window.columns["rownum"] == 1
231 ).alias(datasetType.name)
232 else:
233 subquery = subquery.alias(datasetType.name)
234 columns = DatasetQueryColumns(
235 datasetType=datasetType,
236 id=subquery.columns["id"],
237 runKey=subquery.columns[runKeyName],
238 )
239 else:
240 subquery = subquery.alias(datasetType.name)
241 self.joinTable(subquery, datasetType.dimensions.required, datasets=columns)
242 return True
244 def joinTable(self, table: sqlalchemy.sql.FromClause, dimensions: NamedValueSet[Dimension], *,
245 datasets: Optional[DatasetQueryColumns] = None) -> None:
246 """Join an arbitrary table to the query via dimension relationships.
248 External calls to this method should only be necessary for tables whose
249 records represent neither datasets nor dimension elements.
251 Parameters
252 ----------
253 table : `sqlalchemy.sql.FromClause`
254 SQLAlchemy object representing the logical table (which may be a
255 join or subquery expression) to be joined.
256 dimensions : iterable of `Dimension`
257 The dimensions that relate this table to others that may be in the
258 query. The table must have columns with the names of the
259 dimensions.
260 datasets : `DatasetQueryColumns`, optional
261 Columns that identify a dataset that is part of the query results.
262 """
263 unexpectedDimensions = NamedValueSet(dimensions - self.summary.requested.dimensions)
264 unexpectedDimensions.discard(self.summary.universe.commonSkyPix)
265 if unexpectedDimensions:
266 raise NotImplementedError(
267 f"QueryBuilder does not yet support joining in dimensions {unexpectedDimensions} that "
268 f"were not provided originally to the QuerySummary object passed at construction."
269 )
270 joinOn = self.startJoin(table, dimensions, dimensions.names)
271 self.finishJoin(table, joinOn)
272 if datasets is not None:
273 assert self._columns.datasets is None, \
274 "At most one result dataset type can be returned by a query."
275 self._columns.datasets = datasets
277 def startJoin(self, table: sqlalchemy.sql.FromClause, dimensions: Iterable[Dimension],
278 columnNames: Iterable[str]
279 ) -> List[sqlalchemy.sql.ColumnElement]:
280 """Begin a join on dimensions.
282 Must be followed by call to `finishJoin`.
284 Parameters
285 ----------
286 table : `sqlalchemy.sql.FromClause`
287 SQLAlchemy object representing the logical table (which may be a
288 join or subquery expression) to be joined.
289 dimensions : iterable of `Dimension`
290 The dimensions that relate this table to others that may be in the
291 query. The table must have columns with the names of the
292 dimensions.
293 columnNames : iterable of `str`
294 Names of the columns that correspond to dimension key values; must
295 be `zip` iterable with ``dimensions``.
297 Returns
298 -------
299 joinOn : `list` of `sqlalchemy.sql.ColumnElement`
300 Sequence of boolean expressions that should be combined with AND
301 to form (part of) the ON expression for this JOIN.
302 """
303 joinOn = []
304 for dimension, columnName in zip(dimensions, columnNames):
305 columnInTable = table.columns[columnName]
306 columnsInQuery = self._columns.keys.setdefault(dimension, [])
307 for columnInQuery in columnsInQuery:
308 joinOn.append(columnInQuery == columnInTable)
309 columnsInQuery.append(columnInTable)
310 return joinOn
312 def finishJoin(self, table: sqlalchemy.sql.FromClause, joinOn: List[sqlalchemy.sql.ColumnElement]
313 ) -> None:
314 """Complete a join on dimensions.
316 Must be preceded by call to `startJoin`.
318 Parameters
319 ----------
320 table : `sqlalchemy.sql.FromClause`
321 SQLAlchemy object representing the logical table (which may be a
322 join or subquery expression) to be joined. Must be the same object
323 passed to `startJoin`.
324 joinOn : `list` of `sqlalchemy.sql.ColumnElement`
325 Sequence of boolean expressions that should be combined with AND
326 to form (part of) the ON expression for this JOIN. Should include
327 at least the elements of the list returned by `startJoin`.
328 """
329 onclause: Optional[sqlalchemy.sql.ColumnElement]
330 if len(joinOn) == 0:
331 onclause = None
332 elif len(joinOn) == 1:
333 onclause = joinOn[0]
334 else:
335 onclause = sqlalchemy.sql.and_(*joinOn)
336 self._simpleQuery.join(table, onclause=onclause)
338 def _joinMissingDimensionElements(self) -> None:
339 """Join all dimension element tables that were identified as necessary
340 by `QuerySummary` and have not yet been joined.
342 For internal use by `QueryBuilder` only; will be called (and should
343 only by called) by `finish`.
344 """
345 # Join all DimensionElement tables that we need for spatial/temporal
346 # joins/filters or a nontrivial WHERE expression.
347 # We iterate over these in *reverse* topological order to minimize the
348 # number of tables joined. For example, the "visit" table provides
349 # the primary key value for the "instrument" table it depends on, so we
350 # don't need to join "instrument" as well unless we had a nontrivial
351 # expression on it (and hence included it already above).
352 for element in self.summary.universe.sorted(self.summary.mustHaveTableJoined, reverse=True):
353 self.joinDimensionElement(element)
354 # Join in any requested Dimension tables that don't already have their
355 # primary keys identified by the query.
356 for dimension in self.summary.universe.sorted(self.summary.mustHaveKeysJoined, reverse=True):
357 if dimension not in self._columns.keys:
358 self.joinDimensionElement(dimension)
360 def _addWhereClause(self) -> None:
361 """Add a WHERE clause to the query under construction, connecting all
362 joined dimensions to the expression and data ID dimensions from
363 `QuerySummary`.
365 For internal use by `QueryBuilder` only; will be called (and should
366 only by called) by `finish`.
367 """
368 if self.summary.expression.tree is not None:
369 visitor = ClauseVisitor(self.summary.universe, self._columns, self._elements)
370 self._simpleQuery.where.append(self.summary.expression.tree.visit(visitor))
371 for dimension, columnsInQuery in self._columns.keys.items():
372 if dimension in self.summary.dataId.graph:
373 givenKey = self.summary.dataId[dimension]
374 # Add a WHERE term for each column that corresponds to each
375 # key. This is redundant with the JOIN ON clauses that make
376 # them equal to each other, but more constraints have a chance
377 # of making things easier on the DB's query optimizer.
378 for columnInQuery in columnsInQuery:
379 self._simpleQuery.where.append(columnInQuery == givenKey)
380 else:
381 # Dimension is not fully identified, but it might be a skypix
382 # dimension that's constrained by a given region.
383 if self.summary.whereRegion is not None and isinstance(dimension, SkyPixDimension):
384 # We know the region now.
385 givenSkyPixIds: List[int] = []
386 for begin, end in dimension.pixelization.envelope(self.summary.whereRegion):
387 givenSkyPixIds.extend(range(begin, end))
388 for columnInQuery in columnsInQuery:
389 self._simpleQuery.where.append(columnInQuery.in_(givenSkyPixIds))
390 # If we are given an dataId with a timespan, and there are one or more
391 # timespans in the query that aren't given, add a WHERE expression for
392 # each of them.
393 if self.summary.dataId.graph.temporal and self.summary.temporal:
394 # Timespan is known now.
395 givenInterval = self.summary.dataId.timespan
396 assert givenInterval is not None
397 for element, intervalInQuery in self._columns.timespans.items():
398 assert element not in self.summary.dataId.graph.elements
399 self._simpleQuery.where.append(intervalInQuery.overlaps(givenInterval))
401 def finish(self, joinMissing: bool = True) -> Query:
402 """Finish query constructing, returning a new `Query` instance.
404 Parameters
405 ----------
406 joinMissing : `bool`, optional
407 If `True` (default), automatically join any missing dimension
408 element tables (according to the categorization of the
409 `QuerySummary` the builder was constructed with). `False` should
410 only be passed if the caller can independently guarantee that all
411 dimension relationships are already captured in non-dimension
412 tables that have been manually included in the query.
414 Returns
415 -------
416 query : `Query`
417 A `Query` object that can be executed and used to interpret result
418 rows.
419 """
420 if joinMissing:
421 self._joinMissingDimensionElements()
422 self._addWhereClause()
423 if self._columns.isEmpty():
424 return EmptyQuery(self.summary.requested.universe, managers=self._managers)
425 return DirectQuery(graph=self.summary.requested,
426 uniqueness=DirectQueryUniqueness.NOT_UNIQUE,
427 whereRegion=self.summary.dataId.region,
428 simpleQuery=self._simpleQuery,
429 columns=self._columns,
430 managers=self._managers)