Coverage for python/lsst/daf/butler/registry/queries/_builder.py : 11%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
21from __future__ import annotations
23__all__ = ("QueryBuilder",)
25from typing import AbstractSet, Any, Iterable, List, Optional
27import sqlalchemy.sql
29from ...core import (
30 DimensionElement,
31 SkyPixDimension,
32 Dimension,
33 DatasetType,
34 SimpleQuery,
35)
37from ...core.named import NamedKeyDict, NamedValueAbstractSet, NamedValueSet
39from .._collectionType import CollectionType
40from ._structs import QuerySummary, QueryColumns, DatasetQueryColumns, RegistryManagers
41from .expressions import convertExpressionToSql
42from ._query import DirectQuery, DirectQueryUniqueness, EmptyQuery, Query
43from ..wildcards import CollectionSearch, CollectionQuery
46class QueryBuilder:
47 """A builder for potentially complex queries that join tables based
48 on dimension relationships.
50 Parameters
51 ----------
52 summary : `QuerySummary`
53 Struct organizing the dimensions involved in the query.
54 managers : `RegistryManagers`
55 A struct containing the registry manager instances used by the query
56 system.
57 """
58 def __init__(self, summary: QuerySummary, managers: RegistryManagers):
59 self.summary = summary
60 self._simpleQuery = SimpleQuery()
61 self._elements: NamedKeyDict[DimensionElement, sqlalchemy.sql.FromClause] = NamedKeyDict()
62 self._columns = QueryColumns()
63 self._managers = managers
65 def hasDimensionKey(self, dimension: Dimension) -> bool:
66 """Return `True` if the given dimension's primary key column has
67 been included in the query (possibly via a foreign key column on some
68 other table).
69 """
70 return dimension in self._columns.keys
72 def joinDimensionElement(self, element: DimensionElement) -> None:
73 """Add the table for a `DimensionElement` to the query.
75 This automatically joins the element table to all other tables in the
76 query with which it is related, via both dimension keys and spatial
77 and temporal relationships.
79 External calls to this method should rarely be necessary; `finish` will
80 automatically call it if the `DimensionElement` has been identified as
81 one that must be included.
83 Parameters
84 ----------
85 element : `DimensionElement`
86 Element for which a table should be added. The element must be
87 associated with a database table (see `DimensionElement.hasTable`).
88 """
89 assert element not in self._elements, "Element already included in query."
90 storage = self._managers.dimensions[element]
91 fromClause = storage.join(
92 self,
93 regions=self._columns.regions if element in self.summary.spatial else None,
94 timespans=self._columns.timespans if element in self.summary.temporal else None,
95 )
96 self._elements[element] = fromClause
98 def joinDataset(self, datasetType: DatasetType, collections: Any, *,
99 isResult: bool = True, findFirst: bool = False) -> bool:
100 """Add a dataset search or constraint to the query.
102 Unlike other `QueryBuilder` join methods, this *must* be called
103 directly to search for datasets of a particular type or constrain the
104 query results based on the exists of datasets. However, all dimensions
105 used to identify the dataset type must have already been included in
106 `QuerySummary.requested` when initializing the `QueryBuilder`.
108 Parameters
109 ----------
110 datasetType : `DatasetType`
111 The type of datasets to search for.
112 collections : `Any`
113 An expression that fully or partially identifies the collections
114 to search for datasets, such as a `str`, `re.Pattern`, or iterable
115 thereof. `...` can be used to return all collections. See
116 :ref:`daf_butler_collection_expressions` for more information.
117 isResult : `bool`, optional
118 If `True` (default), include the dataset ID column in the
119 result columns of the query, allowing complete `DatasetRef`
120 instances to be produced from the query results for this dataset
121 type. If `False`, the existence of datasets of this type is used
122 only to constrain the data IDs returned by the query.
123 `joinDataset` may be called with ``isResult=True`` at most one time
124 on a particular `QueryBuilder` instance.
125 findFirst : `bool`, optional
126 If `True` (`False` is default), only include the first match for
127 each data ID, searching the given collections in order. Requires
128 that all entries in ``collections`` be regular strings, so there is
129 a clear search order. Ignored if ``isResult`` is `False`.
131 Returns
132 -------
133 anyRecords : `bool`
134 If `True`, joining the dataset table was successful and the query
135 should proceed. If `False`, we were able to determine (from the
136 combination of ``datasetType`` and ``collections``) that there
137 would be no results joined in from this dataset, and hence (due to
138 the inner join that would normally be present), the full query will
139 return no results.
140 """
141 assert datasetType.dimensions.issubset(self.summary.requested)
142 if isResult and findFirst:
143 collections = CollectionSearch.fromExpression(collections)
144 else:
145 collections = CollectionQuery.fromExpression(collections)
146 # If we are searching all collections with no constraints, loop over
147 # RUN collections only, because that will include all datasets.
148 collectionTypes: AbstractSet[CollectionType]
149 if collections == CollectionQuery():
150 collectionTypes = {CollectionType.RUN}
151 else:
152 collectionTypes = CollectionType.all()
153 datasetRecordStorage = self._managers.datasets.find(datasetType.name)
154 if datasetRecordStorage is None:
155 # Unrecognized dataset type means no results. It might be better
156 # to raise here, but this is consistent with previous behavior,
157 # which is expected by QuantumGraph generation code in pipe_base.
158 return False
159 subsubqueries = []
160 runKeyName = self._managers.collections.getRunForeignKeyName()
161 baseColumnNames = {"id", runKeyName, "ingest_date"} if isResult else set()
162 baseColumnNames.update(datasetType.dimensions.required.names)
163 for rank, collectionRecord in enumerate(collections.iter(self._managers.collections,
164 collectionTypes=collectionTypes)):
165 if collectionRecord.type is CollectionType.CALIBRATION:
166 if datasetType.isCalibration():
167 raise NotImplementedError(
168 f"Query for dataset type '{datasetType.name}' in CALIBRATION-type collection "
169 f"'{collectionRecord.name}' is not yet supported."
170 )
171 else:
172 # We can never find a non-calibration dataset in a
173 # CALIBRATION collection.
174 continue
175 ssq = datasetRecordStorage.select(collection=collectionRecord,
176 dataId=SimpleQuery.Select,
177 id=SimpleQuery.Select if isResult else None,
178 run=SimpleQuery.Select if isResult else None,
179 ingestDate=SimpleQuery.Select if isResult else None)
180 if ssq is None:
181 continue
182 assert {c.name for c in ssq.columns} == baseColumnNames
183 if findFirst:
184 ssq.columns.append(sqlalchemy.sql.literal(rank).label("rank"))
185 subsubqueries.append(ssq.combine())
186 if not subsubqueries:
187 return False
188 subquery = sqlalchemy.sql.union_all(*subsubqueries)
189 columns: Optional[DatasetQueryColumns] = None
190 if isResult:
191 if findFirst:
192 # Rewrite the subquery (currently a UNION ALL over
193 # per-collection subsubqueries) to select the rows with the
194 # lowest rank per data ID. The block below will set subquery
195 # to something like this:
196 #
197 # WITH {dst}_search AS (
198 # SELECT {data-id-cols}, id, run_id, 1 AS rank
199 # FROM <collection1>
200 # UNION ALL
201 # SELECT {data-id-cols}, id, run_id, 2 AS rank
202 # FROM <collection2>
203 # UNION ALL
204 # ...
205 # )
206 # SELECT
207 # {dst}_window.{data-id-cols},
208 # {dst}_window.id,
209 # {dst}_window.run_id
210 # FROM (
211 # SELECT
212 # {dst}_search.{data-id-cols},
213 # {dst}_search.id,
214 # {dst}_search.run_id,
215 # ROW_NUMBER() OVER (
216 # PARTITION BY {dst_search}.{data-id-cols}
217 # ORDER BY rank
218 # ) AS rownum
219 # ) {dst}_window
220 # WHERE
221 # {dst}_window.rownum = 1;
222 #
223 search = subquery.cte(f"{datasetType.name}_search")
224 windowDataIdCols = [
225 search.columns[name].label(name) for name in datasetType.dimensions.required.names
226 ]
227 windowSelectCols = [
228 search.columns["id"].label("id"),
229 search.columns[runKeyName].label(runKeyName),
230 search.columns["ingest_date"].label("ingest_date"),
231 ]
232 windowSelectCols += windowDataIdCols
233 assert {c.name for c in windowSelectCols} == baseColumnNames
234 windowSelectCols.append(
235 sqlalchemy.sql.func.row_number().over(
236 partition_by=windowDataIdCols,
237 order_by=search.columns["rank"]
238 ).label("rownum")
239 )
240 window = sqlalchemy.sql.select(
241 windowSelectCols
242 ).select_from(search).alias(
243 f"{datasetType.name}_window"
244 )
245 subquery = sqlalchemy.sql.select(
246 [window.columns[name].label(name) for name in baseColumnNames]
247 ).select_from(
248 window
249 ).where(
250 window.columns["rownum"] == 1
251 ).alias(datasetType.name)
252 else:
253 subquery = subquery.alias(datasetType.name)
254 columns = DatasetQueryColumns(
255 datasetType=datasetType,
256 id=subquery.columns["id"],
257 runKey=subquery.columns[runKeyName],
258 ingestDate=subquery.columns["ingest_date"],
259 )
260 else:
261 subquery = subquery.alias(datasetType.name)
262 self.joinTable(subquery, datasetType.dimensions.required, datasets=columns)
263 return True
265 def joinTable(self, table: sqlalchemy.sql.FromClause, dimensions: NamedValueAbstractSet[Dimension], *,
266 datasets: Optional[DatasetQueryColumns] = None) -> None:
267 """Join an arbitrary table to the query via dimension relationships.
269 External calls to this method should only be necessary for tables whose
270 records represent neither datasets nor dimension elements.
272 Parameters
273 ----------
274 table : `sqlalchemy.sql.FromClause`
275 SQLAlchemy object representing the logical table (which may be a
276 join or subquery expression) to be joined.
277 dimensions : iterable of `Dimension`
278 The dimensions that relate this table to others that may be in the
279 query. The table must have columns with the names of the
280 dimensions.
281 datasets : `DatasetQueryColumns`, optional
282 Columns that identify a dataset that is part of the query results.
283 """
284 unexpectedDimensions = NamedValueSet(dimensions - self.summary.mustHaveKeysJoined.dimensions)
285 unexpectedDimensions.discard(self.summary.universe.commonSkyPix)
286 if unexpectedDimensions:
287 raise NotImplementedError(
288 f"QueryBuilder does not yet support joining in dimensions {unexpectedDimensions} that "
289 f"were not provided originally to the QuerySummary object passed at construction."
290 )
291 joinOn = self.startJoin(table, dimensions, dimensions.names)
292 self.finishJoin(table, joinOn)
293 if datasets is not None:
294 assert self._columns.datasets is None, \
295 "At most one result dataset type can be returned by a query."
296 self._columns.datasets = datasets
298 def startJoin(self, table: sqlalchemy.sql.FromClause, dimensions: Iterable[Dimension],
299 columnNames: Iterable[str]
300 ) -> List[sqlalchemy.sql.ColumnElement]:
301 """Begin a join on dimensions.
303 Must be followed by call to `finishJoin`.
305 Parameters
306 ----------
307 table : `sqlalchemy.sql.FromClause`
308 SQLAlchemy object representing the logical table (which may be a
309 join or subquery expression) to be joined.
310 dimensions : iterable of `Dimension`
311 The dimensions that relate this table to others that may be in the
312 query. The table must have columns with the names of the
313 dimensions.
314 columnNames : iterable of `str`
315 Names of the columns that correspond to dimension key values; must
316 be `zip` iterable with ``dimensions``.
318 Returns
319 -------
320 joinOn : `list` of `sqlalchemy.sql.ColumnElement`
321 Sequence of boolean expressions that should be combined with AND
322 to form (part of) the ON expression for this JOIN.
323 """
324 joinOn = []
325 for dimension, columnName in zip(dimensions, columnNames):
326 columnInTable = table.columns[columnName]
327 columnsInQuery = self._columns.keys.setdefault(dimension, [])
328 for columnInQuery in columnsInQuery:
329 joinOn.append(columnInQuery == columnInTable)
330 columnsInQuery.append(columnInTable)
331 return joinOn
333 def finishJoin(self, table: sqlalchemy.sql.FromClause, joinOn: List[sqlalchemy.sql.ColumnElement]
334 ) -> None:
335 """Complete a join on dimensions.
337 Must be preceded by call to `startJoin`.
339 Parameters
340 ----------
341 table : `sqlalchemy.sql.FromClause`
342 SQLAlchemy object representing the logical table (which may be a
343 join or subquery expression) to be joined. Must be the same object
344 passed to `startJoin`.
345 joinOn : `list` of `sqlalchemy.sql.ColumnElement`
346 Sequence of boolean expressions that should be combined with AND
347 to form (part of) the ON expression for this JOIN. Should include
348 at least the elements of the list returned by `startJoin`.
349 """
350 onclause: Optional[sqlalchemy.sql.ColumnElement]
351 if len(joinOn) == 0:
352 onclause = None
353 elif len(joinOn) == 1:
354 onclause = joinOn[0]
355 else:
356 onclause = sqlalchemy.sql.and_(*joinOn)
357 self._simpleQuery.join(table, onclause=onclause)
359 def _joinMissingDimensionElements(self) -> None:
360 """Join all dimension element tables that were identified as necessary
361 by `QuerySummary` and have not yet been joined.
363 For internal use by `QueryBuilder` only; will be called (and should
364 only by called) by `finish`.
365 """
366 # Join all DimensionElement tables that we need for spatial/temporal
367 # joins/filters or a nontrivial WHERE expression.
368 # We iterate over these in *reverse* topological order to minimize the
369 # number of tables joined. For example, the "visit" table provides
370 # the primary key value for the "instrument" table it depends on, so we
371 # don't need to join "instrument" as well unless we had a nontrivial
372 # expression on it (and hence included it already above).
373 for element in self.summary.universe.sorted(self.summary.mustHaveTableJoined, reverse=True):
374 self.joinDimensionElement(element)
375 # Join in any requested Dimension tables that don't already have their
376 # primary keys identified by the query.
377 for dimension in self.summary.universe.sorted(self.summary.mustHaveKeysJoined, reverse=True):
378 if dimension not in self._columns.keys:
379 self.joinDimensionElement(dimension)
381 def _addWhereClause(self) -> None:
382 """Add a WHERE clause to the query under construction, connecting all
383 joined dimensions to the expression and data ID dimensions from
384 `QuerySummary`.
386 For internal use by `QueryBuilder` only; will be called (and should
387 only by called) by `finish`.
388 """
389 if self.summary.where.tree is not None:
390 self._simpleQuery.where.append(
391 convertExpressionToSql(
392 self.summary.where.tree,
393 self.summary.universe,
394 columns=self._columns,
395 elements=self._elements,
396 bind=self.summary.where.bind,
397 TimespanReprClass=self._managers.TimespanReprClass,
398 )
399 )
400 for dimension, columnsInQuery in self._columns.keys.items():
401 if dimension in self.summary.where.dataId.graph:
402 givenKey = self.summary.where.dataId[dimension]
403 # Add a WHERE term for each column that corresponds to each
404 # key. This is redundant with the JOIN ON clauses that make
405 # them equal to each other, but more constraints have a chance
406 # of making things easier on the DB's query optimizer.
407 for columnInQuery in columnsInQuery:
408 self._simpleQuery.where.append(columnInQuery == givenKey)
409 else:
410 # Dimension is not fully identified, but it might be a skypix
411 # dimension that's constrained by a given region.
412 if self.summary.where.region is not None and isinstance(dimension, SkyPixDimension):
413 # We know the region now.
414 givenSkyPixIds: List[int] = []
415 for begin, end in dimension.pixelization.envelope(self.summary.where.region):
416 givenSkyPixIds.extend(range(begin, end))
417 for columnInQuery in columnsInQuery:
418 self._simpleQuery.where.append(columnInQuery.in_(givenSkyPixIds))
419 # If we are given an dataId with a timespan, and there are one or more
420 # timespans in the query that aren't given, add a WHERE expression for
421 # each of them.
422 if self.summary.where.dataId.graph.temporal and self.summary.temporal:
423 # Timespan is known now.
424 givenInterval = self.summary.where.dataId.timespan
425 assert givenInterval is not None
426 for element, intervalInQuery in self._columns.timespans.items():
427 assert element not in self.summary.where.dataId.graph.elements
428 self._simpleQuery.where.append(
429 intervalInQuery.overlaps(self._managers.TimespanReprClass.fromLiteral(givenInterval))
430 )
432 def finish(self, joinMissing: bool = True) -> Query:
433 """Finish query constructing, returning a new `Query` instance.
435 Parameters
436 ----------
437 joinMissing : `bool`, optional
438 If `True` (default), automatically join any missing dimension
439 element tables (according to the categorization of the
440 `QuerySummary` the builder was constructed with). `False` should
441 only be passed if the caller can independently guarantee that all
442 dimension relationships are already captured in non-dimension
443 tables that have been manually included in the query.
445 Returns
446 -------
447 query : `Query`
448 A `Query` object that can be executed and used to interpret result
449 rows.
450 """
451 if joinMissing:
452 self._joinMissingDimensionElements()
453 self._addWhereClause()
454 if self._columns.isEmpty():
455 return EmptyQuery(self.summary.requested.universe, managers=self._managers)
456 return DirectQuery(graph=self.summary.requested,
457 uniqueness=DirectQueryUniqueness.NOT_UNIQUE,
458 whereRegion=self.summary.where.dataId.region,
459 simpleQuery=self._simpleQuery,
460 columns=self._columns,
461 managers=self._managers)