Coverage for python/lsst/daf/butler/registry/queries/_builder.py: 9%
Shortcuts on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
21from __future__ import annotations
23__all__ = ("QueryBuilder",)
25from typing import AbstractSet, Any, Iterable, List, Optional
27import sqlalchemy.sql
29from ...core import DatasetType, Dimension, DimensionElement, SimpleQuery, SkyPixDimension
30from ...core.named import NamedKeyDict, NamedValueAbstractSet, NamedValueSet
31from .._collectionType import CollectionType
32from ..wildcards import CollectionQuery, CollectionSearch
33from ._query import DirectQuery, DirectQueryUniqueness, EmptyQuery, OrderByColumn, Query
34from ._structs import DatasetQueryColumns, QueryColumns, QuerySummary, RegistryManagers
35from .expressions import convertExpressionToSql
38class QueryBuilder:
39 """A builder for potentially complex queries that join tables based
40 on dimension relationships.
42 Parameters
43 ----------
44 summary : `QuerySummary`
45 Struct organizing the dimensions involved in the query.
46 managers : `RegistryManagers`
47 A struct containing the registry manager instances used by the query
48 system.
49 doomed_by : `Iterable` [ `str` ], optional
50 A list of messages (appropriate for e.g. logging or exceptions) that
51 explain why the query is known to return no results even before it is
52 executed. Queries with a non-empty list will never be executed.
53 """
55 def __init__(self, summary: QuerySummary, managers: RegistryManagers, doomed_by: Iterable[str] = ()):
56 self.summary = summary
57 self._simpleQuery = SimpleQuery()
58 self._elements: NamedKeyDict[DimensionElement, sqlalchemy.sql.FromClause] = NamedKeyDict()
59 self._columns = QueryColumns()
60 self._managers = managers
61 self._doomed_by = list(doomed_by)
63 def hasDimensionKey(self, dimension: Dimension) -> bool:
64 """Return `True` if the given dimension's primary key column has
65 been included in the query (possibly via a foreign key column on some
66 other table).
67 """
68 return dimension in self._columns.keys
70 def joinDimensionElement(self, element: DimensionElement) -> None:
71 """Add the table for a `DimensionElement` to the query.
73 This automatically joins the element table to all other tables in the
74 query with which it is related, via both dimension keys and spatial
75 and temporal relationships.
77 External calls to this method should rarely be necessary; `finish` will
78 automatically call it if the `DimensionElement` has been identified as
79 one that must be included.
81 Parameters
82 ----------
83 element : `DimensionElement`
84 Element for which a table should be added. The element must be
85 associated with a database table (see `DimensionElement.hasTable`).
86 """
87 assert element not in self._elements, "Element already included in query."
88 storage = self._managers.dimensions[element]
89 fromClause = storage.join(
90 self,
91 regions=self._columns.regions if element in self.summary.spatial else None,
92 timespans=self._columns.timespans if element in self.summary.temporal else None,
93 )
94 self._elements[element] = fromClause
96 def joinDataset(
97 self, datasetType: DatasetType, collections: Any, *, isResult: bool = True, findFirst: bool = False
98 ) -> bool:
99 """Add a dataset search or constraint to the query.
101 Unlike other `QueryBuilder` join methods, this *must* be called
102 directly to search for datasets of a particular type or constrain the
103 query results based on the exists of datasets. However, all dimensions
104 used to identify the dataset type must have already been included in
105 `QuerySummary.requested` when initializing the `QueryBuilder`.
107 Parameters
108 ----------
109 datasetType : `DatasetType`
110 The type of datasets to search for.
111 collections : `Any`
112 An expression that fully or partially identifies the collections
113 to search for datasets, such as a `str`, `re.Pattern`, or iterable
114 thereof. `...` can be used to return all collections. See
115 :ref:`daf_butler_collection_expressions` for more information.
116 isResult : `bool`, optional
117 If `True` (default), include the dataset ID column in the
118 result columns of the query, allowing complete `DatasetRef`
119 instances to be produced from the query results for this dataset
120 type. If `False`, the existence of datasets of this type is used
121 only to constrain the data IDs returned by the query.
122 `joinDataset` may be called with ``isResult=True`` at most one time
123 on a particular `QueryBuilder` instance.
124 findFirst : `bool`, optional
125 If `True` (`False` is default), only include the first match for
126 each data ID, searching the given collections in order. Requires
127 that all entries in ``collections`` be regular strings, so there is
128 a clear search order. Ignored if ``isResult`` is `False`.
130 Returns
131 -------
132 anyRecords : `bool`
133 If `True`, joining the dataset table was successful and the query
134 should proceed. If `False`, we were able to determine (from the
135 combination of ``datasetType`` and ``collections``) that there
136 would be no results joined in from this dataset, and hence (due to
137 the inner join that would normally be present), the full query will
138 return no results.
139 """
140 assert datasetType in self.summary.datasets
141 if isResult and findFirst:
142 collections = CollectionSearch.fromExpression(collections)
143 else:
144 collections = CollectionQuery.fromExpression(collections)
145 explicitCollections = frozenset(collections.explicitNames())
146 # If we are searching all collections with no constraints, loop over
147 # RUN collections only, because that will include all datasets.
148 collectionTypes: AbstractSet[CollectionType]
149 if collections == CollectionQuery():
150 collectionTypes = {CollectionType.RUN}
151 else:
152 collectionTypes = CollectionType.all()
153 datasetRecordStorage = self._managers.datasets.find(datasetType.name)
154 if datasetRecordStorage is None:
155 # Unrecognized dataset type means no results. It might be better
156 # to raise here, but this is consistent with previous behavior,
157 # which is expected by QuantumGraph generation code in pipe_base.
158 self._doomed_by.append(
159 f"Dataset type {datasetType.name!r} is not registered, so no instances of it can exist in "
160 "any collection."
161 )
162 return False
163 subsubqueries = []
164 runKeyName = self._managers.collections.getRunForeignKeyName()
165 baseColumnNames = {"id", runKeyName, "ingest_date"} if isResult else set()
166 baseColumnNames.update(datasetType.dimensions.required.names)
167 if not findFirst:
168 calibration_collections = []
169 other_collections = []
170 rejections: List[str] = []
171 for rank, collectionRecord in enumerate(
172 collections.iter(self._managers.collections, collectionTypes=collectionTypes)
173 ):
174 # Only include collections that (according to collection summaries)
175 # might have datasets of this type and governor dimensions
176 # consistent with the query's WHERE clause.
177 collection_summary = self._managers.datasets.getCollectionSummary(collectionRecord)
178 if not collection_summary.is_compatible_with(
179 datasetType,
180 self.summary.where.restriction,
181 rejections=rejections,
182 name=collectionRecord.name,
183 ):
184 continue
185 if collectionRecord.type is CollectionType.CALIBRATION:
186 # If collection name was provided explicitly then say sorry,
187 # otherwise collection is a part of chained one and we skip it.
188 if datasetType.isCalibration() and collectionRecord.name in explicitCollections:
189 if self.summary.temporal or self.summary.mustHaveKeysJoined.temporal:
190 raise NotImplementedError(
191 f"Temporal query for dataset type '{datasetType.name}' in CALIBRATION-type "
192 f"collection '{collectionRecord.name}' is not yet supported."
193 )
194 elif findFirst:
195 raise NotImplementedError(
196 f"Find-first query for dataset type '{datasetType.name}' in CALIBRATION-type "
197 f"collection '{collectionRecord.name}' is not yet supported."
198 )
199 else:
200 calibration_collections.append(collectionRecord)
201 else:
202 # We can never find a non-calibration dataset in a
203 # CALIBRATION collection.
204 rejections.append(
205 f"Not searching for non-calibration dataset {datasetType.name!r} "
206 f"in CALIBRATION collection {collectionRecord.name!r}."
207 )
208 continue
209 elif findFirst:
210 # If findFirst=True, each collection gets its own subquery so
211 # we can add a literal rank for it.
212 ssq = datasetRecordStorage.select(
213 collectionRecord,
214 dataId=SimpleQuery.Select,
215 id=SimpleQuery.Select if isResult else None,
216 run=SimpleQuery.Select if isResult else None,
217 ingestDate=SimpleQuery.Select if isResult else None,
218 )
219 assert {c.name for c in ssq.columns} == baseColumnNames
220 ssq.columns.append(sqlalchemy.sql.literal(rank).label("rank"))
221 subsubqueries.append(ssq.combine())
222 else:
223 # If findFirst=False, we have one subquery for all CALIBRATION
224 # collections and one subquery for all other collections; we'll
225 # assemble those later after grouping by collection type.
226 other_collections.append(collectionRecord)
227 if not findFirst:
228 if other_collections:
229 ssq = datasetRecordStorage.select(
230 *other_collections,
231 dataId=SimpleQuery.Select,
232 id=SimpleQuery.Select if isResult else None,
233 run=SimpleQuery.Select if isResult else None,
234 ingestDate=SimpleQuery.Select if isResult else None,
235 )
236 subsubqueries.append(ssq.combine())
237 if calibration_collections:
238 ssq = datasetRecordStorage.select(
239 *calibration_collections,
240 dataId=SimpleQuery.Select,
241 id=SimpleQuery.Select if isResult else None,
242 run=SimpleQuery.Select if isResult else None,
243 ingestDate=SimpleQuery.Select if isResult else None,
244 )
245 subsubqueries.append(ssq.combine())
246 if not subsubqueries:
247 if rejections:
248 self._doomed_by.extend(rejections)
249 else:
250 self._doomed_by.append(f"No collections to search matching expression {collections}.")
251 # Make a single subquery with no collections that never yields
252 # results; this should never get executed, but downstream code
253 # still needs to access the SQLAlchemy column objects.
254 ssq = datasetRecordStorage.select(
255 dataId=SimpleQuery.Select,
256 id=SimpleQuery.Select if isResult else None,
257 run=SimpleQuery.Select if isResult else None,
258 ingestDate=SimpleQuery.Select if isResult else None,
259 )
260 if findFirst:
261 ssq.columns.append(sqlalchemy.sql.literal(rank).label("rank"))
262 subsubqueries.append(ssq.combine())
263 # Although one would expect that these subqueries can be
264 # UNION ALL instead of UNION because each subquery is already
265 # distinct, it turns out that with many
266 # subqueries this causes catastrophic performance problems
267 # with both sqlite and postgres. Using UNION may require
268 # more table scans, but a much simpler query plan given our
269 # table structures. See DM-31429.
270 subquery = sqlalchemy.sql.union(*subsubqueries)
271 columns: Optional[DatasetQueryColumns] = None
272 if isResult:
273 if findFirst:
274 # Rewrite the subquery (currently a UNION ALL over
275 # per-collection subsubqueries) to select the rows with the
276 # lowest rank per data ID. The block below will set subquery
277 # to something like this:
278 #
279 # WITH {dst}_search AS (
280 # SELECT {data-id-cols}, id, run_id, 1 AS rank
281 # FROM <collection1>
282 # UNION ALL
283 # SELECT {data-id-cols}, id, run_id, 2 AS rank
284 # FROM <collection2>
285 # UNION ALL
286 # ...
287 # )
288 # SELECT
289 # {dst}_window.{data-id-cols},
290 # {dst}_window.id,
291 # {dst}_window.run_id
292 # FROM (
293 # SELECT
294 # {dst}_search.{data-id-cols},
295 # {dst}_search.id,
296 # {dst}_search.run_id,
297 # ROW_NUMBER() OVER (
298 # PARTITION BY {dst_search}.{data-id-cols}
299 # ORDER BY rank
300 # ) AS rownum
301 # ) {dst}_window
302 # WHERE
303 # {dst}_window.rownum = 1;
304 #
305 search = subquery.cte(f"{datasetType.name}_search")
306 windowDataIdCols = [
307 search.columns[name].label(name) for name in datasetType.dimensions.required.names
308 ]
309 windowSelectCols = [
310 search.columns["id"].label("id"),
311 search.columns[runKeyName].label(runKeyName),
312 search.columns["ingest_date"].label("ingest_date"),
313 ]
314 windowSelectCols += windowDataIdCols
315 assert {c.name for c in windowSelectCols} == baseColumnNames
316 windowSelectCols.append(
317 sqlalchemy.sql.func.row_number()
318 .over(partition_by=windowDataIdCols, order_by=search.columns["rank"])
319 .label("rownum")
320 )
321 window = (
322 sqlalchemy.sql.select(*windowSelectCols)
323 .select_from(search)
324 .alias(f"{datasetType.name}_window")
325 )
326 subquery = (
327 sqlalchemy.sql.select(*[window.columns[name].label(name) for name in baseColumnNames])
328 .select_from(window)
329 .where(window.columns["rownum"] == 1)
330 .alias(datasetType.name)
331 )
332 else:
333 subquery = subquery.alias(datasetType.name)
334 columns = DatasetQueryColumns(
335 datasetType=datasetType,
336 id=subquery.columns["id"],
337 runKey=subquery.columns[runKeyName],
338 ingestDate=subquery.columns["ingest_date"],
339 )
340 else:
341 subquery = subquery.alias(datasetType.name)
342 self.joinTable(subquery, datasetType.dimensions.required, datasets=columns)
343 return not self._doomed_by
345 def joinTable(
346 self,
347 table: sqlalchemy.sql.FromClause,
348 dimensions: NamedValueAbstractSet[Dimension],
349 *,
350 datasets: Optional[DatasetQueryColumns] = None,
351 ) -> None:
352 """Join an arbitrary table to the query via dimension relationships.
354 External calls to this method should only be necessary for tables whose
355 records represent neither datasets nor dimension elements.
357 Parameters
358 ----------
359 table : `sqlalchemy.sql.FromClause`
360 SQLAlchemy object representing the logical table (which may be a
361 join or subquery expression) to be joined.
362 dimensions : iterable of `Dimension`
363 The dimensions that relate this table to others that may be in the
364 query. The table must have columns with the names of the
365 dimensions.
366 datasets : `DatasetQueryColumns`, optional
367 Columns that identify a dataset that is part of the query results.
368 """
369 unexpectedDimensions = NamedValueSet(dimensions - self.summary.mustHaveKeysJoined.dimensions)
370 unexpectedDimensions.discard(self.summary.universe.commonSkyPix)
371 if unexpectedDimensions:
372 raise NotImplementedError(
373 f"QueryBuilder does not yet support joining in dimensions {unexpectedDimensions} that "
374 f"were not provided originally to the QuerySummary object passed at construction."
375 )
376 joinOn = self.startJoin(table, dimensions, dimensions.names)
377 self.finishJoin(table, joinOn)
378 if datasets is not None:
379 assert (
380 self._columns.datasets is None
381 ), "At most one result dataset type can be returned by a query."
382 self._columns.datasets = datasets
384 def startJoin(
385 self, table: sqlalchemy.sql.FromClause, dimensions: Iterable[Dimension], columnNames: Iterable[str]
386 ) -> List[sqlalchemy.sql.ColumnElement]:
387 """Begin a join on dimensions.
389 Must be followed by call to `finishJoin`.
391 Parameters
392 ----------
393 table : `sqlalchemy.sql.FromClause`
394 SQLAlchemy object representing the logical table (which may be a
395 join or subquery expression) to be joined.
396 dimensions : iterable of `Dimension`
397 The dimensions that relate this table to others that may be in the
398 query. The table must have columns with the names of the
399 dimensions.
400 columnNames : iterable of `str`
401 Names of the columns that correspond to dimension key values; must
402 be `zip` iterable with ``dimensions``.
404 Returns
405 -------
406 joinOn : `list` of `sqlalchemy.sql.ColumnElement`
407 Sequence of boolean expressions that should be combined with AND
408 to form (part of) the ON expression for this JOIN.
409 """
410 joinOn = []
411 for dimension, columnName in zip(dimensions, columnNames):
412 columnInTable = table.columns[columnName]
413 columnsInQuery = self._columns.keys.setdefault(dimension, [])
414 for columnInQuery in columnsInQuery:
415 joinOn.append(columnInQuery == columnInTable)
416 columnsInQuery.append(columnInTable)
417 return joinOn
419 def finishJoin(
420 self, table: sqlalchemy.sql.FromClause, joinOn: List[sqlalchemy.sql.ColumnElement]
421 ) -> None:
422 """Complete a join on dimensions.
424 Must be preceded by call to `startJoin`.
426 Parameters
427 ----------
428 table : `sqlalchemy.sql.FromClause`
429 SQLAlchemy object representing the logical table (which may be a
430 join or subquery expression) to be joined. Must be the same object
431 passed to `startJoin`.
432 joinOn : `list` of `sqlalchemy.sql.ColumnElement`
433 Sequence of boolean expressions that should be combined with AND
434 to form (part of) the ON expression for this JOIN. Should include
435 at least the elements of the list returned by `startJoin`.
436 """
437 onclause: Optional[sqlalchemy.sql.ColumnElement]
438 if len(joinOn) == 0:
439 onclause = None
440 elif len(joinOn) == 1:
441 onclause = joinOn[0]
442 else:
443 onclause = sqlalchemy.sql.and_(*joinOn)
444 self._simpleQuery.join(table, onclause=onclause)
446 def _joinMissingDimensionElements(self) -> None:
447 """Join all dimension element tables that were identified as necessary
448 by `QuerySummary` and have not yet been joined.
450 For internal use by `QueryBuilder` only; will be called (and should
451 only by called) by `finish`.
452 """
453 # Join all DimensionElement tables that we need for spatial/temporal
454 # joins/filters or a nontrivial WHERE expression.
455 # We iterate over these in *reverse* topological order to minimize the
456 # number of tables joined. For example, the "visit" table provides
457 # the primary key value for the "instrument" table it depends on, so we
458 # don't need to join "instrument" as well unless we had a nontrivial
459 # expression on it (and hence included it already above).
460 for element in self.summary.universe.sorted(self.summary.mustHaveTableJoined, reverse=True):
461 self.joinDimensionElement(element)
462 # Join in any requested Dimension tables that don't already have their
463 # primary keys identified by the query.
464 for dimension in self.summary.universe.sorted(self.summary.mustHaveKeysJoined, reverse=True):
465 if dimension not in self._columns.keys:
466 self.joinDimensionElement(dimension)
468 def _addWhereClause(self) -> None:
469 """Add a WHERE clause to the query under construction, connecting all
470 joined dimensions to the expression and data ID dimensions from
471 `QuerySummary`.
473 For internal use by `QueryBuilder` only; will be called (and should
474 only by called) by `finish`.
475 """
476 if self.summary.where.tree is not None:
477 self._simpleQuery.where.append(
478 convertExpressionToSql(
479 self.summary.where.tree,
480 self.summary.universe,
481 columns=self._columns,
482 elements=self._elements,
483 bind=self.summary.where.bind,
484 TimespanReprClass=self._managers.TimespanReprClass,
485 )
486 )
487 for dimension, columnsInQuery in self._columns.keys.items():
488 if dimension in self.summary.where.dataId.graph:
489 givenKey = self.summary.where.dataId[dimension]
490 # Add a WHERE term for each column that corresponds to each
491 # key. This is redundant with the JOIN ON clauses that make
492 # them equal to each other, but more constraints have a chance
493 # of making things easier on the DB's query optimizer.
494 for columnInQuery in columnsInQuery:
495 self._simpleQuery.where.append(columnInQuery == givenKey)
496 else:
497 # Dimension is not fully identified, but it might be a skypix
498 # dimension that's constrained by a given region.
499 if self.summary.where.region is not None and isinstance(dimension, SkyPixDimension):
500 # We know the region now.
501 givenSkyPixIds: List[int] = []
502 for begin, end in dimension.pixelization.envelope(self.summary.where.region):
503 givenSkyPixIds.extend(range(begin, end))
504 for columnInQuery in columnsInQuery:
505 self._simpleQuery.where.append(columnInQuery.in_(givenSkyPixIds))
506 # If we are given an dataId with a timespan, and there are one or more
507 # timespans in the query that aren't given, add a WHERE expression for
508 # each of them.
509 if self.summary.where.dataId.graph.temporal and self.summary.temporal:
510 # Timespan is known now.
511 givenInterval = self.summary.where.dataId.timespan
512 assert givenInterval is not None
513 for element, intervalInQuery in self._columns.timespans.items():
514 assert element not in self.summary.where.dataId.graph.elements
515 self._simpleQuery.where.append(
516 intervalInQuery.overlaps(self._managers.TimespanReprClass.fromLiteral(givenInterval))
517 )
519 def finish(self, joinMissing: bool = True) -> Query:
520 """Finish query constructing, returning a new `Query` instance.
522 Parameters
523 ----------
524 joinMissing : `bool`, optional
525 If `True` (default), automatically join any missing dimension
526 element tables (according to the categorization of the
527 `QuerySummary` the builder was constructed with). `False` should
528 only be passed if the caller can independently guarantee that all
529 dimension relationships are already captured in non-dimension
530 tables that have been manually included in the query.
532 Returns
533 -------
534 query : `Query`
535 A `Query` object that can be executed and used to interpret result
536 rows.
537 """
538 if joinMissing:
539 self._joinMissingDimensionElements()
540 self._addWhereClause()
541 if self._columns.isEmpty():
542 return EmptyQuery(
543 self.summary.requested.universe, managers=self._managers, doomed_by=self._doomed_by
544 )
545 return DirectQuery(
546 graph=self.summary.requested,
547 uniqueness=DirectQueryUniqueness.NOT_UNIQUE,
548 whereRegion=self.summary.where.dataId.region,
549 simpleQuery=self._simpleQuery,
550 columns=self._columns,
551 order_by_columns=self._order_by_columns(),
552 limit=self.summary.limit,
553 managers=self._managers,
554 doomed_by=self._doomed_by,
555 )
557 def _order_by_columns(self) -> Iterable[OrderByColumn]:
558 """Generate columns to be used for ORDER BY clause.
560 Returns
561 -------
562 order_by_columns : `Iterable` [ `ColumnIterable` ]
563 Sequence of columns to appear in ORDER BY clause.
564 """
565 order_by_columns: List[OrderByColumn] = []
566 if not self.summary.order_by:
567 return order_by_columns
569 for order_by_column in self.summary.order_by.order_by_columns:
571 column: sqlalchemy.sql.ColumnElement
572 if order_by_column.column is None:
573 # dimension name, it has to be in SELECT list already, only
574 # add it to ORDER BY
575 assert isinstance(order_by_column.element, Dimension), "expecting full Dimension"
576 column = self._columns.getKeyColumn(order_by_column.element)
577 else:
578 table = self._elements[order_by_column.element]
580 if order_by_column.column in ("timespan.begin", "timespan.end"):
581 TimespanReprClass = self._managers.TimespanReprClass
582 timespan_repr = TimespanReprClass.fromSelectable(table)
583 if order_by_column.column == "timespan.begin":
584 column = timespan_repr.lower()
585 label = f"{order_by_column.element.name}_timespan_begin"
586 else:
587 column = timespan_repr.upper()
588 label = f"{order_by_column.element.name}_timespan_end"
589 else:
590 column = table.columns[order_by_column.column]
591 # make a unique label for it
592 label = f"{order_by_column.element.name}_{order_by_column.column}"
594 column = column.label(label)
596 order_by_columns.append(OrderByColumn(column=column, ordering=order_by_column.ordering))
598 return order_by_columns