Coverage for python/lsst/daf/butler/registry/queries/_builder.py: 9%
Shortcuts on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
21from __future__ import annotations
23__all__ = ("QueryBuilder",)
25import dataclasses
26from typing import AbstractSet, Any, Iterable, List, Optional
28import sqlalchemy.sql
30from ...core import (
31 DimensionElement,
32 SkyPixDimension,
33 Dimension,
34 DatasetType,
35 SimpleQuery,
36)
38from ...core.named import NamedKeyDict, NamedValueAbstractSet, NamedValueSet
39from ...core import ddl
41from .._collectionType import CollectionType
42from ._structs import QuerySummary, QueryColumns, DatasetQueryColumns, RegistryManagers
43from .expressions import convertExpressionToSql
44from ._query import DirectQuery, DirectQueryUniqueness, EmptyQuery, OrderByColumn, Query
45from ..wildcards import CollectionSearch, CollectionQuery
48class QueryBuilder:
49 """A builder for potentially complex queries that join tables based
50 on dimension relationships.
52 Parameters
53 ----------
54 summary : `QuerySummary`
55 Struct organizing the dimensions involved in the query.
56 managers : `RegistryManagers`
57 A struct containing the registry manager instances used by the query
58 system.
59 doomed_by : `Iterable` [ `str` ], optional
60 A list of messages (appropriate for e.g. logging or exceptions) that
61 explain why the query is known to return no results even before it is
62 executed. Queries with a non-empty list will never be executed.
63 """
64 def __init__(self, summary: QuerySummary, managers: RegistryManagers, doomed_by: Iterable[str] = ()):
65 self.summary = summary
66 self._simpleQuery = SimpleQuery()
67 self._elements: NamedKeyDict[DimensionElement, sqlalchemy.sql.FromClause] = NamedKeyDict()
68 self._columns = QueryColumns()
69 self._managers = managers
70 self._doomed_by = list(doomed_by)
72 def hasDimensionKey(self, dimension: Dimension) -> bool:
73 """Return `True` if the given dimension's primary key column has
74 been included in the query (possibly via a foreign key column on some
75 other table).
76 """
77 return dimension in self._columns.keys
79 def joinDimensionElement(self, element: DimensionElement) -> None:
80 """Add the table for a `DimensionElement` to the query.
82 This automatically joins the element table to all other tables in the
83 query with which it is related, via both dimension keys and spatial
84 and temporal relationships.
86 External calls to this method should rarely be necessary; `finish` will
87 automatically call it if the `DimensionElement` has been identified as
88 one that must be included.
90 Parameters
91 ----------
92 element : `DimensionElement`
93 Element for which a table should be added. The element must be
94 associated with a database table (see `DimensionElement.hasTable`).
95 """
96 assert element not in self._elements, "Element already included in query."
97 storage = self._managers.dimensions[element]
98 fromClause = storage.join(
99 self,
100 regions=self._columns.regions if element in self.summary.spatial else None,
101 timespans=self._columns.timespans if element in self.summary.temporal else None,
102 )
103 self._elements[element] = fromClause
105 def joinDataset(self, datasetType: DatasetType, collections: Any, *,
106 isResult: bool = True, findFirst: bool = False) -> bool:
107 """Add a dataset search or constraint to the query.
109 Unlike other `QueryBuilder` join methods, this *must* be called
110 directly to search for datasets of a particular type or constrain the
111 query results based on the exists of datasets. However, all dimensions
112 used to identify the dataset type must have already been included in
113 `QuerySummary.requested` when initializing the `QueryBuilder`.
115 Parameters
116 ----------
117 datasetType : `DatasetType`
118 The type of datasets to search for.
119 collections : `Any`
120 An expression that fully or partially identifies the collections
121 to search for datasets, such as a `str`, `re.Pattern`, or iterable
122 thereof. `...` can be used to return all collections. See
123 :ref:`daf_butler_collection_expressions` for more information.
124 isResult : `bool`, optional
125 If `True` (default), include the dataset ID column in the
126 result columns of the query, allowing complete `DatasetRef`
127 instances to be produced from the query results for this dataset
128 type. If `False`, the existence of datasets of this type is used
129 only to constrain the data IDs returned by the query.
130 `joinDataset` may be called with ``isResult=True`` at most one time
131 on a particular `QueryBuilder` instance.
132 findFirst : `bool`, optional
133 If `True` (`False` is default), only include the first match for
134 each data ID, searching the given collections in order. Requires
135 that all entries in ``collections`` be regular strings, so there is
136 a clear search order. Ignored if ``isResult`` is `False`.
138 Returns
139 -------
140 anyRecords : `bool`
141 If `True`, joining the dataset table was successful and the query
142 should proceed. If `False`, we were able to determine (from the
143 combination of ``datasetType`` and ``collections``) that there
144 would be no results joined in from this dataset, and hence (due to
145 the inner join that would normally be present), the full query will
146 return no results.
147 """
148 assert datasetType in self.summary.datasets
149 if isResult and findFirst:
150 collections = CollectionSearch.fromExpression(collections)
151 else:
152 collections = CollectionQuery.fromExpression(collections)
153 explicitCollections = frozenset(collections.explicitNames())
154 # If we are searching all collections with no constraints, loop over
155 # RUN collections only, because that will include all datasets.
156 collectionTypes: AbstractSet[CollectionType]
157 if collections == CollectionQuery():
158 collectionTypes = {CollectionType.RUN}
159 else:
160 collectionTypes = CollectionType.all()
161 datasetRecordStorage = self._managers.datasets.find(datasetType.name)
162 if datasetRecordStorage is None:
163 # Unrecognized dataset type means no results. It might be better
164 # to raise here, but this is consistent with previous behavior,
165 # which is expected by QuantumGraph generation code in pipe_base.
166 self._doomed_by.append(
167 f"Dataset type {datasetType.name!r} is not registered, so no instances of it can exist in "
168 "any collection."
169 )
170 return False
171 subsubqueries = []
172 runKeyName = self._managers.collections.getRunForeignKeyName()
173 baseColumnNames = {"id", runKeyName, "ingest_date"} if isResult else set()
174 baseColumnNames.update(datasetType.dimensions.required.names)
175 if not findFirst:
176 calibration_collections = []
177 other_collections = []
178 rejections: List[str] = []
179 for rank, collectionRecord in enumerate(collections.iter(self._managers.collections,
180 collectionTypes=collectionTypes)):
181 # Only include collections that (according to collection summaries)
182 # might have datasets of this type and governor dimensions
183 # consistent with the query's WHERE clause.
184 collection_summary = self._managers.datasets.getCollectionSummary(collectionRecord)
185 if not collection_summary.is_compatible_with(
186 datasetType,
187 self.summary.where.restriction,
188 rejections=rejections,
189 name=collectionRecord.name,
190 ):
191 continue
192 if collectionRecord.type is CollectionType.CALIBRATION:
193 # If collection name was provided explicitly then say sorry,
194 # otherwise collection is a part of chained one and we skip it.
195 if datasetType.isCalibration() and collectionRecord.name in explicitCollections:
196 if self.summary.temporal or self.summary.mustHaveKeysJoined.temporal:
197 raise NotImplementedError(
198 f"Temporal query for dataset type '{datasetType.name}' in CALIBRATION-type "
199 f"collection '{collectionRecord.name}' is not yet supported."
200 )
201 elif findFirst:
202 raise NotImplementedError(
203 f"Find-first query for dataset type '{datasetType.name}' in CALIBRATION-type "
204 f"collection '{collectionRecord.name}' is not yet supported."
205 )
206 else:
207 calibration_collections.append(collectionRecord)
208 else:
209 # We can never find a non-calibration dataset in a
210 # CALIBRATION collection.
211 rejections.append(
212 f"Not searching for non-calibration dataset {datasetType.name!r} "
213 f"in CALIBRATION collection {collectionRecord.name!r}."
214 )
215 continue
216 elif findFirst:
217 # If findFirst=True, each collection gets its own subquery so
218 # we can add a literal rank for it.
219 ssq = datasetRecordStorage.select(
220 collectionRecord,
221 dataId=SimpleQuery.Select,
222 id=SimpleQuery.Select if isResult else None,
223 run=SimpleQuery.Select if isResult else None,
224 ingestDate=SimpleQuery.Select if isResult else None,
225 )
226 assert {c.name for c in ssq.columns} == baseColumnNames
227 ssq.columns.append(sqlalchemy.sql.literal(rank).label("rank"))
228 subsubqueries.append(ssq.combine())
229 else:
230 # If findFirst=False, we have one subquery for all CALIBRATION
231 # collections and one subquery for all other collections; we'll
232 # assemble those later after grouping by collection type.
233 other_collections.append(collectionRecord)
234 if not findFirst:
235 if other_collections:
236 ssq = datasetRecordStorage.select(
237 *other_collections,
238 dataId=SimpleQuery.Select,
239 id=SimpleQuery.Select if isResult else None,
240 run=SimpleQuery.Select if isResult else None,
241 ingestDate=SimpleQuery.Select if isResult else None,
242 )
243 subsubqueries.append(ssq.combine())
244 if calibration_collections:
245 ssq = datasetRecordStorage.select(
246 *calibration_collections,
247 dataId=SimpleQuery.Select,
248 id=SimpleQuery.Select if isResult else None,
249 run=SimpleQuery.Select if isResult else None,
250 ingestDate=SimpleQuery.Select if isResult else None,
251 )
252 subsubqueries.append(ssq.combine())
253 if not subsubqueries:
254 if rejections:
255 self._doomed_by.extend(rejections)
256 else:
257 self._doomed_by.append(f"No collections to search matching expression {collections}.")
258 # Make a single subquery with no collections that never yields
259 # results; this should never get executed, but downstream code
260 # still needs to access the SQLAlchemy column objects.
261 ssq = datasetRecordStorage.select(
262 dataId=SimpleQuery.Select,
263 id=SimpleQuery.Select if isResult else None,
264 run=SimpleQuery.Select if isResult else None,
265 ingestDate=SimpleQuery.Select if isResult else None,
266 )
267 if findFirst:
268 ssq.columns.append(sqlalchemy.sql.literal(rank).label("rank"))
269 subsubqueries.append(ssq.combine())
270 # Although one would expect that these subqueries can be
271 # UNION ALL instead of UNION because each subquery is already
272 # distinct, it turns out that with many
273 # subqueries this causes catastrophic performance problems
274 # with both sqlite and postgres. Using UNION may require
275 # more table scans, but a much simpler query plan given our
276 # table structures. See DM-31429.
277 subquery = sqlalchemy.sql.union(*subsubqueries)
278 columns: Optional[DatasetQueryColumns] = None
279 if isResult:
280 if findFirst:
281 # Rewrite the subquery (currently a UNION ALL over
282 # per-collection subsubqueries) to select the rows with the
283 # lowest rank per data ID. The block below will set subquery
284 # to something like this:
285 #
286 # WITH {dst}_search AS (
287 # SELECT {data-id-cols}, id, run_id, 1 AS rank
288 # FROM <collection1>
289 # UNION ALL
290 # SELECT {data-id-cols}, id, run_id, 2 AS rank
291 # FROM <collection2>
292 # UNION ALL
293 # ...
294 # )
295 # SELECT
296 # {dst}_window.{data-id-cols},
297 # {dst}_window.id,
298 # {dst}_window.run_id
299 # FROM (
300 # SELECT
301 # {dst}_search.{data-id-cols},
302 # {dst}_search.id,
303 # {dst}_search.run_id,
304 # ROW_NUMBER() OVER (
305 # PARTITION BY {dst_search}.{data-id-cols}
306 # ORDER BY rank
307 # ) AS rownum
308 # ) {dst}_window
309 # WHERE
310 # {dst}_window.rownum = 1;
311 #
312 search = subquery.cte(f"{datasetType.name}_search")
313 windowDataIdCols = [
314 search.columns[name].label(name) for name in datasetType.dimensions.required.names
315 ]
316 windowSelectCols = [
317 search.columns["id"].label("id"),
318 search.columns[runKeyName].label(runKeyName),
319 search.columns["ingest_date"].label("ingest_date"),
320 ]
321 windowSelectCols += windowDataIdCols
322 assert {c.name for c in windowSelectCols} == baseColumnNames
323 windowSelectCols.append(
324 sqlalchemy.sql.func.row_number().over(
325 partition_by=windowDataIdCols,
326 order_by=search.columns["rank"]
327 ).label("rownum")
328 )
329 window = sqlalchemy.sql.select(
330 *windowSelectCols
331 ).select_from(search).alias(
332 f"{datasetType.name}_window"
333 )
334 subquery = sqlalchemy.sql.select(
335 *[window.columns[name].label(name) for name in baseColumnNames]
336 ).select_from(
337 window
338 ).where(
339 window.columns["rownum"] == 1
340 ).alias(datasetType.name)
341 else:
342 subquery = subquery.alias(datasetType.name)
343 columns = DatasetQueryColumns(
344 datasetType=datasetType,
345 id=subquery.columns["id"],
346 runKey=subquery.columns[runKeyName],
347 ingestDate=subquery.columns["ingest_date"],
348 )
349 else:
350 subquery = subquery.alias(datasetType.name)
351 self.joinTable(subquery, datasetType.dimensions.required, datasets=columns)
352 return not self._doomed_by
354 def joinTable(self, table: sqlalchemy.sql.FromClause, dimensions: NamedValueAbstractSet[Dimension], *,
355 datasets: Optional[DatasetQueryColumns] = None) -> None:
356 """Join an arbitrary table to the query via dimension relationships.
358 External calls to this method should only be necessary for tables whose
359 records represent neither datasets nor dimension elements.
361 Parameters
362 ----------
363 table : `sqlalchemy.sql.FromClause`
364 SQLAlchemy object representing the logical table (which may be a
365 join or subquery expression) to be joined.
366 dimensions : iterable of `Dimension`
367 The dimensions that relate this table to others that may be in the
368 query. The table must have columns with the names of the
369 dimensions.
370 datasets : `DatasetQueryColumns`, optional
371 Columns that identify a dataset that is part of the query results.
372 """
373 unexpectedDimensions = NamedValueSet(dimensions - self.summary.mustHaveKeysJoined.dimensions)
374 unexpectedDimensions.discard(self.summary.universe.commonSkyPix)
375 if unexpectedDimensions:
376 raise NotImplementedError(
377 f"QueryBuilder does not yet support joining in dimensions {unexpectedDimensions} that "
378 f"were not provided originally to the QuerySummary object passed at construction."
379 )
380 joinOn = self.startJoin(table, dimensions, dimensions.names)
381 self.finishJoin(table, joinOn)
382 if datasets is not None:
383 assert self._columns.datasets is None, \
384 "At most one result dataset type can be returned by a query."
385 self._columns.datasets = datasets
387 def startJoin(self, table: sqlalchemy.sql.FromClause, dimensions: Iterable[Dimension],
388 columnNames: Iterable[str]
389 ) -> List[sqlalchemy.sql.ColumnElement]:
390 """Begin a join on dimensions.
392 Must be followed by call to `finishJoin`.
394 Parameters
395 ----------
396 table : `sqlalchemy.sql.FromClause`
397 SQLAlchemy object representing the logical table (which may be a
398 join or subquery expression) to be joined.
399 dimensions : iterable of `Dimension`
400 The dimensions that relate this table to others that may be in the
401 query. The table must have columns with the names of the
402 dimensions.
403 columnNames : iterable of `str`
404 Names of the columns that correspond to dimension key values; must
405 be `zip` iterable with ``dimensions``.
407 Returns
408 -------
409 joinOn : `list` of `sqlalchemy.sql.ColumnElement`
410 Sequence of boolean expressions that should be combined with AND
411 to form (part of) the ON expression for this JOIN.
412 """
413 joinOn = []
414 for dimension, columnName in zip(dimensions, columnNames):
415 columnInTable = table.columns[columnName]
416 columnsInQuery = self._columns.keys.setdefault(dimension, [])
417 for columnInQuery in columnsInQuery:
418 joinOn.append(columnInQuery == columnInTable)
419 columnsInQuery.append(columnInTable)
420 return joinOn
422 def finishJoin(self, table: sqlalchemy.sql.FromClause, joinOn: List[sqlalchemy.sql.ColumnElement]
423 ) -> None:
424 """Complete a join on dimensions.
426 Must be preceded by call to `startJoin`.
428 Parameters
429 ----------
430 table : `sqlalchemy.sql.FromClause`
431 SQLAlchemy object representing the logical table (which may be a
432 join or subquery expression) to be joined. Must be the same object
433 passed to `startJoin`.
434 joinOn : `list` of `sqlalchemy.sql.ColumnElement`
435 Sequence of boolean expressions that should be combined with AND
436 to form (part of) the ON expression for this JOIN. Should include
437 at least the elements of the list returned by `startJoin`.
438 """
439 onclause: Optional[sqlalchemy.sql.ColumnElement]
440 if len(joinOn) == 0:
441 onclause = None
442 elif len(joinOn) == 1:
443 onclause = joinOn[0]
444 else:
445 onclause = sqlalchemy.sql.and_(*joinOn)
446 self._simpleQuery.join(table, onclause=onclause)
448 def _joinMissingDimensionElements(self) -> None:
449 """Join all dimension element tables that were identified as necessary
450 by `QuerySummary` and have not yet been joined.
452 For internal use by `QueryBuilder` only; will be called (and should
453 only by called) by `finish`.
454 """
455 # Join all DimensionElement tables that we need for spatial/temporal
456 # joins/filters or a nontrivial WHERE expression.
457 # We iterate over these in *reverse* topological order to minimize the
458 # number of tables joined. For example, the "visit" table provides
459 # the primary key value for the "instrument" table it depends on, so we
460 # don't need to join "instrument" as well unless we had a nontrivial
461 # expression on it (and hence included it already above).
462 for element in self.summary.universe.sorted(self.summary.mustHaveTableJoined, reverse=True):
463 self.joinDimensionElement(element)
464 # Join in any requested Dimension tables that don't already have their
465 # primary keys identified by the query.
466 for dimension in self.summary.universe.sorted(self.summary.mustHaveKeysJoined, reverse=True):
467 if dimension not in self._columns.keys:
468 self.joinDimensionElement(dimension)
470 def _addWhereClause(self) -> None:
471 """Add a WHERE clause to the query under construction, connecting all
472 joined dimensions to the expression and data ID dimensions from
473 `QuerySummary`.
475 For internal use by `QueryBuilder` only; will be called (and should
476 only by called) by `finish`.
477 """
478 if self.summary.where.tree is not None:
479 self._simpleQuery.where.append(
480 convertExpressionToSql(
481 self.summary.where.tree,
482 self.summary.universe,
483 columns=self._columns,
484 elements=self._elements,
485 bind=self.summary.where.bind,
486 TimespanReprClass=self._managers.TimespanReprClass,
487 )
488 )
489 for dimension, columnsInQuery in self._columns.keys.items():
490 if dimension in self.summary.where.dataId.graph:
491 givenKey = self.summary.where.dataId[dimension]
492 # Add a WHERE term for each column that corresponds to each
493 # key. This is redundant with the JOIN ON clauses that make
494 # them equal to each other, but more constraints have a chance
495 # of making things easier on the DB's query optimizer.
496 for columnInQuery in columnsInQuery:
497 self._simpleQuery.where.append(columnInQuery == givenKey)
498 else:
499 # Dimension is not fully identified, but it might be a skypix
500 # dimension that's constrained by a given region.
501 if self.summary.where.region is not None and isinstance(dimension, SkyPixDimension):
502 # We know the region now.
503 givenSkyPixIds: List[int] = []
504 for begin, end in dimension.pixelization.envelope(self.summary.where.region):
505 givenSkyPixIds.extend(range(begin, end))
506 for columnInQuery in columnsInQuery:
507 self._simpleQuery.where.append(columnInQuery.in_(givenSkyPixIds))
508 # If we are given an dataId with a timespan, and there are one or more
509 # timespans in the query that aren't given, add a WHERE expression for
510 # each of them.
511 if self.summary.where.dataId.graph.temporal and self.summary.temporal:
512 # Timespan is known now.
513 givenInterval = self.summary.where.dataId.timespan
514 assert givenInterval is not None
515 for element, intervalInQuery in self._columns.timespans.items():
516 assert element not in self.summary.where.dataId.graph.elements
517 self._simpleQuery.where.append(
518 intervalInQuery.overlaps(self._managers.TimespanReprClass.fromLiteral(givenInterval))
519 )
521 def finish(self, joinMissing: bool = True) -> Query:
522 """Finish query constructing, returning a new `Query` instance.
524 Parameters
525 ----------
526 joinMissing : `bool`, optional
527 If `True` (default), automatically join any missing dimension
528 element tables (according to the categorization of the
529 `QuerySummary` the builder was constructed with). `False` should
530 only be passed if the caller can independently guarantee that all
531 dimension relationships are already captured in non-dimension
532 tables that have been manually included in the query.
534 Returns
535 -------
536 query : `Query`
537 A `Query` object that can be executed and used to interpret result
538 rows.
539 """
540 if joinMissing:
541 self._joinMissingDimensionElements()
542 self._addWhereClause()
543 if self._columns.isEmpty():
544 return EmptyQuery(self.summary.requested.universe, managers=self._managers,
545 doomed_by=self._doomed_by)
546 return DirectQuery(graph=self.summary.requested,
547 uniqueness=DirectQueryUniqueness.NOT_UNIQUE,
548 whereRegion=self.summary.where.dataId.region,
549 simpleQuery=self._simpleQuery,
550 columns=self._columns,
551 order_by_columns=self._order_by_columns(),
552 limit=self.summary.limit,
553 managers=self._managers,
554 doomed_by=self._doomed_by)
556 def _order_by_columns(self) -> Iterable[OrderByColumn]:
557 """Generate columns to be used for ORDER BY clause.
559 Returns
560 -------
561 order_by_columns : `Iterable` [ `ColumnIterable` ]
562 Sequence of columns to appear in ORDER BY clause.
563 """
564 order_by_columns: List[OrderByColumn] = []
565 if not self.summary.order_by:
566 return order_by_columns
568 for order_by_column in self.summary.order_by.order_by_columns:
570 column: sqlalchemy.sql.ColumnElement
571 field_spec: Optional[ddl.FieldSpec]
572 dimension: Optional[Dimension] = None
573 if order_by_column.column is None:
574 # dimension name, it has to be in SELECT list already, only
575 # add it to ORDER BY
576 assert isinstance(order_by_column.element, Dimension), "expecting full Dimension"
577 column = self._columns.getKeyColumn(order_by_column.element)
578 add_to_select = False
579 field_spec = None
580 dimension = order_by_column.element
581 else:
582 table = self._elements[order_by_column.element]
584 if order_by_column.column in ("timespan.begin", "timespan.end"):
585 TimespanReprClass = self._managers.TimespanReprClass
586 timespan_repr = TimespanReprClass.fromSelectable(table)
587 if order_by_column.column == "timespan.begin":
588 column = timespan_repr.lower()
589 label = f"{order_by_column.element.name}_timespan_begin"
590 else:
591 column = timespan_repr.upper()
592 label = f"{order_by_column.element.name}_timespan_end"
593 field_spec = ddl.FieldSpec(label, dtype=sqlalchemy.BigInteger, nullable=True)
594 else:
595 column = table.columns[order_by_column.column]
596 # make a unique label for it
597 label = f"{order_by_column.element.name}_{order_by_column.column}"
598 field_spec = order_by_column.element.RecordClass.fields.facts[order_by_column.column]
599 field_spec = dataclasses.replace(field_spec, name=label)
601 column = column.label(label)
602 add_to_select = True
604 order_by_columns.append(
605 OrderByColumn(column=column, ordering=order_by_column.ordering,
606 add_to_select=add_to_select, field_spec=field_spec,
607 dimension=dimension)
608 )
610 return order_by_columns