Coverage for python/lsst/daf/butler/registry/queries/_builder.py: 9%
Shortcuts on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
21from __future__ import annotations
23__all__ = ("QueryBuilder",)
25import dataclasses
26from typing import AbstractSet, Any, Iterable, List, Optional
28import sqlalchemy.sql
30from ...core import DatasetType, Dimension, DimensionElement, SimpleQuery, SkyPixDimension, ddl
31from ...core.named import NamedKeyDict, NamedValueAbstractSet, NamedValueSet
32from .._collectionType import CollectionType
33from ..wildcards import CollectionQuery, CollectionSearch
34from ._query import DirectQuery, DirectQueryUniqueness, EmptyQuery, OrderByColumn, Query
35from ._structs import DatasetQueryColumns, QueryColumns, QuerySummary, RegistryManagers
36from .expressions import convertExpressionToSql
39class QueryBuilder:
40 """A builder for potentially complex queries that join tables based
41 on dimension relationships.
43 Parameters
44 ----------
45 summary : `QuerySummary`
46 Struct organizing the dimensions involved in the query.
47 managers : `RegistryManagers`
48 A struct containing the registry manager instances used by the query
49 system.
50 doomed_by : `Iterable` [ `str` ], optional
51 A list of messages (appropriate for e.g. logging or exceptions) that
52 explain why the query is known to return no results even before it is
53 executed. Queries with a non-empty list will never be executed.
54 """
56 def __init__(self, summary: QuerySummary, managers: RegistryManagers, doomed_by: Iterable[str] = ()):
57 self.summary = summary
58 self._simpleQuery = SimpleQuery()
59 self._elements: NamedKeyDict[DimensionElement, sqlalchemy.sql.FromClause] = NamedKeyDict()
60 self._columns = QueryColumns()
61 self._managers = managers
62 self._doomed_by = list(doomed_by)
64 def hasDimensionKey(self, dimension: Dimension) -> bool:
65 """Return `True` if the given dimension's primary key column has
66 been included in the query (possibly via a foreign key column on some
67 other table).
68 """
69 return dimension in self._columns.keys
71 def joinDimensionElement(self, element: DimensionElement) -> None:
72 """Add the table for a `DimensionElement` to the query.
74 This automatically joins the element table to all other tables in the
75 query with which it is related, via both dimension keys and spatial
76 and temporal relationships.
78 External calls to this method should rarely be necessary; `finish` will
79 automatically call it if the `DimensionElement` has been identified as
80 one that must be included.
82 Parameters
83 ----------
84 element : `DimensionElement`
85 Element for which a table should be added. The element must be
86 associated with a database table (see `DimensionElement.hasTable`).
87 """
88 assert element not in self._elements, "Element already included in query."
89 storage = self._managers.dimensions[element]
90 fromClause = storage.join(
91 self,
92 regions=self._columns.regions if element in self.summary.spatial else None,
93 timespans=self._columns.timespans if element in self.summary.temporal else None,
94 )
95 self._elements[element] = fromClause
97 def joinDataset(
98 self, datasetType: DatasetType, collections: Any, *, isResult: bool = True, findFirst: bool = False
99 ) -> bool:
100 """Add a dataset search or constraint to the query.
102 Unlike other `QueryBuilder` join methods, this *must* be called
103 directly to search for datasets of a particular type or constrain the
104 query results based on the exists of datasets. However, all dimensions
105 used to identify the dataset type must have already been included in
106 `QuerySummary.requested` when initializing the `QueryBuilder`.
108 Parameters
109 ----------
110 datasetType : `DatasetType`
111 The type of datasets to search for.
112 collections : `Any`
113 An expression that fully or partially identifies the collections
114 to search for datasets, such as a `str`, `re.Pattern`, or iterable
115 thereof. `...` can be used to return all collections. See
116 :ref:`daf_butler_collection_expressions` for more information.
117 isResult : `bool`, optional
118 If `True` (default), include the dataset ID column in the
119 result columns of the query, allowing complete `DatasetRef`
120 instances to be produced from the query results for this dataset
121 type. If `False`, the existence of datasets of this type is used
122 only to constrain the data IDs returned by the query.
123 `joinDataset` may be called with ``isResult=True`` at most one time
124 on a particular `QueryBuilder` instance.
125 findFirst : `bool`, optional
126 If `True` (`False` is default), only include the first match for
127 each data ID, searching the given collections in order. Requires
128 that all entries in ``collections`` be regular strings, so there is
129 a clear search order. Ignored if ``isResult`` is `False`.
131 Returns
132 -------
133 anyRecords : `bool`
134 If `True`, joining the dataset table was successful and the query
135 should proceed. If `False`, we were able to determine (from the
136 combination of ``datasetType`` and ``collections``) that there
137 would be no results joined in from this dataset, and hence (due to
138 the inner join that would normally be present), the full query will
139 return no results.
140 """
141 assert datasetType in self.summary.datasets
142 if isResult and findFirst:
143 collections = CollectionSearch.fromExpression(collections)
144 else:
145 collections = CollectionQuery.fromExpression(collections)
146 explicitCollections = frozenset(collections.explicitNames())
147 # If we are searching all collections with no constraints, loop over
148 # RUN collections only, because that will include all datasets.
149 collectionTypes: AbstractSet[CollectionType]
150 if collections == CollectionQuery():
151 collectionTypes = {CollectionType.RUN}
152 else:
153 collectionTypes = CollectionType.all()
154 datasetRecordStorage = self._managers.datasets.find(datasetType.name)
155 if datasetRecordStorage is None:
156 # Unrecognized dataset type means no results. It might be better
157 # to raise here, but this is consistent with previous behavior,
158 # which is expected by QuantumGraph generation code in pipe_base.
159 self._doomed_by.append(
160 f"Dataset type {datasetType.name!r} is not registered, so no instances of it can exist in "
161 "any collection."
162 )
163 return False
164 subsubqueries = []
165 runKeyName = self._managers.collections.getRunForeignKeyName()
166 baseColumnNames = {"id", runKeyName, "ingest_date"} if isResult else set()
167 baseColumnNames.update(datasetType.dimensions.required.names)
168 if not findFirst:
169 calibration_collections = []
170 other_collections = []
171 rejections: List[str] = []
172 for rank, collectionRecord in enumerate(
173 collections.iter(self._managers.collections, collectionTypes=collectionTypes)
174 ):
175 # Only include collections that (according to collection summaries)
176 # might have datasets of this type and governor dimensions
177 # consistent with the query's WHERE clause.
178 collection_summary = self._managers.datasets.getCollectionSummary(collectionRecord)
179 if not collection_summary.is_compatible_with(
180 datasetType,
181 self.summary.where.restriction,
182 rejections=rejections,
183 name=collectionRecord.name,
184 ):
185 continue
186 if collectionRecord.type is CollectionType.CALIBRATION:
187 # If collection name was provided explicitly then say sorry,
188 # otherwise collection is a part of chained one and we skip it.
189 if datasetType.isCalibration() and collectionRecord.name in explicitCollections:
190 if self.summary.temporal or self.summary.mustHaveKeysJoined.temporal:
191 raise NotImplementedError(
192 f"Temporal query for dataset type '{datasetType.name}' in CALIBRATION-type "
193 f"collection '{collectionRecord.name}' is not yet supported."
194 )
195 elif findFirst:
196 raise NotImplementedError(
197 f"Find-first query for dataset type '{datasetType.name}' in CALIBRATION-type "
198 f"collection '{collectionRecord.name}' is not yet supported."
199 )
200 else:
201 calibration_collections.append(collectionRecord)
202 else:
203 # We can never find a non-calibration dataset in a
204 # CALIBRATION collection.
205 rejections.append(
206 f"Not searching for non-calibration dataset {datasetType.name!r} "
207 f"in CALIBRATION collection {collectionRecord.name!r}."
208 )
209 continue
210 elif findFirst:
211 # If findFirst=True, each collection gets its own subquery so
212 # we can add a literal rank for it.
213 ssq = datasetRecordStorage.select(
214 collectionRecord,
215 dataId=SimpleQuery.Select,
216 id=SimpleQuery.Select if isResult else None,
217 run=SimpleQuery.Select if isResult else None,
218 ingestDate=SimpleQuery.Select if isResult else None,
219 )
220 assert {c.name for c in ssq.columns} == baseColumnNames
221 ssq.columns.append(sqlalchemy.sql.literal(rank).label("rank"))
222 subsubqueries.append(ssq.combine())
223 else:
224 # If findFirst=False, we have one subquery for all CALIBRATION
225 # collections and one subquery for all other collections; we'll
226 # assemble those later after grouping by collection type.
227 other_collections.append(collectionRecord)
228 if not findFirst:
229 if other_collections:
230 ssq = datasetRecordStorage.select(
231 *other_collections,
232 dataId=SimpleQuery.Select,
233 id=SimpleQuery.Select if isResult else None,
234 run=SimpleQuery.Select if isResult else None,
235 ingestDate=SimpleQuery.Select if isResult else None,
236 )
237 subsubqueries.append(ssq.combine())
238 if calibration_collections:
239 ssq = datasetRecordStorage.select(
240 *calibration_collections,
241 dataId=SimpleQuery.Select,
242 id=SimpleQuery.Select if isResult else None,
243 run=SimpleQuery.Select if isResult else None,
244 ingestDate=SimpleQuery.Select if isResult else None,
245 )
246 subsubqueries.append(ssq.combine())
247 if not subsubqueries:
248 if rejections:
249 self._doomed_by.extend(rejections)
250 else:
251 self._doomed_by.append(f"No collections to search matching expression {collections}.")
252 # Make a single subquery with no collections that never yields
253 # results; this should never get executed, but downstream code
254 # still needs to access the SQLAlchemy column objects.
255 ssq = datasetRecordStorage.select(
256 dataId=SimpleQuery.Select,
257 id=SimpleQuery.Select if isResult else None,
258 run=SimpleQuery.Select if isResult else None,
259 ingestDate=SimpleQuery.Select if isResult else None,
260 )
261 if findFirst:
262 ssq.columns.append(sqlalchemy.sql.literal(rank).label("rank"))
263 subsubqueries.append(ssq.combine())
264 # Although one would expect that these subqueries can be
265 # UNION ALL instead of UNION because each subquery is already
266 # distinct, it turns out that with many
267 # subqueries this causes catastrophic performance problems
268 # with both sqlite and postgres. Using UNION may require
269 # more table scans, but a much simpler query plan given our
270 # table structures. See DM-31429.
271 subquery = sqlalchemy.sql.union(*subsubqueries)
272 columns: Optional[DatasetQueryColumns] = None
273 if isResult:
274 if findFirst:
275 # Rewrite the subquery (currently a UNION ALL over
276 # per-collection subsubqueries) to select the rows with the
277 # lowest rank per data ID. The block below will set subquery
278 # to something like this:
279 #
280 # WITH {dst}_search AS (
281 # SELECT {data-id-cols}, id, run_id, 1 AS rank
282 # FROM <collection1>
283 # UNION ALL
284 # SELECT {data-id-cols}, id, run_id, 2 AS rank
285 # FROM <collection2>
286 # UNION ALL
287 # ...
288 # )
289 # SELECT
290 # {dst}_window.{data-id-cols},
291 # {dst}_window.id,
292 # {dst}_window.run_id
293 # FROM (
294 # SELECT
295 # {dst}_search.{data-id-cols},
296 # {dst}_search.id,
297 # {dst}_search.run_id,
298 # ROW_NUMBER() OVER (
299 # PARTITION BY {dst_search}.{data-id-cols}
300 # ORDER BY rank
301 # ) AS rownum
302 # ) {dst}_window
303 # WHERE
304 # {dst}_window.rownum = 1;
305 #
306 search = subquery.cte(f"{datasetType.name}_search")
307 windowDataIdCols = [
308 search.columns[name].label(name) for name in datasetType.dimensions.required.names
309 ]
310 windowSelectCols = [
311 search.columns["id"].label("id"),
312 search.columns[runKeyName].label(runKeyName),
313 search.columns["ingest_date"].label("ingest_date"),
314 ]
315 windowSelectCols += windowDataIdCols
316 assert {c.name for c in windowSelectCols} == baseColumnNames
317 windowSelectCols.append(
318 sqlalchemy.sql.func.row_number()
319 .over(partition_by=windowDataIdCols, order_by=search.columns["rank"])
320 .label("rownum")
321 )
322 window = (
323 sqlalchemy.sql.select(*windowSelectCols)
324 .select_from(search)
325 .alias(f"{datasetType.name}_window")
326 )
327 subquery = (
328 sqlalchemy.sql.select(*[window.columns[name].label(name) for name in baseColumnNames])
329 .select_from(window)
330 .where(window.columns["rownum"] == 1)
331 .alias(datasetType.name)
332 )
333 else:
334 subquery = subquery.alias(datasetType.name)
335 columns = DatasetQueryColumns(
336 datasetType=datasetType,
337 id=subquery.columns["id"],
338 runKey=subquery.columns[runKeyName],
339 ingestDate=subquery.columns["ingest_date"],
340 )
341 else:
342 subquery = subquery.alias(datasetType.name)
343 self.joinTable(subquery, datasetType.dimensions.required, datasets=columns)
344 return not self._doomed_by
346 def joinTable(
347 self,
348 table: sqlalchemy.sql.FromClause,
349 dimensions: NamedValueAbstractSet[Dimension],
350 *,
351 datasets: Optional[DatasetQueryColumns] = None,
352 ) -> None:
353 """Join an arbitrary table to the query via dimension relationships.
355 External calls to this method should only be necessary for tables whose
356 records represent neither datasets nor dimension elements.
358 Parameters
359 ----------
360 table : `sqlalchemy.sql.FromClause`
361 SQLAlchemy object representing the logical table (which may be a
362 join or subquery expression) to be joined.
363 dimensions : iterable of `Dimension`
364 The dimensions that relate this table to others that may be in the
365 query. The table must have columns with the names of the
366 dimensions.
367 datasets : `DatasetQueryColumns`, optional
368 Columns that identify a dataset that is part of the query results.
369 """
370 unexpectedDimensions = NamedValueSet(dimensions - self.summary.mustHaveKeysJoined.dimensions)
371 unexpectedDimensions.discard(self.summary.universe.commonSkyPix)
372 if unexpectedDimensions:
373 raise NotImplementedError(
374 f"QueryBuilder does not yet support joining in dimensions {unexpectedDimensions} that "
375 f"were not provided originally to the QuerySummary object passed at construction."
376 )
377 joinOn = self.startJoin(table, dimensions, dimensions.names)
378 self.finishJoin(table, joinOn)
379 if datasets is not None:
380 assert (
381 self._columns.datasets is None
382 ), "At most one result dataset type can be returned by a query."
383 self._columns.datasets = datasets
385 def startJoin(
386 self, table: sqlalchemy.sql.FromClause, dimensions: Iterable[Dimension], columnNames: Iterable[str]
387 ) -> List[sqlalchemy.sql.ColumnElement]:
388 """Begin a join on dimensions.
390 Must be followed by call to `finishJoin`.
392 Parameters
393 ----------
394 table : `sqlalchemy.sql.FromClause`
395 SQLAlchemy object representing the logical table (which may be a
396 join or subquery expression) to be joined.
397 dimensions : iterable of `Dimension`
398 The dimensions that relate this table to others that may be in the
399 query. The table must have columns with the names of the
400 dimensions.
401 columnNames : iterable of `str`
402 Names of the columns that correspond to dimension key values; must
403 be `zip` iterable with ``dimensions``.
405 Returns
406 -------
407 joinOn : `list` of `sqlalchemy.sql.ColumnElement`
408 Sequence of boolean expressions that should be combined with AND
409 to form (part of) the ON expression for this JOIN.
410 """
411 joinOn = []
412 for dimension, columnName in zip(dimensions, columnNames):
413 columnInTable = table.columns[columnName]
414 columnsInQuery = self._columns.keys.setdefault(dimension, [])
415 for columnInQuery in columnsInQuery:
416 joinOn.append(columnInQuery == columnInTable)
417 columnsInQuery.append(columnInTable)
418 return joinOn
420 def finishJoin(
421 self, table: sqlalchemy.sql.FromClause, joinOn: List[sqlalchemy.sql.ColumnElement]
422 ) -> None:
423 """Complete a join on dimensions.
425 Must be preceded by call to `startJoin`.
427 Parameters
428 ----------
429 table : `sqlalchemy.sql.FromClause`
430 SQLAlchemy object representing the logical table (which may be a
431 join or subquery expression) to be joined. Must be the same object
432 passed to `startJoin`.
433 joinOn : `list` of `sqlalchemy.sql.ColumnElement`
434 Sequence of boolean expressions that should be combined with AND
435 to form (part of) the ON expression for this JOIN. Should include
436 at least the elements of the list returned by `startJoin`.
437 """
438 onclause: Optional[sqlalchemy.sql.ColumnElement]
439 if len(joinOn) == 0:
440 onclause = None
441 elif len(joinOn) == 1:
442 onclause = joinOn[0]
443 else:
444 onclause = sqlalchemy.sql.and_(*joinOn)
445 self._simpleQuery.join(table, onclause=onclause)
447 def _joinMissingDimensionElements(self) -> None:
448 """Join all dimension element tables that were identified as necessary
449 by `QuerySummary` and have not yet been joined.
451 For internal use by `QueryBuilder` only; will be called (and should
452 only by called) by `finish`.
453 """
454 # Join all DimensionElement tables that we need for spatial/temporal
455 # joins/filters or a nontrivial WHERE expression.
456 # We iterate over these in *reverse* topological order to minimize the
457 # number of tables joined. For example, the "visit" table provides
458 # the primary key value for the "instrument" table it depends on, so we
459 # don't need to join "instrument" as well unless we had a nontrivial
460 # expression on it (and hence included it already above).
461 for element in self.summary.universe.sorted(self.summary.mustHaveTableJoined, reverse=True):
462 self.joinDimensionElement(element)
463 # Join in any requested Dimension tables that don't already have their
464 # primary keys identified by the query.
465 for dimension in self.summary.universe.sorted(self.summary.mustHaveKeysJoined, reverse=True):
466 if dimension not in self._columns.keys:
467 self.joinDimensionElement(dimension)
469 def _addWhereClause(self) -> None:
470 """Add a WHERE clause to the query under construction, connecting all
471 joined dimensions to the expression and data ID dimensions from
472 `QuerySummary`.
474 For internal use by `QueryBuilder` only; will be called (and should
475 only by called) by `finish`.
476 """
477 if self.summary.where.tree is not None:
478 self._simpleQuery.where.append(
479 convertExpressionToSql(
480 self.summary.where.tree,
481 self.summary.universe,
482 columns=self._columns,
483 elements=self._elements,
484 bind=self.summary.where.bind,
485 TimespanReprClass=self._managers.TimespanReprClass,
486 )
487 )
488 for dimension, columnsInQuery in self._columns.keys.items():
489 if dimension in self.summary.where.dataId.graph:
490 givenKey = self.summary.where.dataId[dimension]
491 # Add a WHERE term for each column that corresponds to each
492 # key. This is redundant with the JOIN ON clauses that make
493 # them equal to each other, but more constraints have a chance
494 # of making things easier on the DB's query optimizer.
495 for columnInQuery in columnsInQuery:
496 self._simpleQuery.where.append(columnInQuery == givenKey)
497 else:
498 # Dimension is not fully identified, but it might be a skypix
499 # dimension that's constrained by a given region.
500 if self.summary.where.region is not None and isinstance(dimension, SkyPixDimension):
501 # We know the region now.
502 givenSkyPixIds: List[int] = []
503 for begin, end in dimension.pixelization.envelope(self.summary.where.region):
504 givenSkyPixIds.extend(range(begin, end))
505 for columnInQuery in columnsInQuery:
506 self._simpleQuery.where.append(columnInQuery.in_(givenSkyPixIds))
507 # If we are given an dataId with a timespan, and there are one or more
508 # timespans in the query that aren't given, add a WHERE expression for
509 # each of them.
510 if self.summary.where.dataId.graph.temporal and self.summary.temporal:
511 # Timespan is known now.
512 givenInterval = self.summary.where.dataId.timespan
513 assert givenInterval is not None
514 for element, intervalInQuery in self._columns.timespans.items():
515 assert element not in self.summary.where.dataId.graph.elements
516 self._simpleQuery.where.append(
517 intervalInQuery.overlaps(self._managers.TimespanReprClass.fromLiteral(givenInterval))
518 )
520 def finish(self, joinMissing: bool = True) -> Query:
521 """Finish query constructing, returning a new `Query` instance.
523 Parameters
524 ----------
525 joinMissing : `bool`, optional
526 If `True` (default), automatically join any missing dimension
527 element tables (according to the categorization of the
528 `QuerySummary` the builder was constructed with). `False` should
529 only be passed if the caller can independently guarantee that all
530 dimension relationships are already captured in non-dimension
531 tables that have been manually included in the query.
533 Returns
534 -------
535 query : `Query`
536 A `Query` object that can be executed and used to interpret result
537 rows.
538 """
539 if joinMissing:
540 self._joinMissingDimensionElements()
541 self._addWhereClause()
542 if self._columns.isEmpty():
543 return EmptyQuery(
544 self.summary.requested.universe, managers=self._managers, doomed_by=self._doomed_by
545 )
546 return DirectQuery(
547 graph=self.summary.requested,
548 uniqueness=DirectQueryUniqueness.NOT_UNIQUE,
549 whereRegion=self.summary.where.dataId.region,
550 simpleQuery=self._simpleQuery,
551 columns=self._columns,
552 order_by_columns=self._order_by_columns(),
553 limit=self.summary.limit,
554 managers=self._managers,
555 doomed_by=self._doomed_by,
556 )
558 def _order_by_columns(self) -> Iterable[OrderByColumn]:
559 """Generate columns to be used for ORDER BY clause.
561 Returns
562 -------
563 order_by_columns : `Iterable` [ `ColumnIterable` ]
564 Sequence of columns to appear in ORDER BY clause.
565 """
566 order_by_columns: List[OrderByColumn] = []
567 if not self.summary.order_by:
568 return order_by_columns
570 for order_by_column in self.summary.order_by.order_by_columns:
572 column: sqlalchemy.sql.ColumnElement
573 field_spec: Optional[ddl.FieldSpec]
574 dimension: Optional[Dimension] = None
575 if order_by_column.column is None:
576 # dimension name, it has to be in SELECT list already, only
577 # add it to ORDER BY
578 assert isinstance(order_by_column.element, Dimension), "expecting full Dimension"
579 column = self._columns.getKeyColumn(order_by_column.element)
580 add_to_select = False
581 field_spec = None
582 dimension = order_by_column.element
583 else:
584 table = self._elements[order_by_column.element]
586 if order_by_column.column in ("timespan.begin", "timespan.end"):
587 TimespanReprClass = self._managers.TimespanReprClass
588 timespan_repr = TimespanReprClass.fromSelectable(table)
589 if order_by_column.column == "timespan.begin":
590 column = timespan_repr.lower()
591 label = f"{order_by_column.element.name}_timespan_begin"
592 else:
593 column = timespan_repr.upper()
594 label = f"{order_by_column.element.name}_timespan_end"
595 field_spec = ddl.FieldSpec(label, dtype=sqlalchemy.BigInteger, nullable=True)
596 else:
597 column = table.columns[order_by_column.column]
598 # make a unique label for it
599 label = f"{order_by_column.element.name}_{order_by_column.column}"
600 field_spec = order_by_column.element.RecordClass.fields.facts[order_by_column.column]
601 field_spec = dataclasses.replace(field_spec, name=label)
603 column = column.label(label)
604 add_to_select = True
606 order_by_columns.append(
607 OrderByColumn(
608 column=column,
609 ordering=order_by_column.ordering,
610 add_to_select=add_to_select,
611 field_spec=field_spec,
612 dimension=dimension,
613 )
614 )
616 return order_by_columns