Coverage for python/lsst/daf/butler/registry/queries/_builder.py: 9%
Shortcuts on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
21from __future__ import annotations
23__all__ = ("QueryBuilder",)
25from typing import AbstractSet, Any, Iterable, List, Optional
27import sqlalchemy.sql
29from ...core import (
30 DimensionElement,
31 SkyPixDimension,
32 Dimension,
33 DatasetType,
34 SimpleQuery,
35)
37from ...core.named import NamedKeyDict, NamedValueAbstractSet, NamedValueSet
39from .._collectionType import CollectionType
40from ._structs import QuerySummary, QueryColumns, DatasetQueryColumns, RegistryManagers
41from .expressions import convertExpressionToSql
42from ._query import DirectQuery, DirectQueryUniqueness, EmptyQuery, Query
43from ..wildcards import CollectionSearch, CollectionQuery
46class QueryBuilder:
47 """A builder for potentially complex queries that join tables based
48 on dimension relationships.
50 Parameters
51 ----------
52 summary : `QuerySummary`
53 Struct organizing the dimensions involved in the query.
54 managers : `RegistryManagers`
55 A struct containing the registry manager instances used by the query
56 system.
57 doomed_by : `Iterable` [ `str` ], optional
58 A list of messages (appropriate for e.g. logging or exceptions) that
59 explain why the query is known to return no results even before it is
60 executed. Queries with a non-empty list will never be executed.
61 """
62 def __init__(self, summary: QuerySummary, managers: RegistryManagers, doomed_by: Iterable[str] = ()):
63 self.summary = summary
64 self._simpleQuery = SimpleQuery()
65 self._elements: NamedKeyDict[DimensionElement, sqlalchemy.sql.FromClause] = NamedKeyDict()
66 self._columns = QueryColumns()
67 self._managers = managers
68 self._doomed_by = list(doomed_by)
70 def hasDimensionKey(self, dimension: Dimension) -> bool:
71 """Return `True` if the given dimension's primary key column has
72 been included in the query (possibly via a foreign key column on some
73 other table).
74 """
75 return dimension in self._columns.keys
77 def joinDimensionElement(self, element: DimensionElement) -> None:
78 """Add the table for a `DimensionElement` to the query.
80 This automatically joins the element table to all other tables in the
81 query with which it is related, via both dimension keys and spatial
82 and temporal relationships.
84 External calls to this method should rarely be necessary; `finish` will
85 automatically call it if the `DimensionElement` has been identified as
86 one that must be included.
88 Parameters
89 ----------
90 element : `DimensionElement`
91 Element for which a table should be added. The element must be
92 associated with a database table (see `DimensionElement.hasTable`).
93 """
94 assert element not in self._elements, "Element already included in query."
95 storage = self._managers.dimensions[element]
96 fromClause = storage.join(
97 self,
98 regions=self._columns.regions if element in self.summary.spatial else None,
99 timespans=self._columns.timespans if element in self.summary.temporal else None,
100 )
101 self._elements[element] = fromClause
103 def joinDataset(self, datasetType: DatasetType, collections: Any, *,
104 isResult: bool = True, findFirst: bool = False) -> bool:
105 """Add a dataset search or constraint to the query.
107 Unlike other `QueryBuilder` join methods, this *must* be called
108 directly to search for datasets of a particular type or constrain the
109 query results based on the exists of datasets. However, all dimensions
110 used to identify the dataset type must have already been included in
111 `QuerySummary.requested` when initializing the `QueryBuilder`.
113 Parameters
114 ----------
115 datasetType : `DatasetType`
116 The type of datasets to search for.
117 collections : `Any`
118 An expression that fully or partially identifies the collections
119 to search for datasets, such as a `str`, `re.Pattern`, or iterable
120 thereof. `...` can be used to return all collections. See
121 :ref:`daf_butler_collection_expressions` for more information.
122 isResult : `bool`, optional
123 If `True` (default), include the dataset ID column in the
124 result columns of the query, allowing complete `DatasetRef`
125 instances to be produced from the query results for this dataset
126 type. If `False`, the existence of datasets of this type is used
127 only to constrain the data IDs returned by the query.
128 `joinDataset` may be called with ``isResult=True`` at most one time
129 on a particular `QueryBuilder` instance.
130 findFirst : `bool`, optional
131 If `True` (`False` is default), only include the first match for
132 each data ID, searching the given collections in order. Requires
133 that all entries in ``collections`` be regular strings, so there is
134 a clear search order. Ignored if ``isResult`` is `False`.
136 Returns
137 -------
138 anyRecords : `bool`
139 If `True`, joining the dataset table was successful and the query
140 should proceed. If `False`, we were able to determine (from the
141 combination of ``datasetType`` and ``collections``) that there
142 would be no results joined in from this dataset, and hence (due to
143 the inner join that would normally be present), the full query will
144 return no results.
145 """
146 assert datasetType.dimensions.issubset(self.summary.requested)
147 if isResult and findFirst:
148 collections = CollectionSearch.fromExpression(collections)
149 else:
150 collections = CollectionQuery.fromExpression(collections)
151 explicitCollections = frozenset(collections.explicitNames())
152 # If we are searching all collections with no constraints, loop over
153 # RUN collections only, because that will include all datasets.
154 collectionTypes: AbstractSet[CollectionType]
155 if collections == CollectionQuery():
156 collectionTypes = {CollectionType.RUN}
157 else:
158 collectionTypes = CollectionType.all()
159 datasetRecordStorage = self._managers.datasets.find(datasetType.name)
160 if datasetRecordStorage is None:
161 # Unrecognized dataset type means no results. It might be better
162 # to raise here, but this is consistent with previous behavior,
163 # which is expected by QuantumGraph generation code in pipe_base.
164 self._doomed_by.append(
165 f"Dataset type {datasetType.name!r} is not registered, so no instances of it can exist in "
166 "any collection."
167 )
168 return False
169 subsubqueries = []
170 runKeyName = self._managers.collections.getRunForeignKeyName()
171 baseColumnNames = {"id", runKeyName, "ingest_date"} if isResult else set()
172 baseColumnNames.update(datasetType.dimensions.required.names)
173 if not findFirst:
174 calibration_collections = []
175 other_collections = []
176 rejections: List[str] = []
177 for rank, collectionRecord in enumerate(collections.iter(self._managers.collections,
178 collectionTypes=collectionTypes)):
179 # Only include collections that (according to collection summaries)
180 # might have datasets of this type and governor dimensions
181 # consistent with the query's WHERE clause.
182 collection_summary = self._managers.datasets.getCollectionSummary(collectionRecord)
183 if not collection_summary.is_compatible_with(
184 datasetType,
185 self.summary.where.restriction,
186 rejections=rejections,
187 name=collectionRecord.name,
188 ):
189 continue
190 if collectionRecord.type is CollectionType.CALIBRATION:
191 # If collection name was provided explicitly then say sorry,
192 # otherwise collection is a part of chained one and we skip it.
193 if datasetType.isCalibration() and collectionRecord.name in explicitCollections:
194 if self.summary.temporal or self.summary.mustHaveKeysJoined.temporal:
195 raise NotImplementedError(
196 f"Temporal query for dataset type '{datasetType.name}' in CALIBRATION-type "
197 f"collection '{collectionRecord.name}' is not yet supported."
198 )
199 elif findFirst:
200 raise NotImplementedError(
201 f"Find-first query for dataset type '{datasetType.name}' in CALIBRATION-type "
202 f"collection '{collectionRecord.name}' is not yet supported."
203 )
204 else:
205 calibration_collections.append(collectionRecord)
206 else:
207 # We can never find a non-calibration dataset in a
208 # CALIBRATION collection.
209 rejections.append(
210 f"Not searching for non-calibration dataset {datasetType.name!r} "
211 f"in CALIBRATION collection {collectionRecord.name!r}."
212 )
213 continue
214 elif findFirst:
215 # If findFirst=True, each collection gets its own subquery so
216 # we can add a literal rank for it.
217 ssq = datasetRecordStorage.select(
218 collectionRecord,
219 dataId=SimpleQuery.Select,
220 id=SimpleQuery.Select if isResult else None,
221 run=SimpleQuery.Select if isResult else None,
222 ingestDate=SimpleQuery.Select if isResult else None,
223 )
224 assert {c.name for c in ssq.columns} == baseColumnNames
225 ssq.columns.append(sqlalchemy.sql.literal(rank).label("rank"))
226 subsubqueries.append(ssq.combine())
227 else:
228 # If findFirst=False, we have one subquery for all CALIBRATION
229 # collections and one subquery for all other collections; we'll
230 # assemble those later after grouping by collection type.
231 other_collections.append(collectionRecord)
232 if not findFirst:
233 if other_collections:
234 ssq = datasetRecordStorage.select(
235 *other_collections,
236 dataId=SimpleQuery.Select,
237 id=SimpleQuery.Select if isResult else None,
238 run=SimpleQuery.Select if isResult else None,
239 ingestDate=SimpleQuery.Select if isResult else None,
240 )
241 subsubqueries.append(ssq.combine())
242 if calibration_collections:
243 ssq = datasetRecordStorage.select(
244 *calibration_collections,
245 dataId=SimpleQuery.Select,
246 id=SimpleQuery.Select if isResult else None,
247 run=SimpleQuery.Select if isResult else None,
248 ingestDate=SimpleQuery.Select if isResult else None,
249 )
250 subsubqueries.append(ssq.combine())
251 if not subsubqueries:
252 if rejections:
253 self._doomed_by.extend(rejections)
254 else:
255 self._doomed_by.append(f"No collections to search matching expression {collections}.")
256 # Make a single subquery with no collections that never yields
257 # results; this should never get executed, but downstream code
258 # still needs to access the SQLAlchemy column objects.
259 ssq = datasetRecordStorage.select(
260 dataId=SimpleQuery.Select,
261 id=SimpleQuery.Select if isResult else None,
262 run=SimpleQuery.Select if isResult else None,
263 ingestDate=SimpleQuery.Select if isResult else None,
264 )
265 if findFirst:
266 ssq.columns.append(sqlalchemy.sql.literal(rank).label("rank"))
267 subsubqueries.append(ssq.combine())
268 # Although one would expect that these subqueries can be
269 # UNION ALL instead of UNION because each subquery is already
270 # distinct, it turns out that with many
271 # subqueries this causes catastrophic performance problems
272 # with both sqlite and postgres. Using UNION may require
273 # more table scans, but a much simpler query plan given our
274 # table structures. See DM-31429.
275 subquery = sqlalchemy.sql.union(*subsubqueries)
276 columns: Optional[DatasetQueryColumns] = None
277 if isResult:
278 if findFirst:
279 # Rewrite the subquery (currently a UNION ALL over
280 # per-collection subsubqueries) to select the rows with the
281 # lowest rank per data ID. The block below will set subquery
282 # to something like this:
283 #
284 # WITH {dst}_search AS (
285 # SELECT {data-id-cols}, id, run_id, 1 AS rank
286 # FROM <collection1>
287 # UNION ALL
288 # SELECT {data-id-cols}, id, run_id, 2 AS rank
289 # FROM <collection2>
290 # UNION ALL
291 # ...
292 # )
293 # SELECT
294 # {dst}_window.{data-id-cols},
295 # {dst}_window.id,
296 # {dst}_window.run_id
297 # FROM (
298 # SELECT
299 # {dst}_search.{data-id-cols},
300 # {dst}_search.id,
301 # {dst}_search.run_id,
302 # ROW_NUMBER() OVER (
303 # PARTITION BY {dst_search}.{data-id-cols}
304 # ORDER BY rank
305 # ) AS rownum
306 # ) {dst}_window
307 # WHERE
308 # {dst}_window.rownum = 1;
309 #
310 search = subquery.cte(f"{datasetType.name}_search")
311 windowDataIdCols = [
312 search.columns[name].label(name) for name in datasetType.dimensions.required.names
313 ]
314 windowSelectCols = [
315 search.columns["id"].label("id"),
316 search.columns[runKeyName].label(runKeyName),
317 search.columns["ingest_date"].label("ingest_date"),
318 ]
319 windowSelectCols += windowDataIdCols
320 assert {c.name for c in windowSelectCols} == baseColumnNames
321 windowSelectCols.append(
322 sqlalchemy.sql.func.row_number().over(
323 partition_by=windowDataIdCols,
324 order_by=search.columns["rank"]
325 ).label("rownum")
326 )
327 window = sqlalchemy.sql.select(
328 *windowSelectCols
329 ).select_from(search).alias(
330 f"{datasetType.name}_window"
331 )
332 subquery = sqlalchemy.sql.select(
333 *[window.columns[name].label(name) for name in baseColumnNames]
334 ).select_from(
335 window
336 ).where(
337 window.columns["rownum"] == 1
338 ).alias(datasetType.name)
339 else:
340 subquery = subquery.alias(datasetType.name)
341 columns = DatasetQueryColumns(
342 datasetType=datasetType,
343 id=subquery.columns["id"],
344 runKey=subquery.columns[runKeyName],
345 ingestDate=subquery.columns["ingest_date"],
346 )
347 else:
348 subquery = subquery.alias(datasetType.name)
349 self.joinTable(subquery, datasetType.dimensions.required, datasets=columns)
350 return not self._doomed_by
352 def joinTable(self, table: sqlalchemy.sql.FromClause, dimensions: NamedValueAbstractSet[Dimension], *,
353 datasets: Optional[DatasetQueryColumns] = None) -> None:
354 """Join an arbitrary table to the query via dimension relationships.
356 External calls to this method should only be necessary for tables whose
357 records represent neither datasets nor dimension elements.
359 Parameters
360 ----------
361 table : `sqlalchemy.sql.FromClause`
362 SQLAlchemy object representing the logical table (which may be a
363 join or subquery expression) to be joined.
364 dimensions : iterable of `Dimension`
365 The dimensions that relate this table to others that may be in the
366 query. The table must have columns with the names of the
367 dimensions.
368 datasets : `DatasetQueryColumns`, optional
369 Columns that identify a dataset that is part of the query results.
370 """
371 unexpectedDimensions = NamedValueSet(dimensions - self.summary.mustHaveKeysJoined.dimensions)
372 unexpectedDimensions.discard(self.summary.universe.commonSkyPix)
373 if unexpectedDimensions:
374 raise NotImplementedError(
375 f"QueryBuilder does not yet support joining in dimensions {unexpectedDimensions} that "
376 f"were not provided originally to the QuerySummary object passed at construction."
377 )
378 joinOn = self.startJoin(table, dimensions, dimensions.names)
379 self.finishJoin(table, joinOn)
380 if datasets is not None:
381 assert self._columns.datasets is None, \
382 "At most one result dataset type can be returned by a query."
383 self._columns.datasets = datasets
385 def startJoin(self, table: sqlalchemy.sql.FromClause, dimensions: Iterable[Dimension],
386 columnNames: Iterable[str]
387 ) -> List[sqlalchemy.sql.ColumnElement]:
388 """Begin a join on dimensions.
390 Must be followed by call to `finishJoin`.
392 Parameters
393 ----------
394 table : `sqlalchemy.sql.FromClause`
395 SQLAlchemy object representing the logical table (which may be a
396 join or subquery expression) to be joined.
397 dimensions : iterable of `Dimension`
398 The dimensions that relate this table to others that may be in the
399 query. The table must have columns with the names of the
400 dimensions.
401 columnNames : iterable of `str`
402 Names of the columns that correspond to dimension key values; must
403 be `zip` iterable with ``dimensions``.
405 Returns
406 -------
407 joinOn : `list` of `sqlalchemy.sql.ColumnElement`
408 Sequence of boolean expressions that should be combined with AND
409 to form (part of) the ON expression for this JOIN.
410 """
411 joinOn = []
412 for dimension, columnName in zip(dimensions, columnNames):
413 columnInTable = table.columns[columnName]
414 columnsInQuery = self._columns.keys.setdefault(dimension, [])
415 for columnInQuery in columnsInQuery:
416 joinOn.append(columnInQuery == columnInTable)
417 columnsInQuery.append(columnInTable)
418 return joinOn
420 def finishJoin(self, table: sqlalchemy.sql.FromClause, joinOn: List[sqlalchemy.sql.ColumnElement]
421 ) -> None:
422 """Complete a join on dimensions.
424 Must be preceded by call to `startJoin`.
426 Parameters
427 ----------
428 table : `sqlalchemy.sql.FromClause`
429 SQLAlchemy object representing the logical table (which may be a
430 join or subquery expression) to be joined. Must be the same object
431 passed to `startJoin`.
432 joinOn : `list` of `sqlalchemy.sql.ColumnElement`
433 Sequence of boolean expressions that should be combined with AND
434 to form (part of) the ON expression for this JOIN. Should include
435 at least the elements of the list returned by `startJoin`.
436 """
437 onclause: Optional[sqlalchemy.sql.ColumnElement]
438 if len(joinOn) == 0:
439 onclause = None
440 elif len(joinOn) == 1:
441 onclause = joinOn[0]
442 else:
443 onclause = sqlalchemy.sql.and_(*joinOn)
444 self._simpleQuery.join(table, onclause=onclause)
446 def _joinMissingDimensionElements(self) -> None:
447 """Join all dimension element tables that were identified as necessary
448 by `QuerySummary` and have not yet been joined.
450 For internal use by `QueryBuilder` only; will be called (and should
451 only by called) by `finish`.
452 """
453 # Join all DimensionElement tables that we need for spatial/temporal
454 # joins/filters or a nontrivial WHERE expression.
455 # We iterate over these in *reverse* topological order to minimize the
456 # number of tables joined. For example, the "visit" table provides
457 # the primary key value for the "instrument" table it depends on, so we
458 # don't need to join "instrument" as well unless we had a nontrivial
459 # expression on it (and hence included it already above).
460 for element in self.summary.universe.sorted(self.summary.mustHaveTableJoined, reverse=True):
461 self.joinDimensionElement(element)
462 # Join in any requested Dimension tables that don't already have their
463 # primary keys identified by the query.
464 for dimension in self.summary.universe.sorted(self.summary.mustHaveKeysJoined, reverse=True):
465 if dimension not in self._columns.keys:
466 self.joinDimensionElement(dimension)
468 def _addWhereClause(self) -> None:
469 """Add a WHERE clause to the query under construction, connecting all
470 joined dimensions to the expression and data ID dimensions from
471 `QuerySummary`.
473 For internal use by `QueryBuilder` only; will be called (and should
474 only by called) by `finish`.
475 """
476 if self.summary.where.tree is not None:
477 self._simpleQuery.where.append(
478 convertExpressionToSql(
479 self.summary.where.tree,
480 self.summary.universe,
481 columns=self._columns,
482 elements=self._elements,
483 bind=self.summary.where.bind,
484 TimespanReprClass=self._managers.TimespanReprClass,
485 )
486 )
487 for dimension, columnsInQuery in self._columns.keys.items():
488 if dimension in self.summary.where.dataId.graph:
489 givenKey = self.summary.where.dataId[dimension]
490 # Add a WHERE term for each column that corresponds to each
491 # key. This is redundant with the JOIN ON clauses that make
492 # them equal to each other, but more constraints have a chance
493 # of making things easier on the DB's query optimizer.
494 for columnInQuery in columnsInQuery:
495 self._simpleQuery.where.append(columnInQuery == givenKey)
496 else:
497 # Dimension is not fully identified, but it might be a skypix
498 # dimension that's constrained by a given region.
499 if self.summary.where.region is not None and isinstance(dimension, SkyPixDimension):
500 # We know the region now.
501 givenSkyPixIds: List[int] = []
502 for begin, end in dimension.pixelization.envelope(self.summary.where.region):
503 givenSkyPixIds.extend(range(begin, end))
504 for columnInQuery in columnsInQuery:
505 self._simpleQuery.where.append(columnInQuery.in_(givenSkyPixIds))
506 # If we are given an dataId with a timespan, and there are one or more
507 # timespans in the query that aren't given, add a WHERE expression for
508 # each of them.
509 if self.summary.where.dataId.graph.temporal and self.summary.temporal:
510 # Timespan is known now.
511 givenInterval = self.summary.where.dataId.timespan
512 assert givenInterval is not None
513 for element, intervalInQuery in self._columns.timespans.items():
514 assert element not in self.summary.where.dataId.graph.elements
515 self._simpleQuery.where.append(
516 intervalInQuery.overlaps(self._managers.TimespanReprClass.fromLiteral(givenInterval))
517 )
519 def finish(self, joinMissing: bool = True) -> Query:
520 """Finish query constructing, returning a new `Query` instance.
522 Parameters
523 ----------
524 joinMissing : `bool`, optional
525 If `True` (default), automatically join any missing dimension
526 element tables (according to the categorization of the
527 `QuerySummary` the builder was constructed with). `False` should
528 only be passed if the caller can independently guarantee that all
529 dimension relationships are already captured in non-dimension
530 tables that have been manually included in the query.
532 Returns
533 -------
534 query : `Query`
535 A `Query` object that can be executed and used to interpret result
536 rows.
537 """
538 if joinMissing:
539 self._joinMissingDimensionElements()
540 self._addWhereClause()
541 if self._columns.isEmpty():
542 return EmptyQuery(self.summary.requested.universe, managers=self._managers,
543 doomed_by=self._doomed_by)
544 return DirectQuery(graph=self.summary.requested,
545 uniqueness=DirectQueryUniqueness.NOT_UNIQUE,
546 whereRegion=self.summary.where.dataId.region,
547 simpleQuery=self._simpleQuery,
548 columns=self._columns,
549 managers=self._managers,
550 doomed_by=self._doomed_by)