Coverage for python/lsst/daf/butler/registry/queries/_builder.py : 11%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
21from __future__ import annotations
23__all__ = ("QueryBuilder",)
25from typing import AbstractSet, Any, Iterable, List, Optional
27import sqlalchemy.sql
29from ...core import (
30 DimensionElement,
31 SkyPixDimension,
32 Dimension,
33 DatasetType,
34 SimpleQuery,
35)
37from ...core.named import NamedKeyDict, NamedValueAbstractSet, NamedValueSet
39from .._collectionType import CollectionType
40from ._structs import QuerySummary, QueryColumns, DatasetQueryColumns, RegistryManagers
41from .expressions import convertExpressionToSql
42from ._query import DirectQuery, DirectQueryUniqueness, EmptyQuery, Query
43from ..wildcards import CollectionSearch, CollectionQuery
46class QueryBuilder:
47 """A builder for potentially complex queries that join tables based
48 on dimension relationships.
50 Parameters
51 ----------
52 summary : `QuerySummary`
53 Struct organizing the dimensions involved in the query.
54 managers : `RegistryManagers`
55 A struct containing the registry manager instances used by the query
56 system.
57 """
58 def __init__(self, summary: QuerySummary, managers: RegistryManagers):
59 self.summary = summary
60 self._simpleQuery = SimpleQuery()
61 self._elements: NamedKeyDict[DimensionElement, sqlalchemy.sql.FromClause] = NamedKeyDict()
62 self._columns = QueryColumns()
63 self._managers = managers
65 def hasDimensionKey(self, dimension: Dimension) -> bool:
66 """Return `True` if the given dimension's primary key column has
67 been included in the query (possibly via a foreign key column on some
68 other table).
69 """
70 return dimension in self._columns.keys
72 def joinDimensionElement(self, element: DimensionElement) -> None:
73 """Add the table for a `DimensionElement` to the query.
75 This automatically joins the element table to all other tables in the
76 query with which it is related, via both dimension keys and spatial
77 and temporal relationships.
79 External calls to this method should rarely be necessary; `finish` will
80 automatically call it if the `DimensionElement` has been identified as
81 one that must be included.
83 Parameters
84 ----------
85 element : `DimensionElement`
86 Element for which a table should be added. The element must be
87 associated with a database table (see `DimensionElement.hasTable`).
88 """
89 assert element not in self._elements, "Element already included in query."
90 storage = self._managers.dimensions[element]
91 fromClause = storage.join(
92 self,
93 regions=self._columns.regions if element in self.summary.spatial else None,
94 timespans=self._columns.timespans if element in self.summary.temporal else None,
95 )
96 self._elements[element] = fromClause
98 def joinDataset(self, datasetType: DatasetType, collections: Any, *,
99 isResult: bool = True, findFirst: bool = False) -> bool:
100 """Add a dataset search or constraint to the query.
102 Unlike other `QueryBuilder` join methods, this *must* be called
103 directly to search for datasets of a particular type or constrain the
104 query results based on the exists of datasets. However, all dimensions
105 used to identify the dataset type must have already been included in
106 `QuerySummary.requested` when initializing the `QueryBuilder`.
108 Parameters
109 ----------
110 datasetType : `DatasetType`
111 The type of datasets to search for.
112 collections : `Any`
113 An expression that fully or partially identifies the collections
114 to search for datasets, such as a `str`, `re.Pattern`, or iterable
115 thereof. `...` can be used to return all collections. See
116 :ref:`daf_butler_collection_expressions` for more information.
117 isResult : `bool`, optional
118 If `True` (default), include the dataset ID column in the
119 result columns of the query, allowing complete `DatasetRef`
120 instances to be produced from the query results for this dataset
121 type. If `False`, the existence of datasets of this type is used
122 only to constrain the data IDs returned by the query.
123 `joinDataset` may be called with ``isResult=True`` at most one time
124 on a particular `QueryBuilder` instance.
125 findFirst : `bool`, optional
126 If `True` (`False` is default), only include the first match for
127 each data ID, searching the given collections in order. Requires
128 that all entries in ``collections`` be regular strings, so there is
129 a clear search order. Ignored if ``isResult`` is `False`.
131 Returns
132 -------
133 anyRecords : `bool`
134 If `True`, joining the dataset table was successful and the query
135 should proceed. If `False`, we were able to determine (from the
136 combination of ``datasetType`` and ``collections``) that there
137 would be no results joined in from this dataset, and hence (due to
138 the inner join that would normally be present), the full query will
139 return no results.
140 """
141 assert datasetType.dimensions.issubset(self.summary.requested)
142 if isResult and findFirst:
143 collections = CollectionSearch.fromExpression(collections)
144 else:
145 collections = CollectionQuery.fromExpression(collections)
146 explicitCollections = frozenset(collections.explicitNames())
147 # If we are searching all collections with no constraints, loop over
148 # RUN collections only, because that will include all datasets.
149 collectionTypes: AbstractSet[CollectionType]
150 if collections == CollectionQuery():
151 collectionTypes = {CollectionType.RUN}
152 else:
153 collectionTypes = CollectionType.all()
154 datasetRecordStorage = self._managers.datasets.find(datasetType.name)
155 if datasetRecordStorage is None:
156 # Unrecognized dataset type means no results. It might be better
157 # to raise here, but this is consistent with previous behavior,
158 # which is expected by QuantumGraph generation code in pipe_base.
159 return False
160 subsubqueries = []
161 runKeyName = self._managers.collections.getRunForeignKeyName()
162 baseColumnNames = {"id", runKeyName, "ingest_date"} if isResult else set()
163 baseColumnNames.update(datasetType.dimensions.required.names)
164 for rank, collectionRecord in enumerate(collections.iter(self._managers.collections,
165 collectionTypes=collectionTypes)):
166 if collectionRecord.type is CollectionType.CALIBRATION:
167 # If collection name was provided explicitly then say sorry,
168 # otherwise collection is a part of chained one and we skip it.
169 if datasetType.isCalibration() and collectionRecord.name in explicitCollections:
170 raise NotImplementedError(
171 f"Query for dataset type '{datasetType.name}' in CALIBRATION-type collection "
172 f"'{collectionRecord.name}' is not yet supported."
173 )
174 else:
175 # We can never find a non-calibration dataset in a
176 # CALIBRATION collection.
177 continue
178 ssq = datasetRecordStorage.select(collection=collectionRecord,
179 dataId=SimpleQuery.Select,
180 id=SimpleQuery.Select if isResult else None,
181 run=SimpleQuery.Select if isResult else None,
182 ingestDate=SimpleQuery.Select if isResult else None)
183 if ssq is None:
184 continue
185 assert {c.name for c in ssq.columns} == baseColumnNames
186 if findFirst:
187 ssq.columns.append(sqlalchemy.sql.literal(rank).label("rank"))
188 subsubqueries.append(ssq.combine())
189 if not subsubqueries:
190 return False
191 subquery = sqlalchemy.sql.union_all(*subsubqueries)
192 columns: Optional[DatasetQueryColumns] = None
193 if isResult:
194 if findFirst:
195 # Rewrite the subquery (currently a UNION ALL over
196 # per-collection subsubqueries) to select the rows with the
197 # lowest rank per data ID. The block below will set subquery
198 # to something like this:
199 #
200 # WITH {dst}_search AS (
201 # SELECT {data-id-cols}, id, run_id, 1 AS rank
202 # FROM <collection1>
203 # UNION ALL
204 # SELECT {data-id-cols}, id, run_id, 2 AS rank
205 # FROM <collection2>
206 # UNION ALL
207 # ...
208 # )
209 # SELECT
210 # {dst}_window.{data-id-cols},
211 # {dst}_window.id,
212 # {dst}_window.run_id
213 # FROM (
214 # SELECT
215 # {dst}_search.{data-id-cols},
216 # {dst}_search.id,
217 # {dst}_search.run_id,
218 # ROW_NUMBER() OVER (
219 # PARTITION BY {dst_search}.{data-id-cols}
220 # ORDER BY rank
221 # ) AS rownum
222 # ) {dst}_window
223 # WHERE
224 # {dst}_window.rownum = 1;
225 #
226 search = subquery.cte(f"{datasetType.name}_search")
227 windowDataIdCols = [
228 search.columns[name].label(name) for name in datasetType.dimensions.required.names
229 ]
230 windowSelectCols = [
231 search.columns["id"].label("id"),
232 search.columns[runKeyName].label(runKeyName),
233 search.columns["ingest_date"].label("ingest_date"),
234 ]
235 windowSelectCols += windowDataIdCols
236 assert {c.name for c in windowSelectCols} == baseColumnNames
237 windowSelectCols.append(
238 sqlalchemy.sql.func.row_number().over(
239 partition_by=windowDataIdCols,
240 order_by=search.columns["rank"]
241 ).label("rownum")
242 )
243 window = sqlalchemy.sql.select(
244 windowSelectCols
245 ).select_from(search).alias(
246 f"{datasetType.name}_window"
247 )
248 subquery = sqlalchemy.sql.select(
249 [window.columns[name].label(name) for name in baseColumnNames]
250 ).select_from(
251 window
252 ).where(
253 window.columns["rownum"] == 1
254 ).alias(datasetType.name)
255 else:
256 subquery = subquery.alias(datasetType.name)
257 columns = DatasetQueryColumns(
258 datasetType=datasetType,
259 id=subquery.columns["id"],
260 runKey=subquery.columns[runKeyName],
261 ingestDate=subquery.columns["ingest_date"],
262 )
263 else:
264 subquery = subquery.alias(datasetType.name)
265 self.joinTable(subquery, datasetType.dimensions.required, datasets=columns)
266 return True
268 def joinTable(self, table: sqlalchemy.sql.FromClause, dimensions: NamedValueAbstractSet[Dimension], *,
269 datasets: Optional[DatasetQueryColumns] = None) -> None:
270 """Join an arbitrary table to the query via dimension relationships.
272 External calls to this method should only be necessary for tables whose
273 records represent neither datasets nor dimension elements.
275 Parameters
276 ----------
277 table : `sqlalchemy.sql.FromClause`
278 SQLAlchemy object representing the logical table (which may be a
279 join or subquery expression) to be joined.
280 dimensions : iterable of `Dimension`
281 The dimensions that relate this table to others that may be in the
282 query. The table must have columns with the names of the
283 dimensions.
284 datasets : `DatasetQueryColumns`, optional
285 Columns that identify a dataset that is part of the query results.
286 """
287 unexpectedDimensions = NamedValueSet(dimensions - self.summary.mustHaveKeysJoined.dimensions)
288 unexpectedDimensions.discard(self.summary.universe.commonSkyPix)
289 if unexpectedDimensions:
290 raise NotImplementedError(
291 f"QueryBuilder does not yet support joining in dimensions {unexpectedDimensions} that "
292 f"were not provided originally to the QuerySummary object passed at construction."
293 )
294 joinOn = self.startJoin(table, dimensions, dimensions.names)
295 self.finishJoin(table, joinOn)
296 if datasets is not None:
297 assert self._columns.datasets is None, \
298 "At most one result dataset type can be returned by a query."
299 self._columns.datasets = datasets
301 def startJoin(self, table: sqlalchemy.sql.FromClause, dimensions: Iterable[Dimension],
302 columnNames: Iterable[str]
303 ) -> List[sqlalchemy.sql.ColumnElement]:
304 """Begin a join on dimensions.
306 Must be followed by call to `finishJoin`.
308 Parameters
309 ----------
310 table : `sqlalchemy.sql.FromClause`
311 SQLAlchemy object representing the logical table (which may be a
312 join or subquery expression) to be joined.
313 dimensions : iterable of `Dimension`
314 The dimensions that relate this table to others that may be in the
315 query. The table must have columns with the names of the
316 dimensions.
317 columnNames : iterable of `str`
318 Names of the columns that correspond to dimension key values; must
319 be `zip` iterable with ``dimensions``.
321 Returns
322 -------
323 joinOn : `list` of `sqlalchemy.sql.ColumnElement`
324 Sequence of boolean expressions that should be combined with AND
325 to form (part of) the ON expression for this JOIN.
326 """
327 joinOn = []
328 for dimension, columnName in zip(dimensions, columnNames):
329 columnInTable = table.columns[columnName]
330 columnsInQuery = self._columns.keys.setdefault(dimension, [])
331 for columnInQuery in columnsInQuery:
332 joinOn.append(columnInQuery == columnInTable)
333 columnsInQuery.append(columnInTable)
334 return joinOn
336 def finishJoin(self, table: sqlalchemy.sql.FromClause, joinOn: List[sqlalchemy.sql.ColumnElement]
337 ) -> None:
338 """Complete a join on dimensions.
340 Must be preceded by call to `startJoin`.
342 Parameters
343 ----------
344 table : `sqlalchemy.sql.FromClause`
345 SQLAlchemy object representing the logical table (which may be a
346 join or subquery expression) to be joined. Must be the same object
347 passed to `startJoin`.
348 joinOn : `list` of `sqlalchemy.sql.ColumnElement`
349 Sequence of boolean expressions that should be combined with AND
350 to form (part of) the ON expression for this JOIN. Should include
351 at least the elements of the list returned by `startJoin`.
352 """
353 onclause: Optional[sqlalchemy.sql.ColumnElement]
354 if len(joinOn) == 0:
355 onclause = None
356 elif len(joinOn) == 1:
357 onclause = joinOn[0]
358 else:
359 onclause = sqlalchemy.sql.and_(*joinOn)
360 self._simpleQuery.join(table, onclause=onclause)
362 def _joinMissingDimensionElements(self) -> None:
363 """Join all dimension element tables that were identified as necessary
364 by `QuerySummary` and have not yet been joined.
366 For internal use by `QueryBuilder` only; will be called (and should
367 only by called) by `finish`.
368 """
369 # Join all DimensionElement tables that we need for spatial/temporal
370 # joins/filters or a nontrivial WHERE expression.
371 # We iterate over these in *reverse* topological order to minimize the
372 # number of tables joined. For example, the "visit" table provides
373 # the primary key value for the "instrument" table it depends on, so we
374 # don't need to join "instrument" as well unless we had a nontrivial
375 # expression on it (and hence included it already above).
376 for element in self.summary.universe.sorted(self.summary.mustHaveTableJoined, reverse=True):
377 self.joinDimensionElement(element)
378 # Join in any requested Dimension tables that don't already have their
379 # primary keys identified by the query.
380 for dimension in self.summary.universe.sorted(self.summary.mustHaveKeysJoined, reverse=True):
381 if dimension not in self._columns.keys:
382 self.joinDimensionElement(dimension)
384 def _addWhereClause(self) -> None:
385 """Add a WHERE clause to the query under construction, connecting all
386 joined dimensions to the expression and data ID dimensions from
387 `QuerySummary`.
389 For internal use by `QueryBuilder` only; will be called (and should
390 only by called) by `finish`.
391 """
392 if self.summary.where.tree is not None:
393 self._simpleQuery.where.append(
394 convertExpressionToSql(
395 self.summary.where.tree,
396 self.summary.universe,
397 columns=self._columns,
398 elements=self._elements,
399 bind=self.summary.where.bind,
400 TimespanReprClass=self._managers.TimespanReprClass,
401 )
402 )
403 for dimension, columnsInQuery in self._columns.keys.items():
404 if dimension in self.summary.where.dataId.graph:
405 givenKey = self.summary.where.dataId[dimension]
406 # Add a WHERE term for each column that corresponds to each
407 # key. This is redundant with the JOIN ON clauses that make
408 # them equal to each other, but more constraints have a chance
409 # of making things easier on the DB's query optimizer.
410 for columnInQuery in columnsInQuery:
411 self._simpleQuery.where.append(columnInQuery == givenKey)
412 else:
413 # Dimension is not fully identified, but it might be a skypix
414 # dimension that's constrained by a given region.
415 if self.summary.where.region is not None and isinstance(dimension, SkyPixDimension):
416 # We know the region now.
417 givenSkyPixIds: List[int] = []
418 for begin, end in dimension.pixelization.envelope(self.summary.where.region):
419 givenSkyPixIds.extend(range(begin, end))
420 for columnInQuery in columnsInQuery:
421 self._simpleQuery.where.append(columnInQuery.in_(givenSkyPixIds))
422 # If we are given an dataId with a timespan, and there are one or more
423 # timespans in the query that aren't given, add a WHERE expression for
424 # each of them.
425 if self.summary.where.dataId.graph.temporal and self.summary.temporal:
426 # Timespan is known now.
427 givenInterval = self.summary.where.dataId.timespan
428 assert givenInterval is not None
429 for element, intervalInQuery in self._columns.timespans.items():
430 assert element not in self.summary.where.dataId.graph.elements
431 self._simpleQuery.where.append(
432 intervalInQuery.overlaps(self._managers.TimespanReprClass.fromLiteral(givenInterval))
433 )
435 def finish(self, joinMissing: bool = True) -> Query:
436 """Finish query constructing, returning a new `Query` instance.
438 Parameters
439 ----------
440 joinMissing : `bool`, optional
441 If `True` (default), automatically join any missing dimension
442 element tables (according to the categorization of the
443 `QuerySummary` the builder was constructed with). `False` should
444 only be passed if the caller can independently guarantee that all
445 dimension relationships are already captured in non-dimension
446 tables that have been manually included in the query.
448 Returns
449 -------
450 query : `Query`
451 A `Query` object that can be executed and used to interpret result
452 rows.
453 """
454 if joinMissing:
455 self._joinMissingDimensionElements()
456 self._addWhereClause()
457 if self._columns.isEmpty():
458 return EmptyQuery(self.summary.requested.universe, managers=self._managers)
459 return DirectQuery(graph=self.summary.requested,
460 uniqueness=DirectQueryUniqueness.NOT_UNIQUE,
461 whereRegion=self.summary.where.dataId.region,
462 simpleQuery=self._simpleQuery,
463 columns=self._columns,
464 managers=self._managers)