Coverage for python/lsst/daf/butler/registry/queries/_builder.py: 13%
76 statements
« prev ^ index » next coverage.py v6.5.0, created at 2023-01-11 02:31 -0800
« prev ^ index » next coverage.py v6.5.0, created at 2023-01-11 02:31 -0800
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
21from __future__ import annotations
23__all__ = ("QueryBuilder",)
25from typing import Any, cast
27from lsst.daf.relation import ColumnExpression, ColumnTag, Diagnostics, Predicate, Relation
29from ...core import (
30 ColumnCategorization,
31 DatasetColumnTag,
32 DatasetType,
33 DimensionKeyColumnTag,
34 DimensionRecordColumnTag,
35)
36from ..wildcards import CollectionWildcard
37from ._query import Query
38from ._query_backend import QueryBackend
39from ._query_context import QueryContext
40from ._structs import QuerySummary
43class QueryBuilder:
44 """A builder for potentially complex queries that join tables based
45 on dimension relationships.
47 Parameters
48 ----------
49 summary : `QuerySummary`
50 Struct organizing the dimensions involved in the query.
51 backend : `QueryBackend`
52 Backend object that represents the `Registry` implementation.
53 context : `QueryContext`, optional
54 Object that manages relation engines and database-side state (e.g.
55 temporary tables) for the query. Must have been created by
56 ``backend.context()``, which is used if ``context`` is not provided.
57 relation : `~lsst.daf.relation.Relation`, optional
58 Initial relation for the query.
59 """
61 def __init__(
62 self,
63 summary: QuerySummary,
64 backend: QueryBackend,
65 context: QueryContext | None = None,
66 relation: Relation | None = None,
67 ):
68 self.summary = summary
69 self._backend = backend
70 self._context = backend.context() if context is None else context
71 self.relation = self._context.make_initial_relation(relation)
72 self._governor_constraints = self._backend.resolve_governor_constraints(
73 self.summary.dimensions, self.summary.where.governor_constraints, self._context
74 )
76 def joinDataset(
77 self, datasetType: DatasetType, collections: Any, *, isResult: bool = True, findFirst: bool = False
78 ) -> bool:
79 """Add a dataset search or constraint to the query.
81 Unlike other `QueryBuilder` join methods, this *must* be called
82 directly to search for datasets of a particular type or constrain the
83 query results based on the exists of datasets. However, all dimensions
84 used to identify the dataset type must have already been included in
85 `QuerySummary.requested` when initializing the `QueryBuilder`.
87 Parameters
88 ----------
89 datasetType : `DatasetType`
90 The type of datasets to search for.
91 collections : `Any`
92 An expression that fully or partially identifies the collections
93 to search for datasets, such as a `str`, `re.Pattern`, or iterable
94 thereof. `...` can be used to return all collections. See
95 :ref:`daf_butler_collection_expressions` for more information.
96 isResult : `bool`, optional
97 If `True` (default), include the dataset ID column in the
98 result columns of the query, allowing complete `DatasetRef`
99 instances to be produced from the query results for this dataset
100 type. If `False`, the existence of datasets of this type is used
101 only to constrain the data IDs returned by the query.
102 `joinDataset` may be called with ``isResult=True`` at most one time
103 on a particular `QueryBuilder` instance.
104 findFirst : `bool`, optional
105 If `True` (`False` is default), only include the first match for
106 each data ID, searching the given collections in order. Requires
107 that all entries in ``collections`` be regular strings, so there is
108 a clear search order. Ignored if ``isResult`` is `False`.
110 Returns
111 -------
112 anyRecords : `bool`
113 If `True`, joining the dataset table was successful and the query
114 should proceed. If `False`, we were able to determine (from the
115 combination of ``datasetType`` and ``collections``) that there
116 would be no results joined in from this dataset, and hence (due to
117 the inner join that would normally be present), the full query will
118 return no results.
119 """
120 assert datasetType in self.summary.datasets
121 collections = CollectionWildcard.from_expression(collections)
122 if isResult and findFirst:
123 collections.require_ordered()
124 rejections: list[str] = []
125 collection_records = self._backend.resolve_dataset_collections(
126 datasetType,
127 collections,
128 governor_constraints=self._governor_constraints,
129 rejections=rejections,
130 allow_calibration_collections=(
131 not findFirst and not (self.summary.temporal or self.summary.dimensions.temporal)
132 ),
133 )
134 columns_requested = {"dataset_id", "run", "ingest_date"} if isResult else frozenset()
135 if not collection_records:
136 relation = self._backend.make_doomed_dataset_relation(
137 datasetType, columns_requested, rejections, self._context
138 )
139 elif isResult and findFirst:
140 relation = self._backend.make_dataset_search_relation(
141 datasetType,
142 collection_records,
143 columns_requested,
144 self._context,
145 )
146 else:
147 relation = self._backend.make_dataset_query_relation(
148 datasetType,
149 collection_records,
150 columns_requested,
151 self._context,
152 )
153 self.relation = self.relation.join(relation)
154 return not Diagnostics.run(relation).is_doomed
156 def _addWhereClause(self, categorized_columns: ColumnCategorization) -> None:
157 """Add a WHERE clause to the query under construction, connecting all
158 joined dimensions to the expression and data ID dimensions from
159 `QuerySummary`.
161 For internal use by `QueryBuilder` only; will be called (and should
162 only by called) by `finish`.
164 Parameters
165 ----------
166 categorized_columns : `ColumnCategorization`
167 Struct that organizes the columns in ``self.relation`` by
168 `ColumnTag` type.
169 """
170 # Append WHERE clause terms from predicates.
171 predicate: Predicate = Predicate.literal(True)
172 if self.summary.where.expression_predicate is not None:
173 predicate = predicate.logical_and(self.summary.where.expression_predicate)
174 if self.summary.where.data_id:
175 known_dimensions = self.summary.where.data_id.graph.intersection(self.summary.dimensions)
176 known_data_id = self.summary.where.data_id.subset(known_dimensions)
177 predicate = predicate.logical_and(self._context.make_data_coordinate_predicate(known_data_id))
178 if self.summary.where.region is not None:
179 for skypix_dimension in categorized_columns.filter_skypix(self._backend.universe):
180 if skypix_dimension not in self.summary.where.data_id.graph:
181 predicate = predicate.logical_and(
182 self._context.make_spatial_region_skypix_predicate(
183 skypix_dimension,
184 self.summary.where.region,
185 )
186 )
187 for element in categorized_columns.filter_spatial_region_dimension_elements():
188 if element not in self.summary.where.data_id.graph.names:
189 predicate = predicate.logical_and(
190 self._context.make_spatial_region_overlap_predicate(
191 ColumnExpression.reference(DimensionRecordColumnTag(element, "region")),
192 ColumnExpression.literal(self.summary.where.region),
193 )
194 )
195 if self.summary.where.timespan is not None:
196 for element in categorized_columns.filter_timespan_dimension_elements():
197 if element not in self.summary.where.data_id.graph.names:
198 predicate = predicate.logical_and(
199 self._context.make_timespan_overlap_predicate(
200 DimensionRecordColumnTag(element, "timespan"), self.summary.where.timespan
201 )
202 )
203 self.relation = self.relation.with_rows_satisfying(
204 predicate, preferred_engine=self._context.preferred_engine, require_preferred_engine=True
205 )
207 def finish(self, joinMissing: bool = True) -> Query:
208 """Finish query constructing, returning a new `Query` instance.
210 Parameters
211 ----------
212 joinMissing : `bool`, optional
213 If `True` (default), automatically join any missing dimension
214 element tables (according to the categorization of the
215 `QuerySummary` the builder was constructed with). `False` should
216 only be passed if the caller can independently guarantee that all
217 dimension relationships are already captured in non-dimension
218 tables that have been manually included in the query.
220 Returns
221 -------
222 query : `Query`
223 A `Query` object that can be executed and used to interpret result
224 rows.
225 """
226 columns_required: set[ColumnTag] = set()
227 if self.summary.where.expression_predicate is not None:
228 columns_required.update(self.summary.where.expression_predicate.columns_required)
229 if self.summary.order_by is not None:
230 columns_required.update(self.summary.order_by.columns_required)
231 columns_required.update(DimensionKeyColumnTag.generate(self.summary.requested.names))
232 if joinMissing:
233 self.relation = self._backend.make_dimension_relation(
234 self.summary.dimensions,
235 columns=columns_required,
236 context=self._context,
237 spatial_joins=(
238 [cast(tuple[str, str], tuple(self.summary.spatial.names))]
239 if len(self.summary.spatial) == 2
240 else []
241 ),
242 initial_relation=self.relation,
243 governor_constraints=self._governor_constraints,
244 )
245 categorized_columns = ColumnCategorization.from_iterable(columns_required)
246 self._addWhereClause(categorized_columns)
247 query = Query(
248 self.summary.dimensions,
249 self._backend,
250 context=self._context,
251 relation=self.relation,
252 governor_constraints=self._governor_constraints,
253 is_deferred=True,
254 has_record_columns=False,
255 )
256 if self.summary.order_by is not None:
257 query = query.sorted(self.summary.order_by.terms)
258 if self.summary.limit is not None:
259 query = query.sliced(
260 start=self.summary.limit[0],
261 stop=self.summary.limit[0] + self.summary.limit[1]
262 if self.summary.limit[1] is not None
263 else None,
264 )
265 projected_columns: set[ColumnTag] = set()
266 projected_columns.update(DimensionKeyColumnTag.generate(self.summary.requested.names))
267 for dataset_type in self.summary.datasets:
268 for dataset_column_name in ("dataset_id", "run"):
269 tag = DatasetColumnTag(dataset_type.name, dataset_column_name)
270 if tag in self.relation.columns:
271 projected_columns.add(tag)
272 return query.projected(
273 dimensions=self.summary.requested,
274 columns=projected_columns,
275 drop_postprocessing=False,
276 unique=False,
277 )