Coverage for python/lsst/daf/butler/registry/queries/_builder.py: 18%
68 statements
« prev ^ index » next coverage.py v7.4.4, created at 2024-04-13 09:58 +0000
« prev ^ index » next coverage.py v7.4.4, created at 2024-04-13 09:58 +0000
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This software is dual licensed under the GNU General Public License and also
10# under a 3-clause BSD license. Recipients may choose which of these licenses
11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt,
12# respectively. If you choose the GPL option then the following text applies
13# (but note that there is still no warranty even if you opt for BSD instead):
14#
15# This program is free software: you can redistribute it and/or modify
16# it under the terms of the GNU General Public License as published by
17# the Free Software Foundation, either version 3 of the License, or
18# (at your option) any later version.
19#
20# This program is distributed in the hope that it will be useful,
21# but WITHOUT ANY WARRANTY; without even the implied warranty of
22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23# GNU General Public License for more details.
24#
25# You should have received a copy of the GNU General Public License
26# along with this program. If not, see <http://www.gnu.org/licenses/>.
27from __future__ import annotations
29__all__ = ("QueryBuilder",)
31import itertools
32from typing import Any
34from lsst.daf.relation import ColumnExpression, ColumnTag, Diagnostics, Relation
36from ..._column_categorization import ColumnCategorization
37from ..._column_tags import DatasetColumnTag, DimensionKeyColumnTag, DimensionRecordColumnTag
38from ..._dataset_type import DatasetType
39from ..wildcards import CollectionWildcard
40from ._query import Query
41from ._query_backend import QueryBackend
42from ._query_context import QueryContext
43from ._structs import QuerySummary
46class QueryBuilder:
47 """A builder for potentially complex queries that join tables based
48 on dimension relationships.
50 Parameters
51 ----------
52 summary : `QuerySummary`
53 Struct organizing the dimensions involved in the query.
54 backend : `QueryBackend`
55 Backend object that represents the `Registry` implementation.
56 context : `QueryContext`, optional
57 Object that manages relation engines and database-side state (e.g.
58 temporary tables) for the query. Must have been created by
59 ``backend.context()``, which is used if ``context`` is not provided.
60 relation : `~lsst.daf.relation.Relation`, optional
61 Initial relation for the query.
62 """
64 def __init__(
65 self,
66 summary: QuerySummary,
67 backend: QueryBackend,
68 context: QueryContext | None = None,
69 relation: Relation | None = None,
70 ):
71 self.summary = summary
72 self._backend = backend
73 self._context = backend.context() if context is None else context
74 self.relation = self._context.make_initial_relation(relation)
75 self._governor_constraints = self._backend.resolve_governor_constraints(
76 self.summary.dimensions, self.summary.where.governor_constraints
77 )
79 def joinDataset(
80 self, datasetType: DatasetType, collections: Any, *, isResult: bool = True, findFirst: bool = False
81 ) -> bool:
82 """Add a dataset search or constraint to the query.
84 Unlike other `QueryBuilder` join methods, this *must* be called
85 directly to search for datasets of a particular type or constrain the
86 query results based on the exists of datasets. However, all dimensions
87 used to identify the dataset type must have already been included in
88 `QuerySummary.requested` when initializing the `QueryBuilder`.
90 Parameters
91 ----------
92 datasetType : `DatasetType`
93 The type of datasets to search for.
94 collections : `Any`
95 An expression that fully or partially identifies the collections
96 to search for datasets, such as a `str`, `re.Pattern`, or iterable
97 thereof. `...` can be used to return all collections. See
98 :ref:`daf_butler_collection_expressions` for more information.
99 isResult : `bool`, optional
100 If `True` (default), include the dataset ID column in the
101 result columns of the query, allowing complete `DatasetRef`
102 instances to be produced from the query results for this dataset
103 type. If `False`, the existence of datasets of this type is used
104 only to constrain the data IDs returned by the query.
105 `joinDataset` may be called with ``isResult=True`` at most one time
106 on a particular `QueryBuilder` instance.
107 findFirst : `bool`, optional
108 If `True` (`False` is default), only include the first match for
109 each data ID, searching the given collections in order. Requires
110 that all entries in ``collections`` be regular strings, so there is
111 a clear search order. Ignored if ``isResult`` is `False`.
113 Returns
114 -------
115 anyRecords : `bool`
116 If `True`, joining the dataset table was successful and the query
117 should proceed. If `False`, we were able to determine (from the
118 combination of ``datasetType`` and ``collections``) that there
119 would be no results joined in from this dataset, and hence (due to
120 the inner join that would normally be present), the full query will
121 return no results.
122 """
123 assert datasetType in self.summary.datasets
124 collections = CollectionWildcard.from_expression(collections)
125 if isResult and findFirst:
126 collections.require_ordered()
127 rejections: list[str] = []
128 collection_records = self._backend.resolve_dataset_collections(
129 datasetType,
130 collections,
131 governor_constraints=self._governor_constraints,
132 rejections=rejections,
133 allow_calibration_collections=(not findFirst and not self.summary.dimensions.temporal),
134 )
135 columns_requested = {"dataset_id", "run", "ingest_date"} if isResult else frozenset()
136 if not collection_records:
137 relation = self._backend.make_doomed_dataset_relation(
138 datasetType, columns_requested, rejections, self._context
139 )
140 elif isResult and findFirst:
141 relation = self._backend.make_dataset_search_relation(
142 datasetType,
143 collection_records,
144 columns_requested,
145 self._context,
146 )
147 else:
148 relation = self._backend.make_dataset_query_relation(
149 datasetType,
150 collection_records,
151 columns_requested,
152 self._context,
153 )
154 self.relation = self.relation.join(relation)
155 return not Diagnostics.run(relation).is_doomed
157 def _addWhereClause(self, categorized_columns: ColumnCategorization) -> None:
158 """Add a WHERE clause to the query under construction, connecting all
159 joined dimensions to the expression and data ID dimensions from
160 `QuerySummary`.
162 For internal use by `QueryBuilder` only; will be called (and should
163 only by called) by `finish`.
165 Parameters
166 ----------
167 categorized_columns : `ColumnCategorization`
168 Struct that organizes the columns in ``self.relation`` by
169 `ColumnTag` type.
170 """
171 # Append WHERE clause terms from predicates.
172 if self.summary.where.expression_predicate is not None:
173 self.relation = self.relation.with_rows_satisfying(
174 self.summary.where.expression_predicate,
175 preferred_engine=self._context.preferred_engine,
176 require_preferred_engine=True,
177 )
178 if self.summary.where.data_id:
179 known_dimensions = self.summary.where.data_id.dimensions.intersection(self.summary.dimensions)
180 known_data_id = self.summary.where.data_id.subset(known_dimensions)
181 self.relation = self.relation.with_rows_satisfying(
182 self._context.make_data_coordinate_predicate(known_data_id),
183 preferred_engine=self._context.preferred_engine,
184 require_preferred_engine=True,
185 )
186 if self.summary.region is not None:
187 for skypix_dimension in categorized_columns.filter_skypix(self._backend.universe):
188 self.relation = self.relation.with_rows_satisfying(
189 self._context.make_spatial_region_skypix_predicate(
190 skypix_dimension,
191 self.summary.region,
192 ),
193 preferred_engine=self._context.preferred_engine,
194 require_preferred_engine=True,
195 )
196 for element in categorized_columns.filter_spatial_region_dimension_elements():
197 self.relation = self.relation.with_rows_satisfying(
198 self._context.make_spatial_region_overlap_predicate(
199 ColumnExpression.reference(DimensionRecordColumnTag(element, "region")),
200 ColumnExpression.literal(self.summary.region),
201 ),
202 preferred_engine=self._context.iteration_engine,
203 transfer=True,
204 )
206 def finish(self, joinMissing: bool = True) -> Query:
207 """Finish query constructing, returning a new `Query` instance.
209 Parameters
210 ----------
211 joinMissing : `bool`, optional
212 If `True` (default), automatically join any missing dimension
213 element tables (according to the categorization of the
214 `QuerySummary` the builder was constructed with). `False` should
215 only be passed if the caller can independently guarantee that all
216 dimension relationships are already captured in non-dimension
217 tables that have been manually included in the query.
219 Returns
220 -------
221 query : `Query`
222 A `Query` object that can be executed and used to interpret result
223 rows.
224 """
225 if joinMissing:
226 spatial_joins = []
227 for family1, family2 in itertools.combinations(self.summary.dimensions.spatial, 2):
228 spatial_joins.append(
229 (
230 family1.choose(self.summary.dimensions.elements.names, self.summary.universe).name,
231 family2.choose(self.summary.dimensions.elements.names, self.summary.universe).name,
232 )
233 )
234 self.relation = self._backend.make_dimension_relation(
235 self.summary.dimensions,
236 columns=self.summary.columns_required,
237 context=self._context,
238 spatial_joins=spatial_joins,
239 initial_relation=self.relation,
240 governor_constraints=self._governor_constraints,
241 )
242 categorized_columns = ColumnCategorization.from_iterable(self.relation.columns)
243 self._addWhereClause(categorized_columns)
244 query = Query(
245 self.summary.dimensions,
246 self._backend,
247 context=self._context,
248 relation=self.relation,
249 governor_constraints=self._governor_constraints,
250 is_deferred=True,
251 has_record_columns=False,
252 )
253 if self.summary.order_by is not None:
254 query = query.sorted(self.summary.order_by.terms)
255 if self.summary.limit is not None:
256 query = query.sliced(
257 start=self.summary.limit[0],
258 stop=(
259 self.summary.limit[0] + self.summary.limit[1]
260 if self.summary.limit[1] is not None
261 else None
262 ),
263 )
264 projected_columns: set[ColumnTag] = set()
265 projected_columns.update(DimensionKeyColumnTag.generate(self.summary.requested.names))
266 for dataset_type in self.summary.datasets:
267 for dataset_column_name in ("dataset_id", "run"):
268 tag = DatasetColumnTag(dataset_type.name, dataset_column_name)
269 if tag in self.relation.columns:
270 projected_columns.add(tag)
271 return query.projected(
272 dimensions=self.summary.requested,
273 columns=projected_columns,
274 drop_postprocessing=False,
275 unique=False,
276 )