Coverage for python/lsst/daf/butler/registry/queries/_builder.py: 17%
66 statements
« prev ^ index » next coverage.py v6.5.0, created at 2023-04-04 02:06 -0700
« prev ^ index » next coverage.py v6.5.0, created at 2023-04-04 02:06 -0700
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
21from __future__ import annotations
23__all__ = ("QueryBuilder",)
25import itertools
26from typing import Any
28from lsst.daf.relation import ColumnExpression, ColumnTag, Diagnostics, Relation
30from ...core import (
31 ColumnCategorization,
32 DatasetColumnTag,
33 DatasetType,
34 DimensionKeyColumnTag,
35 DimensionRecordColumnTag,
36)
37from ..wildcards import CollectionWildcard
38from ._query import Query
39from ._query_backend import QueryBackend
40from ._query_context import QueryContext
41from ._structs import QuerySummary
44class QueryBuilder:
45 """A builder for potentially complex queries that join tables based
46 on dimension relationships.
48 Parameters
49 ----------
50 summary : `QuerySummary`
51 Struct organizing the dimensions involved in the query.
52 backend : `QueryBackend`
53 Backend object that represents the `Registry` implementation.
54 context : `QueryContext`, optional
55 Object that manages relation engines and database-side state (e.g.
56 temporary tables) for the query. Must have been created by
57 ``backend.context()``, which is used if ``context`` is not provided.
58 relation : `~lsst.daf.relation.Relation`, optional
59 Initial relation for the query.
60 """
62 def __init__(
63 self,
64 summary: QuerySummary,
65 backend: QueryBackend,
66 context: QueryContext | None = None,
67 relation: Relation | None = None,
68 ):
69 self.summary = summary
70 self._backend = backend
71 self._context = backend.context() if context is None else context
72 self.relation = self._context.make_initial_relation(relation)
73 self._governor_constraints = self._backend.resolve_governor_constraints(
74 self.summary.dimensions, self.summary.where.governor_constraints, self._context
75 )
77 def joinDataset(
78 self, datasetType: DatasetType, collections: Any, *, isResult: bool = True, findFirst: bool = False
79 ) -> bool:
80 """Add a dataset search or constraint to the query.
82 Unlike other `QueryBuilder` join methods, this *must* be called
83 directly to search for datasets of a particular type or constrain the
84 query results based on the exists of datasets. However, all dimensions
85 used to identify the dataset type must have already been included in
86 `QuerySummary.requested` when initializing the `QueryBuilder`.
88 Parameters
89 ----------
90 datasetType : `DatasetType`
91 The type of datasets to search for.
92 collections : `Any`
93 An expression that fully or partially identifies the collections
94 to search for datasets, such as a `str`, `re.Pattern`, or iterable
95 thereof. `...` can be used to return all collections. See
96 :ref:`daf_butler_collection_expressions` for more information.
97 isResult : `bool`, optional
98 If `True` (default), include the dataset ID column in the
99 result columns of the query, allowing complete `DatasetRef`
100 instances to be produced from the query results for this dataset
101 type. If `False`, the existence of datasets of this type is used
102 only to constrain the data IDs returned by the query.
103 `joinDataset` may be called with ``isResult=True`` at most one time
104 on a particular `QueryBuilder` instance.
105 findFirst : `bool`, optional
106 If `True` (`False` is default), only include the first match for
107 each data ID, searching the given collections in order. Requires
108 that all entries in ``collections`` be regular strings, so there is
109 a clear search order. Ignored if ``isResult`` is `False`.
111 Returns
112 -------
113 anyRecords : `bool`
114 If `True`, joining the dataset table was successful and the query
115 should proceed. If `False`, we were able to determine (from the
116 combination of ``datasetType`` and ``collections``) that there
117 would be no results joined in from this dataset, and hence (due to
118 the inner join that would normally be present), the full query will
119 return no results.
120 """
121 assert datasetType in self.summary.datasets
122 collections = CollectionWildcard.from_expression(collections)
123 if isResult and findFirst:
124 collections.require_ordered()
125 rejections: list[str] = []
126 collection_records = self._backend.resolve_dataset_collections(
127 datasetType,
128 collections,
129 governor_constraints=self._governor_constraints,
130 rejections=rejections,
131 allow_calibration_collections=(not findFirst and not self.summary.dimensions.temporal),
132 )
133 columns_requested = {"dataset_id", "run", "ingest_date"} if isResult else frozenset()
134 if not collection_records:
135 relation = self._backend.make_doomed_dataset_relation(
136 datasetType, columns_requested, rejections, self._context
137 )
138 elif isResult and findFirst:
139 relation = self._backend.make_dataset_search_relation(
140 datasetType,
141 collection_records,
142 columns_requested,
143 self._context,
144 )
145 else:
146 relation = self._backend.make_dataset_query_relation(
147 datasetType,
148 collection_records,
149 columns_requested,
150 self._context,
151 )
152 self.relation = self.relation.join(relation)
153 return not Diagnostics.run(relation).is_doomed
155 def _addWhereClause(self, categorized_columns: ColumnCategorization) -> None:
156 """Add a WHERE clause to the query under construction, connecting all
157 joined dimensions to the expression and data ID dimensions from
158 `QuerySummary`.
160 For internal use by `QueryBuilder` only; will be called (and should
161 only by called) by `finish`.
163 Parameters
164 ----------
165 categorized_columns : `ColumnCategorization`
166 Struct that organizes the columns in ``self.relation`` by
167 `ColumnTag` type.
168 """
169 # Append WHERE clause terms from predicates.
170 if self.summary.where.expression_predicate is not None:
171 self.relation = self.relation.with_rows_satisfying(
172 self.summary.where.expression_predicate,
173 preferred_engine=self._context.preferred_engine,
174 require_preferred_engine=True,
175 )
176 if self.summary.where.data_id:
177 known_dimensions = self.summary.where.data_id.graph.intersection(self.summary.dimensions)
178 known_data_id = self.summary.where.data_id.subset(known_dimensions)
179 self.relation = self.relation.with_rows_satisfying(
180 self._context.make_data_coordinate_predicate(known_data_id),
181 preferred_engine=self._context.preferred_engine,
182 require_preferred_engine=True,
183 )
184 if self.summary.region is not None:
185 for skypix_dimension in categorized_columns.filter_skypix(self._backend.universe):
186 self.relation = self.relation.with_rows_satisfying(
187 self._context.make_spatial_region_skypix_predicate(
188 skypix_dimension,
189 self.summary.region,
190 ),
191 preferred_engine=self._context.preferred_engine,
192 require_preferred_engine=True,
193 )
194 for element in categorized_columns.filter_spatial_region_dimension_elements():
195 self.relation = self.relation.with_rows_satisfying(
196 self._context.make_spatial_region_overlap_predicate(
197 ColumnExpression.reference(DimensionRecordColumnTag(element, "region")),
198 ColumnExpression.literal(self.summary.region),
199 ),
200 preferred_engine=self._context.iteration_engine,
201 transfer=True,
202 )
204 def finish(self, joinMissing: bool = True) -> Query:
205 """Finish query constructing, returning a new `Query` instance.
207 Parameters
208 ----------
209 joinMissing : `bool`, optional
210 If `True` (default), automatically join any missing dimension
211 element tables (according to the categorization of the
212 `QuerySummary` the builder was constructed with). `False` should
213 only be passed if the caller can independently guarantee that all
214 dimension relationships are already captured in non-dimension
215 tables that have been manually included in the query.
217 Returns
218 -------
219 query : `Query`
220 A `Query` object that can be executed and used to interpret result
221 rows.
222 """
223 if joinMissing:
224 spatial_joins = []
225 for family1, family2 in itertools.combinations(self.summary.dimensions.spatial, 2):
226 spatial_joins.append(
227 (
228 family1.choose(self.summary.dimensions.elements).name,
229 family2.choose(self.summary.dimensions.elements).name,
230 )
231 )
232 self.relation = self._backend.make_dimension_relation(
233 self.summary.dimensions,
234 columns=self.summary.columns_required,
235 context=self._context,
236 spatial_joins=spatial_joins,
237 initial_relation=self.relation,
238 governor_constraints=self._governor_constraints,
239 )
240 categorized_columns = ColumnCategorization.from_iterable(self.relation.columns)
241 self._addWhereClause(categorized_columns)
242 query = Query(
243 self.summary.dimensions,
244 self._backend,
245 context=self._context,
246 relation=self.relation,
247 governor_constraints=self._governor_constraints,
248 is_deferred=True,
249 has_record_columns=False,
250 )
251 if self.summary.order_by is not None:
252 query = query.sorted(self.summary.order_by.terms)
253 if self.summary.limit is not None:
254 query = query.sliced(
255 start=self.summary.limit[0],
256 stop=self.summary.limit[0] + self.summary.limit[1]
257 if self.summary.limit[1] is not None
258 else None,
259 )
260 projected_columns: set[ColumnTag] = set()
261 projected_columns.update(DimensionKeyColumnTag.generate(self.summary.requested.names))
262 for dataset_type in self.summary.datasets:
263 for dataset_column_name in ("dataset_id", "run"):
264 tag = DatasetColumnTag(dataset_type.name, dataset_column_name)
265 if tag in self.relation.columns:
266 projected_columns.add(tag)
267 return query.projected(
268 dimensions=self.summary.requested,
269 columns=projected_columns,
270 drop_postprocessing=False,
271 unique=False,
272 )