Coverage for python/lsst/daf/butler/registry/queries/_builder.py: 17%
66 statements
« prev ^ index » next coverage.py v7.3.1, created at 2023-10-02 08:00 +0000
« prev ^ index » next coverage.py v7.3.1, created at 2023-10-02 08:00 +0000
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This software is dual licensed under the GNU General Public License and also
10# under a 3-clause BSD license. Recipients may choose which of these licenses
11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt,
12# respectively. If you choose the GPL option then the following text applies
13# (but note that there is still no warranty even if you opt for BSD instead):
14#
15# This program is free software: you can redistribute it and/or modify
16# it under the terms of the GNU General Public License as published by
17# the Free Software Foundation, either version 3 of the License, or
18# (at your option) any later version.
19#
20# This program is distributed in the hope that it will be useful,
21# but WITHOUT ANY WARRANTY; without even the implied warranty of
22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23# GNU General Public License for more details.
24#
25# You should have received a copy of the GNU General Public License
26# along with this program. If not, see <http://www.gnu.org/licenses/>.
27from __future__ import annotations
29__all__ = ("QueryBuilder",)
31import itertools
32from typing import Any
34from lsst.daf.relation import ColumnExpression, ColumnTag, Diagnostics, Relation
36from ...core import (
37 ColumnCategorization,
38 DatasetColumnTag,
39 DatasetType,
40 DimensionKeyColumnTag,
41 DimensionRecordColumnTag,
42)
43from ..wildcards import CollectionWildcard
44from ._query import Query
45from ._query_backend import QueryBackend
46from ._query_context import QueryContext
47from ._structs import QuerySummary
50class QueryBuilder:
51 """A builder for potentially complex queries that join tables based
52 on dimension relationships.
54 Parameters
55 ----------
56 summary : `QuerySummary`
57 Struct organizing the dimensions involved in the query.
58 backend : `QueryBackend`
59 Backend object that represents the `Registry` implementation.
60 context : `QueryContext`, optional
61 Object that manages relation engines and database-side state (e.g.
62 temporary tables) for the query. Must have been created by
63 ``backend.context()``, which is used if ``context`` is not provided.
64 relation : `~lsst.daf.relation.Relation`, optional
65 Initial relation for the query.
66 """
68 def __init__(
69 self,
70 summary: QuerySummary,
71 backend: QueryBackend,
72 context: QueryContext | None = None,
73 relation: Relation | None = None,
74 ):
75 self.summary = summary
76 self._backend = backend
77 self._context = backend.context() if context is None else context
78 self.relation = self._context.make_initial_relation(relation)
79 self._governor_constraints = self._backend.resolve_governor_constraints(
80 self.summary.dimensions, self.summary.where.governor_constraints, self._context
81 )
83 def joinDataset(
84 self, datasetType: DatasetType, collections: Any, *, isResult: bool = True, findFirst: bool = False
85 ) -> bool:
86 """Add a dataset search or constraint to the query.
88 Unlike other `QueryBuilder` join methods, this *must* be called
89 directly to search for datasets of a particular type or constrain the
90 query results based on the exists of datasets. However, all dimensions
91 used to identify the dataset type must have already been included in
92 `QuerySummary.requested` when initializing the `QueryBuilder`.
94 Parameters
95 ----------
96 datasetType : `DatasetType`
97 The type of datasets to search for.
98 collections : `Any`
99 An expression that fully or partially identifies the collections
100 to search for datasets, such as a `str`, `re.Pattern`, or iterable
101 thereof. `...` can be used to return all collections. See
102 :ref:`daf_butler_collection_expressions` for more information.
103 isResult : `bool`, optional
104 If `True` (default), include the dataset ID column in the
105 result columns of the query, allowing complete `DatasetRef`
106 instances to be produced from the query results for this dataset
107 type. If `False`, the existence of datasets of this type is used
108 only to constrain the data IDs returned by the query.
109 `joinDataset` may be called with ``isResult=True`` at most one time
110 on a particular `QueryBuilder` instance.
111 findFirst : `bool`, optional
112 If `True` (`False` is default), only include the first match for
113 each data ID, searching the given collections in order. Requires
114 that all entries in ``collections`` be regular strings, so there is
115 a clear search order. Ignored if ``isResult`` is `False`.
117 Returns
118 -------
119 anyRecords : `bool`
120 If `True`, joining the dataset table was successful and the query
121 should proceed. If `False`, we were able to determine (from the
122 combination of ``datasetType`` and ``collections``) that there
123 would be no results joined in from this dataset, and hence (due to
124 the inner join that would normally be present), the full query will
125 return no results.
126 """
127 assert datasetType in self.summary.datasets
128 collections = CollectionWildcard.from_expression(collections)
129 if isResult and findFirst:
130 collections.require_ordered()
131 rejections: list[str] = []
132 collection_records = self._backend.resolve_dataset_collections(
133 datasetType,
134 collections,
135 governor_constraints=self._governor_constraints,
136 rejections=rejections,
137 allow_calibration_collections=(not findFirst and not self.summary.dimensions.temporal),
138 )
139 columns_requested = {"dataset_id", "run", "ingest_date"} if isResult else frozenset()
140 if not collection_records:
141 relation = self._backend.make_doomed_dataset_relation(
142 datasetType, columns_requested, rejections, self._context
143 )
144 elif isResult and findFirst:
145 relation = self._backend.make_dataset_search_relation(
146 datasetType,
147 collection_records,
148 columns_requested,
149 self._context,
150 )
151 else:
152 relation = self._backend.make_dataset_query_relation(
153 datasetType,
154 collection_records,
155 columns_requested,
156 self._context,
157 )
158 self.relation = self.relation.join(relation)
159 return not Diagnostics.run(relation).is_doomed
161 def _addWhereClause(self, categorized_columns: ColumnCategorization) -> None:
162 """Add a WHERE clause to the query under construction, connecting all
163 joined dimensions to the expression and data ID dimensions from
164 `QuerySummary`.
166 For internal use by `QueryBuilder` only; will be called (and should
167 only by called) by `finish`.
169 Parameters
170 ----------
171 categorized_columns : `ColumnCategorization`
172 Struct that organizes the columns in ``self.relation`` by
173 `ColumnTag` type.
174 """
175 # Append WHERE clause terms from predicates.
176 if self.summary.where.expression_predicate is not None:
177 self.relation = self.relation.with_rows_satisfying(
178 self.summary.where.expression_predicate,
179 preferred_engine=self._context.preferred_engine,
180 require_preferred_engine=True,
181 )
182 if self.summary.where.data_id:
183 known_dimensions = self.summary.where.data_id.graph.intersection(self.summary.dimensions)
184 known_data_id = self.summary.where.data_id.subset(known_dimensions)
185 self.relation = self.relation.with_rows_satisfying(
186 self._context.make_data_coordinate_predicate(known_data_id),
187 preferred_engine=self._context.preferred_engine,
188 require_preferred_engine=True,
189 )
190 if self.summary.region is not None:
191 for skypix_dimension in categorized_columns.filter_skypix(self._backend.universe):
192 self.relation = self.relation.with_rows_satisfying(
193 self._context.make_spatial_region_skypix_predicate(
194 skypix_dimension,
195 self.summary.region,
196 ),
197 preferred_engine=self._context.preferred_engine,
198 require_preferred_engine=True,
199 )
200 for element in categorized_columns.filter_spatial_region_dimension_elements():
201 self.relation = self.relation.with_rows_satisfying(
202 self._context.make_spatial_region_overlap_predicate(
203 ColumnExpression.reference(DimensionRecordColumnTag(element, "region")),
204 ColumnExpression.literal(self.summary.region),
205 ),
206 preferred_engine=self._context.iteration_engine,
207 transfer=True,
208 )
210 def finish(self, joinMissing: bool = True) -> Query:
211 """Finish query constructing, returning a new `Query` instance.
213 Parameters
214 ----------
215 joinMissing : `bool`, optional
216 If `True` (default), automatically join any missing dimension
217 element tables (according to the categorization of the
218 `QuerySummary` the builder was constructed with). `False` should
219 only be passed if the caller can independently guarantee that all
220 dimension relationships are already captured in non-dimension
221 tables that have been manually included in the query.
223 Returns
224 -------
225 query : `Query`
226 A `Query` object that can be executed and used to interpret result
227 rows.
228 """
229 if joinMissing:
230 spatial_joins = []
231 for family1, family2 in itertools.combinations(self.summary.dimensions.spatial, 2):
232 spatial_joins.append(
233 (
234 family1.choose(self.summary.dimensions.elements).name,
235 family2.choose(self.summary.dimensions.elements).name,
236 )
237 )
238 self.relation = self._backend.make_dimension_relation(
239 self.summary.dimensions,
240 columns=self.summary.columns_required,
241 context=self._context,
242 spatial_joins=spatial_joins,
243 initial_relation=self.relation,
244 governor_constraints=self._governor_constraints,
245 )
246 categorized_columns = ColumnCategorization.from_iterable(self.relation.columns)
247 self._addWhereClause(categorized_columns)
248 query = Query(
249 self.summary.dimensions,
250 self._backend,
251 context=self._context,
252 relation=self.relation,
253 governor_constraints=self._governor_constraints,
254 is_deferred=True,
255 has_record_columns=False,
256 )
257 if self.summary.order_by is not None:
258 query = query.sorted(self.summary.order_by.terms)
259 if self.summary.limit is not None:
260 query = query.sliced(
261 start=self.summary.limit[0],
262 stop=self.summary.limit[0] + self.summary.limit[1]
263 if self.summary.limit[1] is not None
264 else None,
265 )
266 projected_columns: set[ColumnTag] = set()
267 projected_columns.update(DimensionKeyColumnTag.generate(self.summary.requested.names))
268 for dataset_type in self.summary.datasets:
269 for dataset_column_name in ("dataset_id", "run"):
270 tag = DatasetColumnTag(dataset_type.name, dataset_column_name)
271 if tag in self.relation.columns:
272 projected_columns.add(tag)
273 return query.projected(
274 dimensions=self.summary.requested,
275 columns=projected_columns,
276 drop_postprocessing=False,
277 unique=False,
278 )