Coverage for python/lsst/daf/butler/direct_query_driver/_query_plan.py: 56%
126 statements
« prev ^ index » next coverage.py v7.5.0, created at 2024-04-30 09:54 +0000
« prev ^ index » next coverage.py v7.5.0, created at 2024-04-30 09:54 +0000
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This software is dual licensed under the GNU General Public License and also
10# under a 3-clause BSD license. Recipients may choose which of these licenses
11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt,
12# respectively. If you choose the GPL option then the following text applies
13# (but note that there is still no warranty even if you opt for BSD instead):
14#
15# This program is free software: you can redistribute it and/or modify
16# it under the terms of the GNU General Public License as published by
17# the Free Software Foundation, either version 3 of the License, or
18# (at your option) any later version.
19#
20# This program is distributed in the hope that it will be useful,
21# but WITHOUT ANY WARRANTY; without even the implied warranty of
22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23# GNU General Public License for more details.
24#
25# You should have received a copy of the GNU General Public License
26# along with this program. If not, see <http://www.gnu.org/licenses/>.
28from __future__ import annotations
30__all__ = (
31 "QueryPlan",
32 "QueryJoinsPlan",
33 "QueryProjectionPlan",
34 "QueryFindFirstPlan",
35 "ResolvedDatasetSearch",
36)
38import dataclasses
39from collections.abc import Iterator
40from typing import Any
42from ..dimensions import DataIdValue, DimensionElement, DimensionGroup
43from ..queries import tree as qt
44from ..queries.visitors import ColumnExpressionVisitor, PredicateVisitFlags, SimplePredicateVisitor
45from ..registry.interfaces import CollectionRecord
48@dataclasses.dataclass
49class ResolvedDatasetSearch:
50 """A struct describing a dataset search joined into a query, after
51 resolving its collection search path.
52 """
54 name: str
55 """Name of the dataset type."""
57 dimensions: DimensionGroup
58 """Dimensions of the dataset type."""
60 collection_records: list[CollectionRecord] = dataclasses.field(default_factory=list)
61 """Records of the collections to search for this dataset, in order, after
62 removing collections inconsistent with the dataset type or the query's
63 data ID constraint.
64 """
66 messages: list[str] = dataclasses.field(default_factory=list)
67 """Diagnostic messages about collections that were filtered out of
68 collection records.
69 """
71 is_calibration_search: bool = False
72 """`True` if any of the collections to be searched is a
73 `~CollectionType.CALIBRATION` collection, `False` otherwise.
75 Since only calibration datasets can be present in
76 `~CollectionType.CALIBRATION` collections, this also
77 """
80@dataclasses.dataclass
81class QueryJoinsPlan:
82 """A struct describing the "joins" section of a butler query.
84 See `QueryPlan` and `QueryPlan.joins` for additional information.
85 """
87 predicate: qt.Predicate
88 """Boolean expression to apply to rows."""
90 columns: qt.ColumnSet
91 """All columns whose tables need to be joined into the query.
93 This is updated after construction to include all columns required by
94 `predicate`.
95 """
97 materializations: dict[qt.MaterializationKey, DimensionGroup] = dataclasses.field(default_factory=dict)
98 """Materializations to join into the query."""
100 datasets: dict[str, ResolvedDatasetSearch] = dataclasses.field(default_factory=dict)
101 """Dataset searches to join into the query."""
103 data_coordinate_uploads: dict[qt.DataCoordinateUploadKey, DimensionGroup] = dataclasses.field(
104 default_factory=dict
105 )
106 """Data coordinate uploads to join into the query."""
108 constraint_data_id: dict[str, DataIdValue] = dataclasses.field(default_factory=dict)
109 """A data ID that must be consistent with all result rows, extracted from
110 `predicate` at construction.
111 """
113 messages: list[str] = dataclasses.field(default_factory=list)
114 """Diagnostic messages that report reasons the query may not return any
115 rows.
116 """
118 def __post_init__(self) -> None:
119 self.predicate.gather_required_columns(self.columns)
120 # Extract the data ID implied by the predicate; we can use the governor
121 # dimensions in that to constrain the collections we search for
122 # datasets later.
123 self.predicate.visit(_DataIdExtractionVisitor(self.constraint_data_id, self.messages))
125 def iter_mandatory(self) -> Iterator[DimensionElement]:
126 """Return an iterator over the dimension elements that must be joined
127 into the query.
129 These elements either provide "field" (non-key) columns or define
130 relationships that result rows must be consistent with. They do not
131 necessarily include all dimension keys in `columns`, since each of
132 those can typically be included in a query in multiple different ways.
133 """
134 for element_name in self.columns.dimensions.elements:
135 element = self.columns.dimensions.universe[element_name]
136 if self.columns.dimension_fields[element_name]:
137 # We need to get dimension record fields for this element, and
138 # its table is the only place to get those.
139 yield element
140 elif element.defines_relationships:
141 # We also need to join in DimensionElement tables that define
142 # one-to-many and many-to-many relationships, but data
143 # coordinate uploads, materializations, and datasets can also
144 # provide these relationships. Data coordinate uploads and
145 # dataset tables only have required dimensions, and can hence
146 # only provide relationships involving those.
147 if any(
148 element.minimal_group.names <= upload_dimensions.required
149 for upload_dimensions in self.data_coordinate_uploads.values()
150 ):
151 continue
152 if any(
153 element.minimal_group.names <= dataset_spec.dimensions.required
154 for dataset_spec in self.datasets.values()
155 ):
156 continue
157 # Materializations have all key columns for their dimensions.
158 if any(
159 element in materialization_dimensions.names
160 for materialization_dimensions in self.materializations.values()
161 ):
162 continue
163 yield element
166@dataclasses.dataclass
167class QueryProjectionPlan:
168 """A struct describing the "projection" stage of a butler query.
170 This struct evaluates to `True` in boolean contexts if either
171 `needs_dimension_distinct` or `needs_dataset_distict` are `True`. In other
172 cases the projection is effectively a no-op, because the "joins"-stage rows
173 are already unique.
175 See `QueryPlan` and `QueryPlan.projection` for additional information.
176 """
178 columns: qt.ColumnSet
179 """The columns present in the query after the projection is applied.
181 This is always a subset of `QueryJoinsPlan.columns`.
182 """
184 datasets: dict[str, ResolvedDatasetSearch]
185 """Dataset searches to join into the query."""
187 needs_dimension_distinct: bool = False
188 """If `True`, the projection's dimensions do not include all dimensions in
189 the "joins" stage, and hence a SELECT DISTINCT [ON] or GROUP BY must be
190 used to make post-projection rows unique.
191 """
193 needs_dataset_distinct: bool = False
194 """If `True`, the projection columns do not include collection-specific
195 dataset fields that were present in the "joins" stage, and hence a SELECT
196 DISTINCT [ON] or GROUP BY must be added to make post-projection rows
197 unique.
198 """
200 def __bool__(self) -> bool:
201 return self.needs_dimension_distinct or self.needs_dataset_distinct
203 find_first_dataset: str | None = None
204 """If not `None`, this is a find-first query for this dataset.
206 This is set even if the find-first search is trivial because there is only
207 one resolved collection.
208 """
210 region_aggregates: list[DimensionElement] = dataclasses.field(default_factory=list)
211 """Dimension elements with spatial regions that must be aggregated by the
212 projection, since their dimension keys are being dropped.
214 This can only be non-empty if `needs_dimension_distinct` is `True`.
215 """
218@dataclasses.dataclass
219class QueryFindFirstPlan:
220 """A struct describing the "find-first" stage of a butler query.
222 See `QueryPlan` and `QueryPlan.find_first` for additional information.
223 """
225 search: ResolvedDatasetSearch
226 """Information about the dataset being searched for."""
228 @property
229 def dataset_type(self) -> str:
230 """Name of the dataset type."""
231 return self.search.name
233 def __bool__(self) -> bool:
234 return len(self.search.collection_records) > 1
237@dataclasses.dataclass
238class QueryPlan:
239 """A struct that aggregates information about a complete butler query.
241 Notes
242 -----
243 Butler queries are transformed into a combination of SQL and Python-side
244 postprocessing in three stages, with each corresponding to an attributes of
245 this class and a method of `DirectQueryDriver`
247 - In the `joins` stage (`~DirectQueryButler.apply_query_joins`), we define
248 the main SQL FROM and WHERE clauses, by joining all tables needed to
249 bring in any columns, or constrain the keys of its rows.
251 - In the `projection` stage (`~DirectQueryButler.apply_query_projection`),
252 we select only the columns needed for the query's result rows (including
253 columns needed only postprocessing and ORDER BY, as well those needed by
254 the objects returned to users). If the result rows are not naturally
255 unique given what went into the query in the "joins" stage, the
256 projection involves a SELECT DISTINCT [ON] or GROUP BY to make them
257 unique, and in a few rare cases uses aggregate functions with GROUP BY.
259 - In the `find_first` stage (`~DirectQueryButler.apply_query_find_first`),
260 we use a window function (PARTITION BY) subquery to find only the first
261 dataset in the collection search path for each data ID. This stage does
262 nothing if there is no find-first dataset search, or if the search is
263 trivial because there is only one collection.
265 In `DirectQueryDriver.build_query`, a `QueryPlan` instance is constructed
266 via `DirectQueryDriver.analyze_query`, which also returns an initial
267 `QueryBuilder`. After this point the plans are considered frozen, and the
268 nested plan attributes are then passed to each of the corresponding
269 `DirectQuery` along with the builder, which is mutated (and occasionally
270 replaced) into the complete SQL/postprocessing form of the query.
271 """
273 joins: QueryJoinsPlan
274 """Description of the "joins" stage of query construction."""
276 projection: QueryProjectionPlan
277 """Description of the "projection" stage of query construction."""
279 find_first: QueryFindFirstPlan | None
280 """Description of the "find_first" stage of query construction.
282 This attribute is `None` if there is no find-first search at all, and
283 `False` in boolean contexts if the search is trivial because there is only
284 one collection after the collections have been resolved.
285 """
287 final_columns: qt.ColumnSet
288 """The columns included in the SELECT clause of the complete SQL query
289 that is actually executed.
291 This is a subset of `QueryProjectionPlan.columns` that differs only in
292 columns used by the `find_first` stage or an ORDER BY expression.
294 Like all other `.queries.tree.ColumnSet` attributes, it does not include
295 fields added directly to `QueryBuilder.special`, which may also be added
296 to the SELECT clause.
297 """
300class _DataIdExtractionVisitor(
301 SimplePredicateVisitor,
302 ColumnExpressionVisitor[tuple[str, None] | tuple[None, Any] | tuple[None, None]],
303):
304 """A column-expression visitor that extracts quality constraints on
305 dimensions that are not OR'd with anything else.
307 Parameters
308 ----------
309 data_id : `dict`
310 Dictionary to populate in place.
311 messages : `list` [ `str` ]
312 List of diagnostic messages to populate in place.
313 """
315 def __init__(self, data_id: dict[str, DataIdValue], messages: list[str]):
316 self.data_id = data_id
317 self.messages = messages
319 def visit_comparison(
320 self,
321 a: qt.ColumnExpression,
322 operator: qt.ComparisonOperator,
323 b: qt.ColumnExpression,
324 flags: PredicateVisitFlags,
325 ) -> None:
326 if flags & PredicateVisitFlags.HAS_OR_SIBLINGS:
327 return None
328 if flags & PredicateVisitFlags.INVERTED:
329 if operator == "!=":
330 operator = "=="
331 else:
332 return None
333 if operator != "==":
334 return None
335 k_a, v_a = a.visit(self)
336 k_b, v_b = b.visit(self)
337 if k_a is not None and v_b is not None:
338 key = k_a
339 value = v_b
340 elif k_b is not None and v_a is not None:
341 key = k_b
342 value = v_a
343 else:
344 return None
345 if (old := self.data_id.setdefault(key, value)) != value:
346 self.messages.append(f"'where' expression requires both {key}={value!r} and {key}={old!r}.")
347 return None
349 def visit_binary_expression(self, expression: qt.BinaryExpression) -> tuple[None, None]:
350 return None, None
352 def visit_unary_expression(self, expression: qt.UnaryExpression) -> tuple[None, None]:
353 return None, None
355 def visit_literal(self, expression: qt.ColumnLiteral) -> tuple[None, Any]:
356 return None, expression.get_literal_value()
358 def visit_dimension_key_reference(self, expression: qt.DimensionKeyReference) -> tuple[str, None]:
359 return expression.dimension.name, None
361 def visit_dimension_field_reference(self, expression: qt.DimensionFieldReference) -> tuple[None, None]:
362 return None, None
364 def visit_dataset_field_reference(self, expression: qt.DatasetFieldReference) -> tuple[None, None]:
365 return None, None
367 def visit_reversed(self, expression: qt.Reversed) -> tuple[None, None]:
368 raise AssertionError("No Reversed expressions in predicates.")