Coverage for python / lsst / daf / butler / queries / _expression_strings.py: 17%
229 statements
« prev ^ index » next coverage.py v7.13.5, created at 2026-05-01 08:18 +0000
« prev ^ index » next coverage.py v7.13.5, created at 2026-05-01 08:18 +0000
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This software is dual licensed under the GNU General Public License and also
10# under a 3-clause BSD license. Recipients may choose which of these licenses
11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt,
12# respectively. If you choose the GPL option then the following text applies
13# (but note that there is still no warranty even if you opt for BSD instead):
14#
15# This program is free software: you can redistribute it and/or modify
16# it under the terms of the GNU General Public License as published by
17# the Free Software Foundation, either version 3 of the License, or
18# (at your option) any later version.
19#
20# This program is distributed in the hope that it will be useful,
21# but WITHOUT ANY WARRANTY; without even the implied warranty of
22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23# GNU General Public License for more details.
24#
25# You should have received a copy of the GNU General Public License
26# along with this program. If not, see <http://www.gnu.org/licenses/>.
28from __future__ import annotations
30from collections.abc import Set
31from typing import Literal, NamedTuple, TypeAlias
32from uuid import UUID
34import astropy.time
36import lsst.sphgeom
38from .._exceptions import InvalidQueryError
39from .._timespan import Timespan
40from ..column_spec import ColumnType
41from ..dimensions import DimensionUniverse
42from ._identifiers import IdentifierContext, interpret_identifier
43from .expressions.categorize import ExpressionConstant, categorizeConstant
44from .expressions.parser import (
45 BoxNode,
46 CircleNode,
47 Node,
48 PointNode,
49 PolygonNode,
50 RangeLiteral,
51 RegionNode,
52 TreeVisitor,
53 parse_expression,
54)
55from .tree import (
56 BinaryExpression,
57 ColumnExpression,
58 ColumnReference,
59 ComparisonOperator,
60 LiteralValue,
61 Predicate,
62 UnaryExpression,
63 make_column_literal,
64)
66BindValue = LiteralValue | list[LiteralValue] | tuple[LiteralValue] | Set[LiteralValue]
69def convert_expression_string_to_predicate(
70 expression: str, *, context: IdentifierContext, universe: DimensionUniverse
71) -> Predicate:
72 """Convert a Butler query expression string to a `Predicate` for use in a
73 QueryTree.
75 Parameters
76 ----------
77 expression : `str`
78 Butler expression query string, as used by the old query system to
79 specify filtering.
80 context : `IdentifierContext`
81 Contextual information that helps determine the meaning of an
82 identifier used in a query.
83 universe : `DimensionUniverse`
84 Dimension metadata for the Butler database being queried.
86 Returns
87 -------
88 predicate : `Predicate`
89 Predicate corresponding to that filter, for use in `QueryTree`.
90 """
91 try:
92 tree = parse_expression(expression)
93 except Exception as exc:
94 raise InvalidQueryError(f"Failed to parse expression '{expression}'") from exc
95 if tree is None:
96 return Predicate.from_bool(True)
97 converter = _ConversionVisitor(context, universe)
98 predicate = tree.visit(converter)
99 assert isinstance(predicate, Predicate), (
100 "The grammar should guarantee that we get a predicate back at the top level."
101 )
103 return predicate
106class _ColExpr(NamedTuple):
107 """Represents a portion of the original expression that has been converted
108 to a ColumnExpression object.
109 """
111 # This wrapper object mostly exists to help with typing and match() --
112 # ColumnExpression is a big discriminated union, and mypy was having a lot
113 # of trouble dealing with it in the context of _VisitorResult's extra
114 # layers of union.
116 value: ColumnExpression
118 @property
119 def column_type(self) -> ColumnType:
120 return self.value.column_type
123class _Null:
124 """Class representing a literal 'null' value in the expression."""
126 column_type: Literal["null"] = "null"
129class _RangeLiteral(NamedTuple):
130 """Class representing a range expression."""
132 value: RangeLiteral
133 column_type: Literal["range"] = "range"
136class _Sequence(NamedTuple):
137 value: list[ColumnExpression]
138 column_type: Literal["sequence"] = "sequence"
141_VisitorResult: TypeAlias = Predicate | _ColExpr | _Null | _RangeLiteral | _Sequence
144class _ConversionVisitor(TreeVisitor[_VisitorResult]):
145 def __init__(self, context: IdentifierContext, universe: DimensionUniverse):
146 super().__init__()
147 self.context = context
148 self.universe = universe
150 def visitBinaryOp(
151 self, operator: str, lhs: _VisitorResult, rhs: _VisitorResult, node: Node
152 ) -> _VisitorResult:
153 match (operator, lhs, rhs):
154 # Handle boolean operators.
155 case ["OR", Predicate() as lhs, Predicate() as rhs]:
156 return lhs.logical_or(rhs)
157 case ["AND", Predicate() as lhs, Predicate() as rhs]:
158 return lhs.logical_and(rhs)
160 # Handle comparison operators.
161 case [("=" | "!=" | "<" | ">" | "<=" | ">=" | "OVERLAPS"), _ColExpr() as lhs, _ColExpr() as rhs]:
162 return Predicate.compare(
163 a=lhs.value, b=rhs.value, operator=_convert_comparison_operator(operator)
164 )
166 # Allow equality comparisons with None/NULL. We don't have an 'IS'
167 # operator.
168 case ["=", _ColExpr() as lhs, _Null()]:
169 return Predicate.is_null(lhs.value)
170 case ["!=", _ColExpr() as lhs, _Null()]:
171 return Predicate.is_null(lhs.value).logical_not()
172 case ["=", _Null(), _ColExpr() as rhs]:
173 return Predicate.is_null(rhs.value)
174 case ["!=", _Null(), _ColExpr() as rhs]:
175 return Predicate.is_null(rhs.value).logical_not()
176 # Boolean columns can be null, but will have been converted to
177 # Predicate, so we need additional cases.
178 case ["=" | "!=", Predicate() as pred, _Null()] | ["=" | "!=", _Null(), Predicate() as pred]:
179 column_ref = _get_boolean_column_reference(pred)
180 if column_ref is not None:
181 match operator:
182 case "=":
183 return Predicate.is_null(column_ref)
184 case "!=":
185 return Predicate.is_null(column_ref).logical_not()
187 # Handle arithmetic operations
188 case [("+" | "-" | "*" | "/" | "%") as op, _ColExpr() as lhs, _ColExpr() as rhs]:
189 return _ColExpr(BinaryExpression(a=lhs.value, b=rhs.value, operator=op))
191 raise InvalidQueryError(
192 f"Invalid types {lhs.column_type}, {rhs.column_type} for binary operator {operator!r} "
193 f"in expression {node!s}."
194 )
196 def visitIsIn(
197 self, lhs: _VisitorResult, values: list[_VisitorResult], not_in: bool, node: Node
198 ) -> _VisitorResult:
199 assert isinstance(lhs, _ColExpr), "LHS of IN guaranteed to be scalar by parser."
200 predicates = [_convert_in_clause_to_predicate(lhs.value, rhs, node) for rhs in values]
201 result = Predicate.from_bool(False).logical_or(*predicates)
202 if not_in:
203 result = result.logical_not()
204 return result
206 def visitIdentifier(self, name: str, node: Node) -> _VisitorResult:
207 if name in self.context.bind:
208 value = self.context.bind[name]
209 # Lists of values do not have a direct representation in the new
210 # query system, so we have to handle them separately here.
211 if isinstance(value, list | tuple | Set):
212 literals: list[ColumnExpression] = [make_column_literal(item) for item in value]
213 types = set({item.column_type for item in literals})
214 if len(types) > 1:
215 raise InvalidQueryError(
216 f"Mismatched types in bind iterable: {value} has a mix of {types}."
217 )
218 return _Sequence(literals)
220 # The other constants are handled in interpret_identifier().
221 if categorizeConstant(name) == ExpressionConstant.NULL:
222 return _Null()
224 column_expression = interpret_identifier(self.context, name)
225 if column_expression.column_type == "bool":
226 # Expression-handling code (in this file and elsewhere) expects
227 # boolean-valued expressions to be represented as Predicate, not a
228 # ColumnExpression.
230 # We should only be getting direct references to a column, not a
231 # more complicated expression.
232 # (Anything more complicated should be a Predicate already.)
233 assert (
234 column_expression.expression_type == "dataset_field"
235 or column_expression.expression_type == "dimension_field"
236 or column_expression.expression_type == "dimension_key"
237 )
238 return Predicate.from_bool_expression(column_expression)
239 else:
240 return _ColExpr(column_expression)
242 def visitBind(self, name: str, node: Node) -> _VisitorResult:
243 if name not in self.context.bind:
244 raise InvalidQueryError(f"Name {name!r} is not in the bind map.")
245 # Logic in visitIdentifier handles binds.
246 return self.visitIdentifier(name, node)
248 def visitNumericLiteral(self, value: str, node: Node) -> _VisitorResult:
249 numeric: int | float
250 try:
251 numeric = int(value)
252 except ValueError:
253 # int() raises for float-like strings
254 numeric = float(value)
255 return _make_literal(numeric)
257 def visitParens(self, expression: _VisitorResult, node: Node) -> _VisitorResult:
258 return expression
260 def visitPointNode(self, ra: _VisitorResult, dec: _VisitorResult, node: PointNode) -> _VisitorResult:
261 ra_value = _get_float_literal_value(ra, node.ra, "POINT")
262 dec_value = _get_float_literal_value(dec, node.dec, "POINT")
264 lon_lat = lsst.sphgeom.LonLat.fromDegrees(ra_value, dec_value)
265 return _make_literal(lon_lat)
267 def visitCircleNode(
268 self, ra: _VisitorResult, dec: _VisitorResult, radius: _VisitorResult, node: CircleNode
269 ) -> _VisitorResult:
270 ra_value = _get_float_literal_value(ra, node.ra, "CIRCLE")
271 dec_value = _get_float_literal_value(dec, node.dec, "CIRCLE")
272 radius_value = _get_float_literal_value(radius, node.radius, "CIRCLE")
274 lon_lat = lsst.sphgeom.LonLat.fromDegrees(ra_value, dec_value)
275 open_angle = lsst.sphgeom.Angle.fromDegrees(radius_value * 2)
276 vec = lsst.sphgeom.UnitVector3d(lon_lat)
277 circle = lsst.sphgeom.Circle(vec, open_angle)
278 return _make_literal(circle)
280 def visitBoxNode(
281 self,
282 ra: _VisitorResult,
283 dec: _VisitorResult,
284 width: _VisitorResult,
285 height: _VisitorResult,
286 node: BoxNode,
287 ) -> _VisitorResult:
288 ra_value = _get_float_literal_value(ra, node.ra, "BOX")
289 dec_value = _get_float_literal_value(dec, node.dec, "BOX")
290 width_value = _get_float_literal_value(width, node.width, "BOX")
291 height_value = _get_float_literal_value(height, node.height, "BOX")
293 lon_lat = lsst.sphgeom.LonLat.fromDegrees(ra_value, dec_value)
294 half_width = lsst.sphgeom.Angle.fromDegrees(width_value / 2)
295 half_height = lsst.sphgeom.Angle.fromDegrees(height_value / 2)
296 box = lsst.sphgeom.Box(lon_lat, half_width, half_height)
297 return _make_literal(box)
299 def visitPolygonNode(
300 self, vertices: list[tuple[_VisitorResult, _VisitorResult]], node: PolygonNode
301 ) -> _VisitorResult:
302 sphgeom_vertices = []
303 for ra, dec in vertices:
304 ra_value = _get_float_literal_value(ra, node, "POLYGON")
305 dec_value = _get_float_literal_value(dec, node, "POLYGON")
306 lon_lat = lsst.sphgeom.LonLat.fromDegrees(ra_value, dec_value)
307 sphgeom_vertices.append(lsst.sphgeom.UnitVector3d(lon_lat))
309 polygon = lsst.sphgeom.ConvexPolygon(sphgeom_vertices)
310 return _make_literal(polygon)
312 def visitRegionNode(self, pos: _VisitorResult, node: RegionNode) -> _VisitorResult:
313 if isinstance(pos, _ColExpr):
314 expr = pos.value
315 if expr.expression_type == "string":
316 pos_str = expr.value
317 region = lsst.sphgeom.Region.from_ivoa_pos(pos_str)
318 return _make_literal(region)
320 raise InvalidQueryError(f"Expression '{node.pos}' in REGION() is not a literal string.")
322 def visitRangeLiteral(
323 self, start: int, stop: int, stride: int | None, node: RangeLiteral
324 ) -> _VisitorResult:
325 # Consumed by visitIsIn.
326 return _RangeLiteral(node)
328 def visitStringLiteral(self, value: str, node: Node) -> _VisitorResult:
329 return _make_literal(value)
331 def visitTimeLiteral(self, value: astropy.time.Time, node: Node) -> _VisitorResult:
332 return _make_literal(value)
334 def visitUuidLiteral(self, value: UUID, node: Node) -> _VisitorResult:
335 return _make_literal(value)
337 def visitTupleNode(self, items: tuple[_VisitorResult, ...], node: Node) -> _VisitorResult:
338 if len(items) != 2:
339 raise InvalidQueryError(f"Timespan tuple should have exactly two items (begin, end) in '{node}'")
341 begin = _to_timespan_bound(items[0], node)
342 end = _to_timespan_bound(items[1], node)
343 return _make_literal(Timespan(begin, end))
345 def visitUnaryOp(self, operator: str, operand: _VisitorResult, node: Node) -> _VisitorResult:
346 # Docstring inherited.
347 match (operator, operand):
348 case ["NOT", Predicate() as operand]:
349 return operand.logical_not()
350 case ["+", _ColExpr(column_type="int" | "float") as operand]:
351 # + is a no-op.
352 return operand
353 case ["-", _ColExpr(column_type="int" | "float", value=expr)]:
354 return _ColExpr(UnaryExpression(operand=expr, operator="-"))
355 raise InvalidQueryError(
356 f"Unary operator {operator!r} is not valid for operand of type {operand.column_type} in {node!s}."
357 )
359 def visitGlobNode(
360 self, expression: _VisitorResult, pattern: _VisitorResult, node: Node
361 ) -> _VisitorResult:
362 # Docstring inherited.
363 if isinstance(expression, _ColExpr) and expression.value.is_column_reference:
364 if expression.value.column_type != "string":
365 raise InvalidQueryError(f"glob() first argument must be a string column (in node {node})")
366 column_ref = expression.value
367 if not (isinstance(pattern, _ColExpr) and pattern.value.expression_type == "string"):
368 raise InvalidQueryError(f"glob() second argument must be a string (in node {node})")
370 return Predicate.compare(a=column_ref, b=pattern.value, operator="glob")
373def _make_literal(value: LiteralValue) -> _ColExpr:
374 return _ColExpr(make_column_literal(value))
377def _to_timespan_bound(value: _VisitorResult, node: Node) -> astropy.time.Time | None:
378 match value:
379 case _ColExpr(value=expr) if expr.expression_type == "datetime":
380 return expr.value
381 case _Null():
382 return None
384 raise InvalidQueryError(
385 f'Invalid type in timespan tuple "{node}" '
386 '(Note that date/time strings must be preceded by "T" to be recognized).'
387 )
390def _convert_comparison_operator(value: str) -> ComparisonOperator:
391 """Convert an expression-string comparison operator to the format
392 used by QueryTree.
393 """
394 match value:
395 case "=":
396 return "=="
397 case "OVERLAPS":
398 return "overlaps"
399 case ("!=" | "<" | ">" | "<=" | ">=") as op:
400 return op
401 case _:
402 raise AssertionError(f"Unhandled comparison operator {value}")
405def _convert_in_clause_to_predicate(lhs: ColumnExpression, rhs: _VisitorResult, node: Node) -> Predicate:
406 """Convert ``lhs IN rhs`` expression to an equivalent ``Predicate``
407 value.
408 """
409 match rhs:
410 case _Sequence():
411 return Predicate.in_container(lhs, rhs.value)
412 case _RangeLiteral():
413 stride = rhs.value.stride
414 if stride is None:
415 stride = 1
416 # Expression strings use inclusive ranges, but Predicate uses
417 # ranges that exclude the stop value.
418 stop = rhs.value.stop + 1
419 return Predicate.in_range(lhs, rhs.value.start, stop, stride)
420 case _ColExpr():
421 return Predicate.compare(lhs, "==", rhs.value)
422 case _Null():
423 return Predicate.is_null(lhs)
424 case _:
425 raise InvalidQueryError(f"Invalid IN expression: '{node!s}")
428def _get_boolean_column_reference(predicate: Predicate) -> ColumnReference | None:
429 """Unwrap a predicate to recover the boolean ColumnReference it contains.
430 Returns `None` if this Predicate contains anything other than a single
431 boolean ColumnReference operand.
433 This undoes the ColumnReference to Predicate conversion that occurs in
434 visitIdentifier for boolean columns.
435 """
436 if len(predicate.operands) == 1 and len(predicate.operands[0]) == 1:
437 predicate_leaf = predicate.operands[0][0]
438 if predicate_leaf.predicate_type == "boolean_wrapper":
439 return predicate_leaf.operand
441 return None
444def _get_float_literal_value(value: _VisitorResult, node: Node, name: str) -> float:
445 """If the given ``value`` is a literal `float` or `int` expression, return
446 it as a float. Otherwise raise an `InvalidQueryError` identifying a
447 problem with the given ``node``.
448 """
449 if isinstance(value, _ColExpr):
450 expr = value.value
451 if expr.expression_type == "float":
452 return expr.value
453 elif expr.expression_type == "int":
454 return float(expr.value)
455 elif expr.expression_type == "unary" and expr.operator == "-":
456 return -1 * _get_float_literal_value(_ColExpr(expr.operand), node, name)
458 raise InvalidQueryError(f"Expression '{node}' in {name}() is not a literal number.")