Coverage for python / lsst / daf / butler / queries / _expression_strings.py: 17%

229 statements  

« prev     ^ index     » next       coverage.py v7.13.5, created at 2026-05-06 08:30 +0000

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This software is dual licensed under the GNU General Public License and also 

10# under a 3-clause BSD license. Recipients may choose which of these licenses 

11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt, 

12# respectively. If you choose the GPL option then the following text applies 

13# (but note that there is still no warranty even if you opt for BSD instead): 

14# 

15# This program is free software: you can redistribute it and/or modify 

16# it under the terms of the GNU General Public License as published by 

17# the Free Software Foundation, either version 3 of the License, or 

18# (at your option) any later version. 

19# 

20# This program is distributed in the hope that it will be useful, 

21# but WITHOUT ANY WARRANTY; without even the implied warranty of 

22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

23# GNU General Public License for more details. 

24# 

25# You should have received a copy of the GNU General Public License 

26# along with this program. If not, see <http://www.gnu.org/licenses/>. 

27 

28from __future__ import annotations 

29 

30from collections.abc import Set 

31from typing import Literal, NamedTuple, TypeAlias 

32from uuid import UUID 

33 

34import astropy.time 

35 

36import lsst.sphgeom 

37 

38from .._exceptions import InvalidQueryError 

39from .._timespan import Timespan 

40from ..column_spec import ColumnType 

41from ..dimensions import DimensionUniverse 

42from ._identifiers import IdentifierContext, interpret_identifier 

43from .expressions.categorize import ExpressionConstant, categorizeConstant 

44from .expressions.parser import ( 

45 BoxNode, 

46 CircleNode, 

47 Node, 

48 PointNode, 

49 PolygonNode, 

50 RangeLiteral, 

51 RegionNode, 

52 TreeVisitor, 

53 parse_expression, 

54) 

55from .tree import ( 

56 BinaryExpression, 

57 ColumnExpression, 

58 ColumnReference, 

59 ComparisonOperator, 

60 LiteralValue, 

61 Predicate, 

62 UnaryExpression, 

63 make_column_literal, 

64) 

65 

66BindValue = LiteralValue | list[LiteralValue] | tuple[LiteralValue] | Set[LiteralValue] 

67 

68 

69def convert_expression_string_to_predicate( 

70 expression: str, *, context: IdentifierContext, universe: DimensionUniverse 

71) -> Predicate: 

72 """Convert a Butler query expression string to a `Predicate` for use in a 

73 QueryTree. 

74 

75 Parameters 

76 ---------- 

77 expression : `str` 

78 Butler expression query string, as used by the old query system to 

79 specify filtering. 

80 context : `IdentifierContext` 

81 Contextual information that helps determine the meaning of an 

82 identifier used in a query. 

83 universe : `DimensionUniverse` 

84 Dimension metadata for the Butler database being queried. 

85 

86 Returns 

87 ------- 

88 predicate : `Predicate` 

89 Predicate corresponding to that filter, for use in `QueryTree`. 

90 """ 

91 try: 

92 tree = parse_expression(expression) 

93 except Exception as exc: 

94 raise InvalidQueryError(f"Failed to parse expression '{expression}'") from exc 

95 if tree is None: 

96 return Predicate.from_bool(True) 

97 converter = _ConversionVisitor(context, universe) 

98 predicate = tree.visit(converter) 

99 assert isinstance(predicate, Predicate), ( 

100 "The grammar should guarantee that we get a predicate back at the top level." 

101 ) 

102 

103 return predicate 

104 

105 

106class _ColExpr(NamedTuple): 

107 """Represents a portion of the original expression that has been converted 

108 to a ColumnExpression object. 

109 """ 

110 

111 # This wrapper object mostly exists to help with typing and match() -- 

112 # ColumnExpression is a big discriminated union, and mypy was having a lot 

113 # of trouble dealing with it in the context of _VisitorResult's extra 

114 # layers of union. 

115 

116 value: ColumnExpression 

117 

118 @property 

119 def column_type(self) -> ColumnType: 

120 return self.value.column_type 

121 

122 

123class _Null: 

124 """Class representing a literal 'null' value in the expression.""" 

125 

126 column_type: Literal["null"] = "null" 

127 

128 

129class _RangeLiteral(NamedTuple): 

130 """Class representing a range expression.""" 

131 

132 value: RangeLiteral 

133 column_type: Literal["range"] = "range" 

134 

135 

136class _Sequence(NamedTuple): 

137 value: list[ColumnExpression] 

138 column_type: Literal["sequence"] = "sequence" 

139 

140 

141_VisitorResult: TypeAlias = Predicate | _ColExpr | _Null | _RangeLiteral | _Sequence 

142 

143 

144class _ConversionVisitor(TreeVisitor[_VisitorResult]): 

145 def __init__(self, context: IdentifierContext, universe: DimensionUniverse): 

146 super().__init__() 

147 self.context = context 

148 self.universe = universe 

149 

150 def visitBinaryOp( 

151 self, operator: str, lhs: _VisitorResult, rhs: _VisitorResult, node: Node 

152 ) -> _VisitorResult: 

153 match (operator, lhs, rhs): 

154 # Handle boolean operators. 

155 case ["OR", Predicate() as lhs, Predicate() as rhs]: 

156 return lhs.logical_or(rhs) 

157 case ["AND", Predicate() as lhs, Predicate() as rhs]: 

158 return lhs.logical_and(rhs) 

159 

160 # Handle comparison operators. 

161 case [("=" | "!=" | "<" | ">" | "<=" | ">=" | "OVERLAPS"), _ColExpr() as lhs, _ColExpr() as rhs]: 

162 return Predicate.compare( 

163 a=lhs.value, b=rhs.value, operator=_convert_comparison_operator(operator) 

164 ) 

165 

166 # Allow equality comparisons with None/NULL. We don't have an 'IS' 

167 # operator. 

168 case ["=", _ColExpr() as lhs, _Null()]: 

169 return Predicate.is_null(lhs.value) 

170 case ["!=", _ColExpr() as lhs, _Null()]: 

171 return Predicate.is_null(lhs.value).logical_not() 

172 case ["=", _Null(), _ColExpr() as rhs]: 

173 return Predicate.is_null(rhs.value) 

174 case ["!=", _Null(), _ColExpr() as rhs]: 

175 return Predicate.is_null(rhs.value).logical_not() 

176 # Boolean columns can be null, but will have been converted to 

177 # Predicate, so we need additional cases. 

178 case ["=" | "!=", Predicate() as pred, _Null()] | ["=" | "!=", _Null(), Predicate() as pred]: 

179 column_ref = _get_boolean_column_reference(pred) 

180 if column_ref is not None: 

181 match operator: 

182 case "=": 

183 return Predicate.is_null(column_ref) 

184 case "!=": 

185 return Predicate.is_null(column_ref).logical_not() 

186 

187 # Handle arithmetic operations 

188 case [("+" | "-" | "*" | "/" | "%") as op, _ColExpr() as lhs, _ColExpr() as rhs]: 

189 return _ColExpr(BinaryExpression(a=lhs.value, b=rhs.value, operator=op)) 

190 

191 raise InvalidQueryError( 

192 f"Invalid types {lhs.column_type}, {rhs.column_type} for binary operator {operator!r} " 

193 f"in expression {node!s}." 

194 ) 

195 

196 def visitIsIn( 

197 self, lhs: _VisitorResult, values: list[_VisitorResult], not_in: bool, node: Node 

198 ) -> _VisitorResult: 

199 assert isinstance(lhs, _ColExpr), "LHS of IN guaranteed to be scalar by parser." 

200 predicates = [_convert_in_clause_to_predicate(lhs.value, rhs, node) for rhs in values] 

201 result = Predicate.from_bool(False).logical_or(*predicates) 

202 if not_in: 

203 result = result.logical_not() 

204 return result 

205 

206 def visitIdentifier(self, name: str, node: Node) -> _VisitorResult: 

207 if name in self.context.bind: 

208 value = self.context.bind[name] 

209 # Lists of values do not have a direct representation in the new 

210 # query system, so we have to handle them separately here. 

211 if isinstance(value, list | tuple | Set): 

212 literals: list[ColumnExpression] = [make_column_literal(item) for item in value] 

213 types = set({item.column_type for item in literals}) 

214 if len(types) > 1: 

215 raise InvalidQueryError( 

216 f"Mismatched types in bind iterable: {value} has a mix of {types}." 

217 ) 

218 return _Sequence(literals) 

219 

220 # The other constants are handled in interpret_identifier(). 

221 if categorizeConstant(name) == ExpressionConstant.NULL: 

222 return _Null() 

223 

224 column_expression = interpret_identifier(self.context, name) 

225 if column_expression.column_type == "bool": 

226 # Expression-handling code (in this file and elsewhere) expects 

227 # boolean-valued expressions to be represented as Predicate, not a 

228 # ColumnExpression. 

229 

230 # We should only be getting direct references to a column, not a 

231 # more complicated expression. 

232 # (Anything more complicated should be a Predicate already.) 

233 assert ( 

234 column_expression.expression_type == "dataset_field" 

235 or column_expression.expression_type == "dimension_field" 

236 or column_expression.expression_type == "dimension_key" 

237 ) 

238 return Predicate.from_bool_expression(column_expression) 

239 else: 

240 return _ColExpr(column_expression) 

241 

242 def visitBind(self, name: str, node: Node) -> _VisitorResult: 

243 if name not in self.context.bind: 

244 raise InvalidQueryError(f"Name {name!r} is not in the bind map.") 

245 # Logic in visitIdentifier handles binds. 

246 return self.visitIdentifier(name, node) 

247 

248 def visitNumericLiteral(self, value: str, node: Node) -> _VisitorResult: 

249 numeric: int | float 

250 try: 

251 numeric = int(value) 

252 except ValueError: 

253 # int() raises for float-like strings 

254 numeric = float(value) 

255 return _make_literal(numeric) 

256 

257 def visitParens(self, expression: _VisitorResult, node: Node) -> _VisitorResult: 

258 return expression 

259 

260 def visitPointNode(self, ra: _VisitorResult, dec: _VisitorResult, node: PointNode) -> _VisitorResult: 

261 ra_value = _get_float_literal_value(ra, node.ra, "POINT") 

262 dec_value = _get_float_literal_value(dec, node.dec, "POINT") 

263 

264 lon_lat = lsst.sphgeom.LonLat.fromDegrees(ra_value, dec_value) 

265 return _make_literal(lon_lat) 

266 

267 def visitCircleNode( 

268 self, ra: _VisitorResult, dec: _VisitorResult, radius: _VisitorResult, node: CircleNode 

269 ) -> _VisitorResult: 

270 ra_value = _get_float_literal_value(ra, node.ra, "CIRCLE") 

271 dec_value = _get_float_literal_value(dec, node.dec, "CIRCLE") 

272 radius_value = _get_float_literal_value(radius, node.radius, "CIRCLE") 

273 

274 lon_lat = lsst.sphgeom.LonLat.fromDegrees(ra_value, dec_value) 

275 open_angle = lsst.sphgeom.Angle.fromDegrees(radius_value * 2) 

276 vec = lsst.sphgeom.UnitVector3d(lon_lat) 

277 circle = lsst.sphgeom.Circle(vec, open_angle) 

278 return _make_literal(circle) 

279 

280 def visitBoxNode( 

281 self, 

282 ra: _VisitorResult, 

283 dec: _VisitorResult, 

284 width: _VisitorResult, 

285 height: _VisitorResult, 

286 node: BoxNode, 

287 ) -> _VisitorResult: 

288 ra_value = _get_float_literal_value(ra, node.ra, "BOX") 

289 dec_value = _get_float_literal_value(dec, node.dec, "BOX") 

290 width_value = _get_float_literal_value(width, node.width, "BOX") 

291 height_value = _get_float_literal_value(height, node.height, "BOX") 

292 

293 lon_lat = lsst.sphgeom.LonLat.fromDegrees(ra_value, dec_value) 

294 half_width = lsst.sphgeom.Angle.fromDegrees(width_value / 2) 

295 half_height = lsst.sphgeom.Angle.fromDegrees(height_value / 2) 

296 box = lsst.sphgeom.Box(lon_lat, half_width, half_height) 

297 return _make_literal(box) 

298 

299 def visitPolygonNode( 

300 self, vertices: list[tuple[_VisitorResult, _VisitorResult]], node: PolygonNode 

301 ) -> _VisitorResult: 

302 sphgeom_vertices = [] 

303 for ra, dec in vertices: 

304 ra_value = _get_float_literal_value(ra, node, "POLYGON") 

305 dec_value = _get_float_literal_value(dec, node, "POLYGON") 

306 lon_lat = lsst.sphgeom.LonLat.fromDegrees(ra_value, dec_value) 

307 sphgeom_vertices.append(lsst.sphgeom.UnitVector3d(lon_lat)) 

308 

309 polygon = lsst.sphgeom.ConvexPolygon(sphgeom_vertices) 

310 return _make_literal(polygon) 

311 

312 def visitRegionNode(self, pos: _VisitorResult, node: RegionNode) -> _VisitorResult: 

313 if isinstance(pos, _ColExpr): 

314 expr = pos.value 

315 if expr.expression_type == "string": 

316 pos_str = expr.value 

317 region = lsst.sphgeom.Region.from_ivoa_pos(pos_str) 

318 return _make_literal(region) 

319 

320 raise InvalidQueryError(f"Expression '{node.pos}' in REGION() is not a literal string.") 

321 

322 def visitRangeLiteral( 

323 self, start: int, stop: int, stride: int | None, node: RangeLiteral 

324 ) -> _VisitorResult: 

325 # Consumed by visitIsIn. 

326 return _RangeLiteral(node) 

327 

328 def visitStringLiteral(self, value: str, node: Node) -> _VisitorResult: 

329 return _make_literal(value) 

330 

331 def visitTimeLiteral(self, value: astropy.time.Time, node: Node) -> _VisitorResult: 

332 return _make_literal(value) 

333 

334 def visitUuidLiteral(self, value: UUID, node: Node) -> _VisitorResult: 

335 return _make_literal(value) 

336 

337 def visitTupleNode(self, items: tuple[_VisitorResult, ...], node: Node) -> _VisitorResult: 

338 if len(items) != 2: 

339 raise InvalidQueryError(f"Timespan tuple should have exactly two items (begin, end) in '{node}'") 

340 

341 begin = _to_timespan_bound(items[0], node) 

342 end = _to_timespan_bound(items[1], node) 

343 return _make_literal(Timespan(begin, end)) 

344 

345 def visitUnaryOp(self, operator: str, operand: _VisitorResult, node: Node) -> _VisitorResult: 

346 # Docstring inherited. 

347 match (operator, operand): 

348 case ["NOT", Predicate() as operand]: 

349 return operand.logical_not() 

350 case ["+", _ColExpr(column_type="int" | "float") as operand]: 

351 # + is a no-op. 

352 return operand 

353 case ["-", _ColExpr(column_type="int" | "float", value=expr)]: 

354 return _ColExpr(UnaryExpression(operand=expr, operator="-")) 

355 raise InvalidQueryError( 

356 f"Unary operator {operator!r} is not valid for operand of type {operand.column_type} in {node!s}." 

357 ) 

358 

359 def visitGlobNode( 

360 self, expression: _VisitorResult, pattern: _VisitorResult, node: Node 

361 ) -> _VisitorResult: 

362 # Docstring inherited. 

363 if isinstance(expression, _ColExpr) and expression.value.is_column_reference: 

364 if expression.value.column_type != "string": 

365 raise InvalidQueryError(f"glob() first argument must be a string column (in node {node})") 

366 column_ref = expression.value 

367 if not (isinstance(pattern, _ColExpr) and pattern.value.expression_type == "string"): 

368 raise InvalidQueryError(f"glob() second argument must be a string (in node {node})") 

369 

370 return Predicate.compare(a=column_ref, b=pattern.value, operator="glob") 

371 

372 

373def _make_literal(value: LiteralValue) -> _ColExpr: 

374 return _ColExpr(make_column_literal(value)) 

375 

376 

377def _to_timespan_bound(value: _VisitorResult, node: Node) -> astropy.time.Time | None: 

378 match value: 

379 case _ColExpr(value=expr) if expr.expression_type == "datetime": 

380 return expr.value 

381 case _Null(): 

382 return None 

383 

384 raise InvalidQueryError( 

385 f'Invalid type in timespan tuple "{node}" ' 

386 '(Note that date/time strings must be preceded by "T" to be recognized).' 

387 ) 

388 

389 

390def _convert_comparison_operator(value: str) -> ComparisonOperator: 

391 """Convert an expression-string comparison operator to the format 

392 used by QueryTree. 

393 """ 

394 match value: 

395 case "=": 

396 return "==" 

397 case "OVERLAPS": 

398 return "overlaps" 

399 case ("!=" | "<" | ">" | "<=" | ">=") as op: 

400 return op 

401 case _: 

402 raise AssertionError(f"Unhandled comparison operator {value}") 

403 

404 

405def _convert_in_clause_to_predicate(lhs: ColumnExpression, rhs: _VisitorResult, node: Node) -> Predicate: 

406 """Convert ``lhs IN rhs`` expression to an equivalent ``Predicate`` 

407 value. 

408 """ 

409 match rhs: 

410 case _Sequence(): 

411 return Predicate.in_container(lhs, rhs.value) 

412 case _RangeLiteral(): 

413 stride = rhs.value.stride 

414 if stride is None: 

415 stride = 1 

416 # Expression strings use inclusive ranges, but Predicate uses 

417 # ranges that exclude the stop value. 

418 stop = rhs.value.stop + 1 

419 return Predicate.in_range(lhs, rhs.value.start, stop, stride) 

420 case _ColExpr(): 

421 return Predicate.compare(lhs, "==", rhs.value) 

422 case _Null(): 

423 return Predicate.is_null(lhs) 

424 case _: 

425 raise InvalidQueryError(f"Invalid IN expression: '{node!s}") 

426 

427 

428def _get_boolean_column_reference(predicate: Predicate) -> ColumnReference | None: 

429 """Unwrap a predicate to recover the boolean ColumnReference it contains. 

430 Returns `None` if this Predicate contains anything other than a single 

431 boolean ColumnReference operand. 

432 

433 This undoes the ColumnReference to Predicate conversion that occurs in 

434 visitIdentifier for boolean columns. 

435 """ 

436 if len(predicate.operands) == 1 and len(predicate.operands[0]) == 1: 

437 predicate_leaf = predicate.operands[0][0] 

438 if predicate_leaf.predicate_type == "boolean_wrapper": 

439 return predicate_leaf.operand 

440 

441 return None 

442 

443 

444def _get_float_literal_value(value: _VisitorResult, node: Node, name: str) -> float: 

445 """If the given ``value`` is a literal `float` or `int` expression, return 

446 it as a float. Otherwise raise an `InvalidQueryError` identifying a 

447 problem with the given ``node``. 

448 """ 

449 if isinstance(value, _ColExpr): 

450 expr = value.value 

451 if expr.expression_type == "float": 

452 return expr.value 

453 elif expr.expression_type == "int": 

454 return float(expr.value) 

455 elif expr.expression_type == "unary" and expr.operator == "-": 

456 return -1 * _get_float_literal_value(_ColExpr(expr.operand), node, name) 

457 

458 raise InvalidQueryError(f"Expression '{node}' in {name}() is not a literal number.")