Coverage for python/lsst/daf/butler/registry/queries/expressions/check.py: 30%
159 statements
« prev ^ index » next coverage.py v6.5.0, created at 2022-12-15 10:02 +0000
« prev ^ index » next coverage.py v6.5.0, created at 2022-12-15 10:02 +0000
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
21from __future__ import annotations
23__all__ = (
24 "CheckVisitor",
25 "InspectionVisitor",
26 "InspectionSummary",
27)
29import dataclasses
30from collections.abc import Mapping, Sequence, Set
31from typing import TYPE_CHECKING, Any, List, Optional, Tuple
33from ....core import (
34 DataCoordinate,
35 DataIdValue,
36 Dimension,
37 DimensionElement,
38 DimensionGraph,
39 DimensionUniverse,
40 NamedKeyDict,
41 NamedValueSet,
42)
43from ..._exceptions import UserExpressionError
44from .categorize import ExpressionConstant, categorizeConstant, categorizeElementId
45from .normalForm import NormalForm, NormalFormVisitor
46from .parser import Node, TreeVisitor
48if TYPE_CHECKING: 48 ↛ 49line 48 didn't jump to line 49, because the condition on line 48 was never true
49 import astropy.time
52@dataclasses.dataclass
53class InspectionSummary:
54 """Base class for objects used by `CheckVisitor` and `InspectionVisitor`
55 to gather information about a parsed expression.
56 """
58 def update(self, other: InspectionSummary) -> None:
59 """Update ``self`` with all dimensions and columns from ``other``.
61 Parameters
62 ----------
63 other : `InspectionSummary`
64 The other summary object.
65 """
66 self.dimensions.update(other.dimensions)
67 for element, columns in other.columns.items():
68 self.columns.setdefault(element, set()).update(columns)
69 self.hasIngestDate = self.hasIngestDate or other.hasIngestDate
71 dimensions: NamedValueSet[Dimension] = dataclasses.field(default_factory=NamedValueSet)
72 """Dimensions whose primary keys or dependencies were referenced anywhere
73 in this branch (`NamedValueSet` [ `Dimension` ]).
74 """
76 columns: NamedKeyDict[DimensionElement, set[str]] = dataclasses.field(default_factory=NamedKeyDict)
77 """Dimension element tables whose columns were referenced anywhere in this
78 branch (`NamedKeyDict` [ `DimensionElement`, `set` [ `str` ] ]).
79 """
81 hasIngestDate: bool = False
82 """Whether this expression includes the special dataset ingest date
83 identifier (`bool`).
84 """
87@dataclasses.dataclass
88class TreeSummary(InspectionSummary):
89 """Result object used by `InspectionVisitor` to gather information about
90 a parsed expression.
92 Notes
93 -----
94 TreeSummary adds attributes that allow dimension equivalence expressions
95 (e.g. "tract=4") to be recognized when they appear in simple contexts
96 (surrounded only by ANDs and ORs). When `InspectionVisitor` is used on its
97 own (i.e. when ``check=False`` in the query code), these don't do anything,
98 but they don't cost much, either. They are used by `CheckVisitor` when it
99 delegates to `InspectionVisitor` to see what governor dimension values are
100 set in a branch of the normal-form expression.
101 """
103 def merge(self, other: TreeSummary, isEq: bool = False) -> TreeSummary:
104 """Merge ``other`` into ``self``, making ``self`` a summary of both
105 expression tree branches.
107 Parameters
108 ----------
109 other : `TreeSummary`
110 The other summary object.
111 isEq : `bool`, optional
112 If `True` (`False` is default), these summaries are being combined
113 via the equality operator.
115 Returns
116 -------
117 self : `TreeSummary`
118 The merged summary (updated in-place).
119 """
120 self.update(other)
121 if isEq and self.isDataIdKeyOnly() and other.isDataIdValueOnly():
122 self.dataIdValue = other.dataIdValue
123 elif isEq and self.isDataIdValueOnly() and other.isDataIdKeyOnly():
124 self.dataIdKey = other.dataIdKey
125 else:
126 self.dataIdKey = None
127 self.dataIdValue = None
128 return self
130 def isDataIdKeyOnly(self) -> bool:
131 """Test whether this branch is _just_ a data ID key identifier."""
132 return self.dataIdKey is not None and self.dataIdValue is None
134 def isDataIdValueOnly(self) -> bool:
135 """Test whether this branch is _just_ a literal value that may be
136 used as the value in a data ID key-value pair.
137 """
138 return self.dataIdKey is None and self.dataIdValue is not None
140 dataIdKey: Optional[Dimension] = None
141 """A `Dimension` that is (if `dataIdValue` is not `None`) or may be
142 (if `dataIdValue` is `None`) fully identified by a literal value in this
143 branch.
144 """
146 dataIdValue: Optional[str] = None
147 """A literal value that constrains (if `dataIdKey` is not `None`) or may
148 constrain (if `dataIdKey` is `None`) a dimension in this branch.
150 This is always a `str` or `None`, but it may need to be coerced to `int`
151 to reflect the actual user intent.
152 """
155class InspectionVisitor(TreeVisitor[TreeSummary]):
156 """Implements TreeVisitor to identify dimension elements that need
157 to be included in a query, prior to actually constructing a SQLAlchemy
158 WHERE clause from it.
160 Parameters
161 ----------
162 universe : `DimensionUniverse`
163 All known dimensions.
164 bind : `Mapping` [ `str`, `object` ]
165 Mapping containing literal values that should be injected into the
166 query expression, keyed by the identifiers they replace.
167 """
169 def __init__(self, universe: DimensionUniverse, bind: Mapping[str, Any]):
170 self.universe = universe
171 self.bind = bind
173 def visitNumericLiteral(self, value: str, node: Node) -> TreeSummary:
174 # Docstring inherited from TreeVisitor.visitNumericLiteral
175 return TreeSummary(dataIdValue=value)
177 def visitStringLiteral(self, value: str, node: Node) -> TreeSummary:
178 # Docstring inherited from TreeVisitor.visitStringLiteral
179 return TreeSummary(dataIdValue=value)
181 def visitTimeLiteral(self, value: astropy.time.Time, node: Node) -> TreeSummary:
182 # Docstring inherited from TreeVisitor.visitTimeLiteral
183 return TreeSummary()
185 def visitIdentifier(self, name: str, node: Node) -> TreeSummary:
186 # Docstring inherited from TreeVisitor.visitIdentifier
187 if name in self.bind:
188 value = self.bind[name]
189 if isinstance(value, (list, tuple, Set)):
190 # This can happen on rhs of IN operator, if there is only one
191 # element in the list then take it.
192 if len(value) == 1:
193 return TreeSummary(dataIdValue=next(iter(value)))
194 else:
195 return TreeSummary()
196 else:
197 return TreeSummary(dataIdValue=value)
198 constant = categorizeConstant(name)
199 if constant is ExpressionConstant.INGEST_DATE:
200 return TreeSummary(hasIngestDate=True)
201 elif constant is ExpressionConstant.NULL:
202 return TreeSummary()
203 assert constant is None, "Enum variant conditionals should be exhaustive."
204 element, column = categorizeElementId(self.universe, name)
205 if column is None:
206 assert isinstance(element, Dimension)
207 return TreeSummary(
208 dimensions=NamedValueSet(element.graph.dimensions),
209 dataIdKey=element,
210 )
211 else:
212 return TreeSummary(
213 dimensions=NamedValueSet(element.graph.dimensions), columns=NamedKeyDict({element: {column}})
214 )
216 def visitUnaryOp(self, operator: str, operand: TreeSummary, node: Node) -> TreeSummary:
217 # Docstring inherited from TreeVisitor.visitUnaryOp
218 return operand
220 def visitBinaryOp(self, operator: str, lhs: TreeSummary, rhs: TreeSummary, node: Node) -> TreeSummary:
221 # Docstring inherited from TreeVisitor.visitBinaryOp
222 return lhs.merge(rhs, isEq=(operator == "="))
224 def visitIsIn(self, lhs: TreeSummary, values: List[TreeSummary], not_in: bool, node: Node) -> TreeSummary:
225 # Docstring inherited from TreeVisitor.visitIsIn
226 for v in values:
227 lhs.merge(v)
228 return lhs
230 def visitParens(self, expression: TreeSummary, node: Node) -> TreeSummary:
231 # Docstring inherited from TreeVisitor.visitParens
232 return expression
234 def visitTupleNode(self, items: Tuple[TreeSummary, ...], node: Node) -> TreeSummary:
235 # Docstring inherited from base class
236 result = TreeSummary()
237 for i in items:
238 result.merge(i)
239 return result
241 def visitRangeLiteral(self, start: int, stop: int, stride: Optional[int], node: Node) -> TreeSummary:
242 # Docstring inherited from TreeVisitor.visitRangeLiteral
243 return TreeSummary()
245 def visitPointNode(self, ra: TreeSummary, dec: TreeSummary, node: Node) -> TreeSummary:
246 # Docstring inherited from base class
247 return TreeSummary()
250@dataclasses.dataclass
251class InnerSummary(InspectionSummary):
252 """Result object used by `CheckVisitor` to gather referenced dimensions
253 and tables from an inner group of AND'd together expression branches, and
254 check them for consistency and completeness.
255 """
257 dimension_values: dict[str, DataIdValue] = dataclasses.field(default_factory=dict)
258 """Mapping containing the values of all dimensions that are equated with
259 literal values in this expression branch.
260 """
262 defaultsNeeded: set[str] = dataclasses.field(default_factory=set)
263 """Governor dimensions whose values are needed by the query, not provided
264 in the query itself, and present in the default data ID.
266 These should be added to the query's data ID when finalizing the WHERE
267 clause.
268 """
271@dataclasses.dataclass
272class OuterSummary(InspectionSummary):
273 """Result object used by `CheckVisitor` to gather referenced dimensions,
274 tables, and governor dimension values from the entire expression.
275 """
277 dimension_constraints: dict[str, set[DataIdValue]] = dataclasses.field(default_factory=dict)
278 """Mapping containing all values that appear in this expression for
279 dimensions relevant to the query.
281 Dimensions that are absent from this dict are not constrained by this
282 expression.
283 """
285 defaultsNeeded: set[str] = dataclasses.field(default_factory=set)
286 """Governor dimensions whose values are needed by the query, not provided
287 in the query itself, and present in the default data ID.
289 These should be added to the query's data ID when finalizing the WHERE
290 clause.
291 """
294class CheckVisitor(NormalFormVisitor[TreeSummary, InnerSummary, OuterSummary]):
295 """An implementation of `NormalFormVisitor` that identifies the dimensions
296 and tables that need to be included in a query while performing some checks
297 for completeness and consistency.
299 Parameters
300 ----------
301 dataId : `DataCoordinate`
302 Dimension values that are fully known in advance.
303 graph : `DimensionGraph`
304 The dimensions the query would include in the absence of this
305 expression.
306 bind : `Mapping` [ `str`, `object` ]
307 Mapping containing literal values that should be injected into the
308 query expression, keyed by the identifiers they replace.
309 defaults : `DataCoordinate`
310 A data ID containing default for governor dimensions.
311 """
313 def __init__(
314 self,
315 dataId: DataCoordinate,
316 graph: DimensionGraph,
317 bind: Mapping[str, Any],
318 defaults: DataCoordinate,
319 ):
320 self.dataId = dataId
321 self.graph = graph
322 self.defaults = defaults
323 self._branchVisitor = InspectionVisitor(dataId.universe, bind)
325 def visitBranch(self, node: Node) -> TreeSummary:
326 # Docstring inherited from NormalFormVisitor.
327 return node.visit(self._branchVisitor)
329 def visitInner(self, branches: Sequence[TreeSummary], form: NormalForm) -> InnerSummary:
330 # Docstring inherited from NormalFormVisitor.
331 # Disjunctive normal form means inner branches are AND'd together...
332 assert form is NormalForm.DISJUNCTIVE
333 # ...and that means each branch we iterate over together below
334 # constrains the others, and they all need to be consistent. Moreover,
335 # because outer branches are OR'd together, we also know that if
336 # something is missing from one of these branches (like a governor
337 # dimension value like the instrument or skymap needed to interpret a
338 # visit or tract number), it really is missing, because there's no way
339 # some other inner branch can constraint it.
340 #
341 # That is, except the data ID the visitor was passed at construction;
342 # that's AND'd to the entire expression later, and thus it affects all
343 # branches. To take care of that, we add any governor values it
344 # contains to the summary in advance.
345 summary = InnerSummary()
346 summary.dimension_values.update(
347 (k, self.dataId[k])
348 for k in (self.dataId.graph.names if self.dataId.hasFull() else self.dataId.graph.required.names)
349 )
350 # Finally, we loop over those branches.
351 for branch in branches:
352 # Update the sets of dimensions and columns we've seen anywhere in
353 # the expression in any context.
354 summary.update(branch)
355 # Test whether this branch has a form like '<dimension>=<value>'
356 # (or equivalent; categorizeIdentifier is smart enough to see that
357 # e.g. 'detector.id=4' is equivalent to 'detector=4'). If so,
358 # remember that we've constrained it on this branch to later make
359 # sure it's consistent with any other constraints on any other
360 # branches its AND'd with.
361 if branch.dataIdKey is not None and branch.dataIdValue is not None:
362 new_value = branch.dataIdKey.primaryKey.getPythonType()(branch.dataIdValue)
363 value = summary.dimension_values.setdefault(branch.dataIdKey.name, new_value)
364 if value != new_value:
365 # Expression says something like "instrument='HSC' AND
366 # instrument='DECam'", or data ID has one and expression
367 # has the other.
368 if branch.dataIdKey in self.dataId:
369 raise UserExpressionError(
370 f"Conflict between expression containing {branch.dataIdKey.name}={new_value!r} "
371 f"and data ID with {branch.dataIdKey.name}={value!r}."
372 )
373 else:
374 raise UserExpressionError(
375 f"Conflicting literal values for {branch.dataIdKey.name} in expression: "
376 f"{value!r} != {branch.dataIdValue!r}."
377 )
378 # Now that we know which governor values we've constrained, see if any
379 # are missing, i.e. if the expression contains something like "visit=X"
380 # without saying what instrument that visit corresponds to. This rules
381 # out a lot of accidents, but it also rules out possibly-legitimate
382 # multi-instrument queries like "visit.seeing < 0.7". But it's not
383 # unreasonable to ask the user to be explicit about the instruments
384 # they want to consider to work around this restriction, and that's
385 # what we do. Note that if someone does write an expression like
386 #
387 # (instrument='HSC' OR instrument='DECam') AND visit.seeing < 0.7
388 #
389 # then in disjunctive normal form that will become
390 #
391 # (instrument='HSC' AND visit.seeing < 0.7)
392 # OR (instrument='DECam' AND visit.seeing < 0.7)
393 #
394 # i.e. each instrument will get its own outer branch and the logic here
395 # still works (that sort of thing is why we convert to normal form,
396 # after all).
397 governorsNeededInBranch: set[str] = set()
398 for dimension in summary.dimensions:
399 governorsNeededInBranch.update(dimension.graph.governors.names)
400 if not governorsNeededInBranch.issubset(summary.dimension_values.keys()):
401 missing = governorsNeededInBranch - summary.dimension_values.keys()
402 if missing <= self.defaults.names:
403 summary.defaultsNeeded.update(missing)
404 else:
405 still_missing = missing - self.defaults.names
406 raise UserExpressionError(
407 f"No value(s) for governor dimensions {still_missing} in expression "
408 "that references dependent dimensions. 'Governor' dimensions must always be specified "
409 "completely in either the query expression (via simple 'name=<value>' terms, not 'IN' "
410 "terms) or in a data ID passed to the query method."
411 )
412 return summary
414 def visitOuter(self, branches: Sequence[InnerSummary], form: NormalForm) -> OuterSummary:
415 # Docstring inherited from NormalFormVisitor.
416 # Disjunctive normal form means outer branches are OR'd together.
417 assert form is NormalForm.DISJUNCTIVE
418 summary = OuterSummary()
419 if branches:
420 # Iterate over branches in first pass to gather all dimensions and
421 # columns referenced. This aggregation is for the full query, so
422 # we don't care whether things are joined by AND or OR (or + or -,
423 # etc). Also gather the set of dimensions directly constrained or
424 # pulled from defaults in _all_ branches. This is the set we will
425 # be able to bound overall; any dimensions not referenced by even
426 # one branch could be unbounded.
427 dimensions_in_all_branches = set(self.graph.universe.getStaticDimensions().names)
428 for branch in branches:
429 summary.update(branch)
430 summary.defaultsNeeded.update(branch.defaultsNeeded)
431 dimensions_in_all_branches.intersection_update(branch.dimension_values)
432 # Go back through and set up the dimension bounds.
433 summary.dimension_constraints.update(
434 {dimension: set() for dimension in dimensions_in_all_branches}
435 )
436 for dim in dimensions_in_all_branches:
437 for branch in branches:
438 summary.dimension_constraints[dim].add(branch.dimension_values[dim])
439 # See if we've referenced any dimensions that weren't in the original
440 # query graph; if so, we update that to include them. This is what
441 # lets a user say "tract=X" on the command line (well, "skymap=Y AND
442 # tract=X" - logic in visitInner checks for that) when running a task
443 # like ISR that has nothing to do with skymaps.
444 if not summary.dimensions.issubset(self.graph.dimensions):
445 self.graph = DimensionGraph(
446 self.graph.universe,
447 dimensions=(summary.dimensions | self.graph.dimensions),
448 )
449 for dimension, values in summary.dimension_constraints.items():
450 if dimension in summary.defaultsNeeded:
451 # One branch contained an explicit value for this dimension
452 # while another needed to refer to the default data ID.
453 # Even if these refer to the same value, that inconsistency
454 # probably indicates user error.
455 raise UserExpressionError(
456 f"Governor dimension {dimension} is explicitly "
457 f"constrained to {values} in one or more branches of "
458 "this query where expression, but is left to default "
459 f"to {self.defaults[dimension]!r} in another branch. "
460 "Defaults and explicit constraints cannot be mixed."
461 )
462 # If any default data ID values were needed, update self.dataId with
463 # them, and then update the governor restriction with them.
464 if summary.defaultsNeeded:
465 defaultsNeededGraph = DimensionGraph(self.graph.universe, names=summary.defaultsNeeded)
466 self.dataId = self.dataId.union(self.defaults.subset(defaultsNeededGraph))
467 for dimension in summary.defaultsNeeded:
468 summary.dimension_constraints[dimension] = {self.defaults[dimension]}
470 return summary