Coverage for python/lsst/daf/butler/registry/queries/expressions/check.py: 34%
149 statements
« prev ^ index » next coverage.py v6.4.1, created at 2022-07-03 01:08 -0700
« prev ^ index » next coverage.py v6.4.1, created at 2022-07-03 01:08 -0700
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
21from __future__ import annotations
23__all__ = (
24 "CheckVisitor",
25 "InspectionVisitor",
26 "InspectionSummary",
27)
29import dataclasses
30from typing import TYPE_CHECKING, Any, List, Mapping, Optional, Sequence, Set, Tuple
32from ....core import (
33 DataCoordinate,
34 Dimension,
35 DimensionElement,
36 DimensionGraph,
37 DimensionUniverse,
38 GovernorDimension,
39 NamedKeyDict,
40 NamedValueSet,
41)
42from ..._exceptions import UserExpressionError
43from ...summaries import GovernorDimensionRestriction
44from .categorize import ExpressionConstant, categorizeConstant, categorizeElementId
45from .normalForm import NormalForm, NormalFormVisitor
46from .parser import Node, TreeVisitor
48if TYPE_CHECKING: 48 ↛ 49line 48 didn't jump to line 49, because the condition on line 48 was never true
49 import astropy.time
52@dataclasses.dataclass
53class InspectionSummary:
54 """Base class for objects used by `CheckVisitor` and `InspectionVisitor`
55 to gather information about a parsed expression.
56 """
58 def update(self, other: InspectionSummary) -> None:
59 """Update ``self`` with all dimensions and columns from ``other``.
61 Parameters
62 ----------
63 other : `InspectionSummary`
64 The other summary object.
65 """
66 self.dimensions.update(other.dimensions)
67 for element, columns in other.columns.items():
68 self.columns.setdefault(element, set()).update(columns)
69 self.hasIngestDate = self.hasIngestDate or other.hasIngestDate
71 dimensions: NamedValueSet[Dimension] = dataclasses.field(default_factory=NamedValueSet)
72 """Dimensions whose primary keys or dependencies were referenced anywhere
73 in this branch (`NamedValueSet` [ `Dimension` ]).
74 """
76 columns: NamedKeyDict[DimensionElement, Set[str]] = dataclasses.field(default_factory=NamedKeyDict)
77 """Dimension element tables whose columns were referenced anywhere in this
78 branch (`NamedKeyDict` [ `DimensionElement`, `set` [ `str` ] ]).
79 """
81 hasIngestDate: bool = False
82 """Whether this expression includes the special dataset ingest date
83 identifier (`bool`).
84 """
87@dataclasses.dataclass
88class TreeSummary(InspectionSummary):
89 """Result object used by `InspectionVisitor` to gather information about
90 a parsed expression.
92 Notes
93 -----
94 TreeSummary adds attributes that allow dimension equivalence expressions
95 (e.g. "tract=4") to be recognized when they appear in simple contexts
96 (surrounded only by ANDs and ORs). When `InspectionVisitor` is used on its
97 own (i.e. when ``check=False`` in the query code), these don't do anything,
98 but they don't cost much, either. They are used by `CheckVisitor` when it
99 delegates to `InspectionVisitor` to see what governor dimension values are
100 set in a branch of the normal-form expression.
101 """
103 def merge(self, other: TreeSummary, isEq: bool = False) -> TreeSummary:
104 """Merge ``other`` into ``self``, making ``self`` a summary of both
105 expression tree branches.
107 Parameters
108 ----------
109 other : `TreeSummary`
110 The other summary object.
111 isEq : `bool`, optional
112 If `True` (`False` is default), these summaries are being combined
113 via the equality operator.
115 Returns
116 -------
117 self : `TreeSummary`
118 The merged summary (updated in-place).
119 """
120 self.update(other)
121 if isEq and self.isDataIdKeyOnly() and other.isDataIdValueOnly():
122 self.dataIdValue = other.dataIdValue
123 elif isEq and self.isDataIdValueOnly() and other.isDataIdKeyOnly():
124 self.dataIdKey = other.dataIdKey
125 else:
126 self.dataIdKey = None
127 self.dataIdValue = None
128 return self
130 def isDataIdKeyOnly(self) -> bool:
131 """Test whether this branch is _just_ a data ID key identifier."""
132 return self.dataIdKey is not None and self.dataIdValue is None
134 def isDataIdValueOnly(self) -> bool:
135 """Test whether this branch is _just_ a literal value that may be
136 used as the value in a data ID key-value pair.
137 """
138 return self.dataIdKey is None and self.dataIdValue is not None
140 dataIdKey: Optional[Dimension] = None
141 """A `Dimension` that is (if `dataIdValue` is not `None`) or may be
142 (if `dataIdValue` is `None`) fully identified by a literal value in this
143 branch.
144 """
146 dataIdValue: Optional[str] = None
147 """A literal value that constrains (if `dataIdKey` is not `None`) or may
148 constrain (if `dataIdKey` is `None`) a dimension in this branch.
150 This is always a `str` or `None`, but it may need to be coerced to `int`
151 to reflect the actual user intent.
152 """
155class InspectionVisitor(TreeVisitor[TreeSummary]):
156 """Implements TreeVisitor to identify dimension elements that need
157 to be included in a query, prior to actually constructing a SQLAlchemy
158 WHERE clause from it.
160 Parameters
161 ----------
162 universe : `DimensionUniverse`
163 All known dimensions.
164 bind : `Mapping` [ `str`, `object` ]
165 Mapping containing literal values that should be injected into the
166 query expression, keyed by the identifiers they replace.
167 """
169 def __init__(self, universe: DimensionUniverse, bind: Mapping[str, Any]):
170 self.universe = universe
171 self.bind = bind
173 def visitNumericLiteral(self, value: str, node: Node) -> TreeSummary:
174 # Docstring inherited from TreeVisitor.visitNumericLiteral
175 return TreeSummary(dataIdValue=value)
177 def visitStringLiteral(self, value: str, node: Node) -> TreeSummary:
178 # Docstring inherited from TreeVisitor.visitStringLiteral
179 return TreeSummary(dataIdValue=value)
181 def visitTimeLiteral(self, value: astropy.time.Time, node: Node) -> TreeSummary:
182 # Docstring inherited from TreeVisitor.visitTimeLiteral
183 return TreeSummary()
185 def visitIdentifier(self, name: str, node: Node) -> TreeSummary:
186 # Docstring inherited from TreeVisitor.visitIdentifier
187 if name in self.bind:
188 return TreeSummary(dataIdValue=self.bind[name])
189 constant = categorizeConstant(name)
190 if constant is ExpressionConstant.INGEST_DATE:
191 return TreeSummary(hasIngestDate=True)
192 elif constant is ExpressionConstant.NULL:
193 return TreeSummary()
194 assert constant is None, "Enum variant conditionals should be exhaustive."
195 element, column = categorizeElementId(self.universe, name)
196 if column is None:
197 assert isinstance(element, Dimension)
198 return TreeSummary(
199 dimensions=NamedValueSet(element.graph.dimensions),
200 dataIdKey=element,
201 )
202 else:
203 return TreeSummary(
204 dimensions=NamedValueSet(element.graph.dimensions), columns=NamedKeyDict({element: {column}})
205 )
207 def visitUnaryOp(self, operator: str, operand: TreeSummary, node: Node) -> TreeSummary:
208 # Docstring inherited from TreeVisitor.visitUnaryOp
209 return operand
211 def visitBinaryOp(self, operator: str, lhs: TreeSummary, rhs: TreeSummary, node: Node) -> TreeSummary:
212 # Docstring inherited from TreeVisitor.visitBinaryOp
213 return lhs.merge(rhs, isEq=(operator == "="))
215 def visitIsIn(self, lhs: TreeSummary, values: List[TreeSummary], not_in: bool, node: Node) -> TreeSummary:
216 # Docstring inherited from TreeVisitor.visitIsIn
217 for v in values:
218 lhs.merge(v)
219 return lhs
221 def visitParens(self, expression: TreeSummary, node: Node) -> TreeSummary:
222 # Docstring inherited from TreeVisitor.visitParens
223 return expression
225 def visitTupleNode(self, items: Tuple[TreeSummary, ...], node: Node) -> TreeSummary:
226 # Docstring inherited from base class
227 result = TreeSummary()
228 for i in items:
229 result.merge(i)
230 return result
232 def visitRangeLiteral(self, start: int, stop: int, stride: Optional[int], node: Node) -> TreeSummary:
233 # Docstring inherited from TreeVisitor.visitRangeLiteral
234 return TreeSummary()
236 def visitPointNode(self, ra: TreeSummary, dec: TreeSummary, node: Node) -> TreeSummary:
237 # Docstring inherited from base class
238 return TreeSummary()
241@dataclasses.dataclass
242class InnerSummary(InspectionSummary):
243 """Result object used by `CheckVisitor` to gather referenced dimensions
244 and tables from an inner group of AND'd together expression branches, and
245 check them for consistency and completeness.
246 """
248 governors: NamedKeyDict[GovernorDimension, str] = dataclasses.field(default_factory=NamedKeyDict)
249 """Mapping containing the values of all governor dimensions that are
250 equated with literal values in this expression branch.
251 """
253 defaultsNeeded: NamedValueSet[GovernorDimension] = dataclasses.field(default_factory=NamedValueSet)
254 """Governor dimensions whose values are needed by the query, not provided
255 in the query itself, and present in the default data ID.
257 These should be added to the query's data ID when finalizing the WHERE
258 clause.
259 """
262@dataclasses.dataclass
263class OuterSummary(InspectionSummary):
264 """Result object used by `CheckVisitor` to gather referenced dimensions,
265 tables, and governor dimension values from the entire expression.
266 """
268 governors: GovernorDimensionRestriction = dataclasses.field(
269 default_factory=GovernorDimensionRestriction.makeFull
270 )
271 """Mapping containing all values that appear in this expression for
272 governor dimension relevant to the query.
274 Governor dimensions that are absent from this dict are not constrained by
275 this expression.
276 """
278 defaultsNeeded: NamedValueSet[GovernorDimension] = dataclasses.field(default_factory=NamedValueSet)
279 """Governor dimensions whose values are needed by the query, not provided
280 in the query itself, and present in the default data ID.
282 These should be added to the query's data ID when finalizing the WHERE
283 clause.
284 """
287class CheckVisitor(NormalFormVisitor[TreeSummary, InnerSummary, OuterSummary]):
288 """An implementation of `NormalFormVisitor` that identifies the dimensions
289 and tables that need to be included in a query while performing some checks
290 for completeness and consistency.
292 Parameters
293 ----------
294 dataId : `DataCoordinate`
295 Dimension values that are fully known in advance.
296 graph : `DimensionGraph`
297 The dimensions the query would include in the absence of this
298 expression.
299 bind : `Mapping` [ `str`, `object` ]
300 Mapping containing literal values that should be injected into the
301 query expression, keyed by the identifiers they replace.
302 defaults : `DataCoordinate`
303 A data ID containing default for governor dimensions.
304 """
306 def __init__(
307 self,
308 dataId: DataCoordinate,
309 graph: DimensionGraph,
310 bind: Mapping[str, Any],
311 defaults: DataCoordinate,
312 ):
313 self.dataId = dataId
314 self.graph = graph
315 self.defaults = defaults
316 self._branchVisitor = InspectionVisitor(dataId.universe, bind)
318 def visitBranch(self, node: Node) -> TreeSummary:
319 # Docstring inherited from NormalFormVisitor.
320 return node.visit(self._branchVisitor)
322 def visitInner(self, branches: Sequence[TreeSummary], form: NormalForm) -> InnerSummary:
323 # Docstring inherited from NormalFormVisitor.
324 # Disjunctive normal form means inner branches are AND'd together...
325 assert form is NormalForm.DISJUNCTIVE
326 # ...and that means each branch we iterate over together below
327 # constrains the others, and they all need to be consistent. Moreover,
328 # because outer branches are OR'd together, we also know that if
329 # something is missing from one of these branches (like a governor
330 # dimension value like the instrument or skymap needed to interpret a
331 # visit or tract number), it really is missing, because there's no way
332 # some other inner branch can constraint it.
333 #
334 # That is, except the data ID the visitor was passed at construction;
335 # that's AND'd to the entire expression later, and thus it affects all
336 # branches. To take care of that, we add any governor values it
337 # contains to the summary in advance.
338 summary = InnerSummary()
339 summary.governors.update((k, self.dataId[k]) for k in self.dataId.graph.governors) # type: ignore
340 # Finally, we loop over those branches.
341 for branch in branches:
342 # Update the sets of dimensions and columns we've seen anywhere in
343 # the expression in any context.
344 summary.update(branch)
345 # Test whether this branch has a form like '<dimension>=<value'
346 # (or equivalent; categorizeIdentifier is smart enough to see that
347 # e.g. 'detector.id=4' is equivalent to 'detector=4').
348 # If so, and it's a governor dimension, remember that we've
349 # constrained it on this branch, and make sure it's consistent
350 # with any other constraints on any other branches its AND'd with.
351 if isinstance(branch.dataIdKey, GovernorDimension) and branch.dataIdValue is not None:
352 governor = branch.dataIdKey
353 value = summary.governors.setdefault(governor, branch.dataIdValue)
354 if value != branch.dataIdValue:
355 # Expression says something like "instrument='HSC' AND
356 # instrument='DECam'", or data ID has one and expression
357 # has the other.
358 if governor in self.dataId:
359 raise UserExpressionError(
360 f"Conflict between expression containing {governor.name}={branch.dataIdValue!r} "
361 f"and data ID with {governor.name}={value!r}."
362 )
363 else:
364 raise UserExpressionError(
365 f"Conflicting literal values for {governor.name} in expression: "
366 f"{value!r} != {branch.dataIdValue!r}."
367 )
368 # Now that we know which governor values we've constrained, see if any
369 # are missing, i.e. if the expression contains something like "visit=X"
370 # without saying what instrument that visit corresponds to. This rules
371 # out a lot of accidents, but it also rules out possibly-legitimate
372 # multi-instrument queries like "visit.seeing < 0.7". But it's not
373 # unreasonable to ask the user to be explicit about the instruments
374 # they want to consider to work around this restriction, and that's
375 # what we do. Note that if someone does write an expression like
376 #
377 # (instrument='HSC' OR instrument='DECam') AND visit.seeing < 0.7
378 #
379 # then in disjunctive normal form that will become
380 #
381 # (instrument='HSC' AND visit.seeing < 0.7)
382 # OR (instrument='DECam' AND visit.seeing < 0.7)
383 #
384 # i.e. each instrument will get its own outer branch and the logic here
385 # still works (that sort of thing is why we convert to normal form,
386 # after all).
387 governorsNeededInBranch: NamedValueSet[GovernorDimension] = NamedValueSet()
388 for dimension in summary.dimensions:
389 governorsNeededInBranch.update(dimension.graph.governors)
390 if not governorsNeededInBranch.issubset(summary.governors.keys()):
391 missing = NamedValueSet(governorsNeededInBranch - summary.governors.keys())
392 if missing <= self.defaults.keys():
393 summary.defaultsNeeded.update(missing)
394 else:
395 raise UserExpressionError(
396 f"No value(s) for governor dimensions {missing - self.defaults.keys()} in expression "
397 "that references dependent dimensions. 'Governor' dimensions must always be specified "
398 "completely in either the query expression (via simple 'name=<value>' terms, not 'IN' "
399 "terms) or in a data ID passed to the query method."
400 )
401 return summary
403 def visitOuter(self, branches: Sequence[InnerSummary], form: NormalForm) -> OuterSummary:
404 # Docstring inherited from NormalFormVisitor.
405 # Disjunctive normal form means outer branches are OR'd together.
406 assert form is NormalForm.DISJUNCTIVE
407 # Iterate over branches in first pass to gather all dimensions and
408 # columns referenced. This aggregation is for the full query, so we
409 # don't care whether things are joined by AND or OR (or + or -, etc).
410 summary = OuterSummary()
411 if branches:
412 # To make an OR of branch constraints start with empty selection.
413 summary.governors = GovernorDimensionRestriction.makeEmpty(self.graph.universe)
414 for branch in branches:
415 summary.update(branch)
416 summary.governors = summary.governors.union(branch.governors)
417 summary.defaultsNeeded.update(branch.defaultsNeeded)
418 # See if we've referenced any dimensions that weren't in the original
419 # query graph; if so, we update that to include them. This is what
420 # lets a user say "tract=X" on the command line (well, "skymap=Y AND
421 # tract=X" - logic in visitInner checks for that) when running a task
422 # like ISR that has nothing to do with skymaps.
423 if not summary.dimensions.issubset(self.graph.dimensions):
424 self.graph = DimensionGraph(
425 self.graph.universe,
426 dimensions=(summary.dimensions | self.graph.dimensions),
427 )
428 for governor, values in summary.governors.items():
429 if governor in summary.defaultsNeeded:
430 # One branch contained an explicit value for this dimension
431 # while another needed to refer to the default data ID.
432 # Even if these refer to the same value, that inconsistency
433 # probably indicates user error.
434 raise UserExpressionError(
435 f"Governor dimension {governor.name} is explicitly "
436 f"constrained to {values} in one or more branches of "
437 "this query where expression, but is left to default "
438 f"to {self.defaults[governor]!r} in another branch. "
439 "Defaults and explicit constraints cannot be mixed."
440 )
441 # If any default data ID values were needed, update self.dataId with
442 # them, and then update the governor restriction with them.
443 if summary.defaultsNeeded:
444 defaultsNeededGraph = DimensionGraph(self.graph.universe, summary.defaultsNeeded)
445 self.dataId = self.dataId.union(self.defaults.subset(defaultsNeededGraph))
446 assert self.dataId.hasRecords(), (
447 "Should be a union of two data IDs with records, "
448 "in which one only adds governor dimension values."
449 )
450 summary.governors.intersection_update(
451 # We know the value for a governor dimension is always a str,
452 # and that's all self.defaults should contain, but MyPy doesn't
453 # know that.
454 {dimension: self.defaults[dimension] for dimension in summary.defaultsNeeded} # type: ignore
455 )
456 return summary