Coverage for python/lsst/daf/butler/registry/queries/expressions/check.py: 32%
153 statements
« prev ^ index » next coverage.py v6.4.4, created at 2022-09-27 02:00 -0700
« prev ^ index » next coverage.py v6.4.4, created at 2022-09-27 02:00 -0700
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
21from __future__ import annotations
23__all__ = (
24 "CheckVisitor",
25 "InspectionVisitor",
26 "InspectionSummary",
27)
29import dataclasses
30from typing import TYPE_CHECKING, Any, List, Mapping, Optional, Sequence, Set, Tuple
32from ....core import (
33 DataCoordinate,
34 DataIdValue,
35 Dimension,
36 DimensionElement,
37 DimensionGraph,
38 DimensionUniverse,
39 NamedKeyDict,
40 NamedValueSet,
41)
42from ..._exceptions import UserExpressionError
43from .categorize import ExpressionConstant, categorizeConstant, categorizeElementId
44from .normalForm import NormalForm, NormalFormVisitor
45from .parser import Node, TreeVisitor
47if TYPE_CHECKING: 47 ↛ 48line 47 didn't jump to line 48, because the condition on line 47 was never true
48 import astropy.time
51@dataclasses.dataclass
52class InspectionSummary:
53 """Base class for objects used by `CheckVisitor` and `InspectionVisitor`
54 to gather information about a parsed expression.
55 """
57 def update(self, other: InspectionSummary) -> None:
58 """Update ``self`` with all dimensions and columns from ``other``.
60 Parameters
61 ----------
62 other : `InspectionSummary`
63 The other summary object.
64 """
65 self.dimensions.update(other.dimensions)
66 for element, columns in other.columns.items():
67 self.columns.setdefault(element, set()).update(columns)
68 self.hasIngestDate = self.hasIngestDate or other.hasIngestDate
70 dimensions: NamedValueSet[Dimension] = dataclasses.field(default_factory=NamedValueSet)
71 """Dimensions whose primary keys or dependencies were referenced anywhere
72 in this branch (`NamedValueSet` [ `Dimension` ]).
73 """
75 columns: NamedKeyDict[DimensionElement, Set[str]] = dataclasses.field(default_factory=NamedKeyDict)
76 """Dimension element tables whose columns were referenced anywhere in this
77 branch (`NamedKeyDict` [ `DimensionElement`, `set` [ `str` ] ]).
78 """
80 hasIngestDate: bool = False
81 """Whether this expression includes the special dataset ingest date
82 identifier (`bool`).
83 """
86@dataclasses.dataclass
87class TreeSummary(InspectionSummary):
88 """Result object used by `InspectionVisitor` to gather information about
89 a parsed expression.
91 Notes
92 -----
93 TreeSummary adds attributes that allow dimension equivalence expressions
94 (e.g. "tract=4") to be recognized when they appear in simple contexts
95 (surrounded only by ANDs and ORs). When `InspectionVisitor` is used on its
96 own (i.e. when ``check=False`` in the query code), these don't do anything,
97 but they don't cost much, either. They are used by `CheckVisitor` when it
98 delegates to `InspectionVisitor` to see what governor dimension values are
99 set in a branch of the normal-form expression.
100 """
102 def merge(self, other: TreeSummary, isEq: bool = False) -> TreeSummary:
103 """Merge ``other`` into ``self``, making ``self`` a summary of both
104 expression tree branches.
106 Parameters
107 ----------
108 other : `TreeSummary`
109 The other summary object.
110 isEq : `bool`, optional
111 If `True` (`False` is default), these summaries are being combined
112 via the equality operator.
114 Returns
115 -------
116 self : `TreeSummary`
117 The merged summary (updated in-place).
118 """
119 self.update(other)
120 if isEq and self.isDataIdKeyOnly() and other.isDataIdValueOnly():
121 self.dataIdValue = other.dataIdValue
122 elif isEq and self.isDataIdValueOnly() and other.isDataIdKeyOnly():
123 self.dataIdKey = other.dataIdKey
124 else:
125 self.dataIdKey = None
126 self.dataIdValue = None
127 return self
129 def isDataIdKeyOnly(self) -> bool:
130 """Test whether this branch is _just_ a data ID key identifier."""
131 return self.dataIdKey is not None and self.dataIdValue is None
133 def isDataIdValueOnly(self) -> bool:
134 """Test whether this branch is _just_ a literal value that may be
135 used as the value in a data ID key-value pair.
136 """
137 return self.dataIdKey is None and self.dataIdValue is not None
139 dataIdKey: Optional[Dimension] = None
140 """A `Dimension` that is (if `dataIdValue` is not `None`) or may be
141 (if `dataIdValue` is `None`) fully identified by a literal value in this
142 branch.
143 """
145 dataIdValue: Optional[str] = None
146 """A literal value that constrains (if `dataIdKey` is not `None`) or may
147 constrain (if `dataIdKey` is `None`) a dimension in this branch.
149 This is always a `str` or `None`, but it may need to be coerced to `int`
150 to reflect the actual user intent.
151 """
154class InspectionVisitor(TreeVisitor[TreeSummary]):
155 """Implements TreeVisitor to identify dimension elements that need
156 to be included in a query, prior to actually constructing a SQLAlchemy
157 WHERE clause from it.
159 Parameters
160 ----------
161 universe : `DimensionUniverse`
162 All known dimensions.
163 bind : `Mapping` [ `str`, `object` ]
164 Mapping containing literal values that should be injected into the
165 query expression, keyed by the identifiers they replace.
166 """
168 def __init__(self, universe: DimensionUniverse, bind: Mapping[str, Any]):
169 self.universe = universe
170 self.bind = bind
172 def visitNumericLiteral(self, value: str, node: Node) -> TreeSummary:
173 # Docstring inherited from TreeVisitor.visitNumericLiteral
174 return TreeSummary(dataIdValue=value)
176 def visitStringLiteral(self, value: str, node: Node) -> TreeSummary:
177 # Docstring inherited from TreeVisitor.visitStringLiteral
178 return TreeSummary(dataIdValue=value)
180 def visitTimeLiteral(self, value: astropy.time.Time, node: Node) -> TreeSummary:
181 # Docstring inherited from TreeVisitor.visitTimeLiteral
182 return TreeSummary()
184 def visitIdentifier(self, name: str, node: Node) -> TreeSummary:
185 # Docstring inherited from TreeVisitor.visitIdentifier
186 if name in self.bind:
187 return TreeSummary(dataIdValue=self.bind[name])
188 constant = categorizeConstant(name)
189 if constant is ExpressionConstant.INGEST_DATE:
190 return TreeSummary(hasIngestDate=True)
191 elif constant is ExpressionConstant.NULL:
192 return TreeSummary()
193 assert constant is None, "Enum variant conditionals should be exhaustive."
194 element, column = categorizeElementId(self.universe, name)
195 if column is None:
196 assert isinstance(element, Dimension)
197 return TreeSummary(
198 dimensions=NamedValueSet(element.graph.dimensions),
199 dataIdKey=element,
200 )
201 else:
202 return TreeSummary(
203 dimensions=NamedValueSet(element.graph.dimensions), columns=NamedKeyDict({element: {column}})
204 )
206 def visitUnaryOp(self, operator: str, operand: TreeSummary, node: Node) -> TreeSummary:
207 # Docstring inherited from TreeVisitor.visitUnaryOp
208 return operand
210 def visitBinaryOp(self, operator: str, lhs: TreeSummary, rhs: TreeSummary, node: Node) -> TreeSummary:
211 # Docstring inherited from TreeVisitor.visitBinaryOp
212 return lhs.merge(rhs, isEq=(operator == "="))
214 def visitIsIn(self, lhs: TreeSummary, values: List[TreeSummary], not_in: bool, node: Node) -> TreeSummary:
215 # Docstring inherited from TreeVisitor.visitIsIn
216 for v in values:
217 lhs.merge(v)
218 return lhs
220 def visitParens(self, expression: TreeSummary, node: Node) -> TreeSummary:
221 # Docstring inherited from TreeVisitor.visitParens
222 return expression
224 def visitTupleNode(self, items: Tuple[TreeSummary, ...], node: Node) -> TreeSummary:
225 # Docstring inherited from base class
226 result = TreeSummary()
227 for i in items:
228 result.merge(i)
229 return result
231 def visitRangeLiteral(self, start: int, stop: int, stride: Optional[int], node: Node) -> TreeSummary:
232 # Docstring inherited from TreeVisitor.visitRangeLiteral
233 return TreeSummary()
235 def visitPointNode(self, ra: TreeSummary, dec: TreeSummary, node: Node) -> TreeSummary:
236 # Docstring inherited from base class
237 return TreeSummary()
240@dataclasses.dataclass
241class InnerSummary(InspectionSummary):
242 """Result object used by `CheckVisitor` to gather referenced dimensions
243 and tables from an inner group of AND'd together expression branches, and
244 check them for consistency and completeness.
245 """
247 dimension_values: dict[str, DataIdValue] = dataclasses.field(default_factory=dict)
248 """Mapping containing the values of all dimensions that are equated with
249 literal values in this expression branch.
250 """
252 defaultsNeeded: set[str] = dataclasses.field(default_factory=set)
253 """Governor dimensions whose values are needed by the query, not provided
254 in the query itself, and present in the default data ID.
256 These should be added to the query's data ID when finalizing the WHERE
257 clause.
258 """
261@dataclasses.dataclass
262class OuterSummary(InspectionSummary):
263 """Result object used by `CheckVisitor` to gather referenced dimensions,
264 tables, and governor dimension values from the entire expression.
265 """
267 dimension_constraints: dict[str, set[DataIdValue]] = dataclasses.field(default_factory=dict)
268 """Mapping containing all values that appear in this expression for
269 dimensions relevant to the query.
271 Dimensions that are absent from this dict are not constrained by this
272 expression.
273 """
275 defaultsNeeded: set[str] = dataclasses.field(default_factory=set)
276 """Governor dimensions whose values are needed by the query, not provided
277 in the query itself, and present in the default data ID.
279 These should be added to the query's data ID when finalizing the WHERE
280 clause.
281 """
284class CheckVisitor(NormalFormVisitor[TreeSummary, InnerSummary, OuterSummary]):
285 """An implementation of `NormalFormVisitor` that identifies the dimensions
286 and tables that need to be included in a query while performing some checks
287 for completeness and consistency.
289 Parameters
290 ----------
291 dataId : `DataCoordinate`
292 Dimension values that are fully known in advance.
293 graph : `DimensionGraph`
294 The dimensions the query would include in the absence of this
295 expression.
296 bind : `Mapping` [ `str`, `object` ]
297 Mapping containing literal values that should be injected into the
298 query expression, keyed by the identifiers they replace.
299 defaults : `DataCoordinate`
300 A data ID containing default for governor dimensions.
301 """
303 def __init__(
304 self,
305 dataId: DataCoordinate,
306 graph: DimensionGraph,
307 bind: Mapping[str, Any],
308 defaults: DataCoordinate,
309 ):
310 self.dataId = dataId
311 self.graph = graph
312 self.defaults = defaults
313 self._branchVisitor = InspectionVisitor(dataId.universe, bind)
315 def visitBranch(self, node: Node) -> TreeSummary:
316 # Docstring inherited from NormalFormVisitor.
317 return node.visit(self._branchVisitor)
319 def visitInner(self, branches: Sequence[TreeSummary], form: NormalForm) -> InnerSummary:
320 # Docstring inherited from NormalFormVisitor.
321 # Disjunctive normal form means inner branches are AND'd together...
322 assert form is NormalForm.DISJUNCTIVE
323 # ...and that means each branch we iterate over together below
324 # constrains the others, and they all need to be consistent. Moreover,
325 # because outer branches are OR'd together, we also know that if
326 # something is missing from one of these branches (like a governor
327 # dimension value like the instrument or skymap needed to interpret a
328 # visit or tract number), it really is missing, because there's no way
329 # some other inner branch can constraint it.
330 #
331 # That is, except the data ID the visitor was passed at construction;
332 # that's AND'd to the entire expression later, and thus it affects all
333 # branches. To take care of that, we add any governor values it
334 # contains to the summary in advance.
335 summary = InnerSummary()
336 summary.dimension_values.update(
337 (k, self.dataId[k])
338 for k in (self.dataId.graph.names if self.dataId.hasFull() else self.dataId.graph.required.names)
339 )
340 # Finally, we loop over those branches.
341 for branch in branches:
342 # Update the sets of dimensions and columns we've seen anywhere in
343 # the expression in any context.
344 summary.update(branch)
345 # Test whether this branch has a form like '<dimension>=<value>'
346 # (or equivalent; categorizeIdentifier is smart enough to see that
347 # e.g. 'detector.id=4' is equivalent to 'detector=4'). If so,
348 # remember that we've constrained it on this branch to later make
349 # sure it's consistent with any other constraints on any other
350 # branches its AND'd with.
351 if branch.dataIdKey is not None and branch.dataIdValue is not None:
352 new_value = branch.dataIdKey.primaryKey.getPythonType()(branch.dataIdValue)
353 value = summary.dimension_values.setdefault(branch.dataIdKey.name, new_value)
354 if value != new_value:
355 # Expression says something like "instrument='HSC' AND
356 # instrument='DECam'", or data ID has one and expression
357 # has the other.
358 if branch.dataIdKey in self.dataId:
359 raise UserExpressionError(
360 f"Conflict between expression containing {branch.dataIdKey.name}={new_value!r} "
361 f"and data ID with {branch.dataIdKey.name}={value!r}."
362 )
363 else:
364 raise UserExpressionError(
365 f"Conflicting literal values for {branch.dataIdKey.name} in expression: "
366 f"{value!r} != {branch.dataIdValue!r}."
367 )
368 # Now that we know which governor values we've constrained, see if any
369 # are missing, i.e. if the expression contains something like "visit=X"
370 # without saying what instrument that visit corresponds to. This rules
371 # out a lot of accidents, but it also rules out possibly-legitimate
372 # multi-instrument queries like "visit.seeing < 0.7". But it's not
373 # unreasonable to ask the user to be explicit about the instruments
374 # they want to consider to work around this restriction, and that's
375 # what we do. Note that if someone does write an expression like
376 #
377 # (instrument='HSC' OR instrument='DECam') AND visit.seeing < 0.7
378 #
379 # then in disjunctive normal form that will become
380 #
381 # (instrument='HSC' AND visit.seeing < 0.7)
382 # OR (instrument='DECam' AND visit.seeing < 0.7)
383 #
384 # i.e. each instrument will get its own outer branch and the logic here
385 # still works (that sort of thing is why we convert to normal form,
386 # after all).
387 governorsNeededInBranch: set[str] = set()
388 for dimension in summary.dimensions:
389 governorsNeededInBranch.update(dimension.graph.governors.names)
390 if not governorsNeededInBranch.issubset(summary.dimension_values.keys()):
391 missing = governorsNeededInBranch - summary.dimension_values.keys()
392 if missing <= self.defaults.names:
393 summary.defaultsNeeded.update(missing)
394 else:
395 still_missing = missing - self.defaults.names
396 raise UserExpressionError(
397 f"No value(s) for governor dimensions {still_missing} in expression "
398 "that references dependent dimensions. 'Governor' dimensions must always be specified "
399 "completely in either the query expression (via simple 'name=<value>' terms, not 'IN' "
400 "terms) or in a data ID passed to the query method."
401 )
402 return summary
404 def visitOuter(self, branches: Sequence[InnerSummary], form: NormalForm) -> OuterSummary:
405 # Docstring inherited from NormalFormVisitor.
406 # Disjunctive normal form means outer branches are OR'd together.
407 assert form is NormalForm.DISJUNCTIVE
408 summary = OuterSummary()
409 if branches:
410 # Iterate over branches in first pass to gather all dimensions and
411 # columns referenced. This aggregation is for the full query, so
412 # we don't care whether things are joined by AND or OR (or + or -,
413 # etc). Also gather the set of dimensions directly constrained or
414 # pulled from defaults in _all_ branches. This is the set we will
415 # be able to bound overall; any dimensions not referenced by even
416 # one branch could be unbounded.
417 dimensions_in_all_branches = set(self.graph.universe.getStaticDimensions().names)
418 for branch in branches:
419 summary.update(branch)
420 summary.defaultsNeeded.update(branch.defaultsNeeded)
421 dimensions_in_all_branches.intersection_update(branch.dimension_values)
422 # Go back through and set up the dimension bounds.
423 summary.dimension_constraints.update(
424 {dimension: set() for dimension in dimensions_in_all_branches}
425 )
426 for dim in dimensions_in_all_branches:
427 for branch in branches:
428 summary.dimension_constraints[dim].add(branch.dimension_values[dim])
429 # See if we've referenced any dimensions that weren't in the original
430 # query graph; if so, we update that to include them. This is what
431 # lets a user say "tract=X" on the command line (well, "skymap=Y AND
432 # tract=X" - logic in visitInner checks for that) when running a task
433 # like ISR that has nothing to do with skymaps.
434 if not summary.dimensions.issubset(self.graph.dimensions):
435 self.graph = DimensionGraph(
436 self.graph.universe,
437 dimensions=(summary.dimensions | self.graph.dimensions),
438 )
439 for dimension, values in summary.dimension_constraints.items():
440 if dimension in summary.defaultsNeeded:
441 # One branch contained an explicit value for this dimension
442 # while another needed to refer to the default data ID.
443 # Even if these refer to the same value, that inconsistency
444 # probably indicates user error.
445 raise UserExpressionError(
446 f"Governor dimension {dimension} is explicitly "
447 f"constrained to {values} in one or more branches of "
448 "this query where expression, but is left to default "
449 f"to {self.defaults[dimension]!r} in another branch. "
450 "Defaults and explicit constraints cannot be mixed."
451 )
452 # If any default data ID values were needed, update self.dataId with
453 # them, and then update the governor restriction with them.
454 if summary.defaultsNeeded:
455 defaultsNeededGraph = DimensionGraph(self.graph.universe, names=summary.defaultsNeeded)
456 self.dataId = self.dataId.union(self.defaults.subset(defaultsNeededGraph))
457 for dimension in summary.defaultsNeeded:
458 summary.dimension_constraints[dimension] = {self.defaults[dimension]}
460 return summary