Coverage for python/lsst/daf/butler/registry/queries/expressions/check.py: 33%
Shortcuts on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
21from __future__ import annotations
23__all__ = (
24 "CheckVisitor",
25 "InspectionVisitor",
26 "InspectionSummary",
27)
29import dataclasses
30from typing import TYPE_CHECKING, AbstractSet, List, Optional, Sequence, Set, Tuple
32from ....core import (
33 DataCoordinate,
34 Dimension,
35 DimensionElement,
36 DimensionGraph,
37 DimensionUniverse,
38 GovernorDimension,
39 NamedKeyDict,
40 NamedValueSet,
41)
42from ...summaries import GovernorDimensionRestriction
43from .categorize import ExpressionConstant, categorizeConstant, categorizeElementId
44from .normalForm import NormalForm, NormalFormVisitor
45from .parser import Node, TreeVisitor
47if TYPE_CHECKING: 47 ↛ 48line 47 didn't jump to line 48, because the condition on line 47 was never true
48 import astropy.time
51@dataclasses.dataclass
52class InspectionSummary:
53 """Base class for objects used by `CheckVisitor` and `InspectionVisitor`
54 to gather information about a parsed expression.
55 """
57 def update(self, other: InspectionSummary) -> None:
58 """Update ``self`` with all dimensions and columns from ``other``.
60 Parameters
61 ----------
62 other : `InspectionSummary`
63 The other summary object.
64 """
65 self.dimensions.update(other.dimensions)
66 for element, columns in other.columns.items():
67 self.columns.setdefault(element, set()).update(columns)
68 self.hasIngestDate = self.hasIngestDate or other.hasIngestDate
70 dimensions: NamedValueSet[Dimension] = dataclasses.field(default_factory=NamedValueSet)
71 """Dimensions whose primary keys or dependencies were referenced anywhere
72 in this branch (`NamedValueSet` [ `Dimension` ]).
73 """
75 columns: NamedKeyDict[DimensionElement, Set[str]] = dataclasses.field(default_factory=NamedKeyDict)
76 """Dimension element tables whose columns were referenced anywhere in this
77 branch (`NamedKeyDict` [ `DimensionElement`, `set` [ `str` ] ]).
78 """
80 hasIngestDate: bool = False
81 """Whether this expression includes the special dataset ingest date
82 identifier (`bool`).
83 """
86@dataclasses.dataclass
87class TreeSummary(InspectionSummary):
88 """Result object used by `InspectionVisitor` to gather information about
89 a parsed expression.
91 Notes
92 -----
93 TreeSummary adds attributes that allow dimension equivalence expressions
94 (e.g. "tract=4") to be recognized when they appear in simple contexts
95 (surrounded only by ANDs and ORs). When `InspectionVisitor` is used on its
96 own (i.e. when ``check=False`` in the query code), these don't do anything,
97 but they don't cost much, either. They are used by `CheckVisitor` when it
98 delegates to `InspectionVisitor` to see what governor dimension values are
99 set in a branch of the normal-form expression.
100 """
102 def merge(self, other: TreeSummary, isEq: bool = False) -> TreeSummary:
103 """Merge ``other`` into ``self``, making ``self`` a summary of both
104 expression tree branches.
106 Parameters
107 ----------
108 other : `TreeSummary`
109 The other summary object.
110 isEq : `bool`, optional
111 If `True` (`False` is default), these summaries are being combined
112 via the equality operator.
114 Returns
115 -------
116 self : `TreeSummary`
117 The merged summary (updated in-place).
118 """
119 self.update(other)
120 if isEq and self.isDataIdKeyOnly() and other.isDataIdValueOnly():
121 self.dataIdValue = other.dataIdValue
122 elif isEq and self.isDataIdValueOnly() and other.isDataIdKeyOnly():
123 self.dataIdKey = other.dataIdKey
124 else:
125 self.dataIdKey = None
126 self.dataIdValue = None
127 return self
129 def isDataIdKeyOnly(self) -> bool:
130 """Test whether this branch is _just_ a data ID key identifier."""
131 return self.dataIdKey is not None and self.dataIdValue is None
133 def isDataIdValueOnly(self) -> bool:
134 """Test whether this branch is _just_ a literal value that may be
135 used as the value in a data ID key-value pair.
136 """
137 return self.dataIdKey is None and self.dataIdValue is not None
139 dataIdKey: Optional[Dimension] = None
140 """A `Dimension` that is (if `dataIdValue` is not `None`) or may be
141 (if `dataIdValue` is `None`) fully identified by a literal value in this
142 branch.
143 """
145 dataIdValue: Optional[str] = None
146 """A literal value that constrains (if `dataIdKey` is not `None`) or may
147 constrain (if `dataIdKey` is `None`) a dimension in this branch.
149 This is always a `str` or `None`, but it may need to be coerced to `int`
150 to reflect the actual user intent.
151 """
154class InspectionVisitor(TreeVisitor[TreeSummary]):
155 """Implements TreeVisitor to identify dimension elements that need
156 to be included in a query, prior to actually constructing a SQLAlchemy
157 WHERE clause from it.
159 Parameters
160 ----------
161 universe : `DimensionUniverse`
162 All known dimensions.
163 bindKeys : `collections.abc.Set` [ `str` ]
164 Identifiers that represent bound parameter values, and hence need not
165 represent in-database entities.
166 """
168 def __init__(self, universe: DimensionUniverse, bindKeys: AbstractSet[str]):
169 self.universe = universe
170 self.bindKeys = bindKeys
172 def visitNumericLiteral(self, value: str, node: Node) -> TreeSummary:
173 # Docstring inherited from TreeVisitor.visitNumericLiteral
174 return TreeSummary(dataIdValue=value)
176 def visitStringLiteral(self, value: str, node: Node) -> TreeSummary:
177 # Docstring inherited from TreeVisitor.visitStringLiteral
178 return TreeSummary(dataIdValue=value)
180 def visitTimeLiteral(self, value: astropy.time.Time, node: Node) -> TreeSummary:
181 # Docstring inherited from TreeVisitor.visitTimeLiteral
182 return TreeSummary()
184 def visitIdentifier(self, name: str, node: Node) -> TreeSummary:
185 # Docstring inherited from TreeVisitor.visitIdentifier
186 if name in self.bindKeys:
187 return TreeSummary()
188 constant = categorizeConstant(name)
189 if constant is ExpressionConstant.INGEST_DATE:
190 return TreeSummary(hasIngestDate=True)
191 elif constant is ExpressionConstant.NULL:
192 return TreeSummary()
193 assert constant is None, "Enum variant conditionals should be exhaustive."
194 element, column = categorizeElementId(self.universe, name)
195 if column is None:
196 assert isinstance(element, Dimension)
197 return TreeSummary(
198 dimensions=NamedValueSet(element.graph.dimensions),
199 dataIdKey=element,
200 )
201 else:
202 return TreeSummary(
203 dimensions=NamedValueSet(element.graph.dimensions), columns=NamedKeyDict({element: {column}})
204 )
206 def visitUnaryOp(self, operator: str, operand: TreeSummary, node: Node) -> TreeSummary:
207 # Docstring inherited from TreeVisitor.visitUnaryOp
208 return operand
210 def visitBinaryOp(self, operator: str, lhs: TreeSummary, rhs: TreeSummary, node: Node) -> TreeSummary:
211 # Docstring inherited from TreeVisitor.visitBinaryOp
212 return lhs.merge(rhs, isEq=(operator == "="))
214 def visitIsIn(self, lhs: TreeSummary, values: List[TreeSummary], not_in: bool, node: Node) -> TreeSummary:
215 # Docstring inherited from TreeVisitor.visitIsIn
216 for v in values:
217 lhs.merge(v)
218 return lhs
220 def visitParens(self, expression: TreeSummary, node: Node) -> TreeSummary:
221 # Docstring inherited from TreeVisitor.visitParens
222 return expression
224 def visitTupleNode(self, items: Tuple[TreeSummary, ...], node: Node) -> TreeSummary:
225 # Docstring inherited from base class
226 result = TreeSummary()
227 for i in items:
228 result.merge(i)
229 return result
231 def visitRangeLiteral(self, start: int, stop: int, stride: Optional[int], node: Node) -> TreeSummary:
232 # Docstring inherited from TreeVisitor.visitRangeLiteral
233 return TreeSummary()
235 def visitPointNode(self, ra: TreeSummary, dec: TreeSummary, node: Node) -> TreeSummary:
236 # Docstring inherited from base class
237 return TreeSummary()
240@dataclasses.dataclass
241class InnerSummary(InspectionSummary):
242 """Result object used by `CheckVisitor` to gather referenced dimensions
243 and tables from an inner group of AND'd together expression branches, and
244 check them for consistency and completeness.
245 """
247 governors: NamedKeyDict[GovernorDimension, str] = dataclasses.field(default_factory=NamedKeyDict)
248 """Mapping containing the values of all governor dimensions that are
249 equated with literal values in this expression branch.
250 """
252 defaultsNeeded: NamedValueSet[GovernorDimension] = dataclasses.field(default_factory=NamedValueSet)
253 """Governor dimensions whose values are needed by the query, not provided
254 in the query itself, and present in the default data ID.
256 These should be added to the query's data ID when finalizing the WHERE
257 clause.
258 """
261@dataclasses.dataclass
262class OuterSummary(InspectionSummary):
263 """Result object used by `CheckVisitor` to gather referenced dimensions,
264 tables, and governor dimension values from the entire expression.
265 """
267 governors: GovernorDimensionRestriction = dataclasses.field(
268 default_factory=GovernorDimensionRestriction.makeFull
269 )
270 """Mapping containing all values that appear in this expression for
271 governor dimension relevant to the query.
273 Governor dimensions that are absent from this dict are not constrained by
274 this expression.
275 """
277 defaultsNeeded: NamedValueSet[GovernorDimension] = dataclasses.field(default_factory=NamedValueSet)
278 """Governor dimensions whose values are needed by the query, not provided
279 in the query itself, and present in the default data ID.
281 These should be added to the query's data ID when finalizing the WHERE
282 clause.
283 """
286class CheckVisitor(NormalFormVisitor[TreeSummary, InnerSummary, OuterSummary]):
287 """An implementation of `NormalFormVisitor` that identifies the dimensions
288 and tables that need to be included in a query while performing some checks
289 for completeness and consistency.
291 Parameters
292 ----------
293 dataId : `DataCoordinate`
294 Dimension values that are fully known in advance.
295 graph : `DimensionGraph`
296 The dimensions the query would include in the absence of this
297 expression.
298 bindKeys : `collections.abc.Set` [ `str` ]
299 Identifiers that represent bound parameter values, and hence need not
300 represent in-database entities.
301 defaults : `DataCoordinate`
302 A data ID containing default for governor dimensions.
303 """
305 def __init__(
306 self,
307 dataId: DataCoordinate,
308 graph: DimensionGraph,
309 bindKeys: AbstractSet[str],
310 defaults: DataCoordinate,
311 ):
312 self.dataId = dataId
313 self.graph = graph
314 self.bindKeys = bindKeys
315 self.defaults = defaults
316 self._branchVisitor = InspectionVisitor(dataId.universe, bindKeys)
318 def visitBranch(self, node: Node) -> TreeSummary:
319 # Docstring inherited from NormalFormVisitor.
320 return node.visit(self._branchVisitor)
322 def visitInner(self, branches: Sequence[TreeSummary], form: NormalForm) -> InnerSummary:
323 # Docstring inherited from NormalFormVisitor.
324 # Disjunctive normal form means inner branches are AND'd together...
325 assert form is NormalForm.DISJUNCTIVE
326 # ...and that means each branch we iterate over together below
327 # constrains the others, and they all need to be consistent. Moreover,
328 # because outer branches are OR'd together, we also know that if
329 # something is missing from one of these branches (like a governor
330 # dimension value like the instrument or skymap needed to interpret a
331 # visit or tract number), it really is missing, because there's no way
332 # some other inner branch can constraint it.
333 #
334 # That is, except the data ID the visitor was passed at construction;
335 # that's AND'd to the entire expression later, and thus it affects all
336 # branches. To take care of that, we add any governor values it
337 # contains to the summary in advance.
338 summary = InnerSummary()
339 summary.governors.update((k, self.dataId[k]) for k in self.dataId.graph.governors) # type: ignore
340 # Finally, we loop over those branches.
341 for branch in branches:
342 # Update the sets of dimensions and columns we've seen anywhere in
343 # the expression in any context.
344 summary.update(branch)
345 # Test whether this branch has a form like '<dimension>=<value'
346 # (or equivalent; categorizeIdentifier is smart enough to see that
347 # e.g. 'detector.id=4' is equivalent to 'detector=4').
348 # If so, and it's a governor dimension, remember that we've
349 # constrained it on this branch, and make sure it's consistent
350 # with any other constraints on any other branches its AND'd with.
351 if isinstance(branch.dataIdKey, GovernorDimension) and branch.dataIdValue is not None:
352 governor = branch.dataIdKey
353 value = summary.governors.setdefault(governor, branch.dataIdValue)
354 if value != branch.dataIdValue:
355 # Expression says something like "instrument='HSC' AND
356 # instrument='DECam'", or data ID has one and expression
357 # has the other.
358 if governor in self.dataId:
359 raise RuntimeError(
360 f"Conflict between expression containing {governor.name}={branch.dataIdValue!r} "
361 f"and data ID with {governor.name}={value!r}."
362 )
363 else:
364 raise RuntimeError(
365 f"Conflicting literal values for {governor.name} in expression: "
366 f"{value!r} != {branch.dataIdValue!r}."
367 )
368 # Now that we know which governor values we've constrained, see if any
369 # are missing, i.e. if the expression contains something like "visit=X"
370 # without saying what instrument that visit corresponds to. This rules
371 # out a lot of accidents, but it also rules out possibly-legitimate
372 # multi-instrument queries like "visit.seeing < 0.7". But it's not
373 # unreasonable to ask the user to be explicit about the instruments
374 # they want to consider to work around this restriction, and that's
375 # what we do. Note that if someone does write an expression like
376 #
377 # (instrument='HSC' OR instrument='DECam') AND visit.seeing < 0.7
378 #
379 # then in disjunctive normal form that will become
380 #
381 # (instrument='HSC' AND visit.seeing < 0.7)
382 # OR (instrument='DECam' AND visit.seeing < 0.7)
383 #
384 # i.e. each instrument will get its own outer branch and the logic here
385 # still works (that sort of thing is why we convert to normal form,
386 # after all).
387 governorsNeededInBranch: NamedValueSet[GovernorDimension] = NamedValueSet()
388 for dimension in summary.dimensions:
389 governorsNeededInBranch.update(dimension.graph.governors)
390 if not governorsNeededInBranch.issubset(summary.governors.keys()):
391 missing = NamedValueSet(governorsNeededInBranch - summary.governors.keys())
392 if missing <= self.defaults.keys():
393 summary.defaultsNeeded.update(missing)
394 else:
395 raise RuntimeError(
396 f"No value(s) for governor dimensions {missing - self.defaults.keys()} in expression "
397 "that references dependent dimensions. 'Governor' dimensions must always be specified "
398 "completely in either the query expression (via simple 'name=<value>' terms, not 'IN' "
399 "terms) or in a data ID passed to the query method."
400 )
401 return summary
403 def visitOuter(self, branches: Sequence[InnerSummary], form: NormalForm) -> OuterSummary:
404 # Docstring inherited from NormalFormVisitor.
405 # Disjunctive normal form means outer branches are OR'd together.
406 assert form is NormalForm.DISJUNCTIVE
407 # Iterate over branches in first pass to gather all dimensions and
408 # columns referenced. This aggregation is for the full query, so we
409 # don't care whether things are joined by AND or OR (or + or -, etc).
410 summary = OuterSummary()
411 for branch in branches:
412 summary.update(branch)
413 summary.governors.update(branch.governors)
414 summary.defaultsNeeded.update(branch.defaultsNeeded)
415 # See if we've referenced any dimensions that weren't in the original
416 # query graph; if so, we update that to include them. This is what
417 # lets a user say "tract=X" on the command line (well, "skymap=Y AND
418 # tract=X" - logic in visitInner checks for that) when running a task
419 # like ISR that has nothing to do with skymaps.
420 if not summary.dimensions.issubset(self.graph.dimensions):
421 self.graph = DimensionGraph(
422 self.graph.universe,
423 dimensions=(summary.dimensions | self.graph.dimensions),
424 )
425 for governor, values in branch.governors.items():
426 if governor in summary.defaultsNeeded:
427 # One branch contained an explicit value for this dimension
428 # while another needed to refer to the default data ID.
429 # Even if these refer to the same value, that inconsistency
430 # probably indicates user error.
431 raise RuntimeError(
432 f"Governor dimension {governor.name} is explicitly "
433 f"constrained to {values} in one or more branches of "
434 "this query where expression, but is left to default "
435 f"to {self.defaults[governor]!r} in another branch. "
436 "Defaults and explicit constraints cannot be mixed."
437 )
438 # If any default data ID values were needed, update self.dataId with
439 # them, and then update the governor restriction with them.
440 if summary.defaultsNeeded:
441 defaultsNeededGraph = DimensionGraph(self.graph.universe, summary.defaultsNeeded)
442 self.dataId = self.dataId.union(self.defaults.subset(defaultsNeededGraph))
443 assert self.dataId.hasRecords(), (
444 "Should be a union of two data IDs with records, "
445 "in which one only adds governor dimension values."
446 )
447 summary.governors.intersection_update(
448 # We know the value for a governor dimension is always a str,
449 # and that's all self.defaults should contain, but MyPy doesn't
450 # know that.
451 {dimension: self.defaults[dimension] for dimension in summary.defaultsNeeded} # type: ignore
452 )
453 return summary