Coverage for python/lsst/daf/butler/registry/queries/expressions/check.py : 31%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
21from __future__ import annotations
23__all__ = (
24 "CheckVisitor",
25 "InspectionVisitor",
26 "InspectionSummary",
27)
29import dataclasses
30from typing import (
31 AbstractSet,
32 List,
33 Optional,
34 Sequence,
35 Set,
36 Tuple,
37 TYPE_CHECKING,
38)
40from ....core import (
41 DataCoordinate,
42 DimensionUniverse,
43 Dimension,
44 DimensionElement,
45 DimensionGraph,
46 GovernorDimension,
47 NamedKeyDict,
48 NamedValueSet,
49)
50from ...summaries import GovernorDimensionRestriction
51from .parser import Node, TreeVisitor
52from .normalForm import NormalForm, NormalFormVisitor
53from .categorize import categorizeElementId, categorizeConstant, ExpressionConstant
55if TYPE_CHECKING: 55 ↛ 56line 55 didn't jump to line 56, because the condition on line 55 was never true
56 import astropy.time
59@dataclasses.dataclass
60class InspectionSummary:
61 """Base class for objects used by `CheckVisitor` and `InspectionVisitor`
62 to gather information about a parsed expression.
63 """
65 def update(self, other: InspectionSummary) -> None:
66 """Update ``self`` with all dimensions and columns from ``other``.
68 Parameters
69 ----------
70 other : `InspectionSummary`
71 The other summary object.
72 """
73 self.dimensions.update(other.dimensions)
74 for element, columns in other.columns.items():
75 self.columns.setdefault(element, set()).update(columns)
76 self.hasIngestDate = self.hasIngestDate or other.hasIngestDate
78 dimensions: NamedValueSet[Dimension] = dataclasses.field(default_factory=NamedValueSet)
79 """Dimensions whose primary keys or dependencies were referenced anywhere
80 in this branch (`NamedValueSet` [ `Dimension` ]).
81 """
83 columns: NamedKeyDict[DimensionElement, Set[str]] = dataclasses.field(default_factory=NamedKeyDict)
84 """Dimension element tables whose columns were referenced anywhere in this
85 branch (`NamedKeyDict` [ `DimensionElement`, `set` [ `str` ] ]).
86 """
88 hasIngestDate: bool = False
89 """Whether this expression includes the special dataset ingest date
90 identifier (`bool`).
91 """
94@dataclasses.dataclass
95class TreeSummary(InspectionSummary):
96 """Result object used by `InspectionVisitor` to gather information about
97 a parsed expression.
99 Notes
100 -----
101 TreeSummary adds attributes that allow dimension equivalence expressions
102 (e.g. "tract=4") to be recognized when they appear in simple contexts
103 (surrounded only by ANDs and ORs). When `InspectionVisitor` is used on its
104 own (i.e. when ``check=False`` in the query code), these don't do anything,
105 but they don't cost much, either. They are used by `CheckVisitor` when it
106 delegates to `InspectionVisitor` to see what governor dimension values are
107 set in a branch of the normal-form expression.
108 """
110 def merge(self, other: TreeSummary, isEq: bool = False) -> TreeSummary:
111 """Merge ``other`` into ``self``, making ``self`` a summary of both
112 expression tree branches.
114 Parameters
115 ----------
116 other : `TreeSummary`
117 The other summary object.
118 isEq : `bool`, optional
119 If `True` (`False` is default), these summaries are being combined
120 via the equality operator.
122 Returns
123 -------
124 self : `TreeSummary`
125 The merged summary (updated in-place).
126 """
127 self.update(other)
128 if isEq and self.isDataIdKeyOnly() and other.isDataIdValueOnly():
129 self.dataIdValue = other.dataIdValue
130 elif isEq and self.isDataIdValueOnly() and other.isDataIdKeyOnly():
131 self.dataIdKey = other.dataIdKey
132 else:
133 self.dataIdKey = None
134 self.dataIdValue = None
135 return self
137 def isDataIdKeyOnly(self) -> bool:
138 """Test whether this branch is _just_ a data ID key identifier.
139 """
140 return self.dataIdKey is not None and self.dataIdValue is None
142 def isDataIdValueOnly(self) -> bool:
143 """Test whether this branch is _just_ a literal value that may be
144 used as the value in a data ID key-value pair.
145 """
146 return self.dataIdKey is None and self.dataIdValue is not None
148 dataIdKey: Optional[Dimension] = None
149 """A `Dimension` that is (if `dataIdValue` is not `None`) or may be
150 (if `dataIdValue` is `None`) fully identified by a literal value in this
151 branch.
152 """
154 dataIdValue: Optional[str] = None
155 """A literal value that constrains (if `dataIdKey` is not `None`) or may
156 constrain (if `dataIdKey` is `None`) a dimension in this branch.
158 This is always a `str` or `None`, but it may need to be coerced to `int`
159 to reflect the actual user intent.
160 """
163class InspectionVisitor(TreeVisitor[TreeSummary]):
164 """Implements TreeVisitor to identify dimension elements that need
165 to be included in a query, prior to actually constructing a SQLAlchemy
166 WHERE clause from it.
168 Parameters
169 ----------
170 universe : `DimensionUniverse`
171 All known dimensions.
172 bindKeys : `collections.abc.Set` [ `str` ]
173 Identifiers that represent bound parameter values, and hence need not
174 represent in-database entities.
175 """
176 def __init__(self, universe: DimensionUniverse, bindKeys: AbstractSet[str]):
177 self.universe = universe
178 self.bindKeys = bindKeys
180 def visitNumericLiteral(self, value: str, node: Node) -> TreeSummary:
181 # Docstring inherited from TreeVisitor.visitNumericLiteral
182 return TreeSummary(dataIdValue=value)
184 def visitStringLiteral(self, value: str, node: Node) -> TreeSummary:
185 # Docstring inherited from TreeVisitor.visitStringLiteral
186 return TreeSummary(dataIdValue=value)
188 def visitTimeLiteral(self, value: astropy.time.Time, node: Node) -> TreeSummary:
189 # Docstring inherited from TreeVisitor.visitTimeLiteral
190 return TreeSummary()
192 def visitIdentifier(self, name: str, node: Node) -> TreeSummary:
193 # Docstring inherited from TreeVisitor.visitIdentifier
194 if name in self.bindKeys:
195 return TreeSummary()
196 constant = categorizeConstant(name)
197 if constant is ExpressionConstant.INGEST_DATE:
198 return TreeSummary(hasIngestDate=True)
199 elif constant is ExpressionConstant.NULL:
200 return TreeSummary()
201 assert constant is None, "Enum variant conditionals should be exhaustive."
202 element, column = categorizeElementId(self.universe, name)
203 if column is None:
204 assert isinstance(element, Dimension)
205 return TreeSummary(
206 dimensions=NamedValueSet(element.graph.dimensions),
207 dataIdKey=element,
208 )
209 else:
210 return TreeSummary(
211 dimensions=NamedValueSet(element.graph.dimensions),
212 columns=NamedKeyDict({element: {column}})
213 )
215 def visitUnaryOp(self, operator: str, operand: TreeSummary, node: Node
216 ) -> TreeSummary:
217 # Docstring inherited from TreeVisitor.visitUnaryOp
218 return operand
220 def visitBinaryOp(self, operator: str, lhs: TreeSummary, rhs: TreeSummary,
221 node: Node) -> TreeSummary:
222 # Docstring inherited from TreeVisitor.visitBinaryOp
223 return lhs.merge(rhs, isEq=(operator == "="))
225 def visitIsIn(self, lhs: TreeSummary, values: List[TreeSummary], not_in: bool,
226 node: Node) -> TreeSummary:
227 # Docstring inherited from TreeVisitor.visitIsIn
228 for v in values:
229 lhs.merge(v)
230 return lhs
232 def visitParens(self, expression: TreeSummary, node: Node) -> TreeSummary:
233 # Docstring inherited from TreeVisitor.visitParens
234 return expression
236 def visitTupleNode(self, items: Tuple[TreeSummary, ...], node: Node) -> TreeSummary:
237 # Docstring inherited from base class
238 result = TreeSummary()
239 for i in items:
240 result.merge(i)
241 return result
243 def visitRangeLiteral(self, start: int, stop: int, stride: Optional[int], node: Node
244 ) -> TreeSummary:
245 # Docstring inherited from TreeVisitor.visitRangeLiteral
246 return TreeSummary()
248 def visitPointNode(self, ra: TreeSummary, dec: TreeSummary, node: Node) -> TreeSummary:
249 # Docstring inherited from base class
250 return TreeSummary()
253@dataclasses.dataclass
254class InnerSummary(InspectionSummary):
255 """Result object used by `CheckVisitor` to gather referenced dimensions
256 and tables from an inner group of AND'd together expression branches, and
257 check them for consistency and completeness.
258 """
260 governors: NamedKeyDict[GovernorDimension, str] = dataclasses.field(default_factory=NamedKeyDict)
261 """Mapping containing the values of all governor dimensions that are
262 equated with literal values in this expression branch.
263 """
265 defaultsNeeded: NamedValueSet[GovernorDimension] = dataclasses.field(default_factory=NamedValueSet)
266 """Governor dimensions whose values are needed by the query, not provided
267 in the query itself, and present in the default data ID.
269 These should be added to the query's data ID when finalizing the WHERE
270 clause.
271 """
274@dataclasses.dataclass
275class OuterSummary(InspectionSummary):
276 """Result object used by `CheckVisitor` to gather referenced dimensions,
277 tables, and governor dimension values from the entire expression.
278 """
280 governors: GovernorDimensionRestriction = dataclasses.field(
281 default_factory=GovernorDimensionRestriction.makeFull
282 )
283 """Mapping containing all values that appear in this expression for
284 governor dimension relevant to the query.
286 Governor dimensions that are absent from this dict are not constrained by
287 this expression.
288 """
290 defaultsNeeded: NamedValueSet[GovernorDimension] = dataclasses.field(default_factory=NamedValueSet)
291 """Governor dimensions whose values are needed by the query, not provided
292 in the query itself, and present in the default data ID.
294 These should be added to the query's data ID when finalizing the WHERE
295 clause.
296 """
299class CheckVisitor(NormalFormVisitor[TreeSummary, InnerSummary, OuterSummary]):
300 """An implementation of `NormalFormVisitor` that identifies the dimensions
301 and tables that need to be included in a query while performing some checks
302 for completeness and consistency.
304 Parameters
305 ----------
306 dataId : `DataCoordinate`
307 Dimension values that are fully known in advance.
308 graph : `DimensionGraph`
309 The dimensions the query would include in the absence of this
310 expression.
311 bindKeys : `collections.abc.Set` [ `str` ]
312 Identifiers that represent bound parameter values, and hence need not
313 represent in-database entities.
314 defaults : `DataCoordinate`
315 A data ID containing default for governor dimensions.
316 """
317 def __init__(self, dataId: DataCoordinate, graph: DimensionGraph, bindKeys: AbstractSet[str],
318 defaults: DataCoordinate):
319 self.dataId = dataId
320 self.graph = graph
321 self.bindKeys = bindKeys
322 self.defaults = defaults
323 self._branchVisitor = InspectionVisitor(dataId.universe, bindKeys)
325 def visitBranch(self, node: Node) -> TreeSummary:
326 # Docstring inherited from NormalFormVisitor.
327 return node.visit(self._branchVisitor)
329 def visitInner(self, branches: Sequence[TreeSummary], form: NormalForm) -> InnerSummary:
330 # Docstring inherited from NormalFormVisitor.
331 # Disjunctive normal form means inner branches are AND'd together...
332 assert form is NormalForm.DISJUNCTIVE
333 # ...and that means each branch we iterate over together below
334 # constrains the others, and they all need to be consistent. Moreover,
335 # because outer branches are OR'd together, we also know that if
336 # something is missing from one of these branches (like a governor
337 # dimension value like the instrument or skymap needed to interpret a
338 # visit or tract number), it really is missing, because there's no way
339 # some other inner branch can constraint it.
340 #
341 # That is, except the data ID the visitor was passed at construction;
342 # that's AND'd to the entire expression later, and thus it affects all
343 # branches. To take care of that, we add any governor values it
344 # contains to the summary in advance.
345 summary = InnerSummary()
346 summary.governors.update((k, self.dataId[k]) for k in self.dataId.graph.governors) # type: ignore
347 # Finally, we loop over those branches.
348 for branch in branches:
349 # Update the sets of dimensions and columns we've seen anywhere in
350 # the expression in any context.
351 summary.update(branch)
352 # Test whether this branch has a form like '<dimension>=<value'
353 # (or equivalent; categorizeIdentifier is smart enough to see that
354 # e.g. 'detector.id=4' is equivalent to 'detector=4').
355 # If so, and it's a governor dimension, remember that we've
356 # constrained it on this branch, and make sure it's consistent
357 # with any other constraints on any other branches its AND'd with.
358 if isinstance(branch.dataIdKey, GovernorDimension) and branch.dataIdValue is not None:
359 governor = branch.dataIdKey
360 value = summary.governors.setdefault(governor, branch.dataIdValue)
361 if value != branch.dataIdValue:
362 # Expression says something like "instrument='HSC' AND
363 # instrument='DECam'", or data ID has one and expression
364 # has the other.
365 if governor in self.dataId:
366 raise RuntimeError(
367 f"Conflict between expression containing {governor.name}={branch.dataIdValue!r} "
368 f"and data ID with {governor.name}={value!r}."
369 )
370 else:
371 raise RuntimeError(
372 f"Conflicting literal values for {governor.name} in expression: "
373 f"{value!r} != {branch.dataIdValue!r}."
374 )
375 # Now that we know which governor values we've constrained, see if any
376 # are missing, i.e. if the expression contains something like "visit=X"
377 # without saying what instrument that visit corresponds to. This rules
378 # out a lot of accidents, but it also rules out possibly-legitimate
379 # multi-instrument queries like "visit.seeing < 0.7". But it's not
380 # unreasonable to ask the user to be explicit about the instruments
381 # they want to consider to work around this restriction, and that's
382 # what we do. Note that if someone does write an expression like
383 #
384 # (instrument='HSC' OR instrument='DECam') AND visit.seeing < 0.7
385 #
386 # then in disjunctive normal form that will become
387 #
388 # (instrument='HSC' AND visit.seeing < 0.7)
389 # OR (instrument='DECam' AND visit.seeing < 0.7)
390 #
391 # i.e. each instrument will get its own outer branch and the logic here
392 # still works (that sort of thing is why we convert to normal form,
393 # after all).
394 governorsNeededInBranch: NamedValueSet[GovernorDimension] = NamedValueSet()
395 for dimension in summary.dimensions:
396 governorsNeededInBranch.update(dimension.graph.governors)
397 if not governorsNeededInBranch.issubset(summary.governors.keys()):
398 missing = NamedValueSet(governorsNeededInBranch - summary.governors.keys())
399 if missing <= self.defaults.keys():
400 summary.defaultsNeeded.update(missing)
401 else:
402 raise RuntimeError(
403 f"No value(s) for governor dimensions {missing - self.defaults.keys()} in expression "
404 "that references dependent dimensions. 'Governor' dimensions must always be specified "
405 "completely in either the query expression (via simple 'name=<value>' terms, not 'IN' "
406 "terms) or in a data ID passed to the query method."
407 )
408 return summary
410 def visitOuter(self, branches: Sequence[InnerSummary], form: NormalForm) -> OuterSummary:
411 # Docstring inherited from NormalFormVisitor.
412 # Disjunctive normal form means outer branches are OR'd together.
413 assert form is NormalForm.DISJUNCTIVE
414 # Iterate over branches in first pass to gather all dimensions and
415 # columns referenced. This aggregation is for the full query, so we
416 # don't care whether things are joined by AND or OR (or + or -, etc).
417 summary = OuterSummary()
418 for branch in branches:
419 summary.update(branch)
420 summary.governors.update(branch.governors)
421 summary.defaultsNeeded.update(branch.defaultsNeeded)
422 # See if we've referenced any dimensions that weren't in the original
423 # query graph; if so, we update that to include them. This is what
424 # lets a user say "tract=X" on the command line (well, "skymap=Y AND
425 # tract=X" - logic in visitInner checks for that) when running a task
426 # like ISR that has nothing to do with skymaps.
427 if not summary.dimensions.issubset(self.graph.dimensions):
428 self.graph = DimensionGraph(
429 self.graph.universe,
430 dimensions=(summary.dimensions | self.graph.dimensions),
431 )
432 for governor, values in branch.governors.items():
433 if governor in summary.defaultsNeeded:
434 # One branch contained an explicit value for this dimension
435 # while another needed to refer to the default data ID.
436 # Even if these refer to the same value, that inconsistency
437 # probably indicates user error.
438 raise RuntimeError(
439 f"Governor dimension {governor.name} is explicitly "
440 f"constrained to {values} in one or more branches of "
441 "this query where expression, but is left to default "
442 f"to {self.defaults[governor]!r} in another branch. "
443 "Defaults and explicit constraints cannot be mixed."
444 )
445 # If any default data ID values were needed, update self.dataId with
446 # them, and then update the governor restriction with them.
447 if summary.defaultsNeeded:
448 defaultsNeededGraph = DimensionGraph(self.graph.universe, summary.defaultsNeeded)
449 self.dataId = self.dataId.union(self.defaults.subset(defaultsNeededGraph))
450 assert self.dataId.hasRecords(), (
451 "Should be a union of two data IDs with records, "
452 "in which one only adds governor dimension values."
453 )
454 summary.governors.intersection_update(
455 # We know the value for a governor dimension is always a str,
456 # and that's all self.defaults should contain, but MyPy doesn't
457 # know that.
458 {dimension: self.defaults[dimension] for dimension in summary.defaultsNeeded} # type: ignore
459 )
460 return summary