Coverage for python/lsst/daf/butler/registry/queries/expressions/check.py : 31%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
21from __future__ import annotations
23__all__ = (
24 "CheckVisitor",
25 "InspectionVisitor",
26 "InspectionSummary",
27)
29import dataclasses
30from typing import (
31 AbstractSet,
32 List,
33 Optional,
34 Sequence,
35 Set,
36 Tuple,
37 TYPE_CHECKING,
38 Union,
39)
41from ....core import (
42 DataCoordinate,
43 DimensionUniverse,
44 Dimension,
45 DimensionElement,
46 DimensionGraph,
47 GovernorDimension,
48 NamedKeyDict,
49 NamedValueSet,
50)
51from ...wildcards import EllipsisType, Ellipsis
52from .parser import Node, TreeVisitor
53from .normalForm import NormalForm, NormalFormVisitor
54from .categorize import categorizeElementId, categorizeConstant, ExpressionConstant
56if TYPE_CHECKING: 56 ↛ 57line 56 didn't jump to line 57, because the condition on line 56 was never true
57 import astropy.time
60@dataclasses.dataclass
61class InspectionSummary:
62 """Base class for objects used by `CheckVisitor` and `InspectionVisitor`
63 to gather information about a parsed expression.
64 """
66 def update(self, other: InspectionSummary) -> None:
67 """Update ``self`` with all dimensions and columns from ``other``.
69 Parameters
70 ----------
71 other : `InspectionSummary`
72 The other summary object.
73 """
74 self.dimensions.update(other.dimensions)
75 for element, columns in other.columns.items():
76 self.columns.setdefault(element, set()).update(columns)
77 self.hasIngestDate = self.hasIngestDate or other.hasIngestDate
79 dimensions: NamedValueSet[Dimension] = dataclasses.field(default_factory=NamedValueSet)
80 """Dimensions whose primary keys or dependencies were referenced anywhere
81 in this branch (`NamedValueSet` [ `Dimension` ]).
82 """
84 columns: NamedKeyDict[DimensionElement, Set[str]] = dataclasses.field(default_factory=NamedKeyDict)
85 """Dimension element tables whose columns were referenced anywhere in this
86 branch (`NamedKeyDict` [ `DimensionElement`, `set` [ `str` ] ]).
87 """
89 hasIngestDate: bool = False
90 """Whether this expression includes the special dataset ingest date
91 identifier (`bool`).
92 """
95@dataclasses.dataclass
96class TreeSummary(InspectionSummary):
97 """Result object used by `InspectionVisitor` to gather information about
98 a parsed expression.
100 Notes
101 -----
102 TreeSummary adds attributes that allow dimension equivalence expressions
103 (e.g. "tract=4") to be recognized when they appear in simple contexts
104 (surrounded only by ANDs and ORs). When `InspectionVisitor` is used on its
105 own (i.e. when ``check=False`` in the query code), these don't do anything,
106 but they don't cost much, either. They are used by `CheckVisitor` when it
107 delegates to `InspectionVisitor` to see what governor dimension values are
108 set in a branch of the normal-form expression.
109 """
111 def merge(self, other: TreeSummary, isEq: bool = False) -> TreeSummary:
112 """Merge ``other`` into ``self``, making ``self`` a summary of both
113 expression tree branches.
115 Parameters
116 ----------
117 other : `TreeSummary`
118 The other summary object.
119 isEq : `bool`, optional
120 If `True` (`False` is default), these summaries are being combined
121 via the equality operator.
123 Returns
124 -------
125 self : `TreeSummary`
126 The merged summary (updated in-place).
127 """
128 self.update(other)
129 if isEq and self.isDataIdKeyOnly() and other.isDataIdValueOnly():
130 self.dataIdValue = other.dataIdValue
131 elif isEq and self.isDataIdValueOnly() and other.isDataIdKeyOnly():
132 self.dataIdKey = other.dataIdKey
133 else:
134 self.dataIdKey = None
135 self.dataIdValue = None
136 return self
138 def isDataIdKeyOnly(self) -> bool:
139 """Test whether this branch is _just_ a data ID key identifier.
140 """
141 return self.dataIdKey is not None and self.dataIdValue is None
143 def isDataIdValueOnly(self) -> bool:
144 """Test whether this branch is _just_ a literal value that may be
145 used as the value in a data ID key-value pair.
146 """
147 return self.dataIdKey is None and self.dataIdValue is not None
149 dataIdKey: Optional[Dimension] = None
150 """A `Dimension` that is (if `dataIdValue` is not `None`) or may be
151 (if `dataIdValue` is `None`) fully identified by a literal value in this
152 branch.
153 """
155 dataIdValue: Optional[str] = None
156 """A literal value that constrains (if `dataIdKey` is not `None`) or may
157 constrain (if `dataIdKey` is `None`) a dimension in this branch.
159 This is always a `str` or `None`, but it may need to be coerced to `int`
160 to reflect the actual user intent.
161 """
164class InspectionVisitor(TreeVisitor[TreeSummary]):
165 """Implements TreeVisitor to identify dimension elements that need
166 to be included in a query, prior to actually constructing a SQLAlchemy
167 WHERE clause from it.
169 Parameters
170 ----------
171 universe : `DimensionUniverse`
172 All known dimensions.
173 bindKeys : `collections.abc.Set` [ `str` ]
174 Identifiers that represent bound parameter values, and hence need not
175 represent in-database entities.
176 """
177 def __init__(self, universe: DimensionUniverse, bindKeys: AbstractSet[str]):
178 self.universe = universe
179 self.bindKeys = bindKeys
181 def visitNumericLiteral(self, value: str, node: Node) -> TreeSummary:
182 # Docstring inherited from TreeVisitor.visitNumericLiteral
183 return TreeSummary(dataIdValue=value)
185 def visitStringLiteral(self, value: str, node: Node) -> TreeSummary:
186 # Docstring inherited from TreeVisitor.visitStringLiteral
187 return TreeSummary(dataIdValue=value)
189 def visitTimeLiteral(self, value: astropy.time.Time, node: Node) -> TreeSummary:
190 # Docstring inherited from TreeVisitor.visitTimeLiteral
191 return TreeSummary()
193 def visitIdentifier(self, name: str, node: Node) -> TreeSummary:
194 # Docstring inherited from TreeVisitor.visitIdentifier
195 if name in self.bindKeys:
196 return TreeSummary()
197 constant = categorizeConstant(name)
198 if constant is ExpressionConstant.INGEST_DATE:
199 return TreeSummary(hasIngestDate=True)
200 elif constant is ExpressionConstant.NULL:
201 return TreeSummary()
202 assert constant is None, "Enum variant conditionals should be exhaustive."
203 element, column = categorizeElementId(self.universe, name)
204 if column is None:
205 assert isinstance(element, Dimension)
206 return TreeSummary(
207 dimensions=NamedValueSet(element.graph.dimensions),
208 dataIdKey=element,
209 )
210 else:
211 return TreeSummary(
212 dimensions=NamedValueSet(element.graph.dimensions),
213 columns=NamedKeyDict({element: {column}})
214 )
216 def visitUnaryOp(self, operator: str, operand: TreeSummary, node: Node
217 ) -> TreeSummary:
218 # Docstring inherited from TreeVisitor.visitUnaryOp
219 return operand
221 def visitBinaryOp(self, operator: str, lhs: TreeSummary, rhs: TreeSummary,
222 node: Node) -> TreeSummary:
223 # Docstring inherited from TreeVisitor.visitBinaryOp
224 return lhs.merge(rhs, isEq=(operator == "="))
226 def visitIsIn(self, lhs: TreeSummary, values: List[TreeSummary], not_in: bool,
227 node: Node) -> TreeSummary:
228 # Docstring inherited from TreeVisitor.visitIsIn
229 for v in values:
230 lhs.merge(v)
231 return lhs
233 def visitParens(self, expression: TreeSummary, node: Node) -> TreeSummary:
234 # Docstring inherited from TreeVisitor.visitParens
235 return expression
237 def visitTupleNode(self, items: Tuple[TreeSummary, ...], node: Node) -> TreeSummary:
238 # Docstring inherited from base class
239 result = TreeSummary()
240 for i in items:
241 result.merge(i)
242 return result
244 def visitRangeLiteral(self, start: int, stop: int, stride: Optional[int], node: Node
245 ) -> TreeSummary:
246 # Docstring inherited from TreeVisitor.visitRangeLiteral
247 return TreeSummary()
249 def visitPointNode(self, ra: TreeSummary, dec: TreeSummary, node: Node) -> TreeSummary:
250 # Docstring inherited from base class
251 return TreeSummary()
254@dataclasses.dataclass
255class InnerSummary(InspectionSummary):
256 """Result object used by `CheckVisitor` to gather referenced dimensions
257 and tables from an inner group of AND'd together expression branches, and
258 check them for consistency and completeness.
259 """
261 governors: NamedKeyDict[GovernorDimension, str] = dataclasses.field(default_factory=NamedKeyDict)
262 """Mapping containing the values of all governor dimensions that are
263 equated with literal values in this expression branch.
264 """
267@dataclasses.dataclass
268class OuterSummary(InspectionSummary):
269 """Result object used by `CheckVisitor` to gather referenced dimensions,
270 tables, and governor dimension values from the entire expression.
271 """
273 governors: NamedKeyDict[GovernorDimension, Union[Set[str], EllipsisType]] \
274 = dataclasses.field(default_factory=NamedKeyDict)
275 """Mapping containing all values that appear in this expression for any
276 governor dimension relevant to the query.
278 Mapping values may be a `set` of `str` to indicate that only these values
279 are permitted for a dimension, or ``...`` indicate that the values for
280 that governor are not fully constrained by this expression.
281 """
284class CheckVisitor(NormalFormVisitor[TreeSummary, InnerSummary, OuterSummary]):
285 """An implementation of `NormalFormVisitor` that identifies the dimensions
286 and tables that need to be included in a query while performing some checks
287 for completeness and consistency.
289 Parameters
290 ----------
291 dataId : `DataCoordinate`
292 Dimension values that are fully known in advance.
293 graph : `DimensionGraph`
294 The dimensions the query would include in the absence of this
295 expression.
296 bindKeys : `collections.abc.Set` [ `str` ]
297 Identifiers that represent bound parameter values, and hence need not
298 represent in-database entities.
299 """
300 def __init__(self, dataId: DataCoordinate, graph: DimensionGraph, bindKeys: AbstractSet[str]):
301 self.dataId = dataId
302 self.graph = graph
303 self.bindKeys = bindKeys
304 self._branchVisitor = InspectionVisitor(dataId.universe, bindKeys)
306 def visitBranch(self, node: Node) -> TreeSummary:
307 # Docstring inherited from NormalFormVisitor.
308 return node.visit(self._branchVisitor)
310 def visitInner(self, branches: Sequence[TreeSummary], form: NormalForm) -> InnerSummary:
311 # Docstring inherited from NormalFormVisitor.
312 # Disjunctive normal form means inner branches are AND'd together...
313 assert form is NormalForm.DISJUNCTIVE
314 # ...and that means each branch we iterate over together below
315 # constrains the others, and they all need to be consistent. Moreover,
316 # because outer branches are OR'd together, we also know that if
317 # something is missing from one of these branches (like a governor
318 # dimension value like the instrument or skymap needed to interpret a
319 # visit or tract number), it really is missing, because there's no way
320 # some other inner branch can constraint it.
321 #
322 # That is, except the data ID the visitor was passed at construction;
323 # that's AND'd to the entire expression later, and thus it affects all
324 # branches. To take care of that, we add any governor values it
325 # contains to the summary in advance.
326 summary = InnerSummary()
327 summary.governors.update((k, self.dataId[k]) for k in self.dataId.graph.governors) # type: ignore
328 # Finally, we loop over those branches.
329 for branch in branches:
330 # Update the sets of dimensions and columns we've seen anywhere in
331 # the expression in any context.
332 summary.update(branch)
333 # Test whether this branch has a form like '<dimension>=<value'
334 # (or equivalent; categorizeIdentifier is smart enough to see that
335 # e.g. 'detector.id=4' is equivalent to 'detector=4').
336 # If so, and it's a governor dimension, remember that we've
337 # constrained it on this branch, and make sure it's consistent
338 # with any other constraints on any other branches its AND'd with.
339 if isinstance(branch.dataIdKey, GovernorDimension) and branch.dataIdValue is not None:
340 governor = branch.dataIdKey
341 value = summary.governors.setdefault(governor, branch.dataIdValue)
342 if value != branch.dataIdValue:
343 # Expression says something like "instrument='HSC' AND
344 # instrument='DECam'", or data ID has one and expression
345 # has the other.
346 if governor in self.dataId:
347 raise RuntimeError(
348 f"Conflict between expression containing {governor.name}={branch.dataIdValue!r} "
349 f"and data ID with {governor.name}={value!r}."
350 )
351 else:
352 raise RuntimeError(
353 f"Conflicting literal values for {governor.name} in expression: "
354 f"{value!r} != {branch.dataIdValue!r}."
355 )
356 # Now that we know which governor values we've constrained, see if any
357 # are missing, i.e. if the expression contains something like "visit=X"
358 # without saying what instrument that visit corresponds to. This rules
359 # out a lot of accidents, but it also rules out possibly-legitimate
360 # multi-instrument queries like "visit.seeing < 0.7". But it's not
361 # unreasonable to ask the user to be explicit about the instruments
362 # they want to consider to work around this restriction, and that's
363 # what we do. Note that if someone does write an expression like
364 #
365 # (instrument='HSC' OR instrument='DECam') AND visit.seeing < 0.7
366 #
367 # then in disjunctive normal form that will become
368 #
369 # (instrument='HSC' AND visit.seeing < 0.7)
370 # OR (instrument='DECam' AND visit.seeing < 0.7)
371 #
372 # i.e. each instrument will get its own outer branch and the logic here
373 # still works (that sort of thing is why we convert to normal form,
374 # after all).
375 governorsNeededInBranch: NamedValueSet[GovernorDimension] = NamedValueSet()
376 for dimension in summary.dimensions:
377 governorsNeededInBranch.update(dimension.graph.governors)
378 if not governorsNeededInBranch.issubset(summary.governors.keys()):
379 missing = NamedValueSet(governorsNeededInBranch - summary.governors.keys())
380 raise RuntimeError(
381 f"No value(s) for governor dimensions {missing} in expression that references dependent "
382 "dimensions. 'Governor' dimensions must always be specified completely in either the "
383 "query expression (via simple 'name=<value>' terms, not 'IN' terms) or in a data ID passed "
384 "to the query method."
385 )
386 return summary
388 def visitOuter(self, branches: Sequence[InnerSummary], form: NormalForm) -> OuterSummary:
389 # Docstring inherited from NormalFormVisitor.
390 # Disjunctive normal form means outer branches are OR'd together.
391 assert form is NormalForm.DISJUNCTIVE
392 # Iterate over branches in first pass to gather all dimensions and
393 # columns referenced. This aggregation is for the full query, so we
394 # don't care whether things are joined by AND or OR (or + or -, etc).
395 summary = OuterSummary()
396 for branch in branches:
397 summary.update(branch)
398 # See if we've referenced any dimensions that weren't in the original
399 # query graph; if so, we update that to include them. This is what
400 # lets a user say "tract=X" on the command line (well, "skymap=Y AND
401 # tract=X" - logic in visitInner checks for that) when running a task
402 # like ISR that has nothing to do with skymaps.
403 if not summary.dimensions.issubset(self.graph.dimensions):
404 self.graph = DimensionGraph(
405 self.graph.universe,
406 dimensions=(summary.dimensions | self.graph.dimensions),
407 )
408 # Set up a dict of empty sets, with all of the governors this query
409 # involves as keys.
410 summary.governors.update((k, set()) for k in self.graph.governors)
411 # Iterate over branches again to see if there are any branches that
412 # don't constraint a particular governor (because these branches are
413 # OR'd together, that means there is no constraint on that governor at
414 # all); if that's the case, we set the dict value to None. If a
415 # governor is constrained by all branches, we update the set with the
416 # values that governor can have.
417 for branch in branches:
418 for governor in summary.governors:
419 currentValues = summary.governors[governor]
420 if currentValues is not Ellipsis:
421 branchValue = branch.governors.get(governor)
422 if branchValue is None:
423 # This governor is unconstrained in this branch, so
424 # no other branch can constrain it.
425 summary.governors[governor] = Ellipsis
426 else:
427 currentValues.add(branchValue)
428 return summary