Coverage for python/lsst/daf/butler/registry/queries/expressions/check.py: 28%
172 statements
« prev ^ index » next coverage.py v6.5.0, created at 2023-02-07 10:26 +0000
« prev ^ index » next coverage.py v6.5.0, created at 2023-02-07 10:26 +0000
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
21from __future__ import annotations
23__all__ = (
24 "CheckVisitor",
25 "InspectionVisitor",
26 "InspectionSummary",
27)
29import dataclasses
30from collections.abc import Mapping, Sequence, Set
31from typing import TYPE_CHECKING, Any
33from ....core import (
34 DataCoordinate,
35 DataIdValue,
36 DatasetColumnTag,
37 Dimension,
38 DimensionElement,
39 DimensionGraph,
40 DimensionKeyColumnTag,
41 DimensionRecordColumnTag,
42 DimensionUniverse,
43 NamedKeyDict,
44 NamedValueSet,
45)
46from ..._exceptions import UserExpressionError
47from .categorize import ExpressionConstant, categorizeConstant, categorizeElementId
48from .normalForm import NormalForm, NormalFormVisitor
49from .parser import Node, TreeVisitor
51if TYPE_CHECKING: 51 ↛ 52line 51 didn't jump to line 52, because the condition on line 51 was never true
52 import astropy.time
53 from lsst.daf.relation import ColumnTag
56@dataclasses.dataclass
57class InspectionSummary:
58 """Base class for objects used by `CheckVisitor` and `InspectionVisitor`
59 to gather information about a parsed expression.
60 """
62 def update(self, other: InspectionSummary) -> None:
63 """Update ``self`` with all dimensions and columns from ``other``.
65 Parameters
66 ----------
67 other : `InspectionSummary`
68 The other summary object.
69 """
70 self.dimensions.update(other.dimensions)
71 for element, columns in other.columns.items():
72 self.columns.setdefault(element, set()).update(columns)
73 self.hasIngestDate = self.hasIngestDate or other.hasIngestDate
75 dimensions: NamedValueSet[Dimension] = dataclasses.field(default_factory=NamedValueSet)
76 """Dimensions whose primary keys or dependencies were referenced anywhere
77 in this branch (`NamedValueSet` [ `Dimension` ]).
78 """
80 columns: NamedKeyDict[DimensionElement, set[str]] = dataclasses.field(default_factory=NamedKeyDict)
81 """Dimension element tables whose columns were referenced anywhere in this
82 branch (`NamedKeyDict` [ `DimensionElement`, `set` [ `str` ] ]).
83 """
85 hasIngestDate: bool = False
86 """Whether this expression includes the special dataset ingest date
87 identifier (`bool`).
88 """
90 def make_column_tag_set(self, dataset_type_name: str | None) -> set[ColumnTag]:
91 """Transforms the columns captured here into a set of `ColumnTag`
92 objects.
94 Parameters
95 ----------
96 dataset_type_name : `str` or `None`
97 Name of the dataset type to assume for unqualified dataset columns,
98 or `None` to reject any such identifiers.
100 Returns
101 -------
102 tag_set : `set` [ `ColumnTag` ]
103 Set of categorized column tags.
104 """
105 result: set[ColumnTag] = set()
106 if self.hasIngestDate:
107 if dataset_type_name is None:
108 raise UserExpressionError(
109 "Expression requires an ingest date, which requires exactly one dataset type."
110 )
111 result.add(DatasetColumnTag(dataset_type_name, "ingest_date"))
112 result.update(DimensionKeyColumnTag.generate(self.dimensions.names))
113 for dimension_element, columns in self.columns.items():
114 result.update(DimensionRecordColumnTag.generate(dimension_element.name, columns))
115 return result
118@dataclasses.dataclass
119class TreeSummary(InspectionSummary):
120 """Result object used by `InspectionVisitor` to gather information about
121 a parsed expression.
123 Notes
124 -----
125 TreeSummary adds attributes that allow dimension equivalence expressions
126 (e.g. "tract=4") to be recognized when they appear in simple contexts
127 (surrounded only by ANDs and ORs). When `InspectionVisitor` is used on its
128 own (i.e. when ``check=False`` in the query code), these don't do anything,
129 but they don't cost much, either. They are used by `CheckVisitor` when it
130 delegates to `InspectionVisitor` to see what governor dimension values are
131 set in a branch of the normal-form expression.
132 """
134 def merge(self, other: TreeSummary, isEq: bool = False) -> TreeSummary:
135 """Merge ``other`` into ``self``, making ``self`` a summary of both
136 expression tree branches.
138 Parameters
139 ----------
140 other : `TreeSummary`
141 The other summary object.
142 isEq : `bool`, optional
143 If `True` (`False` is default), these summaries are being combined
144 via the equality operator.
146 Returns
147 -------
148 self : `TreeSummary`
149 The merged summary (updated in-place).
150 """
151 self.update(other)
152 if isEq and self.isDataIdKeyOnly() and other.isDataIdValueOnly():
153 self.dataIdValue = other.dataIdValue
154 elif isEq and self.isDataIdValueOnly() and other.isDataIdKeyOnly():
155 self.dataIdKey = other.dataIdKey
156 else:
157 self.dataIdKey = None
158 self.dataIdValue = None
159 return self
161 def isDataIdKeyOnly(self) -> bool:
162 """Test whether this branch is _just_ a data ID key identifier."""
163 return self.dataIdKey is not None and self.dataIdValue is None
165 def isDataIdValueOnly(self) -> bool:
166 """Test whether this branch is _just_ a literal value that may be
167 used as the value in a data ID key-value pair.
168 """
169 return self.dataIdKey is None and self.dataIdValue is not None
171 dataIdKey: Dimension | None = None
172 """A `Dimension` that is (if `dataIdValue` is not `None`) or may be
173 (if `dataIdValue` is `None`) fully identified by a literal value in this
174 branch.
175 """
177 dataIdValue: str | None = None
178 """A literal value that constrains (if `dataIdKey` is not `None`) or may
179 constrain (if `dataIdKey` is `None`) a dimension in this branch.
181 This is always a `str` or `None`, but it may need to be coerced to `int`
182 to reflect the actual user intent.
183 """
186class InspectionVisitor(TreeVisitor[TreeSummary]):
187 """Implements TreeVisitor to identify dimension elements that need
188 to be included in a query, prior to actually constructing a SQLAlchemy
189 WHERE clause from it.
191 Parameters
192 ----------
193 universe : `DimensionUniverse`
194 All known dimensions.
195 bind : `Mapping` [ `str`, `object` ]
196 Mapping containing literal values that should be injected into the
197 query expression, keyed by the identifiers they replace.
198 """
200 def __init__(self, universe: DimensionUniverse, bind: Mapping[str, Any]):
201 self.universe = universe
202 self.bind = bind
204 def visitNumericLiteral(self, value: str, node: Node) -> TreeSummary:
205 # Docstring inherited from TreeVisitor.visitNumericLiteral
206 return TreeSummary(dataIdValue=value)
208 def visitStringLiteral(self, value: str, node: Node) -> TreeSummary:
209 # Docstring inherited from TreeVisitor.visitStringLiteral
210 return TreeSummary(dataIdValue=value)
212 def visitTimeLiteral(self, value: astropy.time.Time, node: Node) -> TreeSummary:
213 # Docstring inherited from TreeVisitor.visitTimeLiteral
214 return TreeSummary()
216 def visitIdentifier(self, name: str, node: Node) -> TreeSummary:
217 # Docstring inherited from TreeVisitor.visitIdentifier
218 if name in self.bind:
219 value = self.bind[name]
220 if isinstance(value, (list, tuple, Set)):
221 # This can happen on rhs of IN operator, if there is only one
222 # element in the list then take it.
223 if len(value) == 1:
224 return TreeSummary(dataIdValue=next(iter(value)))
225 else:
226 return TreeSummary()
227 else:
228 return TreeSummary(dataIdValue=value)
229 constant = categorizeConstant(name)
230 if constant is ExpressionConstant.INGEST_DATE:
231 return TreeSummary(hasIngestDate=True)
232 elif constant is ExpressionConstant.NULL:
233 return TreeSummary()
234 assert constant is None, "Enum variant conditionals should be exhaustive."
235 element, column = categorizeElementId(self.universe, name)
236 if column is None:
237 assert isinstance(element, Dimension)
238 return TreeSummary(
239 dimensions=NamedValueSet(element.graph.dimensions),
240 dataIdKey=element,
241 )
242 else:
243 return TreeSummary(
244 dimensions=NamedValueSet(element.graph.dimensions), columns=NamedKeyDict({element: {column}})
245 )
247 def visitUnaryOp(self, operator: str, operand: TreeSummary, node: Node) -> TreeSummary:
248 # Docstring inherited from TreeVisitor.visitUnaryOp
249 return operand
251 def visitBinaryOp(self, operator: str, lhs: TreeSummary, rhs: TreeSummary, node: Node) -> TreeSummary:
252 # Docstring inherited from TreeVisitor.visitBinaryOp
253 return lhs.merge(rhs, isEq=(operator == "="))
255 def visitIsIn(self, lhs: TreeSummary, values: list[TreeSummary], not_in: bool, node: Node) -> TreeSummary:
256 # Docstring inherited from TreeVisitor.visitIsIn
257 for v in values:
258 lhs.merge(v)
259 return lhs
261 def visitParens(self, expression: TreeSummary, node: Node) -> TreeSummary:
262 # Docstring inherited from TreeVisitor.visitParens
263 return expression
265 def visitTupleNode(self, items: tuple[TreeSummary, ...], node: Node) -> TreeSummary:
266 # Docstring inherited from base class
267 result = TreeSummary()
268 for i in items:
269 result.merge(i)
270 return result
272 def visitRangeLiteral(self, start: int, stop: int, stride: int | None, node: Node) -> TreeSummary:
273 # Docstring inherited from TreeVisitor.visitRangeLiteral
274 return TreeSummary()
276 def visitPointNode(self, ra: TreeSummary, dec: TreeSummary, node: Node) -> TreeSummary:
277 # Docstring inherited from base class
278 return TreeSummary()
281@dataclasses.dataclass
282class InnerSummary(InspectionSummary):
283 """Result object used by `CheckVisitor` to gather referenced dimensions
284 and tables from an inner group of AND'd together expression branches, and
285 check them for consistency and completeness.
286 """
288 dimension_values: dict[str, DataIdValue] = dataclasses.field(default_factory=dict)
289 """Mapping containing the values of all dimensions that are equated with
290 literal values in this expression branch.
291 """
293 defaultsNeeded: set[str] = dataclasses.field(default_factory=set)
294 """Governor dimensions whose values are needed by the query, not provided
295 in the query itself, and present in the default data ID.
297 These should be added to the query's data ID when finalizing the WHERE
298 clause.
299 """
302@dataclasses.dataclass
303class OuterSummary(InspectionSummary):
304 """Result object used by `CheckVisitor` to gather referenced dimensions,
305 tables, and governor dimension values from the entire expression.
306 """
308 dimension_constraints: dict[str, set[DataIdValue]] = dataclasses.field(default_factory=dict)
309 """Mapping containing all values that appear in this expression for
310 dimensions relevant to the query.
312 Dimensions that are absent from this dict are not constrained by this
313 expression.
314 """
316 defaultsNeeded: set[str] = dataclasses.field(default_factory=set)
317 """Governor dimensions whose values are needed by the query, not provided
318 in the query itself, and present in the default data ID.
320 These should be added to the query's data ID when finalizing the WHERE
321 clause.
322 """
325class CheckVisitor(NormalFormVisitor[TreeSummary, InnerSummary, OuterSummary]):
326 """An implementation of `NormalFormVisitor` that identifies the dimensions
327 and tables that need to be included in a query while performing some checks
328 for completeness and consistency.
330 Parameters
331 ----------
332 dataId : `DataCoordinate`
333 Dimension values that are fully known in advance.
334 graph : `DimensionGraph`
335 The dimensions the query would include in the absence of this
336 expression.
337 bind : `Mapping` [ `str`, `object` ]
338 Mapping containing literal values that should be injected into the
339 query expression, keyed by the identifiers they replace.
340 defaults : `DataCoordinate`
341 A data ID containing default for governor dimensions.
342 allow_orphans : `bool`, optional
343 If `True`, permit expressions to refer to dimensions without providing
344 a value for their governor dimensions (e.g. referring to a visit
345 without an instrument). Should be left to default to `False` in
346 essentially all new code.
347 """
349 def __init__(
350 self,
351 dataId: DataCoordinate,
352 graph: DimensionGraph,
353 bind: Mapping[str, Any],
354 defaults: DataCoordinate,
355 allow_orphans: bool = False,
356 ):
357 self.dataId = dataId
358 self.graph = graph
359 self.defaults = defaults
360 self._branchVisitor = InspectionVisitor(dataId.universe, bind)
361 self._allow_orphans = allow_orphans
363 def visitBranch(self, node: Node) -> TreeSummary:
364 # Docstring inherited from NormalFormVisitor.
365 return node.visit(self._branchVisitor)
367 def visitInner(self, branches: Sequence[TreeSummary], form: NormalForm) -> InnerSummary:
368 # Docstring inherited from NormalFormVisitor.
369 # Disjunctive normal form means inner branches are AND'd together...
370 assert form is NormalForm.DISJUNCTIVE
371 # ...and that means each branch we iterate over together below
372 # constrains the others, and they all need to be consistent. Moreover,
373 # because outer branches are OR'd together, we also know that if
374 # something is missing from one of these branches (like a governor
375 # dimension value like the instrument or skymap needed to interpret a
376 # visit or tract number), it really is missing, because there's no way
377 # some other inner branch can constraint it.
378 #
379 # That is, except the data ID the visitor was passed at construction;
380 # that's AND'd to the entire expression later, and thus it affects all
381 # branches. To take care of that, we add any governor values it
382 # contains to the summary in advance.
383 summary = InnerSummary()
384 summary.dimension_values.update(
385 (k, self.dataId[k])
386 for k in (self.dataId.graph.names if self.dataId.hasFull() else self.dataId.graph.required.names)
387 )
388 # Finally, we loop over those branches.
389 for branch in branches:
390 # Update the sets of dimensions and columns we've seen anywhere in
391 # the expression in any context.
392 summary.update(branch)
393 # Test whether this branch has a form like '<dimension>=<value>'
394 # (or equivalent; categorizeIdentifier is smart enough to see that
395 # e.g. 'detector.id=4' is equivalent to 'detector=4'). If so,
396 # remember that we've constrained it on this branch to later make
397 # sure it's consistent with any other constraints on any other
398 # branches its AND'd with.
399 if branch.dataIdKey is not None and branch.dataIdValue is not None:
400 new_value = branch.dataIdKey.primaryKey.getPythonType()(branch.dataIdValue)
401 value = summary.dimension_values.setdefault(branch.dataIdKey.name, new_value)
402 if value != new_value:
403 # Expression says something like "instrument='HSC' AND
404 # instrument='DECam'", or data ID has one and expression
405 # has the other.
406 if branch.dataIdKey in self.dataId:
407 raise UserExpressionError(
408 f"Conflict between expression containing {branch.dataIdKey.name}={new_value!r} "
409 f"and data ID with {branch.dataIdKey.name}={value!r}."
410 )
411 else:
412 raise UserExpressionError(
413 f"Conflicting literal values for {branch.dataIdKey.name} in expression: "
414 f"{value!r} != {branch.dataIdValue!r}."
415 )
416 # Now that we know which governor values we've constrained, see if any
417 # are missing, i.e. if the expression contains something like "visit=X"
418 # without saying what instrument that visit corresponds to. This rules
419 # out a lot of accidents, but it also rules out possibly-legitimate
420 # multi-instrument queries like "visit.seeing < 0.7". But it's not
421 # unreasonable to ask the user to be explicit about the instruments
422 # they want to consider to work around this restriction, and that's
423 # what we do. Note that if someone does write an expression like
424 #
425 # (instrument='HSC' OR instrument='DECam') AND visit.seeing < 0.7
426 #
427 # then in disjunctive normal form that will become
428 #
429 # (instrument='HSC' AND visit.seeing < 0.7)
430 # OR (instrument='DECam' AND visit.seeing < 0.7)
431 #
432 # i.e. each instrument will get its own outer branch and the logic here
433 # still works (that sort of thing is why we convert to normal form,
434 # after all).
435 governorsNeededInBranch: set[str] = set()
436 for dimension in summary.dimensions:
437 governorsNeededInBranch.update(dimension.graph.governors.names)
438 if not governorsNeededInBranch.issubset(summary.dimension_values.keys()):
439 missing = governorsNeededInBranch - summary.dimension_values.keys()
440 if missing <= self.defaults.names:
441 summary.defaultsNeeded.update(missing)
442 elif not self._allow_orphans:
443 still_missing = missing - self.defaults.names
444 raise UserExpressionError(
445 f"No value(s) for governor dimensions {still_missing} in expression "
446 "that references dependent dimensions. 'Governor' dimensions must always be specified "
447 "completely in either the query expression (via simple 'name=<value>' terms, not 'IN' "
448 "terms) or in a data ID passed to the query method."
449 )
450 return summary
452 def visitOuter(self, branches: Sequence[InnerSummary], form: NormalForm) -> OuterSummary:
453 # Docstring inherited from NormalFormVisitor.
454 # Disjunctive normal form means outer branches are OR'd together.
455 assert form is NormalForm.DISJUNCTIVE
456 summary = OuterSummary()
457 if branches:
458 # Iterate over branches in first pass to gather all dimensions and
459 # columns referenced. This aggregation is for the full query, so
460 # we don't care whether things are joined by AND or OR (or + or -,
461 # etc). Also gather the set of dimensions directly constrained or
462 # pulled from defaults in _all_ branches. This is the set we will
463 # be able to bound overall; any dimensions not referenced by even
464 # one branch could be unbounded.
465 dimensions_in_all_branches = set(self.graph.universe.getStaticDimensions().names)
466 for branch in branches:
467 summary.update(branch)
468 summary.defaultsNeeded.update(branch.defaultsNeeded)
469 dimensions_in_all_branches.intersection_update(branch.dimension_values)
470 # Go back through and set up the dimension bounds.
471 summary.dimension_constraints.update(
472 {dimension: set() for dimension in dimensions_in_all_branches}
473 )
474 for dim in dimensions_in_all_branches:
475 for branch in branches:
476 summary.dimension_constraints[dim].add(branch.dimension_values[dim])
477 # See if we've referenced any dimensions that weren't in the original
478 # query graph; if so, we update that to include them. This is what
479 # lets a user say "tract=X" on the command line (well, "skymap=Y AND
480 # tract=X" - logic in visitInner checks for that) when running a task
481 # like ISR that has nothing to do with skymaps.
482 if not summary.dimensions.issubset(self.graph.dimensions):
483 self.graph = DimensionGraph(
484 self.graph.universe,
485 dimensions=(summary.dimensions | self.graph.dimensions),
486 )
487 for dimension, values in summary.dimension_constraints.items():
488 if dimension in summary.defaultsNeeded:
489 # One branch contained an explicit value for this dimension
490 # while another needed to refer to the default data ID.
491 # Even if these refer to the same value, that inconsistency
492 # probably indicates user error.
493 raise UserExpressionError(
494 f"Governor dimension {dimension} is explicitly "
495 f"constrained to {values} in one or more branches of "
496 "this query where expression, but is left to default "
497 f"to {self.defaults[dimension]!r} in another branch. "
498 "Defaults and explicit constraints cannot be mixed."
499 )
500 # If any default data ID values were needed, update self.dataId with
501 # them, and then update the governor restriction with them.
502 if summary.defaultsNeeded:
503 defaultsNeededGraph = DimensionGraph(self.graph.universe, names=summary.defaultsNeeded)
504 self.dataId = self.dataId.union(self.defaults.subset(defaultsNeededGraph))
505 for dimension in summary.defaultsNeeded:
506 summary.dimension_constraints[dimension] = {self.defaults[dimension]}
508 return summary