Coverage for python/lsst/daf/butler/registry/queries/expressions/check.py: 29%
173 statements
« prev ^ index » next coverage.py v7.4.4, created at 2024-04-19 03:44 -0700
« prev ^ index » next coverage.py v7.4.4, created at 2024-04-19 03:44 -0700
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This software is dual licensed under the GNU General Public License and also
10# under a 3-clause BSD license. Recipients may choose which of these licenses
11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt,
12# respectively. If you choose the GPL option then the following text applies
13# (but note that there is still no warranty even if you opt for BSD instead):
14#
15# This program is free software: you can redistribute it and/or modify
16# it under the terms of the GNU General Public License as published by
17# the Free Software Foundation, either version 3 of the License, or
18# (at your option) any later version.
19#
20# This program is distributed in the hope that it will be useful,
21# but WITHOUT ANY WARRANTY; without even the implied warranty of
22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23# GNU General Public License for more details.
24#
25# You should have received a copy of the GNU General Public License
26# along with this program. If not, see <http://www.gnu.org/licenses/>.
27from __future__ import annotations
29__all__ = (
30 "CheckVisitor",
31 "InspectionVisitor",
32 "InspectionSummary",
33)
35import dataclasses
36from collections.abc import Mapping, Sequence, Set
37from typing import TYPE_CHECKING, Any
39from ...._column_tags import DatasetColumnTag, DimensionKeyColumnTag, DimensionRecordColumnTag
40from ....dimensions import DataCoordinate, DataIdValue, Dimension, DimensionGroup, DimensionUniverse
41from ..._exceptions import UserExpressionError
42from .categorize import ExpressionConstant, categorizeConstant, categorizeElementId
43from .normalForm import NormalForm, NormalFormVisitor
44from .parser import Node, TreeVisitor
46if TYPE_CHECKING:
47 import astropy.time
48 from lsst.daf.relation import ColumnTag
51@dataclasses.dataclass
52class InspectionSummary:
53 """Base class for objects used by `CheckVisitor` and `InspectionVisitor`
54 to gather information about a parsed expression.
55 """
57 def update(self, other: InspectionSummary) -> None:
58 """Update ``self`` with all dimensions and columns from ``other``.
60 Parameters
61 ----------
62 other : `InspectionSummary`
63 The other summary object.
64 """
65 self.dimensions.update(other.dimensions)
66 for element, columns in other.columns.items():
67 self.columns.setdefault(element, set()).update(columns)
68 self.hasIngestDate = self.hasIngestDate or other.hasIngestDate
70 dimensions: set[str] = dataclasses.field(default_factory=set)
71 """Names of dimensions whose primary keys or dependencies were referenced
72 anywhere in this branch (`set` [ `str` ]).
73 """
75 columns: dict[str, set[str]] = dataclasses.field(default_factory=dict)
76 """Names of dimension element tables whose columns were referenced anywhere
77 in this branch (`dict` [ `str`, `set` [ `str` ] ]).
78 """
80 hasIngestDate: bool = False
81 """Whether this expression includes the special dataset ingest date
82 identifier (`bool`).
83 """
85 def make_column_tag_set(self, dataset_type_name: str | None) -> set[ColumnTag]:
86 """Transform the columns captured here into a set of `ColumnTag`
87 objects.
89 Parameters
90 ----------
91 dataset_type_name : `str` or `None`
92 Name of the dataset type to assume for unqualified dataset columns,
93 or `None` to reject any such identifiers.
95 Returns
96 -------
97 tag_set : `set` [ `ColumnTag` ]
98 Set of categorized column tags.
99 """
100 result: set[ColumnTag] = set()
101 if self.hasIngestDate:
102 if dataset_type_name is None:
103 raise UserExpressionError(
104 "Expression requires an ingest date, which requires exactly one dataset type."
105 )
106 result.add(DatasetColumnTag(dataset_type_name, "ingest_date"))
107 result.update(DimensionKeyColumnTag.generate(self.dimensions))
108 for dimension_element, columns in self.columns.items():
109 result.update(DimensionRecordColumnTag.generate(dimension_element, columns))
110 return result
113@dataclasses.dataclass
114class TreeSummary(InspectionSummary):
115 """Result object used by `InspectionVisitor` to gather information about
116 a parsed expression.
118 Notes
119 -----
120 TreeSummary adds attributes that allow dimension equivalence expressions
121 (e.g. "tract=4") to be recognized when they appear in simple contexts
122 (surrounded only by ANDs and ORs). When `InspectionVisitor` is used on its
123 own (i.e. when ``check=False`` in the query code), these don't do anything,
124 but they don't cost much, either. They are used by `CheckVisitor` when it
125 delegates to `InspectionVisitor` to see what governor dimension values are
126 set in a branch of the normal-form expression.
127 """
129 def merge(self, other: TreeSummary, isEq: bool = False) -> TreeSummary:
130 """Merge ``other`` into ``self``, making ``self`` a summary of both
131 expression tree branches.
133 Parameters
134 ----------
135 other : `TreeSummary`
136 The other summary object.
137 isEq : `bool`, optional
138 If `True` (`False` is default), these summaries are being combined
139 via the equality operator.
141 Returns
142 -------
143 self : `TreeSummary`
144 The merged summary (updated in-place).
145 """
146 self.update(other)
147 if isEq and self.isDataIdKeyOnly() and other.isDataIdValueOnly():
148 self.dataIdValue = other.dataIdValue
149 elif isEq and self.isDataIdValueOnly() and other.isDataIdKeyOnly():
150 self.dataIdKey = other.dataIdKey
151 else:
152 self.dataIdKey = None
153 self.dataIdValue = None
154 return self
156 def isDataIdKeyOnly(self) -> bool:
157 """Test whether this branch is _just_ a data ID key identifier."""
158 return self.dataIdKey is not None and self.dataIdValue is None
160 def isDataIdValueOnly(self) -> bool:
161 """Test whether this branch is _just_ a literal value that may be
162 used as the value in a data ID key-value pair.
163 """
164 return self.dataIdKey is None and self.dataIdValue is not None
166 dataIdKey: Dimension | None = None
167 """A `Dimension` that is (if `dataIdValue` is not `None`) or may be
168 (if `dataIdValue` is `None`) fully identified by a literal value in this
169 branch.
170 """
172 dataIdValue: str | None = None
173 """A literal value that constrains (if `dataIdKey` is not `None`) or may
174 constrain (if `dataIdKey` is `None`) a dimension in this branch.
176 This is always a `str` or `None`, but it may need to be coerced to `int`
177 to reflect the actual user intent.
178 """
181class InspectionVisitor(TreeVisitor[TreeSummary]):
182 """Implements TreeVisitor to identify dimension elements that need
183 to be included in a query, prior to actually constructing a SQLAlchemy
184 WHERE clause from it.
186 Parameters
187 ----------
188 universe : `DimensionUniverse`
189 All known dimensions.
190 bind : `~collections.abc.Mapping` [ `str`, `object` ]
191 Mapping containing literal values that should be injected into the
192 query expression, keyed by the identifiers they replace.
193 """
195 def __init__(self, universe: DimensionUniverse, bind: Mapping[str, Any]):
196 self.universe = universe
197 self.bind = bind
199 def visitNumericLiteral(self, value: str, node: Node) -> TreeSummary:
200 # Docstring inherited from TreeVisitor.visitNumericLiteral
201 return TreeSummary(dataIdValue=value)
203 def visitStringLiteral(self, value: str, node: Node) -> TreeSummary:
204 # Docstring inherited from TreeVisitor.visitStringLiteral
205 return TreeSummary(dataIdValue=value)
207 def visitTimeLiteral(self, value: astropy.time.Time, node: Node) -> TreeSummary:
208 # Docstring inherited from TreeVisitor.visitTimeLiteral
209 return TreeSummary()
211 def visitIdentifier(self, name: str, node: Node) -> TreeSummary:
212 # Docstring inherited from TreeVisitor.visitIdentifier
213 if name in self.bind:
214 value = self.bind[name]
215 if isinstance(value, list | tuple | Set):
216 # This can happen on rhs of IN operator, if there is only one
217 # element in the list then take it.
218 if len(value) == 1:
219 return TreeSummary(dataIdValue=next(iter(value)))
220 else:
221 return TreeSummary()
222 else:
223 return TreeSummary(dataIdValue=value)
224 constant = categorizeConstant(name)
225 if constant is ExpressionConstant.INGEST_DATE:
226 return TreeSummary(hasIngestDate=True)
227 elif constant is ExpressionConstant.NULL:
228 return TreeSummary()
229 assert constant is None, "Enum variant conditionals should be exhaustive."
230 element, column = categorizeElementId(self.universe, name)
231 if column is None:
232 assert isinstance(element, Dimension)
233 return TreeSummary(
234 dimensions=set(element.minimal_group.names),
235 dataIdKey=element,
236 )
237 else:
238 return TreeSummary(dimensions=set(element.minimal_group.names), columns={element.name: {column}})
240 def visitUnaryOp(self, operator: str, operand: TreeSummary, node: Node) -> TreeSummary:
241 # Docstring inherited from TreeVisitor.visitUnaryOp
242 return operand
244 def visitBinaryOp(self, operator: str, lhs: TreeSummary, rhs: TreeSummary, node: Node) -> TreeSummary:
245 # Docstring inherited from TreeVisitor.visitBinaryOp
246 return lhs.merge(rhs, isEq=(operator == "="))
248 def visitIsIn(self, lhs: TreeSummary, values: list[TreeSummary], not_in: bool, node: Node) -> TreeSummary:
249 # Docstring inherited from TreeVisitor.visitIsIn
250 for v in values:
251 lhs.merge(v)
252 return lhs
254 def visitParens(self, expression: TreeSummary, node: Node) -> TreeSummary:
255 # Docstring inherited from TreeVisitor.visitParens
256 return expression
258 def visitTupleNode(self, items: tuple[TreeSummary, ...], node: Node) -> TreeSummary:
259 # Docstring inherited from base class
260 result = TreeSummary()
261 for i in items:
262 result.merge(i)
263 return result
265 def visitRangeLiteral(self, start: int, stop: int, stride: int | None, node: Node) -> TreeSummary:
266 # Docstring inherited from TreeVisitor.visitRangeLiteral
267 return TreeSummary()
269 def visitPointNode(self, ra: TreeSummary, dec: TreeSummary, node: Node) -> TreeSummary:
270 # Docstring inherited from base class
271 return TreeSummary()
274@dataclasses.dataclass
275class InnerSummary(InspectionSummary):
276 """Result object used by `CheckVisitor` to gather referenced dimensions
277 and tables from an inner group of AND'd together expression branches, and
278 check them for consistency and completeness.
279 """
281 dimension_values: dict[str, DataIdValue] = dataclasses.field(default_factory=dict)
282 """Mapping containing the values of all dimensions that are equated with
283 literal values in this expression branch.
284 """
286 defaultsNeeded: set[str] = dataclasses.field(default_factory=set)
287 """Governor dimensions whose values are needed by the query, not provided
288 in the query itself, and present in the default data ID.
290 These should be added to the query's data ID when finalizing the WHERE
291 clause.
292 """
295@dataclasses.dataclass
296class OuterSummary(InspectionSummary):
297 """Result object used by `CheckVisitor` to gather referenced dimensions,
298 tables, and governor dimension values from the entire expression.
299 """
301 dimension_constraints: dict[str, set[DataIdValue]] = dataclasses.field(default_factory=dict)
302 """Mapping containing all values that appear in this expression for
303 dimensions relevant to the query.
305 Dimensions that are absent from this dict are not constrained by this
306 expression.
307 """
309 defaultsNeeded: set[str] = dataclasses.field(default_factory=set)
310 """Governor dimensions whose values are needed by the query, not provided
311 in the query itself, and present in the default data ID.
313 These should be added to the query's data ID when finalizing the WHERE
314 clause.
315 """
318class CheckVisitor(NormalFormVisitor[TreeSummary, InnerSummary, OuterSummary]):
319 """An implementation of `NormalFormVisitor` that identifies the dimensions
320 and tables that need to be included in a query while performing some checks
321 for completeness and consistency.
323 Parameters
324 ----------
325 dataId : `DataCoordinate`
326 Dimension values that are fully known in advance.
327 dimensions : `DimensionGroup`
328 The dimensions the query would include in the absence of this
329 expression.
330 bind : `~collections.abc.Mapping` [ `str`, `object` ]
331 Mapping containing literal values that should be injected into the
332 query expression, keyed by the identifiers they replace.
333 defaults : `DataCoordinate`
334 A data ID containing default for governor dimensions.
335 allow_orphans : `bool`, optional
336 If `True`, permit expressions to refer to dimensions without providing
337 a value for their governor dimensions (e.g. referring to a visit
338 without an instrument). Should be left to default to `False` in
339 essentially all new code.
340 """
342 def __init__(
343 self,
344 dataId: DataCoordinate,
345 dimensions: DimensionGroup,
346 bind: Mapping[str, Any],
347 defaults: DataCoordinate,
348 allow_orphans: bool = False,
349 ):
350 self.dataId = dataId
351 self.dimensions = dimensions
352 self.defaults = defaults
353 self._branchVisitor = InspectionVisitor(dataId.universe, bind)
354 self._allow_orphans = allow_orphans
356 @property
357 def universe(self) -> DimensionUniverse:
358 """Object that defines all dimensions."""
359 return self.dimensions.universe
361 def visitBranch(self, node: Node) -> TreeSummary:
362 # Docstring inherited from NormalFormVisitor.
363 return node.visit(self._branchVisitor)
365 def visitInner(self, branches: Sequence[TreeSummary], form: NormalForm) -> InnerSummary:
366 # Docstring inherited from NormalFormVisitor.
367 # Disjunctive normal form means inner branches are AND'd together...
368 assert form is NormalForm.DISJUNCTIVE
369 # ...and that means each branch we iterate over together below
370 # constrains the others, and they all need to be consistent. Moreover,
371 # because outer branches are OR'd together, we also know that if
372 # something is missing from one of these branches (like a governor
373 # dimension value like the instrument or skymap needed to interpret a
374 # visit or tract number), it really is missing, because there's no way
375 # some other inner branch can constraint it.
376 #
377 # That is, except the data ID the visitor was passed at construction;
378 # that's AND'd to the entire expression later, and thus it affects all
379 # branches. To take care of that, we add any governor values it
380 # contains to the summary in advance.
381 summary = InnerSummary()
382 summary.dimension_values.update(self.dataId.mapping)
383 # Finally, we loop over those branches.
384 for branch in branches:
385 # Update the sets of dimensions and columns we've seen anywhere in
386 # the expression in any context.
387 summary.update(branch)
388 # Test whether this branch has a form like '<dimension>=<value>'
389 # (or equivalent; categorizeIdentifier is smart enough to see that
390 # e.g. 'detector.id=4' is equivalent to 'detector=4'). If so,
391 # remember that we've constrained it on this branch to later make
392 # sure it's consistent with any other constraints on any other
393 # branches its AND'd with.
394 if branch.dataIdKey is not None and branch.dataIdValue is not None:
395 new_value = branch.dataIdKey.primaryKey.getPythonType()(branch.dataIdValue)
396 value = summary.dimension_values.setdefault(branch.dataIdKey.name, new_value)
397 if value != new_value:
398 # Expression says something like "instrument='HSC' AND
399 # instrument='DECam'", or data ID has one and expression
400 # has the other.
401 if branch.dataIdKey.name in self.dataId:
402 raise UserExpressionError(
403 f"Conflict between expression containing {branch.dataIdKey.name}={new_value!r} "
404 f"and data ID with {branch.dataIdKey.name}={value!r}."
405 )
406 else:
407 raise UserExpressionError(
408 f"Conflicting literal values for {branch.dataIdKey.name} in expression: "
409 f"{value!r} != {branch.dataIdValue!r}."
410 )
411 # Now that we know which governor values we've constrained, see if any
412 # are missing, i.e. if the expression contains something like "visit=X"
413 # without saying what instrument that visit corresponds to. This rules
414 # out a lot of accidents, but it also rules out possibly-legitimate
415 # multi-instrument queries like "visit.seeing < 0.7". But it's not
416 # unreasonable to ask the user to be explicit about the instruments
417 # they want to consider to work around this restriction, and that's
418 # what we do. Note that if someone does write an expression like
419 #
420 # (instrument='HSC' OR instrument='DECam') AND visit.seeing < 0.7
421 #
422 # then in disjunctive normal form that will become
423 #
424 # (instrument='HSC' AND visit.seeing < 0.7)
425 # OR (instrument='DECam' AND visit.seeing < 0.7)
426 #
427 # i.e. each instrument will get its own outer branch and the logic here
428 # still works (that sort of thing is why we convert to normal form,
429 # after all).
430 governorsNeededInBranch: set[str] = set()
431 for dimension in summary.dimensions:
432 governorsNeededInBranch.update(self.universe.dimensions[dimension].minimal_group.governors)
433 if not governorsNeededInBranch.issubset(summary.dimension_values.keys()):
434 missing = governorsNeededInBranch - summary.dimension_values.keys()
435 if missing <= self.defaults.dimensions.required:
436 summary.defaultsNeeded.update(missing)
437 elif not self._allow_orphans:
438 still_missing = missing - self.defaults.names
439 raise UserExpressionError(
440 f"No value(s) for governor dimensions {still_missing} in expression "
441 "that references dependent dimensions. 'Governor' dimensions must always be specified "
442 "completely in either the query expression (via simple 'name=<value>' terms, not 'IN' "
443 "terms) or in a data ID passed to the query method."
444 )
445 return summary
447 def visitOuter(self, branches: Sequence[InnerSummary], form: NormalForm) -> OuterSummary:
448 # Docstring inherited from NormalFormVisitor.
449 # Disjunctive normal form means outer branches are OR'd together.
450 assert form is NormalForm.DISJUNCTIVE
451 summary = OuterSummary()
452 if branches:
453 # Iterate over branches in first pass to gather all dimensions and
454 # columns referenced. This aggregation is for the full query, so
455 # we don't care whether things are joined by AND or OR (or + or -,
456 # etc). Also gather the set of dimensions directly constrained or
457 # pulled from defaults in _all_ branches. This is the set we will
458 # be able to bound overall; any dimensions not referenced by even
459 # one branch could be unbounded.
460 dimensions_in_all_branches = set(self.universe.dimensions.names)
461 for branch in branches:
462 summary.update(branch)
463 summary.defaultsNeeded.update(branch.defaultsNeeded)
464 dimensions_in_all_branches.intersection_update(branch.dimension_values)
465 # Go back through and set up the dimension bounds.
466 summary.dimension_constraints.update(
467 {dimension: set() for dimension in dimensions_in_all_branches}
468 )
469 for dim in dimensions_in_all_branches:
470 for branch in branches:
471 summary.dimension_constraints[dim].add(branch.dimension_values[dim])
472 # See if we've referenced any dimensions that weren't in the original
473 # query graph; if so, we update that to include them. This is what
474 # lets a user say "tract=X" on the command line (well, "skymap=Y AND
475 # tract=X" - logic in visitInner checks for that) when running a task
476 # like ISR that has nothing to do with skymaps.
477 if not summary.dimensions.issubset(self.dimensions.names):
478 self.dimensions = self.universe.conform(summary.dimensions | self.dimensions.names)
479 for dimension, values in summary.dimension_constraints.items():
480 if dimension in summary.defaultsNeeded:
481 # One branch contained an explicit value for this dimension
482 # while another needed to refer to the default data ID.
483 # Even if these refer to the same value, that inconsistency
484 # probably indicates user error.
485 raise UserExpressionError(
486 f"Governor dimension {dimension} is explicitly "
487 f"constrained to {values} in one or more branches of "
488 "this query where expression, but is left to default "
489 f"to {self.defaults[dimension]!r} in another branch. "
490 "Defaults and explicit constraints cannot be mixed."
491 )
492 # If any default data ID values were needed, update self.dataId with
493 # them, and then update the governor restriction with them.
494 if summary.defaultsNeeded:
495 defaultsNeededGraph = self.universe.conform(summary.defaultsNeeded)
496 self.dataId = self.dataId.union(self.defaults.subset(defaultsNeededGraph))
497 for dimension in summary.defaultsNeeded:
498 summary.dimension_constraints[dimension] = {self.defaults[dimension]}
500 return summary