Coverage for python/lsst/daf/butler/registry/queries/expressions/check.py: 28%
169 statements
« prev ^ index » next coverage.py v7.3.1, created at 2023-10-02 08:00 +0000
« prev ^ index » next coverage.py v7.3.1, created at 2023-10-02 08:00 +0000
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This software is dual licensed under the GNU General Public License and also
10# under a 3-clause BSD license. Recipients may choose which of these licenses
11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt,
12# respectively. If you choose the GPL option then the following text applies
13# (but note that there is still no warranty even if you opt for BSD instead):
14#
15# This program is free software: you can redistribute it and/or modify
16# it under the terms of the GNU General Public License as published by
17# the Free Software Foundation, either version 3 of the License, or
18# (at your option) any later version.
19#
20# This program is distributed in the hope that it will be useful,
21# but WITHOUT ANY WARRANTY; without even the implied warranty of
22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23# GNU General Public License for more details.
24#
25# You should have received a copy of the GNU General Public License
26# along with this program. If not, see <http://www.gnu.org/licenses/>.
27from __future__ import annotations
29__all__ = (
30 "CheckVisitor",
31 "InspectionVisitor",
32 "InspectionSummary",
33)
35import dataclasses
36from collections.abc import Mapping, Sequence, Set
37from typing import TYPE_CHECKING, Any
39from ....core import (
40 DataCoordinate,
41 DataIdValue,
42 DatasetColumnTag,
43 Dimension,
44 DimensionElement,
45 DimensionGraph,
46 DimensionKeyColumnTag,
47 DimensionRecordColumnTag,
48 DimensionUniverse,
49 NamedKeyDict,
50 NamedValueSet,
51)
52from ..._exceptions import UserExpressionError
53from .categorize import ExpressionConstant, categorizeConstant, categorizeElementId
54from .normalForm import NormalForm, NormalFormVisitor
55from .parser import Node, TreeVisitor
57if TYPE_CHECKING:
58 import astropy.time
59 from lsst.daf.relation import ColumnTag
62@dataclasses.dataclass
63class InspectionSummary:
64 """Base class for objects used by `CheckVisitor` and `InspectionVisitor`
65 to gather information about a parsed expression.
66 """
68 def update(self, other: InspectionSummary) -> None:
69 """Update ``self`` with all dimensions and columns from ``other``.
71 Parameters
72 ----------
73 other : `InspectionSummary`
74 The other summary object.
75 """
76 self.dimensions.update(other.dimensions)
77 for element, columns in other.columns.items():
78 self.columns.setdefault(element, set()).update(columns)
79 self.hasIngestDate = self.hasIngestDate or other.hasIngestDate
81 dimensions: NamedValueSet[Dimension] = dataclasses.field(default_factory=NamedValueSet)
82 """Dimensions whose primary keys or dependencies were referenced anywhere
83 in this branch (`NamedValueSet` [ `Dimension` ]).
84 """
86 columns: NamedKeyDict[DimensionElement, set[str]] = dataclasses.field(default_factory=NamedKeyDict)
87 """Dimension element tables whose columns were referenced anywhere in this
88 branch (`NamedKeyDict` [ `DimensionElement`, `set` [ `str` ] ]).
89 """
91 hasIngestDate: bool = False
92 """Whether this expression includes the special dataset ingest date
93 identifier (`bool`).
94 """
96 def make_column_tag_set(self, dataset_type_name: str | None) -> set[ColumnTag]:
97 """Transform the columns captured here into a set of `ColumnTag`
98 objects.
100 Parameters
101 ----------
102 dataset_type_name : `str` or `None`
103 Name of the dataset type to assume for unqualified dataset columns,
104 or `None` to reject any such identifiers.
106 Returns
107 -------
108 tag_set : `set` [ `ColumnTag` ]
109 Set of categorized column tags.
110 """
111 result: set[ColumnTag] = set()
112 if self.hasIngestDate:
113 if dataset_type_name is None:
114 raise UserExpressionError(
115 "Expression requires an ingest date, which requires exactly one dataset type."
116 )
117 result.add(DatasetColumnTag(dataset_type_name, "ingest_date"))
118 result.update(DimensionKeyColumnTag.generate(self.dimensions.names))
119 for dimension_element, columns in self.columns.items():
120 result.update(DimensionRecordColumnTag.generate(dimension_element.name, columns))
121 return result
124@dataclasses.dataclass
125class TreeSummary(InspectionSummary):
126 """Result object used by `InspectionVisitor` to gather information about
127 a parsed expression.
129 Notes
130 -----
131 TreeSummary adds attributes that allow dimension equivalence expressions
132 (e.g. "tract=4") to be recognized when they appear in simple contexts
133 (surrounded only by ANDs and ORs). When `InspectionVisitor` is used on its
134 own (i.e. when ``check=False`` in the query code), these don't do anything,
135 but they don't cost much, either. They are used by `CheckVisitor` when it
136 delegates to `InspectionVisitor` to see what governor dimension values are
137 set in a branch of the normal-form expression.
138 """
140 def merge(self, other: TreeSummary, isEq: bool = False) -> TreeSummary:
141 """Merge ``other`` into ``self``, making ``self`` a summary of both
142 expression tree branches.
144 Parameters
145 ----------
146 other : `TreeSummary`
147 The other summary object.
148 isEq : `bool`, optional
149 If `True` (`False` is default), these summaries are being combined
150 via the equality operator.
152 Returns
153 -------
154 self : `TreeSummary`
155 The merged summary (updated in-place).
156 """
157 self.update(other)
158 if isEq and self.isDataIdKeyOnly() and other.isDataIdValueOnly():
159 self.dataIdValue = other.dataIdValue
160 elif isEq and self.isDataIdValueOnly() and other.isDataIdKeyOnly():
161 self.dataIdKey = other.dataIdKey
162 else:
163 self.dataIdKey = None
164 self.dataIdValue = None
165 return self
167 def isDataIdKeyOnly(self) -> bool:
168 """Test whether this branch is _just_ a data ID key identifier."""
169 return self.dataIdKey is not None and self.dataIdValue is None
171 def isDataIdValueOnly(self) -> bool:
172 """Test whether this branch is _just_ a literal value that may be
173 used as the value in a data ID key-value pair.
174 """
175 return self.dataIdKey is None and self.dataIdValue is not None
177 dataIdKey: Dimension | None = None
178 """A `Dimension` that is (if `dataIdValue` is not `None`) or may be
179 (if `dataIdValue` is `None`) fully identified by a literal value in this
180 branch.
181 """
183 dataIdValue: str | None = None
184 """A literal value that constrains (if `dataIdKey` is not `None`) or may
185 constrain (if `dataIdKey` is `None`) a dimension in this branch.
187 This is always a `str` or `None`, but it may need to be coerced to `int`
188 to reflect the actual user intent.
189 """
192class InspectionVisitor(TreeVisitor[TreeSummary]):
193 """Implements TreeVisitor to identify dimension elements that need
194 to be included in a query, prior to actually constructing a SQLAlchemy
195 WHERE clause from it.
197 Parameters
198 ----------
199 universe : `DimensionUniverse`
200 All known dimensions.
201 bind : `~collections.abc.Mapping` [ `str`, `object` ]
202 Mapping containing literal values that should be injected into the
203 query expression, keyed by the identifiers they replace.
204 """
206 def __init__(self, universe: DimensionUniverse, bind: Mapping[str, Any]):
207 self.universe = universe
208 self.bind = bind
210 def visitNumericLiteral(self, value: str, node: Node) -> TreeSummary:
211 # Docstring inherited from TreeVisitor.visitNumericLiteral
212 return TreeSummary(dataIdValue=value)
214 def visitStringLiteral(self, value: str, node: Node) -> TreeSummary:
215 # Docstring inherited from TreeVisitor.visitStringLiteral
216 return TreeSummary(dataIdValue=value)
218 def visitTimeLiteral(self, value: astropy.time.Time, node: Node) -> TreeSummary:
219 # Docstring inherited from TreeVisitor.visitTimeLiteral
220 return TreeSummary()
222 def visitIdentifier(self, name: str, node: Node) -> TreeSummary:
223 # Docstring inherited from TreeVisitor.visitIdentifier
224 if name in self.bind:
225 value = self.bind[name]
226 if isinstance(value, list | tuple | Set):
227 # This can happen on rhs of IN operator, if there is only one
228 # element in the list then take it.
229 if len(value) == 1:
230 return TreeSummary(dataIdValue=next(iter(value)))
231 else:
232 return TreeSummary()
233 else:
234 return TreeSummary(dataIdValue=value)
235 constant = categorizeConstant(name)
236 if constant is ExpressionConstant.INGEST_DATE:
237 return TreeSummary(hasIngestDate=True)
238 elif constant is ExpressionConstant.NULL:
239 return TreeSummary()
240 assert constant is None, "Enum variant conditionals should be exhaustive."
241 element, column = categorizeElementId(self.universe, name)
242 if column is None:
243 assert isinstance(element, Dimension)
244 return TreeSummary(
245 dimensions=NamedValueSet(element.graph.dimensions),
246 dataIdKey=element,
247 )
248 else:
249 return TreeSummary(
250 dimensions=NamedValueSet(element.graph.dimensions), columns=NamedKeyDict({element: {column}})
251 )
253 def visitUnaryOp(self, operator: str, operand: TreeSummary, node: Node) -> TreeSummary:
254 # Docstring inherited from TreeVisitor.visitUnaryOp
255 return operand
257 def visitBinaryOp(self, operator: str, lhs: TreeSummary, rhs: TreeSummary, node: Node) -> TreeSummary:
258 # Docstring inherited from TreeVisitor.visitBinaryOp
259 return lhs.merge(rhs, isEq=(operator == "="))
261 def visitIsIn(self, lhs: TreeSummary, values: list[TreeSummary], not_in: bool, node: Node) -> TreeSummary:
262 # Docstring inherited from TreeVisitor.visitIsIn
263 for v in values:
264 lhs.merge(v)
265 return lhs
267 def visitParens(self, expression: TreeSummary, node: Node) -> TreeSummary:
268 # Docstring inherited from TreeVisitor.visitParens
269 return expression
271 def visitTupleNode(self, items: tuple[TreeSummary, ...], node: Node) -> TreeSummary:
272 # Docstring inherited from base class
273 result = TreeSummary()
274 for i in items:
275 result.merge(i)
276 return result
278 def visitRangeLiteral(self, start: int, stop: int, stride: int | None, node: Node) -> TreeSummary:
279 # Docstring inherited from TreeVisitor.visitRangeLiteral
280 return TreeSummary()
282 def visitPointNode(self, ra: TreeSummary, dec: TreeSummary, node: Node) -> TreeSummary:
283 # Docstring inherited from base class
284 return TreeSummary()
287@dataclasses.dataclass
288class InnerSummary(InspectionSummary):
289 """Result object used by `CheckVisitor` to gather referenced dimensions
290 and tables from an inner group of AND'd together expression branches, and
291 check them for consistency and completeness.
292 """
294 dimension_values: dict[str, DataIdValue] = dataclasses.field(default_factory=dict)
295 """Mapping containing the values of all dimensions that are equated with
296 literal values in this expression branch.
297 """
299 defaultsNeeded: set[str] = dataclasses.field(default_factory=set)
300 """Governor dimensions whose values are needed by the query, not provided
301 in the query itself, and present in the default data ID.
303 These should be added to the query's data ID when finalizing the WHERE
304 clause.
305 """
308@dataclasses.dataclass
309class OuterSummary(InspectionSummary):
310 """Result object used by `CheckVisitor` to gather referenced dimensions,
311 tables, and governor dimension values from the entire expression.
312 """
314 dimension_constraints: dict[str, set[DataIdValue]] = dataclasses.field(default_factory=dict)
315 """Mapping containing all values that appear in this expression for
316 dimensions relevant to the query.
318 Dimensions that are absent from this dict are not constrained by this
319 expression.
320 """
322 defaultsNeeded: set[str] = dataclasses.field(default_factory=set)
323 """Governor dimensions whose values are needed by the query, not provided
324 in the query itself, and present in the default data ID.
326 These should be added to the query's data ID when finalizing the WHERE
327 clause.
328 """
331class CheckVisitor(NormalFormVisitor[TreeSummary, InnerSummary, OuterSummary]):
332 """An implementation of `NormalFormVisitor` that identifies the dimensions
333 and tables that need to be included in a query while performing some checks
334 for completeness and consistency.
336 Parameters
337 ----------
338 dataId : `DataCoordinate`
339 Dimension values that are fully known in advance.
340 graph : `DimensionGraph`
341 The dimensions the query would include in the absence of this
342 expression.
343 bind : `~collections.abc.Mapping` [ `str`, `object` ]
344 Mapping containing literal values that should be injected into the
345 query expression, keyed by the identifiers they replace.
346 defaults : `DataCoordinate`
347 A data ID containing default for governor dimensions.
348 allow_orphans : `bool`, optional
349 If `True`, permit expressions to refer to dimensions without providing
350 a value for their governor dimensions (e.g. referring to a visit
351 without an instrument). Should be left to default to `False` in
352 essentially all new code.
353 """
355 def __init__(
356 self,
357 dataId: DataCoordinate,
358 graph: DimensionGraph,
359 bind: Mapping[str, Any],
360 defaults: DataCoordinate,
361 allow_orphans: bool = False,
362 ):
363 self.dataId = dataId
364 self.graph = graph
365 self.defaults = defaults
366 self._branchVisitor = InspectionVisitor(dataId.universe, bind)
367 self._allow_orphans = allow_orphans
369 def visitBranch(self, node: Node) -> TreeSummary:
370 # Docstring inherited from NormalFormVisitor.
371 return node.visit(self._branchVisitor)
373 def visitInner(self, branches: Sequence[TreeSummary], form: NormalForm) -> InnerSummary:
374 # Docstring inherited from NormalFormVisitor.
375 # Disjunctive normal form means inner branches are AND'd together...
376 assert form is NormalForm.DISJUNCTIVE
377 # ...and that means each branch we iterate over together below
378 # constrains the others, and they all need to be consistent. Moreover,
379 # because outer branches are OR'd together, we also know that if
380 # something is missing from one of these branches (like a governor
381 # dimension value like the instrument or skymap needed to interpret a
382 # visit or tract number), it really is missing, because there's no way
383 # some other inner branch can constraint it.
384 #
385 # That is, except the data ID the visitor was passed at construction;
386 # that's AND'd to the entire expression later, and thus it affects all
387 # branches. To take care of that, we add any governor values it
388 # contains to the summary in advance.
389 summary = InnerSummary()
390 summary.dimension_values.update(
391 (k, self.dataId[k])
392 for k in (self.dataId.graph.names if self.dataId.hasFull() else self.dataId.graph.required.names)
393 )
394 # Finally, we loop over those branches.
395 for branch in branches:
396 # Update the sets of dimensions and columns we've seen anywhere in
397 # the expression in any context.
398 summary.update(branch)
399 # Test whether this branch has a form like '<dimension>=<value>'
400 # (or equivalent; categorizeIdentifier is smart enough to see that
401 # e.g. 'detector.id=4' is equivalent to 'detector=4'). If so,
402 # remember that we've constrained it on this branch to later make
403 # sure it's consistent with any other constraints on any other
404 # branches its AND'd with.
405 if branch.dataIdKey is not None and branch.dataIdValue is not None:
406 new_value = branch.dataIdKey.primaryKey.getPythonType()(branch.dataIdValue)
407 value = summary.dimension_values.setdefault(branch.dataIdKey.name, new_value)
408 if value != new_value:
409 # Expression says something like "instrument='HSC' AND
410 # instrument='DECam'", or data ID has one and expression
411 # has the other.
412 if branch.dataIdKey in self.dataId:
413 raise UserExpressionError(
414 f"Conflict between expression containing {branch.dataIdKey.name}={new_value!r} "
415 f"and data ID with {branch.dataIdKey.name}={value!r}."
416 )
417 else:
418 raise UserExpressionError(
419 f"Conflicting literal values for {branch.dataIdKey.name} in expression: "
420 f"{value!r} != {branch.dataIdValue!r}."
421 )
422 # Now that we know which governor values we've constrained, see if any
423 # are missing, i.e. if the expression contains something like "visit=X"
424 # without saying what instrument that visit corresponds to. This rules
425 # out a lot of accidents, but it also rules out possibly-legitimate
426 # multi-instrument queries like "visit.seeing < 0.7". But it's not
427 # unreasonable to ask the user to be explicit about the instruments
428 # they want to consider to work around this restriction, and that's
429 # what we do. Note that if someone does write an expression like
430 #
431 # (instrument='HSC' OR instrument='DECam') AND visit.seeing < 0.7
432 #
433 # then in disjunctive normal form that will become
434 #
435 # (instrument='HSC' AND visit.seeing < 0.7)
436 # OR (instrument='DECam' AND visit.seeing < 0.7)
437 #
438 # i.e. each instrument will get its own outer branch and the logic here
439 # still works (that sort of thing is why we convert to normal form,
440 # after all).
441 governorsNeededInBranch: set[str] = set()
442 for dimension in summary.dimensions:
443 governorsNeededInBranch.update(dimension.graph.governors.names)
444 if not governorsNeededInBranch.issubset(summary.dimension_values.keys()):
445 missing = governorsNeededInBranch - summary.dimension_values.keys()
446 if missing <= self.defaults.names:
447 summary.defaultsNeeded.update(missing)
448 elif not self._allow_orphans:
449 still_missing = missing - self.defaults.names
450 raise UserExpressionError(
451 f"No value(s) for governor dimensions {still_missing} in expression "
452 "that references dependent dimensions. 'Governor' dimensions must always be specified "
453 "completely in either the query expression (via simple 'name=<value>' terms, not 'IN' "
454 "terms) or in a data ID passed to the query method."
455 )
456 return summary
458 def visitOuter(self, branches: Sequence[InnerSummary], form: NormalForm) -> OuterSummary:
459 # Docstring inherited from NormalFormVisitor.
460 # Disjunctive normal form means outer branches are OR'd together.
461 assert form is NormalForm.DISJUNCTIVE
462 summary = OuterSummary()
463 if branches:
464 # Iterate over branches in first pass to gather all dimensions and
465 # columns referenced. This aggregation is for the full query, so
466 # we don't care whether things are joined by AND or OR (or + or -,
467 # etc). Also gather the set of dimensions directly constrained or
468 # pulled from defaults in _all_ branches. This is the set we will
469 # be able to bound overall; any dimensions not referenced by even
470 # one branch could be unbounded.
471 dimensions_in_all_branches = set(self.graph.universe.getStaticDimensions().names)
472 for branch in branches:
473 summary.update(branch)
474 summary.defaultsNeeded.update(branch.defaultsNeeded)
475 dimensions_in_all_branches.intersection_update(branch.dimension_values)
476 # Go back through and set up the dimension bounds.
477 summary.dimension_constraints.update(
478 {dimension: set() for dimension in dimensions_in_all_branches}
479 )
480 for dim in dimensions_in_all_branches:
481 for branch in branches:
482 summary.dimension_constraints[dim].add(branch.dimension_values[dim])
483 # See if we've referenced any dimensions that weren't in the original
484 # query graph; if so, we update that to include them. This is what
485 # lets a user say "tract=X" on the command line (well, "skymap=Y AND
486 # tract=X" - logic in visitInner checks for that) when running a task
487 # like ISR that has nothing to do with skymaps.
488 if not summary.dimensions.issubset(self.graph.dimensions):
489 self.graph = DimensionGraph(
490 self.graph.universe,
491 dimensions=(summary.dimensions | self.graph.dimensions),
492 )
493 for dimension, values in summary.dimension_constraints.items():
494 if dimension in summary.defaultsNeeded:
495 # One branch contained an explicit value for this dimension
496 # while another needed to refer to the default data ID.
497 # Even if these refer to the same value, that inconsistency
498 # probably indicates user error.
499 raise UserExpressionError(
500 f"Governor dimension {dimension} is explicitly "
501 f"constrained to {values} in one or more branches of "
502 "this query where expression, but is left to default "
503 f"to {self.defaults[dimension]!r} in another branch. "
504 "Defaults and explicit constraints cannot be mixed."
505 )
506 # If any default data ID values were needed, update self.dataId with
507 # them, and then update the governor restriction with them.
508 if summary.defaultsNeeded:
509 defaultsNeededGraph = DimensionGraph(self.graph.universe, names=summary.defaultsNeeded)
510 self.dataId = self.dataId.union(self.defaults.subset(defaultsNeededGraph))
511 for dimension in summary.defaultsNeeded:
512 summary.dimension_constraints[dimension] = {self.defaults[dimension]}
514 return summary