Coverage for python/lsst/daf/butler/registry/queries/expressions/check.py: 28%
171 statements
« prev ^ index » next coverage.py v7.3.2, created at 2023-10-27 09:44 +0000
« prev ^ index » next coverage.py v7.3.2, created at 2023-10-27 09:44 +0000
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This software is dual licensed under the GNU General Public License and also
10# under a 3-clause BSD license. Recipients may choose which of these licenses
11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt,
12# respectively. If you choose the GPL option then the following text applies
13# (but note that there is still no warranty even if you opt for BSD instead):
14#
15# This program is free software: you can redistribute it and/or modify
16# it under the terms of the GNU General Public License as published by
17# the Free Software Foundation, either version 3 of the License, or
18# (at your option) any later version.
19#
20# This program is distributed in the hope that it will be useful,
21# but WITHOUT ANY WARRANTY; without even the implied warranty of
22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23# GNU General Public License for more details.
24#
25# You should have received a copy of the GNU General Public License
26# along with this program. If not, see <http://www.gnu.org/licenses/>.
27from __future__ import annotations
29__all__ = (
30 "CheckVisitor",
31 "InspectionVisitor",
32 "InspectionSummary",
33)
35import dataclasses
36from collections.abc import Mapping, Sequence, Set
37from typing import TYPE_CHECKING, Any
39from ...._column_tags import DatasetColumnTag, DimensionKeyColumnTag, DimensionRecordColumnTag
40from ...._named import NamedKeyDict, NamedValueSet
41from ....dimensions import (
42 DataCoordinate,
43 DataIdValue,
44 Dimension,
45 DimensionElement,
46 DimensionGraph,
47 DimensionUniverse,
48)
49from ..._exceptions import UserExpressionError
50from .categorize import ExpressionConstant, categorizeConstant, categorizeElementId
51from .normalForm import NormalForm, NormalFormVisitor
52from .parser import Node, TreeVisitor
54if TYPE_CHECKING:
55 import astropy.time
56 from lsst.daf.relation import ColumnTag
59@dataclasses.dataclass
60class InspectionSummary:
61 """Base class for objects used by `CheckVisitor` and `InspectionVisitor`
62 to gather information about a parsed expression.
63 """
65 def update(self, other: InspectionSummary) -> None:
66 """Update ``self`` with all dimensions and columns from ``other``.
68 Parameters
69 ----------
70 other : `InspectionSummary`
71 The other summary object.
72 """
73 self.dimensions.update(other.dimensions)
74 for element, columns in other.columns.items():
75 self.columns.setdefault(element, set()).update(columns)
76 self.hasIngestDate = self.hasIngestDate or other.hasIngestDate
78 dimensions: NamedValueSet[Dimension] = dataclasses.field(default_factory=NamedValueSet)
79 """Dimensions whose primary keys or dependencies were referenced anywhere
80 in this branch (`NamedValueSet` [ `Dimension` ]).
81 """
83 columns: NamedKeyDict[DimensionElement, set[str]] = dataclasses.field(default_factory=NamedKeyDict)
84 """Dimension element tables whose columns were referenced anywhere in this
85 branch (`NamedKeyDict` [ `DimensionElement`, `set` [ `str` ] ]).
86 """
88 hasIngestDate: bool = False
89 """Whether this expression includes the special dataset ingest date
90 identifier (`bool`).
91 """
93 def make_column_tag_set(self, dataset_type_name: str | None) -> set[ColumnTag]:
94 """Transform the columns captured here into a set of `ColumnTag`
95 objects.
97 Parameters
98 ----------
99 dataset_type_name : `str` or `None`
100 Name of the dataset type to assume for unqualified dataset columns,
101 or `None` to reject any such identifiers.
103 Returns
104 -------
105 tag_set : `set` [ `ColumnTag` ]
106 Set of categorized column tags.
107 """
108 result: set[ColumnTag] = set()
109 if self.hasIngestDate:
110 if dataset_type_name is None:
111 raise UserExpressionError(
112 "Expression requires an ingest date, which requires exactly one dataset type."
113 )
114 result.add(DatasetColumnTag(dataset_type_name, "ingest_date"))
115 result.update(DimensionKeyColumnTag.generate(self.dimensions.names))
116 for dimension_element, columns in self.columns.items():
117 result.update(DimensionRecordColumnTag.generate(dimension_element.name, columns))
118 return result
121@dataclasses.dataclass
122class TreeSummary(InspectionSummary):
123 """Result object used by `InspectionVisitor` to gather information about
124 a parsed expression.
126 Notes
127 -----
128 TreeSummary adds attributes that allow dimension equivalence expressions
129 (e.g. "tract=4") to be recognized when they appear in simple contexts
130 (surrounded only by ANDs and ORs). When `InspectionVisitor` is used on its
131 own (i.e. when ``check=False`` in the query code), these don't do anything,
132 but they don't cost much, either. They are used by `CheckVisitor` when it
133 delegates to `InspectionVisitor` to see what governor dimension values are
134 set in a branch of the normal-form expression.
135 """
137 def merge(self, other: TreeSummary, isEq: bool = False) -> TreeSummary:
138 """Merge ``other`` into ``self``, making ``self`` a summary of both
139 expression tree branches.
141 Parameters
142 ----------
143 other : `TreeSummary`
144 The other summary object.
145 isEq : `bool`, optional
146 If `True` (`False` is default), these summaries are being combined
147 via the equality operator.
149 Returns
150 -------
151 self : `TreeSummary`
152 The merged summary (updated in-place).
153 """
154 self.update(other)
155 if isEq and self.isDataIdKeyOnly() and other.isDataIdValueOnly():
156 self.dataIdValue = other.dataIdValue
157 elif isEq and self.isDataIdValueOnly() and other.isDataIdKeyOnly():
158 self.dataIdKey = other.dataIdKey
159 else:
160 self.dataIdKey = None
161 self.dataIdValue = None
162 return self
164 def isDataIdKeyOnly(self) -> bool:
165 """Test whether this branch is _just_ a data ID key identifier."""
166 return self.dataIdKey is not None and self.dataIdValue is None
168 def isDataIdValueOnly(self) -> bool:
169 """Test whether this branch is _just_ a literal value that may be
170 used as the value in a data ID key-value pair.
171 """
172 return self.dataIdKey is None and self.dataIdValue is not None
174 dataIdKey: Dimension | None = None
175 """A `Dimension` that is (if `dataIdValue` is not `None`) or may be
176 (if `dataIdValue` is `None`) fully identified by a literal value in this
177 branch.
178 """
180 dataIdValue: str | None = None
181 """A literal value that constrains (if `dataIdKey` is not `None`) or may
182 constrain (if `dataIdKey` is `None`) a dimension in this branch.
184 This is always a `str` or `None`, but it may need to be coerced to `int`
185 to reflect the actual user intent.
186 """
189class InspectionVisitor(TreeVisitor[TreeSummary]):
190 """Implements TreeVisitor to identify dimension elements that need
191 to be included in a query, prior to actually constructing a SQLAlchemy
192 WHERE clause from it.
194 Parameters
195 ----------
196 universe : `DimensionUniverse`
197 All known dimensions.
198 bind : `~collections.abc.Mapping` [ `str`, `object` ]
199 Mapping containing literal values that should be injected into the
200 query expression, keyed by the identifiers they replace.
201 """
203 def __init__(self, universe: DimensionUniverse, bind: Mapping[str, Any]):
204 self.universe = universe
205 self.bind = bind
207 def visitNumericLiteral(self, value: str, node: Node) -> TreeSummary:
208 # Docstring inherited from TreeVisitor.visitNumericLiteral
209 return TreeSummary(dataIdValue=value)
211 def visitStringLiteral(self, value: str, node: Node) -> TreeSummary:
212 # Docstring inherited from TreeVisitor.visitStringLiteral
213 return TreeSummary(dataIdValue=value)
215 def visitTimeLiteral(self, value: astropy.time.Time, node: Node) -> TreeSummary:
216 # Docstring inherited from TreeVisitor.visitTimeLiteral
217 return TreeSummary()
219 def visitIdentifier(self, name: str, node: Node) -> TreeSummary:
220 # Docstring inherited from TreeVisitor.visitIdentifier
221 if name in self.bind:
222 value = self.bind[name]
223 if isinstance(value, list | tuple | Set):
224 # This can happen on rhs of IN operator, if there is only one
225 # element in the list then take it.
226 if len(value) == 1:
227 return TreeSummary(dataIdValue=next(iter(value)))
228 else:
229 return TreeSummary()
230 else:
231 return TreeSummary(dataIdValue=value)
232 constant = categorizeConstant(name)
233 if constant is ExpressionConstant.INGEST_DATE:
234 return TreeSummary(hasIngestDate=True)
235 elif constant is ExpressionConstant.NULL:
236 return TreeSummary()
237 assert constant is None, "Enum variant conditionals should be exhaustive."
238 element, column = categorizeElementId(self.universe, name)
239 if column is None:
240 assert isinstance(element, Dimension)
241 return TreeSummary(
242 dimensions=NamedValueSet(element.graph.dimensions),
243 dataIdKey=element,
244 )
245 else:
246 return TreeSummary(
247 dimensions=NamedValueSet(element.graph.dimensions), columns=NamedKeyDict({element: {column}})
248 )
250 def visitUnaryOp(self, operator: str, operand: TreeSummary, node: Node) -> TreeSummary:
251 # Docstring inherited from TreeVisitor.visitUnaryOp
252 return operand
254 def visitBinaryOp(self, operator: str, lhs: TreeSummary, rhs: TreeSummary, node: Node) -> TreeSummary:
255 # Docstring inherited from TreeVisitor.visitBinaryOp
256 return lhs.merge(rhs, isEq=(operator == "="))
258 def visitIsIn(self, lhs: TreeSummary, values: list[TreeSummary], not_in: bool, node: Node) -> TreeSummary:
259 # Docstring inherited from TreeVisitor.visitIsIn
260 for v in values:
261 lhs.merge(v)
262 return lhs
264 def visitParens(self, expression: TreeSummary, node: Node) -> TreeSummary:
265 # Docstring inherited from TreeVisitor.visitParens
266 return expression
268 def visitTupleNode(self, items: tuple[TreeSummary, ...], node: Node) -> TreeSummary:
269 # Docstring inherited from base class
270 result = TreeSummary()
271 for i in items:
272 result.merge(i)
273 return result
275 def visitRangeLiteral(self, start: int, stop: int, stride: int | None, node: Node) -> TreeSummary:
276 # Docstring inherited from TreeVisitor.visitRangeLiteral
277 return TreeSummary()
279 def visitPointNode(self, ra: TreeSummary, dec: TreeSummary, node: Node) -> TreeSummary:
280 # Docstring inherited from base class
281 return TreeSummary()
284@dataclasses.dataclass
285class InnerSummary(InspectionSummary):
286 """Result object used by `CheckVisitor` to gather referenced dimensions
287 and tables from an inner group of AND'd together expression branches, and
288 check them for consistency and completeness.
289 """
291 dimension_values: dict[str, DataIdValue] = dataclasses.field(default_factory=dict)
292 """Mapping containing the values of all dimensions that are equated with
293 literal values in this expression branch.
294 """
296 defaultsNeeded: set[str] = dataclasses.field(default_factory=set)
297 """Governor dimensions whose values are needed by the query, not provided
298 in the query itself, and present in the default data ID.
300 These should be added to the query's data ID when finalizing the WHERE
301 clause.
302 """
305@dataclasses.dataclass
306class OuterSummary(InspectionSummary):
307 """Result object used by `CheckVisitor` to gather referenced dimensions,
308 tables, and governor dimension values from the entire expression.
309 """
311 dimension_constraints: dict[str, set[DataIdValue]] = dataclasses.field(default_factory=dict)
312 """Mapping containing all values that appear in this expression for
313 dimensions relevant to the query.
315 Dimensions that are absent from this dict are not constrained by this
316 expression.
317 """
319 defaultsNeeded: set[str] = dataclasses.field(default_factory=set)
320 """Governor dimensions whose values are needed by the query, not provided
321 in the query itself, and present in the default data ID.
323 These should be added to the query's data ID when finalizing the WHERE
324 clause.
325 """
328class CheckVisitor(NormalFormVisitor[TreeSummary, InnerSummary, OuterSummary]):
329 """An implementation of `NormalFormVisitor` that identifies the dimensions
330 and tables that need to be included in a query while performing some checks
331 for completeness and consistency.
333 Parameters
334 ----------
335 dataId : `DataCoordinate`
336 Dimension values that are fully known in advance.
337 graph : `DimensionGraph`
338 The dimensions the query would include in the absence of this
339 expression.
340 bind : `~collections.abc.Mapping` [ `str`, `object` ]
341 Mapping containing literal values that should be injected into the
342 query expression, keyed by the identifiers they replace.
343 defaults : `DataCoordinate`
344 A data ID containing default for governor dimensions.
345 allow_orphans : `bool`, optional
346 If `True`, permit expressions to refer to dimensions without providing
347 a value for their governor dimensions (e.g. referring to a visit
348 without an instrument). Should be left to default to `False` in
349 essentially all new code.
350 """
352 def __init__(
353 self,
354 dataId: DataCoordinate,
355 graph: DimensionGraph,
356 bind: Mapping[str, Any],
357 defaults: DataCoordinate,
358 allow_orphans: bool = False,
359 ):
360 self.dataId = dataId
361 self.graph = graph
362 self.defaults = defaults
363 self._branchVisitor = InspectionVisitor(dataId.universe, bind)
364 self._allow_orphans = allow_orphans
366 def visitBranch(self, node: Node) -> TreeSummary:
367 # Docstring inherited from NormalFormVisitor.
368 return node.visit(self._branchVisitor)
370 def visitInner(self, branches: Sequence[TreeSummary], form: NormalForm) -> InnerSummary:
371 # Docstring inherited from NormalFormVisitor.
372 # Disjunctive normal form means inner branches are AND'd together...
373 assert form is NormalForm.DISJUNCTIVE
374 # ...and that means each branch we iterate over together below
375 # constrains the others, and they all need to be consistent. Moreover,
376 # because outer branches are OR'd together, we also know that if
377 # something is missing from one of these branches (like a governor
378 # dimension value like the instrument or skymap needed to interpret a
379 # visit or tract number), it really is missing, because there's no way
380 # some other inner branch can constraint it.
381 #
382 # That is, except the data ID the visitor was passed at construction;
383 # that's AND'd to the entire expression later, and thus it affects all
384 # branches. To take care of that, we add any governor values it
385 # contains to the summary in advance.
386 summary = InnerSummary()
387 summary.dimension_values.update(
388 (k, self.dataId[k])
389 for k in (self.dataId.graph.names if self.dataId.hasFull() else self.dataId.graph.required.names)
390 )
391 # Finally, we loop over those branches.
392 for branch in branches:
393 # Update the sets of dimensions and columns we've seen anywhere in
394 # the expression in any context.
395 summary.update(branch)
396 # Test whether this branch has a form like '<dimension>=<value>'
397 # (or equivalent; categorizeIdentifier is smart enough to see that
398 # e.g. 'detector.id=4' is equivalent to 'detector=4'). If so,
399 # remember that we've constrained it on this branch to later make
400 # sure it's consistent with any other constraints on any other
401 # branches its AND'd with.
402 if branch.dataIdKey is not None and branch.dataIdValue is not None:
403 new_value = branch.dataIdKey.primaryKey.getPythonType()(branch.dataIdValue)
404 value = summary.dimension_values.setdefault(branch.dataIdKey.name, new_value)
405 if value != new_value:
406 # Expression says something like "instrument='HSC' AND
407 # instrument='DECam'", or data ID has one and expression
408 # has the other.
409 if branch.dataIdKey in self.dataId:
410 raise UserExpressionError(
411 f"Conflict between expression containing {branch.dataIdKey.name}={new_value!r} "
412 f"and data ID with {branch.dataIdKey.name}={value!r}."
413 )
414 else:
415 raise UserExpressionError(
416 f"Conflicting literal values for {branch.dataIdKey.name} in expression: "
417 f"{value!r} != {branch.dataIdValue!r}."
418 )
419 # Now that we know which governor values we've constrained, see if any
420 # are missing, i.e. if the expression contains something like "visit=X"
421 # without saying what instrument that visit corresponds to. This rules
422 # out a lot of accidents, but it also rules out possibly-legitimate
423 # multi-instrument queries like "visit.seeing < 0.7". But it's not
424 # unreasonable to ask the user to be explicit about the instruments
425 # they want to consider to work around this restriction, and that's
426 # what we do. Note that if someone does write an expression like
427 #
428 # (instrument='HSC' OR instrument='DECam') AND visit.seeing < 0.7
429 #
430 # then in disjunctive normal form that will become
431 #
432 # (instrument='HSC' AND visit.seeing < 0.7)
433 # OR (instrument='DECam' AND visit.seeing < 0.7)
434 #
435 # i.e. each instrument will get its own outer branch and the logic here
436 # still works (that sort of thing is why we convert to normal form,
437 # after all).
438 governorsNeededInBranch: set[str] = set()
439 for dimension in summary.dimensions:
440 governorsNeededInBranch.update(dimension.graph.governors.names)
441 if not governorsNeededInBranch.issubset(summary.dimension_values.keys()):
442 missing = governorsNeededInBranch - summary.dimension_values.keys()
443 if missing <= self.defaults.names:
444 summary.defaultsNeeded.update(missing)
445 elif not self._allow_orphans:
446 still_missing = missing - self.defaults.names
447 raise UserExpressionError(
448 f"No value(s) for governor dimensions {still_missing} in expression "
449 "that references dependent dimensions. 'Governor' dimensions must always be specified "
450 "completely in either the query expression (via simple 'name=<value>' terms, not 'IN' "
451 "terms) or in a data ID passed to the query method."
452 )
453 return summary
455 def visitOuter(self, branches: Sequence[InnerSummary], form: NormalForm) -> OuterSummary:
456 # Docstring inherited from NormalFormVisitor.
457 # Disjunctive normal form means outer branches are OR'd together.
458 assert form is NormalForm.DISJUNCTIVE
459 summary = OuterSummary()
460 if branches:
461 # Iterate over branches in first pass to gather all dimensions and
462 # columns referenced. This aggregation is for the full query, so
463 # we don't care whether things are joined by AND or OR (or + or -,
464 # etc). Also gather the set of dimensions directly constrained or
465 # pulled from defaults in _all_ branches. This is the set we will
466 # be able to bound overall; any dimensions not referenced by even
467 # one branch could be unbounded.
468 dimensions_in_all_branches = set(self.graph.universe.getStaticDimensions().names)
469 for branch in branches:
470 summary.update(branch)
471 summary.defaultsNeeded.update(branch.defaultsNeeded)
472 dimensions_in_all_branches.intersection_update(branch.dimension_values)
473 # Go back through and set up the dimension bounds.
474 summary.dimension_constraints.update(
475 {dimension: set() for dimension in dimensions_in_all_branches}
476 )
477 for dim in dimensions_in_all_branches:
478 for branch in branches:
479 summary.dimension_constraints[dim].add(branch.dimension_values[dim])
480 # See if we've referenced any dimensions that weren't in the original
481 # query graph; if so, we update that to include them. This is what
482 # lets a user say "tract=X" on the command line (well, "skymap=Y AND
483 # tract=X" - logic in visitInner checks for that) when running a task
484 # like ISR that has nothing to do with skymaps.
485 if not summary.dimensions.issubset(self.graph.dimensions):
486 self.graph = DimensionGraph(
487 self.graph.universe,
488 dimensions=(summary.dimensions | self.graph.dimensions),
489 )
490 for dimension, values in summary.dimension_constraints.items():
491 if dimension in summary.defaultsNeeded:
492 # One branch contained an explicit value for this dimension
493 # while another needed to refer to the default data ID.
494 # Even if these refer to the same value, that inconsistency
495 # probably indicates user error.
496 raise UserExpressionError(
497 f"Governor dimension {dimension} is explicitly "
498 f"constrained to {values} in one or more branches of "
499 "this query where expression, but is left to default "
500 f"to {self.defaults[dimension]!r} in another branch. "
501 "Defaults and explicit constraints cannot be mixed."
502 )
503 # If any default data ID values were needed, update self.dataId with
504 # them, and then update the governor restriction with them.
505 if summary.defaultsNeeded:
506 defaultsNeededGraph = DimensionGraph(self.graph.universe, names=summary.defaultsNeeded)
507 self.dataId = self.dataId.union(self.defaults.subset(defaultsNeededGraph))
508 for dimension in summary.defaultsNeeded:
509 summary.dimension_constraints[dimension] = {self.defaults[dimension]}
511 return summary