Coverage for python/lsst/daf/butler/registry/queries/_structs.py: 37%
225 statements
« prev ^ index » next coverage.py v7.2.5, created at 2023-05-15 00:10 +0000
« prev ^ index » next coverage.py v7.2.5, created at 2023-05-15 00:10 +0000
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
21from __future__ import annotations
23__all__ = ["QuerySummary"] # other classes here are local to subpackage
25from collections.abc import Iterable, Iterator, Mapping, Set
26from dataclasses import dataclass
27from typing import Any, cast
29from lsst.sphgeom import Region
30from lsst.utils.classes import cached_getter, immutable
31from sqlalchemy.sql import ColumnElement
33from ...core import (
34 DataCoordinate,
35 DatasetType,
36 Dimension,
37 DimensionElement,
38 DimensionGraph,
39 DimensionUniverse,
40 NamedKeyDict,
41 NamedKeyMapping,
42 NamedValueAbstractSet,
43 NamedValueSet,
44 SkyPixDimension,
45 TimespanDatabaseRepresentation,
46)
47from .._exceptions import UserExpressionSyntaxError
49# We're not trying to add typing to the lex/yacc parser code, so MyPy
50# doesn't know about some of these imports.
51from .expressions import Node, NormalForm, NormalFormExpression, ParserYacc # type: ignore
52from .expressions.categorize import categorizeElementOrderByName, categorizeOrderByName
55@immutable
56class QueryWhereExpression:
57 """A struct representing a parsed user-provided WHERE expression.
59 Parameters
60 ----------
61 expression : `str`, optional
62 The string expression to parse. If `None`, a where expression that
63 always evaluates to `True` is implied.
64 bind : `Mapping` [ `str`, `object` ], optional
65 Mapping containing literal values that should be injected into the
66 query expression, keyed by the identifiers they replace.
67 """
69 def __init__(self, expression: str | None = None, bind: Mapping[str, Any] | None = None):
70 if expression:
71 try:
72 parser = ParserYacc()
73 self._tree = parser.parse(expression)
74 except Exception as exc:
75 raise UserExpressionSyntaxError(f"Failed to parse user expression `{expression}'.") from exc
76 assert self._tree is not None
77 else:
78 self._tree = None
79 if bind is None:
80 bind = {}
81 self._bind = bind
83 def attach(
84 self,
85 graph: DimensionGraph,
86 dataId: DataCoordinate | None = None,
87 region: Region | None = None,
88 defaults: DataCoordinate | None = None,
89 check: bool = True,
90 ) -> QueryWhereClause:
91 """Allow this expression to be attached to a `QuerySummary` by
92 transforming it into a `QueryWhereClause`, while checking it for both
93 internal consistency and consistency with the rest of the query.
95 Parameters
96 ----------
97 graph : `DimensionGraph`
98 The dimensions the query would include in the absence of this
99 WHERE expression.
100 dataId : `DataCoordinate`, optional
101 A fully-expanded data ID identifying dimensions known in advance.
102 If not provided, will be set to an empty data ID.
103 region : `lsst.sphgeom.Region`, optional
104 A spatial constraint that all rows must overlap. If `None` and
105 ``dataId`` is an expanded data ID, ``dataId.region`` will be used
106 to construct one.
107 defaults : `DataCoordinate`, optional
108 A data ID containing default for governor dimensions. Ignored
109 unless ``check=True``.
110 check : `bool`
111 If `True` (default) check the query for consistency and inject
112 default values into the data ID when needed. This may
113 reject some valid queries that resemble common mistakes (e.g.
114 queries for visits without specifying an instrument).
115 """
116 if dataId is not None and dataId.hasRecords():
117 if region is None and dataId.region is not None:
118 region = dataId.region
119 if dataId is None:
120 dataId = DataCoordinate.makeEmpty(graph.universe)
121 if defaults is None:
122 defaults = DataCoordinate.makeEmpty(graph.universe)
123 if self._bind and check:
124 for identifier in self._bind:
125 if identifier in graph.universe.getStaticElements().names:
126 raise RuntimeError(
127 f"Bind parameter key {identifier!r} conflicts with a dimension element."
128 )
129 table, sep, column = identifier.partition(".")
130 if column and table in graph.universe.getStaticElements().names:
131 raise RuntimeError(f"Bind parameter key {identifier!r} looks like a dimension column.")
132 governor_constraints: dict[str, Set[str]] = {}
133 summary: InspectionSummary
134 if self._tree is not None:
135 if check:
136 # Convert the expression to disjunctive normal form (ORs of
137 # ANDs). That's potentially super expensive in the general
138 # case (where there's a ton of nesting of ANDs and ORs). That
139 # won't be the case for the expressions we expect, and we
140 # actually use disjunctive normal instead of conjunctive (i.e.
141 # ANDs of ORs) because I think the worst-case is a long list
142 # of OR'd-together data IDs, which is already in or very close
143 # to disjunctive normal form.
144 expr = NormalFormExpression.fromTree(self._tree, NormalForm.DISJUNCTIVE)
145 from .expressions import CheckVisitor
147 # Check the expression for consistency and completeness.
148 visitor = CheckVisitor(dataId, graph, self._bind, defaults)
149 try:
150 summary = expr.visit(visitor)
151 except RuntimeError as err:
152 exprOriginal = str(self._tree)
153 exprNormal = str(expr.toTree())
154 if exprNormal == exprOriginal:
155 msg = f'Error in query expression "{exprOriginal}": {err}'
156 else:
157 msg = (
158 f'Error in query expression "{exprOriginal}" '
159 f'(normalized to "{exprNormal}"): {err}'
160 )
161 raise RuntimeError(msg) from None
162 for dimension_name, values in summary.dimension_constraints.items():
163 if dimension_name in graph.universe.getGovernorDimensions().names:
164 governor_constraints[dimension_name] = cast(Set[str], values)
165 dataId = visitor.dataId
166 else:
167 from .expressions import InspectionVisitor
169 summary = self._tree.visit(InspectionVisitor(graph.universe, self._bind))
170 else:
171 from .expressions import InspectionSummary
173 summary = InspectionSummary()
174 return QueryWhereClause(
175 self._tree,
176 dataId,
177 dimensions=summary.dimensions,
178 columns=summary.columns,
179 bind=self._bind,
180 governor_constraints=governor_constraints,
181 region=region,
182 )
185@dataclass(frozen=True)
186class QueryWhereClause:
187 """Structure holding various contributions to a query's WHERE clause.
189 Instances of this class should only be created by
190 `QueryWhereExpression.attach`, which guarantees the consistency of its
191 attributes.
192 """
194 tree: Node | None
195 """A parsed string expression tree., or `None` if there was no string
196 expression.
197 """
199 dataId: DataCoordinate
200 """A data ID identifying dimensions known before query construction
201 (`DataCoordinate`).
203 ``dataId.hasRecords()`` is guaranteed to return `True`.
204 """
206 dimensions: NamedValueAbstractSet[Dimension]
207 """Dimensions whose primary keys or dependencies were referenced anywhere
208 in the string expression (`NamedValueAbstractSet` [ `Dimension` ]).
209 """
211 columns: NamedKeyMapping[DimensionElement, Set[str]]
212 """Dimension element tables whose non-key columns were referenced anywhere
213 in the string expression
214 (`NamedKeyMapping` [ `DimensionElement`, `Set` [ `str` ] ]).
215 """
217 bind: Mapping[str, Any]
218 """Mapping containing literal values that should be injected into the
219 query expression, keyed by the identifiers they replace (`Mapping`).
220 """
222 region: Region | None
223 """A spatial region that all result rows must overlap
224 (`lsst.sphgeom.Region` or `None`).
225 """
227 governor_constraints: Mapping[str, Set[str]]
228 """Restrictions on the values governor dimensions can take in this query,
229 imposed by the string expression and/or data ID
230 (`Mapping` [ `set`, `~collections.abc.Set` [ `str` ] ]).
232 Governor dimensions not present in this mapping are not constrained at all.
233 """
235 @property
236 @cached_getter
237 def temporal(self) -> NamedValueAbstractSet[DimensionElement]:
238 """Dimension elements whose timespans are referenced by this
239 expression (`NamedValueAbstractSet` [ `DimensionElement` ])
240 """
241 return NamedValueSet(
242 e for e, c in self.columns.items() if TimespanDatabaseRepresentation.NAME in c
243 ).freeze()
246@dataclass(frozen=True)
247class OrderByClauseColumn:
248 """Information about single column in ORDER BY clause."""
250 element: DimensionElement
251 """Dimension element for data in this column (`DimensionElement`)."""
253 column: str | None
254 """Name of the column or `None` for primary key (`str` or `None`)"""
256 ordering: bool
257 """True for ascending order, False for descending (`bool`)."""
260@immutable
261class OrderByClause:
262 """Class for information about columns in ORDER BY clause
264 Parameters
265 ----------
266 order_by : `Iterable` [ `str` ]
267 Sequence of names to use for ordering with optional "-" prefix.
268 graph : `DimensionGraph`
269 Dimensions used by a query.
270 """
272 def __init__(self, order_by: Iterable[str], graph: DimensionGraph):
273 self.order_by_columns = []
274 for name in order_by:
275 if not name or name == "-":
276 raise ValueError("Empty dimension name in ORDER BY")
277 ascending = True
278 if name[0] == "-":
279 ascending = False
280 name = name[1:]
281 element, column = categorizeOrderByName(graph, name)
282 self.order_by_columns.append(
283 OrderByClauseColumn(element=element, column=column, ordering=ascending)
284 )
286 self.elements = NamedValueSet(
287 column.element for column in self.order_by_columns if column.column is not None
288 )
290 order_by_columns: Iterable[OrderByClauseColumn]
291 """Columns that appear in the ORDER BY
292 (`Iterable` [ `OrderByClauseColumn` ]).
293 """
295 elements: NamedValueSet[DimensionElement]
296 """Dimension elements whose non-key columns were referenced by order_by
297 (`NamedValueSet` [ `DimensionElement` ]).
298 """
301@immutable
302class ElementOrderByClause:
303 """Class for information about columns in ORDER BY clause for one element.
305 Parameters
306 ----------
307 order_by : `Iterable` [ `str` ]
308 Sequence of names to use for ordering with optional "-" prefix.
309 element : `DimensionElement`
310 Dimensions used by a query.
311 """
313 def __init__(self, order_by: Iterable[str], element: DimensionElement):
314 self.order_by_columns = []
315 for name in order_by:
316 if not name or name == "-":
317 raise ValueError("Empty dimension name in ORDER BY")
318 ascending = True
319 if name[0] == "-":
320 ascending = False
321 name = name[1:]
322 column = categorizeElementOrderByName(element, name)
323 self.order_by_columns.append(
324 OrderByClauseColumn(element=element, column=column, ordering=ascending)
325 )
327 order_by_columns: Iterable[OrderByClauseColumn]
328 """Columns that appear in the ORDER BY
329 (`Iterable` [ `OrderByClauseColumn` ]).
330 """
333@immutable
334class QuerySummary:
335 """A struct that holds and categorizes the dimensions involved in a query.
337 A `QuerySummary` instance is necessary to construct a `QueryBuilder`, and
338 it needs to include all of the dimensions that will be included in the
339 query (including any needed for querying datasets).
341 Parameters
342 ----------
343 requested : `DimensionGraph`
344 The dimensions whose primary keys should be included in the result rows
345 of the query.
346 dataId : `DataCoordinate`, optional
347 A fully-expanded data ID identifying dimensions known in advance. If
348 not provided, will be set to an empty data ID.
349 expression : `str` or `QueryWhereExpression`, optional
350 A user-provided string WHERE expression.
351 whereRegion : `lsst.sphgeom.Region`, optional
352 If `None` and ``dataId`` is an expanded data ID, ``dataId.region`` will
353 be used to construct one.
354 bind : `Mapping` [ `str`, `object` ], optional
355 Mapping containing literal values that should be injected into the
356 query expression, keyed by the identifiers they replace.
357 defaults : `DataCoordinate`, optional
358 A data ID containing default for governor dimensions.
359 datasets : `Iterable` [ `DatasetType` ], optional
360 Dataset types whose searches may be joined into the query. Callers
361 must still call `QueryBuilder.joinDataset` explicitly to control how
362 that join happens (e.g. which collections are searched), but by
363 declaring them here first we can ensure that the query includes the
364 right dimensions for those joins.
365 order_by : `Iterable` [ `str` ]
366 Sequence of names to use for ordering with optional "-" prefix.
367 limit : `Tuple`, optional
368 Limit on the number of returned rows and optional offset.
369 check : `bool`
370 If `True` (default) check the query for consistency. This may reject
371 some valid queries that resemble common mistakes (e.g. queries for
372 visits without specifying an instrument).
373 """
375 def __init__(
376 self,
377 requested: DimensionGraph,
378 *,
379 dataId: DataCoordinate | None = None,
380 expression: str | QueryWhereExpression | None = None,
381 whereRegion: Region | None = None,
382 bind: Mapping[str, Any] | None = None,
383 defaults: DataCoordinate | None = None,
384 datasets: Iterable[DatasetType] = (),
385 order_by: Iterable[str] | None = None,
386 limit: tuple[int, int | None] | None = None,
387 check: bool = True,
388 ):
389 self.requested = requested
390 if expression is None:
391 expression = QueryWhereExpression(None, bind)
392 elif isinstance(expression, str):
393 expression = QueryWhereExpression(expression, bind)
394 elif bind is not None:
395 raise TypeError("New bind parameters passed, but expression is already a QueryWhereExpression.")
396 self.where = expression.attach(
397 self.requested, dataId=dataId, region=whereRegion, defaults=defaults, check=check
398 )
399 self.datasets = NamedValueSet(datasets).freeze()
400 self.order_by = None if order_by is None else OrderByClause(order_by, requested)
401 self.limit = limit
403 requested: DimensionGraph
404 """Dimensions whose primary keys should be included in the result rows of
405 the query (`DimensionGraph`).
406 """
408 where: QueryWhereClause
409 """Structure containing objects that contribute to the WHERE clause of the
410 query (`QueryWhereClause`).
411 """
413 datasets: NamedValueAbstractSet[DatasetType]
414 """Dataset types whose searches may be joined into the query
415 (`NamedValueAbstractSet` [ `DatasetType` ]).
416 """
418 @property
419 def universe(self) -> DimensionUniverse:
420 """All known dimensions (`DimensionUniverse`)."""
421 return self.requested.universe
423 @property
424 @cached_getter
425 def spatial(self) -> NamedValueAbstractSet[DimensionElement]:
426 """Dimension elements whose regions and skypix IDs should be included
427 in the query (`NamedValueAbstractSet` of `DimensionElement`).
428 """
429 # An element may participate spatially in the query if:
430 # - it's the most precise spatial element for its system in the
431 # requested dimensions (i.e. in `self.requested.spatial`);
432 # - it isn't also given at query construction time.
433 result: NamedValueSet[DimensionElement] = NamedValueSet()
434 for family in self.mustHaveKeysJoined.spatial:
435 element = family.choose(self.mustHaveKeysJoined.elements)
436 assert isinstance(element, DimensionElement)
437 if element not in self.where.dataId.graph.elements:
438 result.add(element)
439 if len(result) == 1:
440 # There's no spatial join, but there might be a WHERE filter based
441 # on a given region.
442 if self.where.dataId.graph.spatial:
443 # We can only perform those filters against SkyPix dimensions,
444 # so if what we have isn't one, add the common SkyPix dimension
445 # to the query; the element we have will be joined to that.
446 (element,) = result
447 if not isinstance(element, SkyPixDimension):
448 result.add(self.universe.commonSkyPix)
449 else:
450 # There is no spatial join or filter in this query. Even
451 # if this element might be associated with spatial
452 # information, we don't need it for this query.
453 return NamedValueSet().freeze()
454 elif len(result) > 1:
455 # There's a spatial join. Those require the common SkyPix
456 # system to be included in the query in order to connect them.
457 result.add(self.universe.commonSkyPix)
458 return result.freeze()
460 @property
461 @cached_getter
462 def temporal(self) -> NamedValueAbstractSet[DimensionElement]:
463 """Dimension elements whose timespans should be included in the
464 query (`NamedValueSet` of `DimensionElement`).
465 """
466 if len(self.mustHaveKeysJoined.temporal) > 1:
467 # We don't actually have multiple temporal families in our current
468 # dimension configuration, so this limitation should be harmless.
469 raise NotImplementedError("Queries that should involve temporal joins are not yet supported.")
470 return self.where.temporal
472 @property
473 @cached_getter
474 def mustHaveKeysJoined(self) -> DimensionGraph:
475 """Dimensions whose primary keys must be used in the JOIN ON clauses
476 of the query, even if their tables do not appear (`DimensionGraph`).
478 A `Dimension` primary key can appear in a join clause without its table
479 via a foreign key column in table of a dependent dimension element or
480 dataset.
481 """
482 names = set(self.requested.names | self.where.dimensions.names)
483 for dataset_type in self.datasets:
484 names.update(dataset_type.dimensions.names)
485 return DimensionGraph(self.universe, names=names)
487 @property
488 @cached_getter
489 def mustHaveTableJoined(self) -> NamedValueAbstractSet[DimensionElement]:
490 """Dimension elements whose associated tables must appear in the
491 query's FROM clause (`NamedValueSet` of `DimensionElement`).
492 """
493 result = NamedValueSet(self.spatial | self.temporal | self.where.columns.keys())
494 if self.order_by is not None:
495 result.update(self.order_by.elements)
496 for dimension in self.mustHaveKeysJoined:
497 if dimension.implied:
498 result.add(dimension)
499 for element in self.mustHaveKeysJoined.union(self.where.dataId.graph).elements:
500 if element.alwaysJoin:
501 result.add(element)
502 return result.freeze()
505@dataclass
506class DatasetQueryColumns:
507 """A struct containing the columns used to reconstruct `DatasetRef`
508 instances from query results.
509 """
511 datasetType: DatasetType
512 """The dataset type being queried (`DatasetType`).
513 """
515 id: ColumnElement
516 """Column containing the unique integer ID for this dataset.
517 """
519 runKey: ColumnElement
520 """Foreign key column to the `~CollectionType.RUN` collection that holds
521 this dataset.
522 """
524 ingestDate: ColumnElement | None
525 """Column containing the ingest timestamp, this is not a part of
526 `DatasetRef` but it comes from the same table.
527 """
529 def __iter__(self) -> Iterator[ColumnElement]:
530 yield self.id
531 yield self.runKey
534@dataclass
535class QueryColumns:
536 """A struct organizing the columns in an under-construction or currently-
537 executing query.
539 Takes no parameters at construction, as expected usage is to add elements
540 to its container attributes incrementally.
541 """
543 def __init__(self) -> None:
544 self.keys = NamedKeyDict()
545 self.timespans = NamedKeyDict()
546 self.regions = NamedKeyDict()
547 self.datasets = None
549 keys: NamedKeyDict[Dimension, list[ColumnElement]]
550 """Columns that correspond to the primary key values of dimensions
551 (`NamedKeyDict` mapping `Dimension` to a `list` of `ColumnElement`).
553 Each value list contains columns from multiple tables corresponding to the
554 same dimension, and the query should constrain the values of those columns
555 to be the same.
557 In a `Query`, the keys of this dictionary must include at least the
558 dimensions in `QuerySummary.requested` and `QuerySummary.dataId.graph`.
559 """
561 timespans: NamedKeyDict[DimensionElement, TimespanDatabaseRepresentation]
562 """Columns that correspond to timespans for elements that participate in a
563 temporal join or filter in the query (`NamedKeyDict` mapping
564 `DimensionElement` to `TimespanDatabaseRepresentation`).
566 In a `Query`, the keys of this dictionary must be exactly the elements
567 in `QuerySummary.temporal`.
568 """
570 regions: NamedKeyDict[DimensionElement, ColumnElement]
571 """Columns that correspond to regions for elements that participate in a
572 spatial join or filter in the query (`NamedKeyDict` mapping
573 `DimensionElement` to `sqlalchemy.sql.ColumnElement`).
575 In a `Query`, the keys of this dictionary must be exactly the elements
576 in `QuerySummary.spatial`.
577 """
579 datasets: DatasetQueryColumns | None
580 """Columns that can be used to construct `DatasetRef` instances from query
581 results.
582 (`DatasetQueryColumns` or `None`).
583 """
585 def isEmpty(self) -> bool:
586 """Return `True` if this query has no columns at all."""
587 return not (self.keys or self.timespans or self.regions or self.datasets is not None)
589 def getKeyColumn(self, dimension: Dimension | str) -> ColumnElement:
590 """Return one of the columns in self.keys for the given dimension.
592 The column selected is an implentation detail but is guaranteed to
593 be deterministic and consistent across multiple calls.
595 Parameters
596 ----------
597 dimension : `Dimension` or `str`
598 Dimension for which to obtain a key column.
600 Returns
601 -------
602 column : `sqlalchemy.sql.ColumnElement`
603 SQLAlchemy column object.
604 """
605 # Choosing the last element here is entirely for human readers of the
606 # query (e.g. developers debugging things); it makes it more likely a
607 # dimension key will be provided by the dimension's own table, or
608 # failing that, some closely related dimension, which might be less
609 # surprising to see than e.g. some dataset subquery. From the
610 # database's perspective this is entirely arbitrary, because the query
611 # guarantees they all have equal values.
612 return self.keys[dimension][-1]