Coverage for python/lsst/daf/butler/registry/queries/_structs.py: 37%
225 statements
« prev ^ index » next coverage.py v6.5.0, created at 2022-10-21 02:03 -0700
« prev ^ index » next coverage.py v6.5.0, created at 2022-10-21 02:03 -0700
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
21from __future__ import annotations
23__all__ = ["QuerySummary"] # other classes here are local to subpackage
25from collections.abc import Iterable, Iterator, Mapping, Set
26from dataclasses import dataclass
27from typing import Any, cast
29from lsst.sphgeom import Region
30from lsst.utils.classes import cached_getter, immutable
31from sqlalchemy.sql import ColumnElement
33from ...core import (
34 DataCoordinate,
35 DatasetType,
36 Dimension,
37 DimensionElement,
38 DimensionGraph,
39 DimensionUniverse,
40 NamedKeyDict,
41 NamedKeyMapping,
42 NamedValueAbstractSet,
43 NamedValueSet,
44 SkyPixDimension,
45 TimespanDatabaseRepresentation,
46)
47from .._exceptions import UserExpressionSyntaxError
49# We're not trying to add typing to the lex/yacc parser code, so MyPy
50# doesn't know about some of these imports.
51from .expressions import Node, NormalForm, NormalFormExpression, ParserYacc # type: ignore
52from .expressions.categorize import categorizeElementOrderByName, categorizeOrderByName
55@immutable
56class QueryWhereExpression:
57 """A struct representing a parsed user-provided WHERE expression.
59 Parameters
60 ----------
61 expression : `str`, optional
62 The string expression to parse. If `None`, a where expression that
63 always evaluates to `True` is implied.
64 bind : `Mapping` [ `str`, `object` ], optional
65 Mapping containing literal values that should be injected into the
66 query expression, keyed by the identifiers they replace.
67 """
69 def __init__(self, expression: str | None = None, bind: Mapping[str, Any] | None = None):
70 if expression:
71 try:
72 parser = ParserYacc()
73 self._tree = parser.parse(expression)
74 except Exception as exc:
75 raise UserExpressionSyntaxError(f"Failed to parse user expression `{expression}'.") from exc
76 assert self._tree is not None
77 else:
78 self._tree = None
79 if bind is None:
80 bind = {}
81 self._bind = bind
83 def attach(
84 self,
85 graph: DimensionGraph,
86 dataId: DataCoordinate | None = None,
87 region: Region | None = None,
88 defaults: DataCoordinate | None = None,
89 check: bool = True,
90 ) -> QueryWhereClause:
91 """Allow this expression to be attached to a `QuerySummary` by
92 transforming it into a `QueryWhereClause`, while checking it for both
93 internal consistency and consistency with the rest of the query.
95 Parameters
96 ----------
97 graph : `DimensionGraph`
98 The dimensions the query would include in the absence of this
99 WHERE expression.
100 dataId : `DataCoordinate`, optional
101 A fully-expanded data ID identifying dimensions known in advance.
102 If not provided, will be set to an empty data ID.
103 region : `lsst.sphgeom.Region`, optional
104 A spatial constraint that all rows must overlap. If `None` and
105 ``dataId`` is an expanded data ID, ``dataId.region`` will be used
106 to construct one.
107 defaults : `DataCoordinate`, optional
108 A data ID containing default for governor dimensions. Ignored
109 unless ``check=True``.
110 check : `bool`
111 If `True` (default) check the query for consistency and inject
112 default values into the data ID when needed. This may
113 reject some valid queries that resemble common mistakes (e.g.
114 queries for visits without specifying an instrument).
115 """
116 if dataId is not None and dataId.hasRecords():
117 if region is None and dataId.region is not None:
118 region = dataId.region
119 if dataId is None:
120 dataId = DataCoordinate.makeEmpty(graph.universe)
121 if defaults is None:
122 defaults = DataCoordinate.makeEmpty(graph.universe)
123 if self._bind and check:
124 for identifier in self._bind:
125 if identifier in graph.universe.getStaticElements().names:
126 raise RuntimeError(
127 f"Bind parameter key {identifier!r} conflicts with a dimension element."
128 )
129 table, sep, column = identifier.partition(".")
130 if column and table in graph.universe.getStaticElements().names:
131 raise RuntimeError(f"Bind parameter key {identifier!r} looks like a dimension column.")
132 governor_constraints: dict[str, Set[str]] = {}
133 summary: InspectionSummary
134 if self._tree is not None:
135 if check:
136 # Convert the expression to disjunctive normal form (ORs of
137 # ANDs). That's potentially super expensive in the general
138 # case (where there's a ton of nesting of ANDs and ORs). That
139 # won't be the case for the expressions we expect, and we
140 # actually use disjunctive normal instead of conjunctive (i.e.
141 # ANDs of ORs) because I think the worst-case is a long list
142 # of OR'd-together data IDs, which is already in or very close
143 # to disjunctive normal form.
144 expr = NormalFormExpression.fromTree(self._tree, NormalForm.DISJUNCTIVE)
145 from .expressions import CheckVisitor
147 # Check the expression for consistency and completeness.
148 visitor = CheckVisitor(dataId, graph, self._bind, defaults)
149 try:
150 summary = expr.visit(visitor)
151 except RuntimeError as err:
152 exprOriginal = str(self._tree)
153 exprNormal = str(expr.toTree())
154 if exprNormal == exprOriginal:
155 msg = f'Error in query expression "{exprOriginal}": {err}'
156 else:
157 msg = (
158 f'Error in query expression "{exprOriginal}" '
159 f'(normalized to "{exprNormal}"): {err}'
160 )
161 raise RuntimeError(msg) from None
162 for dimension_name, values in summary.dimension_constraints.items():
163 if dimension_name in graph.universe.getGovernorDimensions().names:
164 governor_constraints[dimension_name] = cast(Set[str], values)
165 dataId = visitor.dataId
166 else:
167 from .expressions import InspectionVisitor
169 summary = self._tree.visit(InspectionVisitor(graph.universe, self._bind))
170 else:
171 from .expressions import InspectionSummary
173 summary = InspectionSummary()
174 return QueryWhereClause(
175 self._tree,
176 dataId,
177 dimensions=summary.dimensions,
178 columns=summary.columns,
179 bind=self._bind,
180 governor_constraints=governor_constraints,
181 region=region,
182 )
185@dataclass(frozen=True)
186class QueryWhereClause:
187 """Structure holding various contributions to a query's WHERE clause.
189 Instances of this class should only be created by
190 `QueryWhereExpression.attach`, which guarantees the consistency of its
191 attributes.
192 """
194 tree: Node | None
195 """A parsed string expression tree., or `None` if there was no string
196 expression.
197 """
199 dataId: DataCoordinate
200 """A data ID identifying dimensions known before query construction
201 (`DataCoordinate`).
203 ``dataId.hasRecords()`` is guaranteed to return `True`.
204 """
206 dimensions: NamedValueAbstractSet[Dimension]
207 """Dimensions whose primary keys or dependencies were referenced anywhere
208 in the string expression (`NamedValueAbstractSet` [ `Dimension` ]).
209 """
211 columns: NamedKeyMapping[DimensionElement, Set[str]]
212 """Dimension element tables whose non-key columns were referenced anywhere
213 in the string expression
214 (`NamedKeyMapping` [ `DimensionElement`, `Set` [ `str` ] ]).
215 """
217 bind: Mapping[str, Any]
218 """Mapping containing literal values that should be injected into the
219 query expression, keyed by the identifiers they replace (`Mapping`).
220 """
222 region: Region | None
223 """A spatial region that all result rows must overlap
224 (`lsst.sphgeom.Region` or `None`).
225 """
227 governor_constraints: Mapping[str, Set[str]]
228 """Restrictions on the values governor dimensions can take in this query,
229 imposed by the string expression and/or data ID
230 (`Mapping` [ `set`, `~collections.abc.Set` [ `str` ] ]).
232 Governor dimensions not present in this mapping are not constrained at all.
233 """
235 @property
236 @cached_getter
237 def temporal(self) -> NamedValueAbstractSet[DimensionElement]:
238 """Dimension elements whose timespans are referenced by this
239 expression (`NamedValueAbstractSet` [ `DimensionElement` ])
240 """
241 return NamedValueSet(
242 e for e, c in self.columns.items() if TimespanDatabaseRepresentation.NAME in c
243 ).freeze()
246@dataclass(frozen=True)
247class OrderByClauseColumn:
248 """Information about single column in ORDER BY clause."""
250 element: DimensionElement
251 """Dimension element for data in this column (`DimensionElement`)."""
253 column: str | None
254 """Name of the column or `None` for primary key (`str` or `None`)"""
256 ordering: bool
257 """True for ascending order, False for descending (`bool`)."""
260@immutable
261class OrderByClause:
262 """Class for information about columns in ORDER BY clause
264 Parameters
265 ----------
266 order_by : `Iterable` [ `str` ]
267 Sequence of names to use for ordering with optional "-" prefix.
268 graph : `DimensionGraph`
269 Dimensions used by a query.
270 """
272 def __init__(self, order_by: Iterable[str], graph: DimensionGraph):
274 self.order_by_columns = []
275 for name in order_by:
276 if not name or name == "-":
277 raise ValueError("Empty dimension name in ORDER BY")
278 ascending = True
279 if name[0] == "-":
280 ascending = False
281 name = name[1:]
282 element, column = categorizeOrderByName(graph, name)
283 self.order_by_columns.append(
284 OrderByClauseColumn(element=element, column=column, ordering=ascending)
285 )
287 self.elements = NamedValueSet(
288 column.element for column in self.order_by_columns if column.column is not None
289 )
291 order_by_columns: Iterable[OrderByClauseColumn]
292 """Columns that appear in the ORDER BY
293 (`Iterable` [ `OrderByClauseColumn` ]).
294 """
296 elements: NamedValueSet[DimensionElement]
297 """Dimension elements whose non-key columns were referenced by order_by
298 (`NamedValueSet` [ `DimensionElement` ]).
299 """
302@immutable
303class ElementOrderByClause:
304 """Class for information about columns in ORDER BY clause for one element.
306 Parameters
307 ----------
308 order_by : `Iterable` [ `str` ]
309 Sequence of names to use for ordering with optional "-" prefix.
310 element : `DimensionElement`
311 Dimensions used by a query.
312 """
314 def __init__(self, order_by: Iterable[str], element: DimensionElement):
316 self.order_by_columns = []
317 for name in order_by:
318 if not name or name == "-":
319 raise ValueError("Empty dimension name in ORDER BY")
320 ascending = True
321 if name[0] == "-":
322 ascending = False
323 name = name[1:]
324 column = categorizeElementOrderByName(element, name)
325 self.order_by_columns.append(
326 OrderByClauseColumn(element=element, column=column, ordering=ascending)
327 )
329 order_by_columns: Iterable[OrderByClauseColumn]
330 """Columns that appear in the ORDER BY
331 (`Iterable` [ `OrderByClauseColumn` ]).
332 """
335@immutable
336class QuerySummary:
337 """A struct that holds and categorizes the dimensions involved in a query.
339 A `QuerySummary` instance is necessary to construct a `QueryBuilder`, and
340 it needs to include all of the dimensions that will be included in the
341 query (including any needed for querying datasets).
343 Parameters
344 ----------
345 requested : `DimensionGraph`
346 The dimensions whose primary keys should be included in the result rows
347 of the query.
348 dataId : `DataCoordinate`, optional
349 A fully-expanded data ID identifying dimensions known in advance. If
350 not provided, will be set to an empty data ID.
351 expression : `str` or `QueryWhereExpression`, optional
352 A user-provided string WHERE expression.
353 whereRegion : `lsst.sphgeom.Region`, optional
354 If `None` and ``dataId`` is an expanded data ID, ``dataId.region`` will
355 be used to construct one.
356 bind : `Mapping` [ `str`, `object` ], optional
357 Mapping containing literal values that should be injected into the
358 query expression, keyed by the identifiers they replace.
359 defaults : `DataCoordinate`, optional
360 A data ID containing default for governor dimensions.
361 datasets : `Iterable` [ `DatasetType` ], optional
362 Dataset types whose searches may be joined into the query. Callers
363 must still call `QueryBuilder.joinDataset` explicitly to control how
364 that join happens (e.g. which collections are searched), but by
365 declaring them here first we can ensure that the query includes the
366 right dimensions for those joins.
367 order_by : `Iterable` [ `str` ]
368 Sequence of names to use for ordering with optional "-" prefix.
369 limit : `Tuple`, optional
370 Limit on the number of returned rows and optional offset.
371 check : `bool`
372 If `True` (default) check the query for consistency. This may reject
373 some valid queries that resemble common mistakes (e.g. queries for
374 visits without specifying an instrument).
375 """
377 def __init__(
378 self,
379 requested: DimensionGraph,
380 *,
381 dataId: DataCoordinate | None = None,
382 expression: str | QueryWhereExpression | None = None,
383 whereRegion: Region | None = None,
384 bind: Mapping[str, Any] | None = None,
385 defaults: DataCoordinate | None = None,
386 datasets: Iterable[DatasetType] = (),
387 order_by: Iterable[str] | None = None,
388 limit: tuple[int, int | None] | None = None,
389 check: bool = True,
390 ):
391 self.requested = requested
392 if expression is None:
393 expression = QueryWhereExpression(None, bind)
394 elif isinstance(expression, str):
395 expression = QueryWhereExpression(expression, bind)
396 elif bind is not None:
397 raise TypeError("New bind parameters passed, but expression is already a QueryWhereExpression.")
398 self.where = expression.attach(
399 self.requested, dataId=dataId, region=whereRegion, defaults=defaults, check=check
400 )
401 self.datasets = NamedValueSet(datasets).freeze()
402 self.order_by = None if order_by is None else OrderByClause(order_by, requested)
403 self.limit = limit
405 requested: DimensionGraph
406 """Dimensions whose primary keys should be included in the result rows of
407 the query (`DimensionGraph`).
408 """
410 where: QueryWhereClause
411 """Structure containing objects that contribute to the WHERE clause of the
412 query (`QueryWhereClause`).
413 """
415 datasets: NamedValueAbstractSet[DatasetType]
416 """Dataset types whose searches may be joined into the query
417 (`NamedValueAbstractSet` [ `DatasetType` ]).
418 """
420 @property
421 def universe(self) -> DimensionUniverse:
422 """All known dimensions (`DimensionUniverse`)."""
423 return self.requested.universe
425 @property
426 @cached_getter
427 def spatial(self) -> NamedValueAbstractSet[DimensionElement]:
428 """Dimension elements whose regions and skypix IDs should be included
429 in the query (`NamedValueAbstractSet` of `DimensionElement`).
430 """
431 # An element may participate spatially in the query if:
432 # - it's the most precise spatial element for its system in the
433 # requested dimensions (i.e. in `self.requested.spatial`);
434 # - it isn't also given at query construction time.
435 result: NamedValueSet[DimensionElement] = NamedValueSet()
436 for family in self.mustHaveKeysJoined.spatial:
437 element = family.choose(self.mustHaveKeysJoined.elements)
438 assert isinstance(element, DimensionElement)
439 if element not in self.where.dataId.graph.elements:
440 result.add(element)
441 if len(result) == 1:
442 # There's no spatial join, but there might be a WHERE filter based
443 # on a given region.
444 if self.where.dataId.graph.spatial:
445 # We can only perform those filters against SkyPix dimensions,
446 # so if what we have isn't one, add the common SkyPix dimension
447 # to the query; the element we have will be joined to that.
448 (element,) = result
449 if not isinstance(element, SkyPixDimension):
450 result.add(self.universe.commonSkyPix)
451 else:
452 # There is no spatial join or filter in this query. Even
453 # if this element might be associated with spatial
454 # information, we don't need it for this query.
455 return NamedValueSet().freeze()
456 elif len(result) > 1:
457 # There's a spatial join. Those require the common SkyPix
458 # system to be included in the query in order to connect them.
459 result.add(self.universe.commonSkyPix)
460 return result.freeze()
462 @property
463 @cached_getter
464 def temporal(self) -> NamedValueAbstractSet[DimensionElement]:
465 """Dimension elements whose timespans should be included in the
466 query (`NamedValueSet` of `DimensionElement`).
467 """
468 if len(self.mustHaveKeysJoined.temporal) > 1:
469 # We don't actually have multiple temporal families in our current
470 # dimension configuration, so this limitation should be harmless.
471 raise NotImplementedError("Queries that should involve temporal joins are not yet supported.")
472 return self.where.temporal
474 @property
475 @cached_getter
476 def mustHaveKeysJoined(self) -> DimensionGraph:
477 """Dimensions whose primary keys must be used in the JOIN ON clauses
478 of the query, even if their tables do not appear (`DimensionGraph`).
480 A `Dimension` primary key can appear in a join clause without its table
481 via a foreign key column in table of a dependent dimension element or
482 dataset.
483 """
484 names = set(self.requested.names | self.where.dimensions.names)
485 for dataset_type in self.datasets:
486 names.update(dataset_type.dimensions.names)
487 return DimensionGraph(self.universe, names=names)
489 @property
490 @cached_getter
491 def mustHaveTableJoined(self) -> NamedValueAbstractSet[DimensionElement]:
492 """Dimension elements whose associated tables must appear in the
493 query's FROM clause (`NamedValueSet` of `DimensionElement`).
494 """
495 result = NamedValueSet(self.spatial | self.temporal | self.where.columns.keys())
496 if self.order_by is not None:
497 result.update(self.order_by.elements)
498 for dimension in self.mustHaveKeysJoined:
499 if dimension.implied:
500 result.add(dimension)
501 for element in self.mustHaveKeysJoined.union(self.where.dataId.graph).elements:
502 if element.alwaysJoin:
503 result.add(element)
504 return result.freeze()
507@dataclass
508class DatasetQueryColumns:
509 """A struct containing the columns used to reconstruct `DatasetRef`
510 instances from query results.
511 """
513 datasetType: DatasetType
514 """The dataset type being queried (`DatasetType`).
515 """
517 id: ColumnElement
518 """Column containing the unique integer ID for this dataset.
519 """
521 runKey: ColumnElement
522 """Foreign key column to the `~CollectionType.RUN` collection that holds
523 this dataset.
524 """
526 ingestDate: ColumnElement | None
527 """Column containing the ingest timestamp, this is not a part of
528 `DatasetRef` but it comes from the same table.
529 """
531 def __iter__(self) -> Iterator[ColumnElement]:
532 yield self.id
533 yield self.runKey
536@dataclass
537class QueryColumns:
538 """A struct organizing the columns in an under-construction or currently-
539 executing query.
541 Takes no parameters at construction, as expected usage is to add elements
542 to its container attributes incrementally.
543 """
545 def __init__(self) -> None:
546 self.keys = NamedKeyDict()
547 self.timespans = NamedKeyDict()
548 self.regions = NamedKeyDict()
549 self.datasets = None
551 keys: NamedKeyDict[Dimension, list[ColumnElement]]
552 """Columns that correspond to the primary key values of dimensions
553 (`NamedKeyDict` mapping `Dimension` to a `list` of `ColumnElement`).
555 Each value list contains columns from multiple tables corresponding to the
556 same dimension, and the query should constrain the values of those columns
557 to be the same.
559 In a `Query`, the keys of this dictionary must include at least the
560 dimensions in `QuerySummary.requested` and `QuerySummary.dataId.graph`.
561 """
563 timespans: NamedKeyDict[DimensionElement, TimespanDatabaseRepresentation]
564 """Columns that correspond to timespans for elements that participate in a
565 temporal join or filter in the query (`NamedKeyDict` mapping
566 `DimensionElement` to `TimespanDatabaseRepresentation`).
568 In a `Query`, the keys of this dictionary must be exactly the elements
569 in `QuerySummary.temporal`.
570 """
572 regions: NamedKeyDict[DimensionElement, ColumnElement]
573 """Columns that correspond to regions for elements that participate in a
574 spatial join or filter in the query (`NamedKeyDict` mapping
575 `DimensionElement` to `sqlalchemy.sql.ColumnElement`).
577 In a `Query`, the keys of this dictionary must be exactly the elements
578 in `QuerySummary.spatial`.
579 """
581 datasets: DatasetQueryColumns | None
582 """Columns that can be used to construct `DatasetRef` instances from query
583 results.
584 (`DatasetQueryColumns` or `None`).
585 """
587 def isEmpty(self) -> bool:
588 """Return `True` if this query has no columns at all."""
589 return not (self.keys or self.timespans or self.regions or self.datasets is not None)
591 def getKeyColumn(self, dimension: Dimension | str) -> ColumnElement:
592 """Return one of the columns in self.keys for the given dimension.
594 The column selected is an implentation detail but is guaranteed to
595 be deterministic and consistent across multiple calls.
597 Parameters
598 ----------
599 dimension : `Dimension` or `str`
600 Dimension for which to obtain a key column.
602 Returns
603 -------
604 column : `sqlalchemy.sql.ColumnElement`
605 SQLAlchemy column object.
606 """
607 # Choosing the last element here is entirely for human readers of the
608 # query (e.g. developers debugging things); it makes it more likely a
609 # dimension key will be provided by the dimension's own table, or
610 # failing that, some closely related dimension, which might be less
611 # surprising to see than e.g. some dataset subquery. From the
612 # database's perspective this is entirely arbitrary, because the query
613 # guarantees they all have equal values.
614 return self.keys[dimension][-1]