Coverage for python/lsst/daf/butler/registry/queries/_structs.py: 40%
234 statements
« prev ^ index » next coverage.py v6.4.4, created at 2022-09-22 02:05 -0700
« prev ^ index » next coverage.py v6.4.4, created at 2022-09-22 02:05 -0700
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
21from __future__ import annotations
23__all__ = ["QuerySummary", "RegistryManagers"] # other classes here are local to subpackage
25from dataclasses import dataclass
26from typing import AbstractSet, Any, Iterable, Iterator, List, Mapping, Optional, Tuple, Type, Union, cast
28from lsst.sphgeom import Region
29from lsst.utils.classes import cached_getter, immutable
30from sqlalchemy.sql import ColumnElement
32from ...core import (
33 DataCoordinate,
34 DatasetType,
35 Dimension,
36 DimensionElement,
37 DimensionGraph,
38 DimensionUniverse,
39 NamedKeyDict,
40 NamedKeyMapping,
41 NamedValueAbstractSet,
42 NamedValueSet,
43 SkyPixDimension,
44 TimespanDatabaseRepresentation,
45)
46from .._exceptions import UserExpressionSyntaxError
47from ..interfaces import CollectionManager, DatasetRecordStorageManager, DimensionRecordStorageManager
49# We're not trying to add typing to the lex/yacc parser code, so MyPy
50# doesn't know about some of these imports.
51from .expressions import Node, NormalForm, NormalFormExpression, ParserYacc # type: ignore
52from .expressions.categorize import categorizeElementOrderByName, categorizeOrderByName
55@immutable
56class QueryWhereExpression:
57 """A struct representing a parsed user-provided WHERE expression.
59 Parameters
60 ----------
61 expression : `str`, optional
62 The string expression to parse. If `None`, a where expression that
63 always evaluates to `True` is implied.
64 bind : `Mapping` [ `str`, `object` ], optional
65 Mapping containing literal values that should be injected into the
66 query expression, keyed by the identifiers they replace.
67 """
69 def __init__(self, expression: Optional[str] = None, bind: Optional[Mapping[str, Any]] = None):
70 if expression:
71 try:
72 parser = ParserYacc()
73 self._tree = parser.parse(expression)
74 except Exception as exc:
75 raise UserExpressionSyntaxError(f"Failed to parse user expression `{expression}'.") from exc
76 assert self._tree is not None
77 else:
78 self._tree = None
79 if bind is None:
80 bind = {}
81 self._bind = bind
83 def attach(
84 self,
85 graph: DimensionGraph,
86 dataId: Optional[DataCoordinate] = None,
87 region: Optional[Region] = None,
88 defaults: Optional[DataCoordinate] = None,
89 check: bool = True,
90 ) -> QueryWhereClause:
91 """Allow this expression to be attached to a `QuerySummary` by
92 transforming it into a `QueryWhereClause`, while checking it for both
93 internal consistency and consistency with the rest of the query.
95 Parameters
96 ----------
97 graph : `DimensionGraph`
98 The dimensions the query would include in the absence of this
99 WHERE expression.
100 dataId : `DataCoordinate`, optional
101 A fully-expanded data ID identifying dimensions known in advance.
102 If not provided, will be set to an empty data ID.
103 ``dataId.hasRecords()`` must return `True`.
104 region : `lsst.sphgeom.Region`, optional
105 A spatial region that all rows must overlap. If `None` and
106 ``dataId`` is not `None`, ``dataId.region`` will be used.
107 defaults : `DataCoordinate`, optional
108 A data ID containing default for governor dimensions. Ignored
109 unless ``check=True``.
110 check : `bool`
111 If `True` (default) check the query for consistency and inject
112 default values into the data ID when needed. This may
113 reject some valid queries that resemble common mistakes (e.g.
114 queries for visits without specifying an instrument).
115 """
116 if region is None and dataId is not None:
117 region = dataId.region
118 if dataId is None:
119 dataId = DataCoordinate.makeEmpty(graph.universe)
120 if defaults is None:
121 defaults = DataCoordinate.makeEmpty(graph.universe)
122 if self._bind and check:
123 for identifier in self._bind:
124 if identifier in graph.universe.getStaticElements().names:
125 raise RuntimeError(
126 f"Bind parameter key {identifier!r} conflicts with a dimension element."
127 )
128 table, sep, column = identifier.partition(".")
129 if column and table in graph.universe.getStaticElements().names:
130 raise RuntimeError(f"Bind parameter key {identifier!r} looks like a dimension column.")
131 governor_constraints: dict[str, AbstractSet[str]] = {}
132 summary: InspectionSummary
133 if self._tree is not None:
134 if check:
135 # Convert the expression to disjunctive normal form (ORs of
136 # ANDs). That's potentially super expensive in the general
137 # case (where there's a ton of nesting of ANDs and ORs). That
138 # won't be the case for the expressions we expect, and we
139 # actually use disjunctive normal instead of conjunctive (i.e.
140 # ANDs of ORs) because I think the worst-case is a long list
141 # of OR'd-together data IDs, which is already in or very close
142 # to disjunctive normal form.
143 expr = NormalFormExpression.fromTree(self._tree, NormalForm.DISJUNCTIVE)
144 from .expressions import CheckVisitor
146 # Check the expression for consistency and completeness.
147 visitor = CheckVisitor(dataId, graph, self._bind, defaults)
148 try:
149 summary = expr.visit(visitor)
150 except RuntimeError as err:
151 exprOriginal = str(self._tree)
152 exprNormal = str(expr.toTree())
153 if exprNormal == exprOriginal:
154 msg = f'Error in query expression "{exprOriginal}": {err}'
155 else:
156 msg = (
157 f'Error in query expression "{exprOriginal}" '
158 f'(normalized to "{exprNormal}"): {err}'
159 )
160 raise RuntimeError(msg) from None
161 for dimension_name, values in summary.dimension_constraints.items():
162 if dimension_name in graph.universe.getGovernorDimensions().names:
163 governor_constraints[dimension_name] = cast(AbstractSet[str], values)
164 dataId = visitor.dataId
165 else:
166 from .expressions import InspectionVisitor
168 summary = self._tree.visit(InspectionVisitor(graph.universe, self._bind))
169 else:
170 from .expressions import InspectionSummary
172 summary = InspectionSummary()
173 return QueryWhereClause(
174 self._tree,
175 dataId,
176 dimensions=summary.dimensions,
177 columns=summary.columns,
178 bind=self._bind,
179 governor_constraints=governor_constraints,
180 region=region,
181 )
184@dataclass(frozen=True)
185class QueryWhereClause:
186 """Structure holding various contributions to a query's WHERE clause.
188 Instances of this class should only be created by
189 `QueryWhereExpression.attach`, which guarantees the consistency of its
190 attributes.
191 """
193 tree: Optional[Node]
194 """A parsed string expression tree., or `None` if there was no string
195 expression.
196 """
198 dataId: DataCoordinate
199 """A data ID identifying dimensions known before query construction
200 (`DataCoordinate`).
202 ``dataId.hasRecords()`` is guaranteed to return `True`.
203 """
205 dimensions: NamedValueAbstractSet[Dimension]
206 """Dimensions whose primary keys or dependencies were referenced anywhere
207 in the string expression (`NamedValueAbstractSet` [ `Dimension` ]).
208 """
210 columns: NamedKeyMapping[DimensionElement, AbstractSet[str]]
211 """Dimension element tables whose non-key columns were referenced anywhere
212 in the string expression
213 (`NamedKeyMapping` [ `DimensionElement`, `Set` [ `str` ] ]).
214 """
216 bind: Mapping[str, Any]
217 """Mapping containing literal values that should be injected into the
218 query expression, keyed by the identifiers they replace (`Mapping`).
219 """
221 region: Optional[Region]
222 """A spatial region that all result rows must overlap
223 (`lsst.sphgeom.Region` or `None`).
224 """
226 governor_constraints: Mapping[str, AbstractSet[str]]
227 """Restrictions on the values governor dimensions can take in this query,
228 imposed by the string expression and/or data ID
229 (`Mapping` [ `set`, `~collections.abc.Set` [ `str` ] ]).
231 Governor dimensions not present in this mapping are not constrained at all.
232 """
234 @property # type: ignore
235 @cached_getter
236 def temporal(self) -> NamedValueAbstractSet[DimensionElement]:
237 """Dimension elements whose timespans are referenced by this
238 expression (`NamedValueAbstractSet` [ `DimensionElement` ])
239 """
240 return NamedValueSet(
241 e for e, c in self.columns.items() if TimespanDatabaseRepresentation.NAME in c
242 ).freeze()
245@dataclass(frozen=True)
246class OrderByClauseColumn:
247 """Information about single column in ORDER BY clause."""
249 element: DimensionElement
250 """Dimension element for data in this column (`DimensionElement`)."""
252 column: Optional[str]
253 """Name of the column or `None` for primary key (`str` or `None`)"""
255 ordering: bool
256 """True for ascending order, False for descending (`bool`)."""
259@immutable
260class OrderByClause:
261 """Class for information about columns in ORDER BY clause
263 Parameters
264 ----------
265 order_by : `Iterable` [ `str` ]
266 Sequence of names to use for ordering with optional "-" prefix.
267 graph : `DimensionGraph`
268 Dimensions used by a query.
269 """
271 def __init__(self, order_by: Iterable[str], graph: DimensionGraph):
273 self.order_by_columns = []
274 for name in order_by:
275 if not name or name == "-":
276 raise ValueError("Empty dimension name in ORDER BY")
277 ascending = True
278 if name[0] == "-":
279 ascending = False
280 name = name[1:]
281 element, column = categorizeOrderByName(graph, name)
282 self.order_by_columns.append(
283 OrderByClauseColumn(element=element, column=column, ordering=ascending)
284 )
286 self.elements = NamedValueSet(
287 column.element for column in self.order_by_columns if column.column is not None
288 )
290 order_by_columns: Iterable[OrderByClauseColumn]
291 """Columns that appear in the ORDER BY
292 (`Iterable` [ `OrderByClauseColumn` ]).
293 """
295 elements: NamedValueSet[DimensionElement]
296 """Dimension elements whose non-key columns were referenced by order_by
297 (`NamedValueSet` [ `DimensionElement` ]).
298 """
301@immutable
302class ElementOrderByClause:
303 """Class for information about columns in ORDER BY clause for one element.
305 Parameters
306 ----------
307 order_by : `Iterable` [ `str` ]
308 Sequence of names to use for ordering with optional "-" prefix.
309 element : `DimensionElement`
310 Dimensions used by a query.
311 """
313 def __init__(self, order_by: Iterable[str], element: DimensionElement):
315 self.order_by_columns = []
316 for name in order_by:
317 if not name or name == "-":
318 raise ValueError("Empty dimension name in ORDER BY")
319 ascending = True
320 if name[0] == "-":
321 ascending = False
322 name = name[1:]
323 column = categorizeElementOrderByName(element, name)
324 self.order_by_columns.append(
325 OrderByClauseColumn(element=element, column=column, ordering=ascending)
326 )
328 order_by_columns: Iterable[OrderByClauseColumn]
329 """Columns that appear in the ORDER BY
330 (`Iterable` [ `OrderByClauseColumn` ]).
331 """
334@immutable
335class QuerySummary:
336 """A struct that holds and categorizes the dimensions involved in a query.
338 A `QuerySummary` instance is necessary to construct a `QueryBuilder`, and
339 it needs to include all of the dimensions that will be included in the
340 query (including any needed for querying datasets).
342 Parameters
343 ----------
344 requested : `DimensionGraph`
345 The dimensions whose primary keys should be included in the result rows
346 of the query.
347 dataId : `DataCoordinate`, optional
348 A fully-expanded data ID identifying dimensions known in advance. If
349 not provided, will be set to an empty data ID. ``dataId.hasRecords()``
350 must return `True`.
351 expression : `str` or `QueryWhereExpression`, optional
352 A user-provided string WHERE expression.
353 whereRegion : `lsst.sphgeom.Region`, optional
354 A spatial region that all rows must overlap. If `None` and ``dataId``
355 is not `None`, ``dataId.region`` will be used.
356 bind : `Mapping` [ `str`, `object` ], optional
357 Mapping containing literal values that should be injected into the
358 query expression, keyed by the identifiers they replace.
359 defaults : `DataCoordinate`, optional
360 A data ID containing default for governor dimensions.
361 datasets : `Iterable` [ `DatasetType` ], optional
362 Dataset types whose searches may be joined into the query. Callers
363 must still call `QueryBuilder.joinDataset` explicitly to control how
364 that join happens (e.g. which collections are searched), but by
365 declaring them here first we can ensure that the query includes the
366 right dimensions for those joins.
367 order_by : `Iterable` [ `str` ]
368 Sequence of names to use for ordering with optional "-" prefix.
369 limit : `Tuple`, optional
370 Limit on the number of returned rows and optional offset.
371 check : `bool`
372 If `True` (default) check the query for consistency. This may reject
373 some valid queries that resemble common mistakes (e.g. queries for
374 visits without specifying an instrument).
375 """
377 def __init__(
378 self,
379 requested: DimensionGraph,
380 *,
381 dataId: Optional[DataCoordinate] = None,
382 expression: Optional[Union[str, QueryWhereExpression]] = None,
383 whereRegion: Optional[Region] = None,
384 bind: Optional[Mapping[str, Any]] = None,
385 defaults: Optional[DataCoordinate] = None,
386 datasets: Iterable[DatasetType] = (),
387 order_by: Optional[Iterable[str]] = None,
388 limit: Optional[Tuple[int, Optional[int]]] = None,
389 check: bool = True,
390 ):
391 self.requested = requested
392 if expression is None:
393 expression = QueryWhereExpression(None, bind)
394 elif isinstance(expression, str):
395 expression = QueryWhereExpression(expression, bind)
396 elif bind is not None:
397 raise TypeError("New bind parameters passed, but expression is already a QueryWhereExpression.")
398 self.where = expression.attach(
399 self.requested, dataId=dataId, region=whereRegion, defaults=defaults, check=check
400 )
401 self.datasets = NamedValueSet(datasets).freeze()
402 self.order_by = None if order_by is None else OrderByClause(order_by, requested)
403 self.limit = limit
405 requested: DimensionGraph
406 """Dimensions whose primary keys should be included in the result rows of
407 the query (`DimensionGraph`).
408 """
410 where: QueryWhereClause
411 """Structure containing objects that contribute to the WHERE clause of the
412 query (`QueryWhereClause`).
413 """
415 datasets: NamedValueAbstractSet[DatasetType]
416 """Dataset types whose searches may be joined into the query
417 (`NamedValueAbstractSet` [ `DatasetType` ]).
418 """
420 @property
421 def universe(self) -> DimensionUniverse:
422 """All known dimensions (`DimensionUniverse`)."""
423 return self.requested.universe
425 @property # type: ignore
426 @cached_getter
427 def spatial(self) -> NamedValueAbstractSet[DimensionElement]:
428 """Dimension elements whose regions and skypix IDs should be included
429 in the query (`NamedValueAbstractSet` of `DimensionElement`).
430 """
431 # An element may participate spatially in the query if:
432 # - it's the most precise spatial element for its system in the
433 # requested dimensions (i.e. in `self.requested.spatial`);
434 # - it isn't also given at query construction time.
435 result: NamedValueSet[DimensionElement] = NamedValueSet()
436 for family in self.mustHaveKeysJoined.spatial:
437 element = family.choose(self.mustHaveKeysJoined.elements)
438 assert isinstance(element, DimensionElement)
439 if element not in self.where.dataId.graph.elements:
440 result.add(element)
441 if len(result) == 1:
442 # There's no spatial join, but there might be a WHERE filter based
443 # on a given region.
444 if self.where.dataId.graph.spatial:
445 # We can only perform those filters against SkyPix dimensions,
446 # so if what we have isn't one, add the common SkyPix dimension
447 # to the query; the element we have will be joined to that.
448 (element,) = result
449 if not isinstance(element, SkyPixDimension):
450 result.add(self.universe.commonSkyPix)
451 else:
452 # There is no spatial join or filter in this query. Even
453 # if this element might be associated with spatial
454 # information, we don't need it for this query.
455 return NamedValueSet().freeze()
456 elif len(result) > 1:
457 # There's a spatial join. Those require the common SkyPix
458 # system to be included in the query in order to connect them.
459 result.add(self.universe.commonSkyPix)
460 return result.freeze()
462 @property # type: ignore
463 @cached_getter
464 def temporal(self) -> NamedValueAbstractSet[DimensionElement]:
465 """Dimension elements whose timespans should be included in the
466 query (`NamedValueSet` of `DimensionElement`).
467 """
468 if len(self.mustHaveKeysJoined.temporal) > 1:
469 # We don't actually have multiple temporal families in our current
470 # dimension configuration, so this limitation should be harmless.
471 raise NotImplementedError("Queries that should involve temporal joins are not yet supported.")
472 return self.where.temporal
474 @property # type: ignore
475 @cached_getter
476 def mustHaveKeysJoined(self) -> DimensionGraph:
477 """Dimensions whose primary keys must be used in the JOIN ON clauses
478 of the query, even if their tables do not appear (`DimensionGraph`).
480 A `Dimension` primary key can appear in a join clause without its table
481 via a foreign key column in table of a dependent dimension element or
482 dataset.
483 """
484 names = set(self.requested.names | self.where.dimensions.names)
485 for dataset_type in self.datasets:
486 names.update(dataset_type.dimensions.names)
487 return DimensionGraph(self.universe, names=names)
489 @property # type: ignore
490 @cached_getter
491 def mustHaveTableJoined(self) -> NamedValueAbstractSet[DimensionElement]:
492 """Dimension elements whose associated tables must appear in the
493 query's FROM clause (`NamedValueSet` of `DimensionElement`).
494 """
495 result = NamedValueSet(self.spatial | self.temporal | self.where.columns.keys())
496 if self.order_by is not None:
497 result.update(self.order_by.elements)
498 for dimension in self.mustHaveKeysJoined:
499 if dimension.implied:
500 result.add(dimension)
501 for element in self.mustHaveKeysJoined.union(self.where.dataId.graph).elements:
502 if element.alwaysJoin:
503 result.add(element)
504 return result.freeze()
507@dataclass
508class DatasetQueryColumns:
509 """A struct containing the columns used to reconstruct `DatasetRef`
510 instances from query results.
511 """
513 datasetType: DatasetType
514 """The dataset type being queried (`DatasetType`).
515 """
517 id: ColumnElement
518 """Column containing the unique integer ID for this dataset.
519 """
521 runKey: ColumnElement
522 """Foreign key column to the `~CollectionType.RUN` collection that holds
523 this dataset.
524 """
526 ingestDate: Optional[ColumnElement]
527 """Column containing the ingest timestamp, this is not a part of
528 `DatasetRef` but it comes from the same table.
529 """
531 def __iter__(self) -> Iterator[ColumnElement]:
532 yield self.id
533 yield self.runKey
536@dataclass
537class QueryColumns:
538 """A struct organizing the columns in an under-construction or currently-
539 executing query.
541 Takes no parameters at construction, as expected usage is to add elements
542 to its container attributes incrementally.
543 """
545 def __init__(self) -> None:
546 self.keys = NamedKeyDict()
547 self.timespans = NamedKeyDict()
548 self.regions = NamedKeyDict()
549 self.datasets = None
551 keys: NamedKeyDict[Dimension, List[ColumnElement]]
552 """Columns that correspond to the primary key values of dimensions
553 (`NamedKeyDict` mapping `Dimension` to a `list` of `ColumnElement`).
555 Each value list contains columns from multiple tables corresponding to the
556 same dimension, and the query should constrain the values of those columns
557 to be the same.
559 In a `Query`, the keys of this dictionary must include at least the
560 dimensions in `QuerySummary.requested` and `QuerySummary.dataId.graph`.
561 """
563 timespans: NamedKeyDict[DimensionElement, TimespanDatabaseRepresentation]
564 """Columns that correspond to timespans for elements that participate in a
565 temporal join or filter in the query (`NamedKeyDict` mapping
566 `DimensionElement` to `TimespanDatabaseRepresentation`).
568 In a `Query`, the keys of this dictionary must be exactly the elements
569 in `QuerySummary.temporal`.
570 """
572 regions: NamedKeyDict[DimensionElement, ColumnElement]
573 """Columns that correspond to regions for elements that participate in a
574 spatial join or filter in the query (`NamedKeyDict` mapping
575 `DimensionElement` to `sqlalchemy.sql.ColumnElement`).
577 In a `Query`, the keys of this dictionary must be exactly the elements
578 in `QuerySummary.spatial`.
579 """
581 datasets: Optional[DatasetQueryColumns]
582 """Columns that can be used to construct `DatasetRef` instances from query
583 results.
584 (`DatasetQueryColumns` or `None`).
585 """
587 def isEmpty(self) -> bool:
588 """Return `True` if this query has no columns at all."""
589 return not (self.keys or self.timespans or self.regions or self.datasets is not None)
591 def getKeyColumn(self, dimension: Union[Dimension, str]) -> ColumnElement:
592 """Return one of the columns in self.keys for the given dimension.
594 The column selected is an implentation detail but is guaranteed to
595 be deterministic and consistent across multiple calls.
597 Parameters
598 ----------
599 dimension : `Dimension` or `str`
600 Dimension for which to obtain a key column.
602 Returns
603 -------
604 column : `sqlalchemy.sql.ColumnElement`
605 SQLAlchemy column object.
606 """
607 # Choosing the last element here is entirely for human readers of the
608 # query (e.g. developers debugging things); it makes it more likely a
609 # dimension key will be provided by the dimension's own table, or
610 # failing that, some closely related dimension, which might be less
611 # surprising to see than e.g. some dataset subquery. From the
612 # database's perspective this is entirely arbitrary, because the query
613 # guarantees they all have equal values.
614 return self.keys[dimension][-1]
617@dataclass
618class RegistryManagers:
619 """Struct used to pass around the manager objects that back a `Registry`
620 and are used internally by the query system.
621 """
623 collections: CollectionManager
624 """Manager for collections (`CollectionManager`).
625 """
627 datasets: DatasetRecordStorageManager
628 """Manager for datasets and dataset types (`DatasetRecordStorageManager`).
629 """
631 dimensions: DimensionRecordStorageManager
632 """Manager for dimensions (`DimensionRecordStorageManager`).
633 """
635 TimespanReprClass: Type[TimespanDatabaseRepresentation]
636 """Type that encapsulates how timespans are represented in this database
637 (`type`; subclass of `TimespanDatabaseRepresentation`).
638 """