Coverage for python/lsst/daf/butler/registry/queries/_structs.py: 41%
233 statements
« prev ^ index » next coverage.py v7.5.0, created at 2024-04-24 23:50 -0700
« prev ^ index » next coverage.py v7.5.0, created at 2024-04-24 23:50 -0700
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
21from __future__ import annotations
23__all__ = ["QuerySummary", "RegistryManagers"] # other classes here are local to subpackage
25from dataclasses import dataclass
26from typing import AbstractSet, Any, Iterable, Iterator, List, Mapping, Optional, Tuple, Type, Union
28from lsst.sphgeom import Region
29from lsst.utils.classes import cached_getter, immutable
30from sqlalchemy.sql import ColumnElement
32from ...core import (
33 DataCoordinate,
34 DatasetType,
35 Dimension,
36 DimensionElement,
37 DimensionGraph,
38 DimensionUniverse,
39 NamedKeyDict,
40 NamedKeyMapping,
41 NamedValueAbstractSet,
42 NamedValueSet,
43 SkyPixDimension,
44 SpatialRegionDatabaseRepresentation,
45 TimespanDatabaseRepresentation,
46)
47from .._exceptions import UserExpressionSyntaxError
48from ..interfaces import CollectionManager, DatasetRecordStorageManager, DimensionRecordStorageManager
49from ..summaries import GovernorDimensionRestriction
51# We're not trying to add typing to the lex/yacc parser code, so MyPy
52# doesn't know about some of these imports.
53from .expressions import Node, NormalForm, NormalFormExpression, ParserYacc # type: ignore
54from .expressions.categorize import categorizeElementOrderByName, categorizeOrderByName
57@immutable
58class QueryWhereExpression:
59 """A struct representing a parsed user-provided WHERE expression.
61 Parameters
62 ----------
63 expression : `str`, optional
64 The string expression to parse. If `None`, a where expression that
65 always evaluates to `True` is implied.
66 bind : `Mapping` [ `str`, `object` ], optional
67 Mapping containing literal values that should be injected into the
68 query expression, keyed by the identifiers they replace.
69 """
71 def __init__(self, expression: Optional[str] = None, bind: Optional[Mapping[str, Any]] = None):
72 if expression:
73 try:
74 parser = ParserYacc()
75 self._tree = parser.parse(expression)
76 except Exception as exc:
77 raise UserExpressionSyntaxError(f"Failed to parse user expression `{expression}'.") from exc
78 assert self._tree is not None
79 else:
80 self._tree = None
81 if bind is None:
82 bind = {}
83 self._bind = bind
85 def attach(
86 self,
87 graph: DimensionGraph,
88 dataId: Optional[DataCoordinate] = None,
89 region: Optional[Region] = None,
90 defaults: Optional[DataCoordinate] = None,
91 check: bool = True,
92 ) -> QueryWhereClause:
93 """Allow this expression to be attached to a `QuerySummary` by
94 transforming it into a `QueryWhereClause`, while checking it for both
95 internal consistency and consistency with the rest of the query.
97 Parameters
98 ----------
99 graph : `DimensionGraph`
100 The dimensions the query would include in the absence of this
101 WHERE expression.
102 dataId : `DataCoordinate`, optional
103 A fully-expanded data ID identifying dimensions known in advance.
104 If not provided, will be set to an empty data ID.
105 ``dataId.hasRecords()`` must return `True`.
106 region : `lsst.sphgeom.Region`, optional
107 A spatial region that all rows must overlap. If `None` and
108 ``dataId`` is not `None`, ``dataId.region`` will be used.
109 defaults : `DataCoordinate`, optional
110 A data ID containing default for governor dimensions. Ignored
111 unless ``check=True``.
112 check : `bool`
113 If `True` (default) check the query for consistency and inject
114 default values into the data ID when needed. This may
115 reject some valid queries that resemble common mistakes (e.g.
116 queries for visits without specifying an instrument).
117 """
118 if region is None and dataId is not None:
119 region = dataId.region
120 if dataId is None:
121 dataId = DataCoordinate.makeEmpty(graph.universe)
122 if defaults is None:
123 defaults = DataCoordinate.makeEmpty(graph.universe)
124 if self._bind and check:
125 for identifier in self._bind:
126 if identifier in graph.universe.getStaticElements().names:
127 raise RuntimeError(
128 f"Bind parameter key {identifier!r} conflicts with a dimension element."
129 )
130 table, sep, column = identifier.partition(".")
131 if column and table in graph.universe.getStaticElements().names:
132 raise RuntimeError(f"Bind parameter key {identifier!r} looks like a dimension column.")
133 restriction = GovernorDimensionRestriction(NamedKeyDict())
134 summary: InspectionSummary
135 if self._tree is not None:
136 if check:
137 # Convert the expression to disjunctive normal form (ORs of
138 # ANDs). That's potentially super expensive in the general
139 # case (where there's a ton of nesting of ANDs and ORs). That
140 # won't be the case for the expressions we expect, and we
141 # actually use disjunctive normal instead of conjunctive (i.e.
142 # ANDs of ORs) because I think the worst-case is a long list
143 # of OR'd-together data IDs, which is already in or very close
144 # to disjunctive normal form.
145 expr = NormalFormExpression.fromTree(self._tree, NormalForm.DISJUNCTIVE)
146 from .expressions import CheckVisitor
148 # Check the expression for consistency and completeness.
149 visitor = CheckVisitor(dataId, graph, self._bind, defaults)
150 try:
151 summary = expr.visit(visitor)
152 except RuntimeError as err:
153 exprOriginal = str(self._tree)
154 exprNormal = str(expr.toTree())
155 if exprNormal == exprOriginal:
156 msg = f'Error in query expression "{exprOriginal}": {err}'
157 else:
158 msg = (
159 f'Error in query expression "{exprOriginal}" '
160 f'(normalized to "{exprNormal}"): {err}'
161 )
162 raise RuntimeError(msg) from None
163 restriction = summary.governors
164 dataId = visitor.dataId
165 else:
166 from .expressions import InspectionVisitor
168 summary = self._tree.visit(InspectionVisitor(graph.universe, self._bind))
169 else:
170 from .expressions import InspectionSummary
172 summary = InspectionSummary()
173 return QueryWhereClause(
174 self._tree,
175 dataId,
176 dimensions=summary.dimensions,
177 columns=summary.columns,
178 bind=self._bind,
179 restriction=restriction,
180 region=region,
181 )
184@dataclass(frozen=True)
185class QueryWhereClause:
186 """Structure holding various contributions to a query's WHERE clause.
188 Instances of this class should only be created by
189 `QueryWhereExpression.attach`, which guarantees the consistency of its
190 attributes.
191 """
193 tree: Optional[Node]
194 """A parsed string expression tree., or `None` if there was no string
195 expression.
196 """
198 dataId: DataCoordinate
199 """A data ID identifying dimensions known before query construction
200 (`DataCoordinate`).
202 ``dataId.hasRecords()`` is guaranteed to return `True`.
203 """
205 dimensions: NamedValueAbstractSet[Dimension]
206 """Dimensions whose primary keys or dependencies were referenced anywhere
207 in the string expression (`NamedValueAbstractSet` [ `Dimension` ]).
208 """
210 columns: NamedKeyMapping[DimensionElement, AbstractSet[str]]
211 """Dimension element tables whose non-key columns were referenced anywhere
212 in the string expression
213 (`NamedKeyMapping` [ `DimensionElement`, `Set` [ `str` ] ]).
214 """
216 bind: Mapping[str, Any]
217 """Mapping containing literal values that should be injected into the
218 query expression, keyed by the identifiers they replace (`Mapping`).
219 """
221 region: Optional[Region]
222 """A spatial region that all result rows must overlap
223 (`lsst.sphgeom.Region` or `None`).
224 """
226 restriction: GovernorDimensionRestriction
227 """Restrictions on the values governor dimensions can take in this query,
228 imposed by the string expression or data ID
229 (`GovernorDimensionRestriction`).
230 """
232 @property
233 @cached_getter
234 def temporal(self) -> NamedValueAbstractSet[DimensionElement]:
235 """Dimension elements whose timespans are referenced by this
236 expression (`NamedValueAbstractSet` [ `DimensionElement` ])
237 """
238 return NamedValueSet(
239 e for e, c in self.columns.items() if TimespanDatabaseRepresentation.NAME in c
240 ).freeze()
243@dataclass(frozen=True)
244class OrderByClauseColumn:
245 """Information about single column in ORDER BY clause."""
247 element: DimensionElement
248 """Dimension element for data in this column (`DimensionElement`)."""
250 column: Optional[str]
251 """Name of the column or `None` for primary key (`str` or `None`)"""
253 ordering: bool
254 """True for ascending order, False for descending (`bool`)."""
257@immutable
258class OrderByClause:
259 """Class for information about columns in ORDER BY clause
261 Parameters
262 ----------
263 order_by : `Iterable` [ `str` ]
264 Sequence of names to use for ordering with optional "-" prefix.
265 graph : `DimensionGraph`
266 Dimensions used by a query.
267 """
269 def __init__(self, order_by: Iterable[str], graph: DimensionGraph):
270 self.order_by_columns = []
271 for name in order_by:
272 if not name or name == "-":
273 raise ValueError("Empty dimension name in ORDER BY")
274 ascending = True
275 if name[0] == "-":
276 ascending = False
277 name = name[1:]
278 element, column = categorizeOrderByName(graph, name)
279 self.order_by_columns.append(
280 OrderByClauseColumn(element=element, column=column, ordering=ascending)
281 )
283 self.elements = NamedValueSet(
284 column.element for column in self.order_by_columns if column.column is not None
285 )
287 order_by_columns: Iterable[OrderByClauseColumn]
288 """Columns that appear in the ORDER BY
289 (`Iterable` [ `OrderByClauseColumn` ]).
290 """
292 elements: NamedValueSet[DimensionElement]
293 """Dimension elements whose non-key columns were referenced by order_by
294 (`NamedValueSet` [ `DimensionElement` ]).
295 """
298@immutable
299class ElementOrderByClause:
300 """Class for information about columns in ORDER BY clause for one element.
302 Parameters
303 ----------
304 order_by : `Iterable` [ `str` ]
305 Sequence of names to use for ordering with optional "-" prefix.
306 element : `DimensionElement`
307 Dimensions used by a query.
308 """
310 def __init__(self, order_by: Iterable[str], element: DimensionElement):
311 self.order_by_columns = []
312 for name in order_by:
313 if not name or name == "-":
314 raise ValueError("Empty dimension name in ORDER BY")
315 ascending = True
316 if name[0] == "-":
317 ascending = False
318 name = name[1:]
319 column = categorizeElementOrderByName(element, name)
320 self.order_by_columns.append(
321 OrderByClauseColumn(element=element, column=column, ordering=ascending)
322 )
324 order_by_columns: Iterable[OrderByClauseColumn]
325 """Columns that appear in the ORDER BY
326 (`Iterable` [ `OrderByClauseColumn` ]).
327 """
330@immutable
331class QuerySummary:
332 """A struct that holds and categorizes the dimensions involved in a query.
334 A `QuerySummary` instance is necessary to construct a `QueryBuilder`, and
335 it needs to include all of the dimensions that will be included in the
336 query (including any needed for querying datasets).
338 Parameters
339 ----------
340 requested : `DimensionGraph`
341 The dimensions whose primary keys should be included in the result rows
342 of the query.
343 dataId : `DataCoordinate`, optional
344 A fully-expanded data ID identifying dimensions known in advance. If
345 not provided, will be set to an empty data ID. ``dataId.hasRecords()``
346 must return `True`.
347 expression : `str` or `QueryWhereExpression`, optional
348 A user-provided string WHERE expression.
349 whereRegion : `lsst.sphgeom.Region`, optional
350 A spatial region that all rows must overlap. If `None` and ``dataId``
351 is not `None`, ``dataId.region`` will be used.
352 bind : `Mapping` [ `str`, `object` ], optional
353 Mapping containing literal values that should be injected into the
354 query expression, keyed by the identifiers they replace.
355 defaults : `DataCoordinate`, optional
356 A data ID containing default for governor dimensions.
357 datasets : `Iterable` [ `DatasetType` ], optional
358 Dataset types whose searches may be joined into the query. Callers
359 must still call `QueryBuilder.joinDataset` explicitly to control how
360 that join happens (e.g. which collections are searched), but by
361 declaring them here first we can ensure that the query includes the
362 right dimensions for those joins.
363 order_by : `Iterable` [ `str` ]
364 Sequence of names to use for ordering with optional "-" prefix.
365 limit : `Tuple`, optional
366 Limit on the number of returned rows and optional offset.
367 check : `bool`
368 If `True` (default) check the query for consistency. This may reject
369 some valid queries that resemble common mistakes (e.g. queries for
370 visits without specifying an instrument).
371 """
373 def __init__(
374 self,
375 requested: DimensionGraph,
376 *,
377 dataId: Optional[DataCoordinate] = None,
378 expression: Optional[Union[str, QueryWhereExpression]] = None,
379 whereRegion: Optional[Region] = None,
380 bind: Optional[Mapping[str, Any]] = None,
381 defaults: Optional[DataCoordinate] = None,
382 datasets: Iterable[DatasetType] = (),
383 order_by: Optional[Iterable[str]] = None,
384 limit: Optional[Tuple[int, Optional[int]]] = None,
385 check: bool = True,
386 ):
387 self.requested = requested
388 if expression is None:
389 expression = QueryWhereExpression(None, bind)
390 elif isinstance(expression, str):
391 expression = QueryWhereExpression(expression, bind)
392 elif bind is not None:
393 raise TypeError("New bind parameters passed, but expression is already a QueryWhereExpression.")
394 self.where = expression.attach(
395 self.requested, dataId=dataId, region=whereRegion, defaults=defaults, check=check
396 )
397 self.datasets = NamedValueSet(datasets).freeze()
398 self.order_by = None if order_by is None else OrderByClause(order_by, requested)
399 self.limit = limit
401 requested: DimensionGraph
402 """Dimensions whose primary keys should be included in the result rows of
403 the query (`DimensionGraph`).
404 """
406 where: QueryWhereClause
407 """Structure containing objects that contribute to the WHERE clause of the
408 query (`QueryWhereClause`).
409 """
411 datasets: NamedValueAbstractSet[DatasetType]
412 """Dataset types whose searches may be joined into the query
413 (`NamedValueAbstractSet` [ `DatasetType` ]).
414 """
416 @property
417 def universe(self) -> DimensionUniverse:
418 """All known dimensions (`DimensionUniverse`)."""
419 return self.requested.universe
421 @property
422 @cached_getter
423 def spatial(self) -> NamedValueAbstractSet[DimensionElement]:
424 """Dimension elements whose regions and skypix IDs should be included
425 in the query (`NamedValueAbstractSet` of `DimensionElement`).
426 """
427 # An element may participate spatially in the query if:
428 # - it's the most precise spatial element for its system in the
429 # requested dimensions (i.e. in `self.requested.spatial`);
430 # - it isn't also given at query construction time.
431 result: NamedValueSet[DimensionElement] = NamedValueSet()
432 for family in self.mustHaveKeysJoined.spatial:
433 element = family.choose(self.mustHaveKeysJoined.elements)
434 assert isinstance(element, DimensionElement)
435 if element not in self.where.dataId.graph.elements:
436 result.add(element)
437 if len(result) == 1:
438 # There's no spatial join, but there might be a WHERE filter based
439 # on a given region.
440 if self.where.dataId.graph.spatial:
441 # We can only perform those filters against SkyPix dimensions,
442 # so if what we have isn't one, add the common SkyPix dimension
443 # to the query; the element we have will be joined to that.
444 (element,) = result
445 if not isinstance(element, SkyPixDimension):
446 result.add(self.universe.commonSkyPix)
447 else:
448 # There is no spatial join or filter in this query. Even
449 # if this element might be associated with spatial
450 # information, we don't need it for this query.
451 return NamedValueSet().freeze()
452 elif len(result) > 1:
453 # There's a spatial join. Those require the common SkyPix
454 # system to be included in the query in order to connect them.
455 result.add(self.universe.commonSkyPix)
456 return result.freeze()
458 @property
459 @cached_getter
460 def temporal(self) -> NamedValueAbstractSet[DimensionElement]:
461 """Dimension elements whose timespans should be included in the
462 query (`NamedValueSet` of `DimensionElement`).
463 """
464 if len(self.mustHaveKeysJoined.temporal) > 1:
465 # We don't actually have multiple temporal families in our current
466 # dimension configuration, so this limitation should be harmless.
467 raise NotImplementedError("Queries that should involve temporal joins are not yet supported.")
468 return self.where.temporal
470 @property
471 @cached_getter
472 def mustHaveKeysJoined(self) -> DimensionGraph:
473 """Dimensions whose primary keys must be used in the JOIN ON clauses
474 of the query, even if their tables do not appear (`DimensionGraph`).
476 A `Dimension` primary key can appear in a join clause without its table
477 via a foreign key column in table of a dependent dimension element or
478 dataset.
479 """
480 names = set(self.requested.names | self.where.dimensions.names)
481 for dataset_type in self.datasets:
482 names.update(dataset_type.dimensions.names)
483 return DimensionGraph(self.universe, names=names)
485 @property
486 @cached_getter
487 def mustHaveTableJoined(self) -> NamedValueAbstractSet[DimensionElement]:
488 """Dimension elements whose associated tables must appear in the
489 query's FROM clause (`NamedValueSet` of `DimensionElement`).
490 """
491 result = NamedValueSet(self.spatial | self.temporal | self.where.columns.keys())
492 if self.order_by is not None:
493 result.update(self.order_by.elements)
494 for dimension in self.mustHaveKeysJoined:
495 if dimension.implied:
496 result.add(dimension)
497 for element in self.mustHaveKeysJoined.union(self.where.dataId.graph).elements:
498 if element.alwaysJoin:
499 result.add(element)
500 return result.freeze()
503@dataclass
504class DatasetQueryColumns:
505 """A struct containing the columns used to reconstruct `DatasetRef`
506 instances from query results.
507 """
509 datasetType: DatasetType
510 """The dataset type being queried (`DatasetType`).
511 """
513 id: ColumnElement
514 """Column containing the unique integer ID for this dataset.
515 """
517 runKey: ColumnElement
518 """Foreign key column to the `~CollectionType.RUN` collection that holds
519 this dataset.
520 """
522 ingestDate: Optional[ColumnElement]
523 """Column containing the ingest timestamp, this is not a part of
524 `DatasetRef` but it comes from the same table.
525 """
527 def __iter__(self) -> Iterator[ColumnElement]:
528 yield self.id
529 yield self.runKey
532@dataclass
533class QueryColumns:
534 """A struct organizing the columns in an under-construction or currently-
535 executing query.
537 Takes no parameters at construction, as expected usage is to add elements
538 to its container attributes incrementally.
539 """
541 def __init__(self) -> None:
542 self.keys = NamedKeyDict()
543 self.timespans = NamedKeyDict()
544 self.regions = NamedKeyDict()
545 self.datasets = None
547 keys: NamedKeyDict[Dimension, List[ColumnElement]]
548 """Columns that correspond to the primary key values of dimensions
549 (`NamedKeyDict` mapping `Dimension` to a `list` of `ColumnElement`).
551 Each value list contains columns from multiple tables corresponding to the
552 same dimension, and the query should constrain the values of those columns
553 to be the same.
555 In a `Query`, the keys of this dictionary must include at least the
556 dimensions in `QuerySummary.requested` and `QuerySummary.dataId.graph`.
557 """
559 timespans: NamedKeyDict[DimensionElement, TimespanDatabaseRepresentation]
560 """Columns that correspond to timespans for elements that participate in a
561 temporal join or filter in the query (`NamedKeyDict` mapping
562 `DimensionElement` to `TimespanDatabaseRepresentation`).
564 In a `Query`, the keys of this dictionary must be exactly the elements
565 in `QuerySummary.temporal`.
566 """
568 regions: NamedKeyDict[DimensionElement, SpatialRegionDatabaseRepresentation]
569 """Columns that correspond to regions for elements that participate in a
570 spatial join or filter in the query (`NamedKeyDict` mapping
571 `DimensionElement` to `SpatialRegionDatabaseRepresentation`).
573 In a `Query`, the keys of this dictionary must be exactly the elements
574 in `QuerySummary.spatial`.
575 """
577 datasets: Optional[DatasetQueryColumns]
578 """Columns that can be used to construct `DatasetRef` instances from query
579 results.
580 (`DatasetQueryColumns` or `None`).
581 """
583 def isEmpty(self) -> bool:
584 """Return `True` if this query has no columns at all."""
585 return not (self.keys or self.timespans or self.regions or self.datasets is not None)
587 def getKeyColumn(self, dimension: Union[Dimension, str]) -> ColumnElement:
588 """Return one of the columns in self.keys for the given dimension.
590 The column selected is an implentation detail but is guaranteed to
591 be deterministic and consistent across multiple calls.
593 Parameters
594 ----------
595 dimension : `Dimension` or `str`
596 Dimension for which to obtain a key column.
598 Returns
599 -------
600 column : `sqlalchemy.sql.ColumnElement`
601 SQLAlchemy column object.
602 """
603 # Choosing the last element here is entirely for human readers of the
604 # query (e.g. developers debugging things); it makes it more likely a
605 # dimension key will be provided by the dimension's own table, or
606 # failing that, some closely related dimension, which might be less
607 # surprising to see than e.g. some dataset subquery. From the
608 # database's perspective this is entirely arbitrary, because the query
609 # guarantees they all have equal values.
610 return self.keys[dimension][-1]
613@dataclass
614class RegistryManagers:
615 """Struct used to pass around the manager objects that back a `Registry`
616 and are used internally by the query system.
617 """
619 collections: CollectionManager
620 """Manager for collections (`CollectionManager`).
621 """
623 datasets: DatasetRecordStorageManager
624 """Manager for datasets and dataset types (`DatasetRecordStorageManager`).
625 """
627 dimensions: DimensionRecordStorageManager
628 """Manager for dimensions (`DimensionRecordStorageManager`).
629 """
631 TimespanReprClass: Type[TimespanDatabaseRepresentation]
632 """Type that encapsulates how timespans are represented in this database
633 (`type`; subclass of `TimespanDatabaseRepresentation`).
634 """