Coverage for python/lsst/daf/butler/registry/queries/_structs.py: 36%
Shortcuts on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
21from __future__ import annotations
23__all__ = ["QuerySummary", "RegistryManagers"] # other classes here are local to subpackage
25from dataclasses import dataclass
26from typing import AbstractSet, Any, Iterable, Iterator, List, Mapping, Optional, Tuple, Type, Union
28from lsst.sphgeom import Region
29from lsst.utils.classes import cached_getter, immutable
30from sqlalchemy.sql import ColumnElement
32from ...core import (
33 DataCoordinate,
34 DatasetType,
35 Dimension,
36 DimensionElement,
37 DimensionGraph,
38 DimensionUniverse,
39 NamedKeyDict,
40 NamedKeyMapping,
41 NamedValueAbstractSet,
42 NamedValueSet,
43 SkyPixDimension,
44 SpatialRegionDatabaseRepresentation,
45 TimespanDatabaseRepresentation,
46)
47from ..interfaces import CollectionManager, DatasetRecordStorageManager, DimensionRecordStorageManager
48from ..summaries import GovernorDimensionRestriction
50# We're not trying to add typing to the lex/yacc parser code, so MyPy
51# doesn't know about some of these imports.
52from .expressions import Node, NormalForm, NormalFormExpression, ParserYacc # type: ignore
53from .expressions.categorize import categorizeElementOrderByName, categorizeOrderByName
56@immutable
57class QueryWhereExpression:
58 """A struct representing a parsed user-provided WHERE expression.
60 Parameters
61 ----------
62 expression : `str`, optional
63 The string expression to parse. If `None`, a where expression that
64 always evaluates to `True` is implied.
65 bind : `Mapping` [ `str`, `object` ], optional
66 Mapping containing literal values that should be injected into the
67 query expression, keyed by the identifiers they replace.
68 """
70 def __init__(self, expression: Optional[str] = None, bind: Optional[Mapping[str, Any]] = None):
71 if expression:
72 try:
73 parser = ParserYacc()
74 self._tree = parser.parse(expression)
75 except Exception as exc:
76 raise RuntimeError(f"Failed to parse user expression `{expression}'.") from exc
77 assert self._tree is not None
78 else:
79 self._tree = None
80 if bind is None:
81 bind = {}
82 self._bind = bind
84 def attach(
85 self,
86 graph: DimensionGraph,
87 dataId: Optional[DataCoordinate] = None,
88 region: Optional[Region] = None,
89 defaults: Optional[DataCoordinate] = None,
90 check: bool = True,
91 ) -> QueryWhereClause:
92 """Allow this expression to be attached to a `QuerySummary` by
93 transforming it into a `QueryWhereClause`, while checking it for both
94 internal consistency and consistency with the rest of the query.
96 Parameters
97 ----------
98 graph : `DimensionGraph`
99 The dimensions the query would include in the absence of this
100 WHERE expression.
101 dataId : `DataCoordinate`, optional
102 A fully-expanded data ID identifying dimensions known in advance.
103 If not provided, will be set to an empty data ID.
104 ``dataId.hasRecords()`` must return `True`.
105 region : `lsst.sphgeom.Region`, optional
106 A spatial region that all rows must overlap. If `None` and
107 ``dataId`` is not `None`, ``dataId.region`` will be used.
108 defaults : `DataCoordinate`, optional
109 A data ID containing default for governor dimensions. Ignored
110 unless ``check=True``.
111 check : `bool`
112 If `True` (default) check the query for consistency and inject
113 default values into the data ID when needed. This may
114 reject some valid queries that resemble common mistakes (e.g.
115 queries for visits without specifying an instrument).
116 """
117 if region is None and dataId is not None:
118 region = dataId.region
119 if dataId is None:
120 dataId = DataCoordinate.makeEmpty(graph.universe)
121 if defaults is None:
122 defaults = DataCoordinate.makeEmpty(graph.universe)
123 if self._bind and check:
124 for identifier in self._bind:
125 if identifier in graph.universe.getStaticElements().names:
126 raise RuntimeError(
127 f"Bind parameter key {identifier!r} conflicts with a dimension element."
128 )
129 table, sep, column = identifier.partition(".")
130 if column and table in graph.universe.getStaticElements().names:
131 raise RuntimeError(f"Bind parameter key {identifier!r} looks like a dimension column.")
132 restriction = GovernorDimensionRestriction(NamedKeyDict())
133 summary: InspectionSummary
134 if self._tree is not None:
135 if check:
136 # Convert the expression to disjunctive normal form (ORs of
137 # ANDs). That's potentially super expensive in the general
138 # case (where there's a ton of nesting of ANDs and ORs). That
139 # won't be the case for the expressions we expect, and we
140 # actually use disjunctive normal instead of conjunctive (i.e.
141 # ANDs of ORs) because I think the worst-case is a long list
142 # of OR'd-together data IDs, which is already in or very close
143 # to disjunctive normal form.
144 expr = NormalFormExpression.fromTree(self._tree, NormalForm.DISJUNCTIVE)
145 from .expressions import CheckVisitor
147 # Check the expression for consistency and completeness.
148 visitor = CheckVisitor(dataId, graph, self._bind.keys(), defaults)
149 try:
150 summary = expr.visit(visitor)
151 except RuntimeError as err:
152 exprOriginal = str(self._tree)
153 exprNormal = str(expr.toTree())
154 if exprNormal == exprOriginal:
155 msg = f'Error in query expression "{exprOriginal}": {err}'
156 else:
157 msg = (
158 f'Error in query expression "{exprOriginal}" '
159 f'(normalized to "{exprNormal}"): {err}'
160 )
161 raise RuntimeError(msg) from None
162 restriction = summary.governors
163 dataId = visitor.dataId
164 else:
165 from .expressions import InspectionVisitor
167 summary = self._tree.visit(InspectionVisitor(graph.universe, self._bind.keys()))
168 else:
169 from .expressions import InspectionSummary
171 summary = InspectionSummary()
172 return QueryWhereClause(
173 self._tree,
174 dataId,
175 dimensions=summary.dimensions,
176 columns=summary.columns,
177 bind=self._bind,
178 restriction=restriction,
179 region=region,
180 )
183@dataclass(frozen=True)
184class QueryWhereClause:
185 """Structure holding various contributions to a query's WHERE clause.
187 Instances of this class should only be created by
188 `QueryWhereExpression.attach`, which guarantees the consistency of its
189 attributes.
190 """
192 tree: Optional[Node]
193 """A parsed string expression tree., or `None` if there was no string
194 expression.
195 """
197 dataId: DataCoordinate
198 """A data ID identifying dimensions known before query construction
199 (`DataCoordinate`).
201 ``dataId.hasRecords()`` is guaranteed to return `True`.
202 """
204 dimensions: NamedValueAbstractSet[Dimension]
205 """Dimensions whose primary keys or dependencies were referenced anywhere
206 in the string expression (`NamedValueAbstractSet` [ `Dimension` ]).
207 """
209 columns: NamedKeyMapping[DimensionElement, AbstractSet[str]]
210 """Dimension element tables whose non-key columns were referenced anywhere
211 in the string expression
212 (`NamedKeyMapping` [ `DimensionElement`, `Set` [ `str` ] ]).
213 """
215 bind: Mapping[str, Any]
216 """Mapping containing literal values that should be injected into the
217 query expression, keyed by the identifiers they replace (`Mapping`).
218 """
220 region: Optional[Region]
221 """A spatial region that all result rows must overlap
222 (`lsst.sphgeom.Region` or `None`).
223 """
225 restriction: GovernorDimensionRestriction
226 """Restrictions on the values governor dimensions can take in this query,
227 imposed by the string expression or data ID
228 (`GovernorDimensionRestriction`).
229 """
231 @property # type: ignore
232 @cached_getter
233 def temporal(self) -> NamedValueAbstractSet[DimensionElement]:
234 """Dimension elements whose timespans are referenced by this
235 expression (`NamedValueAbstractSet` [ `DimensionElement` ])
236 """
237 return NamedValueSet(
238 e for e, c in self.columns.items() if TimespanDatabaseRepresentation.NAME in c
239 ).freeze()
242@dataclass(frozen=True)
243class OrderByClauseColumn:
244 """Information about single column in ORDER BY clause."""
246 element: DimensionElement
247 """Dimension element for data in this column (`DimensionElement`)."""
249 column: Optional[str]
250 """Name of the column or `None` for primary key (`str` or `None`)"""
252 ordering: bool
253 """True for ascending order, False for descending (`bool`)."""
256@immutable
257class OrderByClause:
258 """Class for information about columns in ORDER BY clause
260 Parameters
261 ----------
262 order_by : `Iterable` [ `str` ]
263 Sequence of names to use for ordering with optional "-" prefix.
264 graph : `DimensionGraph`
265 Dimensions used by a query.
266 """
268 def __init__(self, order_by: Iterable[str], graph: DimensionGraph):
270 self.order_by_columns = []
271 for name in order_by:
272 if not name or name == "-":
273 raise ValueError("Empty dimension name in ORDER BY")
274 ascending = True
275 if name[0] == "-":
276 ascending = False
277 name = name[1:]
278 element, column = categorizeOrderByName(graph, name)
279 self.order_by_columns.append(
280 OrderByClauseColumn(element=element, column=column, ordering=ascending)
281 )
283 self.elements = NamedValueSet(
284 column.element for column in self.order_by_columns if column.column is not None
285 )
287 order_by_columns: Iterable[OrderByClauseColumn]
288 """Columns that appear in the ORDER BY
289 (`Iterable` [ `OrderByClauseColumn` ]).
290 """
292 elements: NamedValueSet[DimensionElement]
293 """Dimension elements whose non-key columns were referenced by order_by
294 (`NamedValueSet` [ `DimensionElement` ]).
295 """
298@immutable
299class ElementOrderByClause:
300 """Class for information about columns in ORDER BY clause for one element.
302 Parameters
303 ----------
304 order_by : `Iterable` [ `str` ]
305 Sequence of names to use for ordering with optional "-" prefix.
306 element : `DimensionElement`
307 Dimensions used by a query.
308 """
310 def __init__(self, order_by: Iterable[str], element: DimensionElement):
312 self.order_by_columns = []
313 for name in order_by:
314 if not name or name == "-":
315 raise ValueError("Empty dimension name in ORDER BY")
316 ascending = True
317 if name[0] == "-":
318 ascending = False
319 name = name[1:]
320 column = categorizeElementOrderByName(element, name)
321 self.order_by_columns.append(
322 OrderByClauseColumn(element=element, column=column, ordering=ascending)
323 )
325 order_by_columns: Iterable[OrderByClauseColumn]
326 """Columns that appear in the ORDER BY
327 (`Iterable` [ `OrderByClauseColumn` ]).
328 """
331@immutable
332class QuerySummary:
333 """A struct that holds and categorizes the dimensions involved in a query.
335 A `QuerySummary` instance is necessary to construct a `QueryBuilder`, and
336 it needs to include all of the dimensions that will be included in the
337 query (including any needed for querying datasets).
339 Parameters
340 ----------
341 requested : `DimensionGraph`
342 The dimensions whose primary keys should be included in the result rows
343 of the query.
344 dataId : `DataCoordinate`, optional
345 A fully-expanded data ID identifying dimensions known in advance. If
346 not provided, will be set to an empty data ID. ``dataId.hasRecords()``
347 must return `True`.
348 expression : `str` or `QueryWhereExpression`, optional
349 A user-provided string WHERE expression.
350 whereRegion : `lsst.sphgeom.Region`, optional
351 A spatial region that all rows must overlap. If `None` and ``dataId``
352 is not `None`, ``dataId.region`` will be used.
353 bind : `Mapping` [ `str`, `object` ], optional
354 Mapping containing literal values that should be injected into the
355 query expression, keyed by the identifiers they replace.
356 defaults : `DataCoordinate`, optional
357 A data ID containing default for governor dimensions.
358 datasets : `Iterable` [ `DatasetType` ], optional
359 Dataset types whose searches may be joined into the query. Callers
360 must still call `QueryBuilder.joinDataset` explicitly to control how
361 that join happens (e.g. which collections are searched), but by
362 declaring them here first we can ensure that the query includes the
363 right dimensions for those joins.
364 order_by : `Iterable` [ `str` ]
365 Sequence of names to use for ordering with optional "-" prefix.
366 limit : `Tuple`, optional
367 Limit on the number of returned rows and optional offset.
368 check : `bool`
369 If `True` (default) check the query for consistency. This may reject
370 some valid queries that resemble common mistakes (e.g. queries for
371 visits without specifying an instrument).
372 """
374 def __init__(
375 self,
376 requested: DimensionGraph,
377 *,
378 dataId: Optional[DataCoordinate] = None,
379 expression: Optional[Union[str, QueryWhereExpression]] = None,
380 whereRegion: Optional[Region] = None,
381 bind: Optional[Mapping[str, Any]] = None,
382 defaults: Optional[DataCoordinate] = None,
383 datasets: Iterable[DatasetType] = (),
384 order_by: Optional[Iterable[str]] = None,
385 limit: Optional[Tuple[int, Optional[int]]] = None,
386 check: bool = True,
387 ):
388 self.requested = requested
389 if expression is None:
390 expression = QueryWhereExpression(None, bind)
391 elif isinstance(expression, str):
392 expression = QueryWhereExpression(expression, bind)
393 elif bind is not None:
394 raise TypeError("New bind parameters passed, but expression is already a QueryWhereExpression.")
395 self.where = expression.attach(
396 self.requested, dataId=dataId, region=whereRegion, defaults=defaults, check=check
397 )
398 self.datasets = NamedValueSet(datasets).freeze()
399 self.order_by = None if order_by is None else OrderByClause(order_by, requested)
400 self.limit = limit
402 requested: DimensionGraph
403 """Dimensions whose primary keys should be included in the result rows of
404 the query (`DimensionGraph`).
405 """
407 where: QueryWhereClause
408 """Structure containing objects that contribute to the WHERE clause of the
409 query (`QueryWhereClause`).
410 """
412 datasets: NamedValueAbstractSet[DatasetType]
413 """Dataset types whose searches may be joined into the query
414 (`NamedValueAbstractSet` [ `DatasetType` ]).
415 """
417 @property
418 def universe(self) -> DimensionUniverse:
419 """All known dimensions (`DimensionUniverse`)."""
420 return self.requested.universe
422 @property # type: ignore
423 @cached_getter
424 def spatial(self) -> NamedValueAbstractSet[DimensionElement]:
425 """Dimension elements whose regions and skypix IDs should be included
426 in the query (`NamedValueAbstractSet` of `DimensionElement`).
427 """
428 # An element may participate spatially in the query if:
429 # - it's the most precise spatial element for its system in the
430 # requested dimensions (i.e. in `self.requested.spatial`);
431 # - it isn't also given at query construction time.
432 result: NamedValueSet[DimensionElement] = NamedValueSet()
433 for family in self.mustHaveKeysJoined.spatial:
434 element = family.choose(self.mustHaveKeysJoined.elements)
435 assert isinstance(element, DimensionElement)
436 if element not in self.where.dataId.graph.elements:
437 result.add(element)
438 if len(result) == 1:
439 # There's no spatial join, but there might be a WHERE filter based
440 # on a given region.
441 if self.where.dataId.graph.spatial:
442 # We can only perform those filters against SkyPix dimensions,
443 # so if what we have isn't one, add the common SkyPix dimension
444 # to the query; the element we have will be joined to that.
445 (element,) = result
446 if not isinstance(element, SkyPixDimension):
447 result.add(self.universe.commonSkyPix)
448 else:
449 # There is no spatial join or filter in this query. Even
450 # if this element might be associated with spatial
451 # information, we don't need it for this query.
452 return NamedValueSet().freeze()
453 elif len(result) > 1:
454 # There's a spatial join. Those require the common SkyPix
455 # system to be included in the query in order to connect them.
456 result.add(self.universe.commonSkyPix)
457 return result.freeze()
459 @property # type: ignore
460 @cached_getter
461 def temporal(self) -> NamedValueAbstractSet[DimensionElement]:
462 """Dimension elements whose timespans should be included in the
463 query (`NamedValueSet` of `DimensionElement`).
464 """
465 if len(self.mustHaveKeysJoined.temporal) > 1:
466 # We don't actually have multiple temporal families in our current
467 # dimension configuration, so this limitation should be harmless.
468 raise NotImplementedError("Queries that should involve temporal joins are not yet supported.")
469 return self.where.temporal
471 @property # type: ignore
472 @cached_getter
473 def mustHaveKeysJoined(self) -> DimensionGraph:
474 """Dimensions whose primary keys must be used in the JOIN ON clauses
475 of the query, even if their tables do not appear (`DimensionGraph`).
477 A `Dimension` primary key can appear in a join clause without its table
478 via a foreign key column in table of a dependent dimension element or
479 dataset.
480 """
481 names = set(self.requested.names | self.where.dimensions.names)
482 for dataset_type in self.datasets:
483 names.update(dataset_type.dimensions.names)
484 return DimensionGraph(self.universe, names=names)
486 @property # type: ignore
487 @cached_getter
488 def mustHaveTableJoined(self) -> NamedValueAbstractSet[DimensionElement]:
489 """Dimension elements whose associated tables must appear in the
490 query's FROM clause (`NamedValueSet` of `DimensionElement`).
491 """
492 result = NamedValueSet(self.spatial | self.temporal | self.where.columns.keys())
493 if self.order_by is not None:
494 result.update(self.order_by.elements)
495 for dimension in self.mustHaveKeysJoined:
496 if dimension.implied:
497 result.add(dimension)
498 for element in self.mustHaveKeysJoined.union(self.where.dataId.graph).elements:
499 if element.alwaysJoin:
500 result.add(element)
501 return result.freeze()
504@dataclass
505class DatasetQueryColumns:
506 """A struct containing the columns used to reconstruct `DatasetRef`
507 instances from query results.
508 """
510 datasetType: DatasetType
511 """The dataset type being queried (`DatasetType`).
512 """
514 id: ColumnElement
515 """Column containing the unique integer ID for this dataset.
516 """
518 runKey: ColumnElement
519 """Foreign key column to the `~CollectionType.RUN` collection that holds
520 this dataset.
521 """
523 ingestDate: Optional[ColumnElement]
524 """Column containing the ingest timestamp, this is not a part of
525 `DatasetRef` but it comes from the same table.
526 """
528 def __iter__(self) -> Iterator[ColumnElement]:
529 yield self.id
530 yield self.runKey
533@dataclass
534class QueryColumns:
535 """A struct organizing the columns in an under-construction or currently-
536 executing query.
538 Takes no parameters at construction, as expected usage is to add elements
539 to its container attributes incrementally.
540 """
542 def __init__(self) -> None:
543 self.keys = NamedKeyDict()
544 self.timespans = NamedKeyDict()
545 self.regions = NamedKeyDict()
546 self.datasets = None
548 keys: NamedKeyDict[Dimension, List[ColumnElement]]
549 """Columns that correspond to the primary key values of dimensions
550 (`NamedKeyDict` mapping `Dimension` to a `list` of `ColumnElement`).
552 Each value list contains columns from multiple tables corresponding to the
553 same dimension, and the query should constrain the values of those columns
554 to be the same.
556 In a `Query`, the keys of this dictionary must include at least the
557 dimensions in `QuerySummary.requested` and `QuerySummary.dataId.graph`.
558 """
560 timespans: NamedKeyDict[DimensionElement, TimespanDatabaseRepresentation]
561 """Columns that correspond to timespans for elements that participate in a
562 temporal join or filter in the query (`NamedKeyDict` mapping
563 `DimensionElement` to `TimespanDatabaseRepresentation`).
565 In a `Query`, the keys of this dictionary must be exactly the elements
566 in `QuerySummary.temporal`.
567 """
569 regions: NamedKeyDict[DimensionElement, SpatialRegionDatabaseRepresentation]
570 """Columns that correspond to regions for elements that participate in a
571 spatial join or filter in the query (`NamedKeyDict` mapping
572 `DimensionElement` to `SpatialRegionDatabaseRepresentation`).
574 In a `Query`, the keys of this dictionary must be exactly the elements
575 in `QuerySummary.spatial`.
576 """
578 datasets: Optional[DatasetQueryColumns]
579 """Columns that can be used to construct `DatasetRef` instances from query
580 results.
581 (`DatasetQueryColumns` or `None`).
582 """
584 def isEmpty(self) -> bool:
585 """Return `True` if this query has no columns at all."""
586 return not (self.keys or self.timespans or self.regions or self.datasets is not None)
588 def getKeyColumn(self, dimension: Union[Dimension, str]) -> ColumnElement:
589 """Return one of the columns in self.keys for the given dimension.
591 The column selected is an implentation detail but is guaranteed to
592 be deterministic and consistent across multiple calls.
594 Parameters
595 ----------
596 dimension : `Dimension` or `str`
597 Dimension for which to obtain a key column.
599 Returns
600 -------
601 column : `sqlalchemy.sql.ColumnElement`
602 SQLAlchemy column object.
603 """
604 # Choosing the last element here is entirely for human readers of the
605 # query (e.g. developers debugging things); it makes it more likely a
606 # dimension key will be provided by the dimension's own table, or
607 # failing that, some closely related dimension, which might be less
608 # surprising to see than e.g. some dataset subquery. From the
609 # database's perspective this is entirely arbitrary, because the query
610 # guarantees they all have equal values.
611 return self.keys[dimension][-1]
614@dataclass
615class RegistryManagers:
616 """Struct used to pass around the manager objects that back a `Registry`
617 and are used internally by the query system.
618 """
620 collections: CollectionManager
621 """Manager for collections (`CollectionManager`).
622 """
624 datasets: DatasetRecordStorageManager
625 """Manager for datasets and dataset types (`DatasetRecordStorageManager`).
626 """
628 dimensions: DimensionRecordStorageManager
629 """Manager for dimensions (`DimensionRecordStorageManager`).
630 """
632 TimespanReprClass: Type[TimespanDatabaseRepresentation]
633 """Type that encapsulates how timespans are represented in this database
634 (`type`; subclass of `TimespanDatabaseRepresentation`).
635 """