Coverage for python/lsst/daf/butler/registry/queries/_structs.py: 36%
Shortcuts on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
21from __future__ import annotations
23__all__ = ["QuerySummary", "RegistryManagers"] # other classes here are local to subpackage
25from dataclasses import dataclass
26from typing import AbstractSet, Any, Iterable, Iterator, List, Mapping, Optional, Tuple, Type, Union
28from lsst.sphgeom import Region
29from lsst.utils.classes import cached_getter, immutable
30from sqlalchemy.sql import ColumnElement
32from ...core import (
33 DataCoordinate,
34 DatasetType,
35 Dimension,
36 DimensionElement,
37 DimensionGraph,
38 DimensionUniverse,
39 NamedKeyDict,
40 NamedKeyMapping,
41 NamedValueAbstractSet,
42 NamedValueSet,
43 SkyPixDimension,
44 SpatialRegionDatabaseRepresentation,
45 TimespanDatabaseRepresentation,
46)
47from ..interfaces import CollectionManager, DatasetRecordStorageManager, DimensionRecordStorageManager
48from ..summaries import GovernorDimensionRestriction
50# We're not trying to add typing to the lex/yacc parser code, so MyPy
51# doesn't know about some of these imports.
52from .expressions import Node, NormalForm, NormalFormExpression, ParserYacc # type: ignore
53from .expressions.categorize import categorizeOrderByName
56@immutable
57class QueryWhereExpression:
58 """A struct representing a parsed user-provided WHERE expression.
60 Parameters
61 ----------
62 expression : `str`, optional
63 The string expression to parse. If `None`, a where expression that
64 always evaluates to `True` is implied.
65 bind : `Mapping` [ `str`, `object` ], optional
66 Mapping containing literal values that should be injected into the
67 query expression, keyed by the identifiers they replace.
68 """
70 def __init__(self, expression: Optional[str] = None, bind: Optional[Mapping[str, Any]] = None):
71 if expression:
72 try:
73 parser = ParserYacc()
74 self._tree = parser.parse(expression)
75 except Exception as exc:
76 raise RuntimeError(f"Failed to parse user expression `{expression}'.") from exc
77 assert self._tree is not None
78 else:
79 self._tree = None
80 if bind is None:
81 bind = {}
82 self._bind = bind
84 def attach(
85 self,
86 graph: DimensionGraph,
87 dataId: Optional[DataCoordinate] = None,
88 region: Optional[Region] = None,
89 defaults: Optional[DataCoordinate] = None,
90 check: bool = True,
91 ) -> QueryWhereClause:
92 """Allow this expression to be attached to a `QuerySummary` by
93 transforming it into a `QueryWhereClause`, while checking it for both
94 internal consistency and consistency with the rest of the query.
96 Parameters
97 ----------
98 graph : `DimensionGraph`
99 The dimensions the query would include in the absence of this
100 WHERE expression.
101 dataId : `DataCoordinate`, optional
102 A fully-expanded data ID identifying dimensions known in advance.
103 If not provided, will be set to an empty data ID.
104 ``dataId.hasRecords()`` must return `True`.
105 region : `lsst.sphgeom.Region`, optional
106 A spatial region that all rows must overlap. If `None` and
107 ``dataId`` is not `None`, ``dataId.region`` will be used.
108 defaults : `DataCoordinate`, optional
109 A data ID containing default for governor dimensions. Ignored
110 unless ``check=True``.
111 check : `bool`
112 If `True` (default) check the query for consistency and inject
113 default values into the data ID when needed. This may
114 reject some valid queries that resemble common mistakes (e.g.
115 queries for visits without specifying an instrument).
116 """
117 if region is None and dataId is not None:
118 region = dataId.region
119 if dataId is None:
120 dataId = DataCoordinate.makeEmpty(graph.universe)
121 if defaults is None:
122 defaults = DataCoordinate.makeEmpty(graph.universe)
123 if self._bind and check:
124 for identifier in self._bind:
125 if identifier in graph.universe.getStaticElements().names:
126 raise RuntimeError(
127 f"Bind parameter key {identifier!r} conflicts with a dimension element."
128 )
129 table, sep, column = identifier.partition(".")
130 if column and table in graph.universe.getStaticElements().names:
131 raise RuntimeError(f"Bind parameter key {identifier!r} looks like a dimension column.")
132 restriction = GovernorDimensionRestriction(NamedKeyDict())
133 summary: InspectionSummary
134 if self._tree is not None:
135 if check:
136 # Convert the expression to disjunctive normal form (ORs of
137 # ANDs). That's potentially super expensive in the general
138 # case (where there's a ton of nesting of ANDs and ORs). That
139 # won't be the case for the expressions we expect, and we
140 # actually use disjunctive normal instead of conjunctive (i.e.
141 # ANDs of ORs) because I think the worst-case is a long list
142 # of OR'd-together data IDs, which is already in or very close
143 # to disjunctive normal form.
144 expr = NormalFormExpression.fromTree(self._tree, NormalForm.DISJUNCTIVE)
145 from .expressions import CheckVisitor
147 # Check the expression for consistency and completeness.
148 visitor = CheckVisitor(dataId, graph, self._bind.keys(), defaults)
149 try:
150 summary = expr.visit(visitor)
151 except RuntimeError as err:
152 exprOriginal = str(self._tree)
153 exprNormal = str(expr.toTree())
154 if exprNormal == exprOriginal:
155 msg = f'Error in query expression "{exprOriginal}": {err}'
156 else:
157 msg = (
158 f'Error in query expression "{exprOriginal}" '
159 f'(normalized to "{exprNormal}"): {err}'
160 )
161 raise RuntimeError(msg) from None
162 restriction = summary.governors
163 dataId = visitor.dataId
164 else:
165 from .expressions import InspectionVisitor
167 summary = self._tree.visit(InspectionVisitor(graph.universe, self._bind.keys()))
168 else:
169 from .expressions import InspectionSummary
171 summary = InspectionSummary()
172 return QueryWhereClause(
173 self._tree,
174 dataId,
175 dimensions=summary.dimensions,
176 columns=summary.columns,
177 bind=self._bind,
178 restriction=restriction,
179 region=region,
180 )
183@dataclass(frozen=True)
184class QueryWhereClause:
185 """Structure holding various contributions to a query's WHERE clause.
187 Instances of this class should only be created by
188 `QueryWhereExpression.attach`, which guarantees the consistency of its
189 attributes.
190 """
192 tree: Optional[Node]
193 """A parsed string expression tree., or `None` if there was no string
194 expression.
195 """
197 dataId: DataCoordinate
198 """A data ID identifying dimensions known before query construction
199 (`DataCoordinate`).
201 ``dataId.hasRecords()`` is guaranteed to return `True`.
202 """
204 dimensions: NamedValueAbstractSet[Dimension]
205 """Dimensions whose primary keys or dependencies were referenced anywhere
206 in the string expression (`NamedValueAbstractSet` [ `Dimension` ]).
207 """
209 columns: NamedKeyMapping[DimensionElement, AbstractSet[str]]
210 """Dimension element tables whose non-key columns were referenced anywhere
211 in the string expression
212 (`NamedKeyMapping` [ `DimensionElement`, `Set` [ `str` ] ]).
213 """
215 bind: Mapping[str, Any]
216 """Mapping containing literal values that should be injected into the
217 query expression, keyed by the identifiers they replace (`Mapping`).
218 """
220 region: Optional[Region]
221 """A spatial region that all result rows must overlap
222 (`lsst.sphgeom.Region` or `None`).
223 """
225 restriction: GovernorDimensionRestriction
226 """Restrictions on the values governor dimensions can take in this query,
227 imposed by the string expression or data ID
228 (`GovernorDimensionRestriction`).
229 """
231 @property # type: ignore
232 @cached_getter
233 def temporal(self) -> NamedValueAbstractSet[DimensionElement]:
234 """Dimension elements whose timespans are referenced by this
235 expression (`NamedValueAbstractSet` [ `DimensionElement` ])
236 """
237 return NamedValueSet(
238 e for e, c in self.columns.items() if TimespanDatabaseRepresentation.NAME in c
239 ).freeze()
242@dataclass(frozen=True)
243class OrderByClauseColumn:
244 """Information about single column in ORDER BY clause."""
246 element: DimensionElement
247 """Dimension element for data in this column (`DimensionElement`)."""
249 column: Optional[str]
250 """Name of the column or `None` for primary key (`str` or `None`)"""
252 ordering: bool
253 """True for ascending order, False for descending (`bool`)."""
256@immutable
257class OrderByClause:
258 """Class for information about columns in ORDER BY clause
260 Parameters
261 ----------
262 order_by : `Iterable` [ `str` ]
263 Sequence of names to use for ordering with optional "-" prefix.
264 graph : `DimensionGraph`
265 Dimensions used by a query.
266 """
268 def __init__(self, order_by: Iterable[str], graph: DimensionGraph):
270 self.order_by_columns = []
271 for name in order_by:
272 if not name or name == "-":
273 raise ValueError("Empty dimension name in ORDER BY")
274 ascending = True
275 if name[0] == "-":
276 ascending = False
277 name = name[1:]
278 element, column = categorizeOrderByName(graph, name)
279 self.order_by_columns.append(
280 OrderByClauseColumn(element=element, column=column, ordering=ascending)
281 )
283 self.elements = NamedValueSet(
284 column.element for column in self.order_by_columns if column.column is not None
285 )
287 order_by_columns: Iterable[OrderByClauseColumn]
288 """Columns that appear in the ORDER BY
289 (`Iterable` [ `OrderByClauseColumn` ]).
290 """
292 elements: NamedValueSet[DimensionElement]
293 """Dimension elements whose non-key columns were referenced by order_by
294 (`NamedValueSet` [ `DimensionElement` ]).
295 """
298@immutable
299class QuerySummary:
300 """A struct that holds and categorizes the dimensions involved in a query.
302 A `QuerySummary` instance is necessary to construct a `QueryBuilder`, and
303 it needs to include all of the dimensions that will be included in the
304 query (including any needed for querying datasets).
306 Parameters
307 ----------
308 requested : `DimensionGraph`
309 The dimensions whose primary keys should be included in the result rows
310 of the query.
311 dataId : `DataCoordinate`, optional
312 A fully-expanded data ID identifying dimensions known in advance. If
313 not provided, will be set to an empty data ID. ``dataId.hasRecords()``
314 must return `True`.
315 expression : `str` or `QueryWhereExpression`, optional
316 A user-provided string WHERE expression.
317 whereRegion : `lsst.sphgeom.Region`, optional
318 A spatial region that all rows must overlap. If `None` and ``dataId``
319 is not `None`, ``dataId.region`` will be used.
320 bind : `Mapping` [ `str`, `object` ], optional
321 Mapping containing literal values that should be injected into the
322 query expression, keyed by the identifiers they replace.
323 defaults : `DataCoordinate`, optional
324 A data ID containing default for governor dimensions.
325 datasets : `Iterable` [ `DatasetType` ], optional
326 Dataset types whose searches may be joined into the query. Callers
327 must still call `QueryBuilder.joinDataset` explicitly to control how
328 that join happens (e.g. which collections are searched), but by
329 declaring them here first we can ensure that the query includes the
330 right dimensions for those joins.
331 order_by : `Iterable` [ `str` ]
332 Sequence of names to use for ordering with optional "-" prefix.
333 limit : `Tuple`, optional
334 Limit on the number of returned rows and optional offset.
335 check : `bool`
336 If `True` (default) check the query for consistency. This may reject
337 some valid queries that resemble common mistakes (e.g. queries for
338 visits without specifying an instrument).
339 """
341 def __init__(
342 self,
343 requested: DimensionGraph,
344 *,
345 dataId: Optional[DataCoordinate] = None,
346 expression: Optional[Union[str, QueryWhereExpression]] = None,
347 whereRegion: Optional[Region] = None,
348 bind: Optional[Mapping[str, Any]] = None,
349 defaults: Optional[DataCoordinate] = None,
350 datasets: Iterable[DatasetType] = (),
351 order_by: Optional[Iterable[str]] = None,
352 limit: Optional[Tuple[int, Optional[int]]] = None,
353 check: bool = True,
354 ):
355 self.requested = requested
356 if expression is None:
357 expression = QueryWhereExpression(None, bind)
358 elif isinstance(expression, str):
359 expression = QueryWhereExpression(expression, bind)
360 elif bind is not None:
361 raise TypeError("New bind parameters passed, but expression is already a QueryWhereExpression.")
362 self.where = expression.attach(
363 self.requested, dataId=dataId, region=whereRegion, defaults=defaults, check=check
364 )
365 self.datasets = NamedValueSet(datasets).freeze()
366 self.order_by = None if order_by is None else OrderByClause(order_by, requested)
367 self.limit = limit
369 requested: DimensionGraph
370 """Dimensions whose primary keys should be included in the result rows of
371 the query (`DimensionGraph`).
372 """
374 where: QueryWhereClause
375 """Structure containing objects that contribute to the WHERE clause of the
376 query (`QueryWhereClause`).
377 """
379 datasets: NamedValueAbstractSet[DatasetType]
380 """Dataset types whose searches may be joined into the query
381 (`NamedValueAbstractSet` [ `DatasetType` ]).
382 """
384 @property
385 def universe(self) -> DimensionUniverse:
386 """All known dimensions (`DimensionUniverse`)."""
387 return self.requested.universe
389 @property # type: ignore
390 @cached_getter
391 def spatial(self) -> NamedValueAbstractSet[DimensionElement]:
392 """Dimension elements whose regions and skypix IDs should be included
393 in the query (`NamedValueAbstractSet` of `DimensionElement`).
394 """
395 # An element may participate spatially in the query if:
396 # - it's the most precise spatial element for its system in the
397 # requested dimensions (i.e. in `self.requested.spatial`);
398 # - it isn't also given at query construction time.
399 result: NamedValueSet[DimensionElement] = NamedValueSet()
400 for family in self.mustHaveKeysJoined.spatial:
401 element = family.choose(self.mustHaveKeysJoined.elements)
402 assert isinstance(element, DimensionElement)
403 if element not in self.where.dataId.graph.elements:
404 result.add(element)
405 if len(result) == 1:
406 # There's no spatial join, but there might be a WHERE filter based
407 # on a given region.
408 if self.where.dataId.graph.spatial:
409 # We can only perform those filters against SkyPix dimensions,
410 # so if what we have isn't one, add the common SkyPix dimension
411 # to the query; the element we have will be joined to that.
412 (element,) = result
413 if not isinstance(element, SkyPixDimension):
414 result.add(self.universe.commonSkyPix)
415 else:
416 # There is no spatial join or filter in this query. Even
417 # if this element might be associated with spatial
418 # information, we don't need it for this query.
419 return NamedValueSet().freeze()
420 elif len(result) > 1:
421 # There's a spatial join. Those require the common SkyPix
422 # system to be included in the query in order to connect them.
423 result.add(self.universe.commonSkyPix)
424 return result.freeze()
426 @property # type: ignore
427 @cached_getter
428 def temporal(self) -> NamedValueAbstractSet[DimensionElement]:
429 """Dimension elements whose timespans should be included in the
430 query (`NamedValueSet` of `DimensionElement`).
431 """
432 if len(self.mustHaveKeysJoined.temporal) > 1:
433 # We don't actually have multiple temporal families in our current
434 # dimension configuration, so this limitation should be harmless.
435 raise NotImplementedError("Queries that should involve temporal joins are not yet supported.")
436 return self.where.temporal
438 @property # type: ignore
439 @cached_getter
440 def mustHaveKeysJoined(self) -> DimensionGraph:
441 """Dimensions whose primary keys must be used in the JOIN ON clauses
442 of the query, even if their tables do not appear (`DimensionGraph`).
444 A `Dimension` primary key can appear in a join clause without its table
445 via a foreign key column in table of a dependent dimension element or
446 dataset.
447 """
448 names = set(self.requested.names | self.where.dimensions.names)
449 for dataset_type in self.datasets:
450 names.update(dataset_type.dimensions.names)
451 return DimensionGraph(self.universe, names=names)
453 @property # type: ignore
454 @cached_getter
455 def mustHaveTableJoined(self) -> NamedValueAbstractSet[DimensionElement]:
456 """Dimension elements whose associated tables must appear in the
457 query's FROM clause (`NamedValueSet` of `DimensionElement`).
458 """
459 result = NamedValueSet(self.spatial | self.temporal | self.where.columns.keys())
460 if self.order_by is not None:
461 result.update(self.order_by.elements)
462 for dimension in self.mustHaveKeysJoined:
463 if dimension.implied:
464 result.add(dimension)
465 for element in self.mustHaveKeysJoined.union(self.where.dataId.graph).elements:
466 if element.alwaysJoin:
467 result.add(element)
468 return result.freeze()
471@dataclass
472class DatasetQueryColumns:
473 """A struct containing the columns used to reconstruct `DatasetRef`
474 instances from query results.
475 """
477 datasetType: DatasetType
478 """The dataset type being queried (`DatasetType`).
479 """
481 id: ColumnElement
482 """Column containing the unique integer ID for this dataset.
483 """
485 runKey: ColumnElement
486 """Foreign key column to the `~CollectionType.RUN` collection that holds
487 this dataset.
488 """
490 ingestDate: Optional[ColumnElement]
491 """Column containing the ingest timestamp, this is not a part of
492 `DatasetRef` but it comes from the same table.
493 """
495 def __iter__(self) -> Iterator[ColumnElement]:
496 yield self.id
497 yield self.runKey
500@dataclass
501class QueryColumns:
502 """A struct organizing the columns in an under-construction or currently-
503 executing query.
505 Takes no parameters at construction, as expected usage is to add elements
506 to its container attributes incrementally.
507 """
509 def __init__(self) -> None:
510 self.keys = NamedKeyDict()
511 self.timespans = NamedKeyDict()
512 self.regions = NamedKeyDict()
513 self.datasets = None
515 keys: NamedKeyDict[Dimension, List[ColumnElement]]
516 """Columns that correspond to the primary key values of dimensions
517 (`NamedKeyDict` mapping `Dimension` to a `list` of `ColumnElement`).
519 Each value list contains columns from multiple tables corresponding to the
520 same dimension, and the query should constrain the values of those columns
521 to be the same.
523 In a `Query`, the keys of this dictionary must include at least the
524 dimensions in `QuerySummary.requested` and `QuerySummary.dataId.graph`.
525 """
527 timespans: NamedKeyDict[DimensionElement, TimespanDatabaseRepresentation]
528 """Columns that correspond to timespans for elements that participate in a
529 temporal join or filter in the query (`NamedKeyDict` mapping
530 `DimensionElement` to `TimespanDatabaseRepresentation`).
532 In a `Query`, the keys of this dictionary must be exactly the elements
533 in `QuerySummary.temporal`.
534 """
536 regions: NamedKeyDict[DimensionElement, SpatialRegionDatabaseRepresentation]
537 """Columns that correspond to regions for elements that participate in a
538 spatial join or filter in the query (`NamedKeyDict` mapping
539 `DimensionElement` to `SpatialRegionDatabaseRepresentation`).
541 In a `Query`, the keys of this dictionary must be exactly the elements
542 in `QuerySummary.spatial`.
543 """
545 datasets: Optional[DatasetQueryColumns]
546 """Columns that can be used to construct `DatasetRef` instances from query
547 results.
548 (`DatasetQueryColumns` or `None`).
549 """
551 def isEmpty(self) -> bool:
552 """Return `True` if this query has no columns at all."""
553 return not (self.keys or self.timespans or self.regions or self.datasets is not None)
555 def getKeyColumn(self, dimension: Union[Dimension, str]) -> ColumnElement:
556 """Return one of the columns in self.keys for the given dimension.
558 The column selected is an implentation detail but is guaranteed to
559 be deterministic and consistent across multiple calls.
561 Parameters
562 ----------
563 dimension : `Dimension` or `str`
564 Dimension for which to obtain a key column.
566 Returns
567 -------
568 column : `sqlalchemy.sql.ColumnElement`
569 SQLAlchemy column object.
570 """
571 # Choosing the last element here is entirely for human readers of the
572 # query (e.g. developers debugging things); it makes it more likely a
573 # dimension key will be provided by the dimension's own table, or
574 # failing that, some closely related dimension, which might be less
575 # surprising to see than e.g. some dataset subquery. From the
576 # database's perspective this is entirely arbitrary, because the query
577 # guarantees they all have equal values.
578 return self.keys[dimension][-1]
581@dataclass
582class RegistryManagers:
583 """Struct used to pass around the manager objects that back a `Registry`
584 and are used internally by the query system.
585 """
587 collections: CollectionManager
588 """Manager for collections (`CollectionManager`).
589 """
591 datasets: DatasetRecordStorageManager
592 """Manager for datasets and dataset types (`DatasetRecordStorageManager`).
593 """
595 dimensions: DimensionRecordStorageManager
596 """Manager for dimensions (`DimensionRecordStorageManager`).
597 """
599 TimespanReprClass: Type[TimespanDatabaseRepresentation]
600 """Type that encapsulates how timespans are represented in this database
601 (`type`; subclass of `TimespanDatabaseRepresentation`).
602 """