Coverage for python/lsst/daf/butler/registry/queries/_structs.py: 36%
Shortcuts on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
21from __future__ import annotations
23__all__ = ["QuerySummary", "RegistryManagers"] # other classes here are local to subpackage
25from dataclasses import dataclass
26from typing import AbstractSet, Any, Iterable, Iterator, List, Mapping, Optional, Tuple, Type, Union
28from sqlalchemy.sql import ColumnElement
30from lsst.utils.classes import cached_getter, immutable
31from lsst.sphgeom import Region
32from ...core import (
33 DataCoordinate,
34 DatasetType,
35 Dimension,
36 DimensionElement,
37 DimensionGraph,
38 DimensionUniverse,
39 NamedKeyDict,
40 NamedKeyMapping,
41 NamedValueAbstractSet,
42 NamedValueSet,
43 SkyPixDimension,
44 SpatialRegionDatabaseRepresentation,
45 TimespanDatabaseRepresentation,
46)
47from ..interfaces import (
48 CollectionManager,
49 DatasetRecordStorageManager,
50 DimensionRecordStorageManager,
51)
52from ..summaries import GovernorDimensionRestriction
53# We're not trying to add typing to the lex/yacc parser code, so MyPy
54# doesn't know about some of these imports.
55from .expressions import Node, NormalForm, NormalFormExpression, ParserYacc # type: ignore
56from .expressions.categorize import categorizeOrderByName
59@immutable
60class QueryWhereExpression:
61 """A struct representing a parsed user-provided WHERE expression.
63 Parameters
64 ----------
65 expression : `str`, optional
66 The string expression to parse. If `None`, a where expression that
67 always evaluates to `True` is implied.
68 bind : `Mapping` [ `str`, `object` ], optional
69 Mapping containing literal values that should be injected into the
70 query expression, keyed by the identifiers they replace.
71 """
72 def __init__(self, expression: Optional[str] = None, bind: Optional[Mapping[str, Any]] = None):
73 if expression:
74 try:
75 parser = ParserYacc()
76 self._tree = parser.parse(expression)
77 except Exception as exc:
78 raise RuntimeError(f"Failed to parse user expression `{expression}'.") from exc
79 assert self._tree is not None
80 else:
81 self._tree = None
82 if bind is None:
83 bind = {}
84 self._bind = bind
86 def attach(
87 self,
88 graph: DimensionGraph,
89 dataId: Optional[DataCoordinate] = None,
90 region: Optional[Region] = None,
91 defaults: Optional[DataCoordinate] = None,
92 check: bool = True,
93 ) -> QueryWhereClause:
94 """Allow this expression to be attached to a `QuerySummary` by
95 transforming it into a `QueryWhereClause`, while checking it for both
96 internal consistency and consistency with the rest of the query.
98 Parameters
99 ----------
100 graph : `DimensionGraph`
101 The dimensions the query would include in the absence of this
102 WHERE expression.
103 dataId : `DataCoordinate`, optional
104 A fully-expanded data ID identifying dimensions known in advance.
105 If not provided, will be set to an empty data ID.
106 ``dataId.hasRecords()`` must return `True`.
107 region : `lsst.sphgeom.Region`, optional
108 A spatial region that all rows must overlap. If `None` and
109 ``dataId`` is not `None`, ``dataId.region`` will be used.
110 defaults : `DataCoordinate`, optional
111 A data ID containing default for governor dimensions. Ignored
112 unless ``check=True``.
113 check : `bool`
114 If `True` (default) check the query for consistency and inject
115 default values into the data ID when needed. This may
116 reject some valid queries that resemble common mistakes (e.g.
117 queries for visits without specifying an instrument).
118 """
119 if region is None and dataId is not None:
120 region = dataId.region
121 if dataId is None:
122 dataId = DataCoordinate.makeEmpty(graph.universe)
123 if defaults is None:
124 defaults = DataCoordinate.makeEmpty(graph.universe)
125 if self._bind and check:
126 for identifier in self._bind:
127 if identifier in graph.universe.getStaticElements().names:
128 raise RuntimeError(
129 f"Bind parameter key {identifier!r} conflicts with a dimension element."
130 )
131 table, sep, column = identifier.partition('.')
132 if column and table in graph.universe.getStaticElements().names:
133 raise RuntimeError(
134 f"Bind parameter key {identifier!r} looks like a dimension column."
135 )
136 restriction = GovernorDimensionRestriction(NamedKeyDict())
137 summary: InspectionSummary
138 if self._tree is not None:
139 if check:
140 # Convert the expression to disjunctive normal form (ORs of
141 # ANDs). That's potentially super expensive in the general
142 # case (where there's a ton of nesting of ANDs and ORs). That
143 # won't be the case for the expressions we expect, and we
144 # actually use disjunctive normal instead of conjunctive (i.e.
145 # ANDs of ORs) because I think the worst-case is a long list
146 # of OR'd-together data IDs, which is already in or very close
147 # to disjunctive normal form.
148 expr = NormalFormExpression.fromTree(self._tree, NormalForm.DISJUNCTIVE)
149 from .expressions import CheckVisitor
150 # Check the expression for consistency and completeness.
151 visitor = CheckVisitor(dataId, graph, self._bind.keys(), defaults)
152 try:
153 summary = expr.visit(visitor)
154 except RuntimeError as err:
155 exprOriginal = str(self._tree)
156 exprNormal = str(expr.toTree())
157 if exprNormal == exprOriginal:
158 msg = f'Error in query expression "{exprOriginal}": {err}'
159 else:
160 msg = (
161 f'Error in query expression "{exprOriginal}" '
162 f'(normalized to "{exprNormal}"): {err}'
163 )
164 raise RuntimeError(msg) from None
165 restriction = summary.governors
166 dataId = visitor.dataId
167 else:
168 from .expressions import InspectionVisitor
169 summary = self._tree.visit(InspectionVisitor(graph.universe, self._bind.keys()))
170 else:
171 from .expressions import InspectionSummary
172 summary = InspectionSummary()
173 return QueryWhereClause(
174 self._tree,
175 dataId,
176 dimensions=summary.dimensions,
177 columns=summary.columns,
178 bind=self._bind,
179 restriction=restriction,
180 region=region,
181 )
184@dataclass(frozen=True)
185class QueryWhereClause:
186 """Structure holding various contributions to a query's WHERE clause.
188 Instances of this class should only be created by
189 `QueryWhereExpression.attach`, which guarantees the consistency of its
190 attributes.
191 """
193 tree: Optional[Node]
194 """A parsed string expression tree., or `None` if there was no string
195 expression.
196 """
198 dataId: DataCoordinate
199 """A data ID identifying dimensions known before query construction
200 (`DataCoordinate`).
202 ``dataId.hasRecords()`` is guaranteed to return `True`.
203 """
205 dimensions: NamedValueAbstractSet[Dimension]
206 """Dimensions whose primary keys or dependencies were referenced anywhere
207 in the string expression (`NamedValueAbstractSet` [ `Dimension` ]).
208 """
210 columns: NamedKeyMapping[DimensionElement, AbstractSet[str]]
211 """Dimension element tables whose non-key columns were referenced anywhere
212 in the string expression
213 (`NamedKeyMapping` [ `DimensionElement`, `Set` [ `str` ] ]).
214 """
216 bind: Mapping[str, Any]
217 """Mapping containing literal values that should be injected into the
218 query expression, keyed by the identifiers they replace (`Mapping`).
219 """
221 region: Optional[Region]
222 """A spatial region that all result rows must overlap
223 (`lsst.sphgeom.Region` or `None`).
224 """
226 restriction: GovernorDimensionRestriction
227 """Restrictions on the values governor dimensions can take in this query,
228 imposed by the string expression or data ID
229 (`GovernorDimensionRestriction`).
230 """
232 @property # type: ignore
233 @cached_getter
234 def temporal(self) -> NamedValueAbstractSet[DimensionElement]:
235 """Dimension elements whose timespans are referenced by this
236 expression (`NamedValueAbstractSet` [ `DimensionElement` ])
237 """
238 return NamedValueSet(
239 e for e, c in self.columns.items() if TimespanDatabaseRepresentation.NAME in c
240 ).freeze()
243@dataclass(frozen=True)
244class OrderByClauseColumn:
245 """Information about single column in ORDER BY clause.
246 """
247 element: DimensionElement
248 """Dimension element for data in this column (`DimensionElement`)."""
250 column: Optional[str]
251 """Name of the column or `None` for primary key (`str` or `None`)"""
253 ordering: bool
254 """True for ascending order, False for descending (`bool`)."""
257@immutable
258class OrderByClause:
259 """Class for information about columns in ORDER BY clause
261 Parameters
262 ----------
263 order_by : `Iterable` [ `str` ]
264 Sequence of names to use for ordering with optional "-" prefix.
265 graph : `DimensionGraph`
266 Dimensions used by a query.
267 """
268 def __init__(self, order_by: Iterable[str], graph: DimensionGraph):
270 self.order_by_columns = []
271 for name in order_by:
272 if not name or name == "-":
273 raise ValueError("Empty dimension name in ORDER BY")
274 ascending = True
275 if name[0] == "-":
276 ascending = False
277 name = name[1:]
278 element, column = categorizeOrderByName(graph, name)
279 self.order_by_columns.append(
280 OrderByClauseColumn(element=element, column=column, ordering=ascending)
281 )
283 self.elements = NamedValueSet(column.element for column in self.order_by_columns
284 if column.column is not None)
286 order_by_columns: Iterable[OrderByClauseColumn]
287 """Columns that appear in the ORDER BY
288 (`Iterable` [ `OrderByClauseColumn` ]).
289 """
291 elements: NamedValueSet[DimensionElement]
292 """Dimension elements whose non-key columns were referenced by order_by
293 (`NamedValueSet` [ `DimensionElement` ]).
294 """
297@immutable
298class QuerySummary:
299 """A struct that holds and categorizes the dimensions involved in a query.
301 A `QuerySummary` instance is necessary to construct a `QueryBuilder`, and
302 it needs to include all of the dimensions that will be included in the
303 query (including any needed for querying datasets).
305 Parameters
306 ----------
307 requested : `DimensionGraph`
308 The dimensions whose primary keys should be included in the result rows
309 of the query.
310 dataId : `DataCoordinate`, optional
311 A fully-expanded data ID identifying dimensions known in advance. If
312 not provided, will be set to an empty data ID. ``dataId.hasRecords()``
313 must return `True`.
314 expression : `str` or `QueryWhereExpression`, optional
315 A user-provided string WHERE expression.
316 whereRegion : `lsst.sphgeom.Region`, optional
317 A spatial region that all rows must overlap. If `None` and ``dataId``
318 is not `None`, ``dataId.region`` will be used.
319 bind : `Mapping` [ `str`, `object` ], optional
320 Mapping containing literal values that should be injected into the
321 query expression, keyed by the identifiers they replace.
322 defaults : `DataCoordinate`, optional
323 A data ID containing default for governor dimensions.
324 datasets : `Iterable` [ `DatasetType` ], optional
325 Dataset types whose searches may be joined into the query. Callers
326 must still call `QueryBuilder.joinDataset` explicitly to control how
327 that join happens (e.g. which collections are searched), but by
328 declaring them here first we can ensure that the query includes the
329 right dimensions for those joins.
330 order_by : `Iterable` [ `str` ]
331 Sequence of names to use for ordering with optional "-" prefix.
332 limit : `Tuple`, optional
333 Limit on the number of returned rows and optional offset.
334 check : `bool`
335 If `True` (default) check the query for consistency. This may reject
336 some valid queries that resemble common mistakes (e.g. queries for
337 visits without specifying an instrument).
338 """
339 def __init__(self, requested: DimensionGraph, *,
340 dataId: Optional[DataCoordinate] = None,
341 expression: Optional[Union[str, QueryWhereExpression]] = None,
342 whereRegion: Optional[Region] = None,
343 bind: Optional[Mapping[str, Any]] = None,
344 defaults: Optional[DataCoordinate] = None,
345 datasets: Iterable[DatasetType] = (),
346 order_by: Optional[Iterable[str]] = None,
347 limit: Optional[Tuple[int, Optional[int]]] = None,
348 check: bool = True):
349 self.requested = requested
350 if expression is None:
351 expression = QueryWhereExpression(None, bind)
352 elif isinstance(expression, str):
353 expression = QueryWhereExpression(expression, bind)
354 elif bind is not None:
355 raise TypeError("New bind parameters passed, but expression is already a QueryWhereExpression.")
356 self.where = expression.attach(self.requested, dataId=dataId, region=whereRegion, defaults=defaults,
357 check=check)
358 self.datasets = NamedValueSet(datasets).freeze()
359 self.order_by = None if order_by is None else OrderByClause(order_by, requested)
360 self.limit = limit
362 requested: DimensionGraph
363 """Dimensions whose primary keys should be included in the result rows of
364 the query (`DimensionGraph`).
365 """
367 where: QueryWhereClause
368 """Structure containing objects that contribute to the WHERE clause of the
369 query (`QueryWhereClause`).
370 """
372 datasets: NamedValueAbstractSet[DatasetType]
373 """Dataset types whose searches may be joined into the query
374 (`NamedValueAbstractSet` [ `DatasetType` ]).
375 """
377 @property
378 def universe(self) -> DimensionUniverse:
379 """All known dimensions (`DimensionUniverse`).
380 """
381 return self.requested.universe
383 @property # type: ignore
384 @cached_getter
385 def spatial(self) -> NamedValueAbstractSet[DimensionElement]:
386 """Dimension elements whose regions and skypix IDs should be included
387 in the query (`NamedValueAbstractSet` of `DimensionElement`).
388 """
389 # An element may participate spatially in the query if:
390 # - it's the most precise spatial element for its system in the
391 # requested dimensions (i.e. in `self.requested.spatial`);
392 # - it isn't also given at query construction time.
393 result: NamedValueSet[DimensionElement] = NamedValueSet()
394 for family in self.mustHaveKeysJoined.spatial:
395 element = family.choose(self.mustHaveKeysJoined.elements)
396 assert isinstance(element, DimensionElement)
397 if element not in self.where.dataId.graph.elements:
398 result.add(element)
399 if len(result) == 1:
400 # There's no spatial join, but there might be a WHERE filter based
401 # on a given region.
402 if self.where.dataId.graph.spatial:
403 # We can only perform those filters against SkyPix dimensions,
404 # so if what we have isn't one, add the common SkyPix dimension
405 # to the query; the element we have will be joined to that.
406 element, = result
407 if not isinstance(element, SkyPixDimension):
408 result.add(self.universe.commonSkyPix)
409 else:
410 # There is no spatial join or filter in this query. Even
411 # if this element might be associated with spatial
412 # information, we don't need it for this query.
413 return NamedValueSet().freeze()
414 elif len(result) > 1:
415 # There's a spatial join. Those require the common SkyPix
416 # system to be included in the query in order to connect them.
417 result.add(self.universe.commonSkyPix)
418 return result.freeze()
420 @property # type: ignore
421 @cached_getter
422 def temporal(self) -> NamedValueAbstractSet[DimensionElement]:
423 """Dimension elements whose timespans should be included in the
424 query (`NamedValueSet` of `DimensionElement`).
425 """
426 if len(self.mustHaveKeysJoined.temporal) > 1:
427 # We don't actually have multiple temporal families in our current
428 # dimension configuration, so this limitation should be harmless.
429 raise NotImplementedError("Queries that should involve temporal joins are not yet supported.")
430 return self.where.temporal
432 @property # type: ignore
433 @cached_getter
434 def mustHaveKeysJoined(self) -> DimensionGraph:
435 """Dimensions whose primary keys must be used in the JOIN ON clauses
436 of the query, even if their tables do not appear (`DimensionGraph`).
438 A `Dimension` primary key can appear in a join clause without its table
439 via a foreign key column in table of a dependent dimension element or
440 dataset.
441 """
442 names = set(self.requested.names | self.where.dimensions.names)
443 for dataset_type in self.datasets:
444 names.update(dataset_type.dimensions.names)
445 return DimensionGraph(self.universe, names=names)
447 @property # type: ignore
448 @cached_getter
449 def mustHaveTableJoined(self) -> NamedValueAbstractSet[DimensionElement]:
450 """Dimension elements whose associated tables must appear in the
451 query's FROM clause (`NamedValueSet` of `DimensionElement`).
452 """
453 result = NamedValueSet(self.spatial | self.temporal | self.where.columns.keys())
454 if self.order_by is not None:
455 result.update(self.order_by.elements)
456 for dimension in self.mustHaveKeysJoined:
457 if dimension.implied:
458 result.add(dimension)
459 for element in self.mustHaveKeysJoined.union(self.where.dataId.graph).elements:
460 if element.alwaysJoin:
461 result.add(element)
462 return result.freeze()
465@dataclass
466class DatasetQueryColumns:
467 """A struct containing the columns used to reconstruct `DatasetRef`
468 instances from query results.
469 """
471 datasetType: DatasetType
472 """The dataset type being queried (`DatasetType`).
473 """
475 id: ColumnElement
476 """Column containing the unique integer ID for this dataset.
477 """
479 runKey: ColumnElement
480 """Foreign key column to the `~CollectionType.RUN` collection that holds
481 this dataset.
482 """
484 ingestDate: Optional[ColumnElement]
485 """Column containing the ingest timestamp, this is not a part of
486 `DatasetRef` but it comes from the same table.
487 """
489 def __iter__(self) -> Iterator[ColumnElement]:
490 yield self.id
491 yield self.runKey
494@dataclass
495class QueryColumns:
496 """A struct organizing the columns in an under-construction or currently-
497 executing query.
499 Takes no parameters at construction, as expected usage is to add elements
500 to its container attributes incrementally.
501 """
502 def __init__(self) -> None:
503 self.keys = NamedKeyDict()
504 self.timespans = NamedKeyDict()
505 self.regions = NamedKeyDict()
506 self.datasets = None
508 keys: NamedKeyDict[Dimension, List[ColumnElement]]
509 """Columns that correspond to the primary key values of dimensions
510 (`NamedKeyDict` mapping `Dimension` to a `list` of `ColumnElement`).
512 Each value list contains columns from multiple tables corresponding to the
513 same dimension, and the query should constrain the values of those columns
514 to be the same.
516 In a `Query`, the keys of this dictionary must include at least the
517 dimensions in `QuerySummary.requested` and `QuerySummary.dataId.graph`.
518 """
520 timespans: NamedKeyDict[DimensionElement, TimespanDatabaseRepresentation]
521 """Columns that correspond to timespans for elements that participate in a
522 temporal join or filter in the query (`NamedKeyDict` mapping
523 `DimensionElement` to `TimespanDatabaseRepresentation`).
525 In a `Query`, the keys of this dictionary must be exactly the elements
526 in `QuerySummary.temporal`.
527 """
529 regions: NamedKeyDict[DimensionElement, SpatialRegionDatabaseRepresentation]
530 """Columns that correspond to regions for elements that participate in a
531 spatial join or filter in the query (`NamedKeyDict` mapping
532 `DimensionElement` to `SpatialRegionDatabaseRepresentation`).
534 In a `Query`, the keys of this dictionary must be exactly the elements
535 in `QuerySummary.spatial`.
536 """
538 datasets: Optional[DatasetQueryColumns]
539 """Columns that can be used to construct `DatasetRef` instances from query
540 results.
541 (`DatasetQueryColumns` or `None`).
542 """
544 def isEmpty(self) -> bool:
545 """Return `True` if this query has no columns at all.
546 """
547 return not (self.keys or self.timespans or self.regions or self.datasets is not None)
549 def getKeyColumn(self, dimension: Union[Dimension, str]) -> ColumnElement:
550 """ Return one of the columns in self.keys for the given dimension.
552 The column selected is an implentation detail but is guaranteed to
553 be deterministic and consistent across multiple calls.
555 Parameters
556 ----------
557 dimension : `Dimension` or `str`
558 Dimension for which to obtain a key column.
560 Returns
561 -------
562 column : `sqlalchemy.sql.ColumnElement`
563 SQLAlchemy column object.
564 """
565 # Choosing the last element here is entirely for human readers of the
566 # query (e.g. developers debugging things); it makes it more likely a
567 # dimension key will be provided by the dimension's own table, or
568 # failing that, some closely related dimension, which might be less
569 # surprising to see than e.g. some dataset subquery. From the
570 # database's perspective this is entirely arbitrary, because the query
571 # guarantees they all have equal values.
572 return self.keys[dimension][-1]
575@dataclass
576class RegistryManagers:
577 """Struct used to pass around the manager objects that back a `Registry`
578 and are used internally by the query system.
579 """
581 collections: CollectionManager
582 """Manager for collections (`CollectionManager`).
583 """
585 datasets: DatasetRecordStorageManager
586 """Manager for datasets and dataset types (`DatasetRecordStorageManager`).
587 """
589 dimensions: DimensionRecordStorageManager
590 """Manager for dimensions (`DimensionRecordStorageManager`).
591 """
593 TimespanReprClass: Type[TimespanDatabaseRepresentation]
594 """Type that encapsulates how timespans are represented in this database
595 (`type`; subclass of `TimespanDatabaseRepresentation`).
596 """