Coverage for python/lsst/daf/butler/registry/queries/_structs.py: 41%
233 statements
« prev ^ index » next coverage.py v6.5.0, created at 2022-12-08 14:18 -0800
« prev ^ index » next coverage.py v6.5.0, created at 2022-12-08 14:18 -0800
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
21from __future__ import annotations
23__all__ = ["QuerySummary", "RegistryManagers"] # other classes here are local to subpackage
25from dataclasses import dataclass
26from typing import AbstractSet, Any, Iterable, Iterator, List, Mapping, Optional, Tuple, Type, Union
28from lsst.sphgeom import Region
29from lsst.utils.classes import cached_getter, immutable
30from sqlalchemy.sql import ColumnElement
32from ...core import (
33 DataCoordinate,
34 DatasetType,
35 Dimension,
36 DimensionElement,
37 DimensionGraph,
38 DimensionUniverse,
39 NamedKeyDict,
40 NamedKeyMapping,
41 NamedValueAbstractSet,
42 NamedValueSet,
43 SkyPixDimension,
44 SpatialRegionDatabaseRepresentation,
45 TimespanDatabaseRepresentation,
46)
47from .._exceptions import UserExpressionSyntaxError
48from ..interfaces import CollectionManager, DatasetRecordStorageManager, DimensionRecordStorageManager
49from ..summaries import GovernorDimensionRestriction
51# We're not trying to add typing to the lex/yacc parser code, so MyPy
52# doesn't know about some of these imports.
53from .expressions import Node, NormalForm, NormalFormExpression, ParserYacc # type: ignore
54from .expressions.categorize import categorizeElementOrderByName, categorizeOrderByName
57@immutable
58class QueryWhereExpression:
59 """A struct representing a parsed user-provided WHERE expression.
61 Parameters
62 ----------
63 expression : `str`, optional
64 The string expression to parse. If `None`, a where expression that
65 always evaluates to `True` is implied.
66 bind : `Mapping` [ `str`, `object` ], optional
67 Mapping containing literal values that should be injected into the
68 query expression, keyed by the identifiers they replace.
69 """
71 def __init__(self, expression: Optional[str] = None, bind: Optional[Mapping[str, Any]] = None):
72 if expression:
73 try:
74 parser = ParserYacc()
75 self._tree = parser.parse(expression)
76 except Exception as exc:
77 raise UserExpressionSyntaxError(f"Failed to parse user expression `{expression}'.") from exc
78 assert self._tree is not None
79 else:
80 self._tree = None
81 if bind is None:
82 bind = {}
83 self._bind = bind
85 def attach(
86 self,
87 graph: DimensionGraph,
88 dataId: Optional[DataCoordinate] = None,
89 region: Optional[Region] = None,
90 defaults: Optional[DataCoordinate] = None,
91 check: bool = True,
92 ) -> QueryWhereClause:
93 """Allow this expression to be attached to a `QuerySummary` by
94 transforming it into a `QueryWhereClause`, while checking it for both
95 internal consistency and consistency with the rest of the query.
97 Parameters
98 ----------
99 graph : `DimensionGraph`
100 The dimensions the query would include in the absence of this
101 WHERE expression.
102 dataId : `DataCoordinate`, optional
103 A fully-expanded data ID identifying dimensions known in advance.
104 If not provided, will be set to an empty data ID.
105 ``dataId.hasRecords()`` must return `True`.
106 region : `lsst.sphgeom.Region`, optional
107 A spatial region that all rows must overlap. If `None` and
108 ``dataId`` is not `None`, ``dataId.region`` will be used.
109 defaults : `DataCoordinate`, optional
110 A data ID containing default for governor dimensions. Ignored
111 unless ``check=True``.
112 check : `bool`
113 If `True` (default) check the query for consistency and inject
114 default values into the data ID when needed. This may
115 reject some valid queries that resemble common mistakes (e.g.
116 queries for visits without specifying an instrument).
117 """
118 if region is None and dataId is not None:
119 region = dataId.region
120 if dataId is None:
121 dataId = DataCoordinate.makeEmpty(graph.universe)
122 if defaults is None:
123 defaults = DataCoordinate.makeEmpty(graph.universe)
124 if self._bind and check:
125 for identifier in self._bind:
126 if identifier in graph.universe.getStaticElements().names:
127 raise RuntimeError(
128 f"Bind parameter key {identifier!r} conflicts with a dimension element."
129 )
130 table, sep, column = identifier.partition(".")
131 if column and table in graph.universe.getStaticElements().names:
132 raise RuntimeError(f"Bind parameter key {identifier!r} looks like a dimension column.")
133 restriction = GovernorDimensionRestriction(NamedKeyDict())
134 summary: InspectionSummary
135 if self._tree is not None:
136 if check:
137 # Convert the expression to disjunctive normal form (ORs of
138 # ANDs). That's potentially super expensive in the general
139 # case (where there's a ton of nesting of ANDs and ORs). That
140 # won't be the case for the expressions we expect, and we
141 # actually use disjunctive normal instead of conjunctive (i.e.
142 # ANDs of ORs) because I think the worst-case is a long list
143 # of OR'd-together data IDs, which is already in or very close
144 # to disjunctive normal form.
145 expr = NormalFormExpression.fromTree(self._tree, NormalForm.DISJUNCTIVE)
146 from .expressions import CheckVisitor
148 # Check the expression for consistency and completeness.
149 visitor = CheckVisitor(dataId, graph, self._bind, defaults)
150 try:
151 summary = expr.visit(visitor)
152 except RuntimeError as err:
153 exprOriginal = str(self._tree)
154 exprNormal = str(expr.toTree())
155 if exprNormal == exprOriginal:
156 msg = f'Error in query expression "{exprOriginal}": {err}'
157 else:
158 msg = (
159 f'Error in query expression "{exprOriginal}" '
160 f'(normalized to "{exprNormal}"): {err}'
161 )
162 raise RuntimeError(msg) from None
163 restriction = summary.governors
164 dataId = visitor.dataId
165 else:
166 from .expressions import InspectionVisitor
168 summary = self._tree.visit(InspectionVisitor(graph.universe, self._bind))
169 else:
170 from .expressions import InspectionSummary
172 summary = InspectionSummary()
173 return QueryWhereClause(
174 self._tree,
175 dataId,
176 dimensions=summary.dimensions,
177 columns=summary.columns,
178 bind=self._bind,
179 restriction=restriction,
180 region=region,
181 )
184@dataclass(frozen=True)
185class QueryWhereClause:
186 """Structure holding various contributions to a query's WHERE clause.
188 Instances of this class should only be created by
189 `QueryWhereExpression.attach`, which guarantees the consistency of its
190 attributes.
191 """
193 tree: Optional[Node]
194 """A parsed string expression tree., or `None` if there was no string
195 expression.
196 """
198 dataId: DataCoordinate
199 """A data ID identifying dimensions known before query construction
200 (`DataCoordinate`).
202 ``dataId.hasRecords()`` is guaranteed to return `True`.
203 """
205 dimensions: NamedValueAbstractSet[Dimension]
206 """Dimensions whose primary keys or dependencies were referenced anywhere
207 in the string expression (`NamedValueAbstractSet` [ `Dimension` ]).
208 """
210 columns: NamedKeyMapping[DimensionElement, AbstractSet[str]]
211 """Dimension element tables whose non-key columns were referenced anywhere
212 in the string expression
213 (`NamedKeyMapping` [ `DimensionElement`, `Set` [ `str` ] ]).
214 """
216 bind: Mapping[str, Any]
217 """Mapping containing literal values that should be injected into the
218 query expression, keyed by the identifiers they replace (`Mapping`).
219 """
221 region: Optional[Region]
222 """A spatial region that all result rows must overlap
223 (`lsst.sphgeom.Region` or `None`).
224 """
226 restriction: GovernorDimensionRestriction
227 """Restrictions on the values governor dimensions can take in this query,
228 imposed by the string expression or data ID
229 (`GovernorDimensionRestriction`).
230 """
232 @property
233 @cached_getter
234 def temporal(self) -> NamedValueAbstractSet[DimensionElement]:
235 """Dimension elements whose timespans are referenced by this
236 expression (`NamedValueAbstractSet` [ `DimensionElement` ])
237 """
238 return NamedValueSet(
239 e for e, c in self.columns.items() if TimespanDatabaseRepresentation.NAME in c
240 ).freeze()
243@dataclass(frozen=True)
244class OrderByClauseColumn:
245 """Information about single column in ORDER BY clause."""
247 element: DimensionElement
248 """Dimension element for data in this column (`DimensionElement`)."""
250 column: Optional[str]
251 """Name of the column or `None` for primary key (`str` or `None`)"""
253 ordering: bool
254 """True for ascending order, False for descending (`bool`)."""
257@immutable
258class OrderByClause:
259 """Class for information about columns in ORDER BY clause
261 Parameters
262 ----------
263 order_by : `Iterable` [ `str` ]
264 Sequence of names to use for ordering with optional "-" prefix.
265 graph : `DimensionGraph`
266 Dimensions used by a query.
267 """
269 def __init__(self, order_by: Iterable[str], graph: DimensionGraph):
271 self.order_by_columns = []
272 for name in order_by:
273 if not name or name == "-":
274 raise ValueError("Empty dimension name in ORDER BY")
275 ascending = True
276 if name[0] == "-":
277 ascending = False
278 name = name[1:]
279 element, column = categorizeOrderByName(graph, name)
280 self.order_by_columns.append(
281 OrderByClauseColumn(element=element, column=column, ordering=ascending)
282 )
284 self.elements = NamedValueSet(
285 column.element for column in self.order_by_columns if column.column is not None
286 )
288 order_by_columns: Iterable[OrderByClauseColumn]
289 """Columns that appear in the ORDER BY
290 (`Iterable` [ `OrderByClauseColumn` ]).
291 """
293 elements: NamedValueSet[DimensionElement]
294 """Dimension elements whose non-key columns were referenced by order_by
295 (`NamedValueSet` [ `DimensionElement` ]).
296 """
299@immutable
300class ElementOrderByClause:
301 """Class for information about columns in ORDER BY clause for one element.
303 Parameters
304 ----------
305 order_by : `Iterable` [ `str` ]
306 Sequence of names to use for ordering with optional "-" prefix.
307 element : `DimensionElement`
308 Dimensions used by a query.
309 """
311 def __init__(self, order_by: Iterable[str], element: DimensionElement):
313 self.order_by_columns = []
314 for name in order_by:
315 if not name or name == "-":
316 raise ValueError("Empty dimension name in ORDER BY")
317 ascending = True
318 if name[0] == "-":
319 ascending = False
320 name = name[1:]
321 column = categorizeElementOrderByName(element, name)
322 self.order_by_columns.append(
323 OrderByClauseColumn(element=element, column=column, ordering=ascending)
324 )
326 order_by_columns: Iterable[OrderByClauseColumn]
327 """Columns that appear in the ORDER BY
328 (`Iterable` [ `OrderByClauseColumn` ]).
329 """
332@immutable
333class QuerySummary:
334 """A struct that holds and categorizes the dimensions involved in a query.
336 A `QuerySummary` instance is necessary to construct a `QueryBuilder`, and
337 it needs to include all of the dimensions that will be included in the
338 query (including any needed for querying datasets).
340 Parameters
341 ----------
342 requested : `DimensionGraph`
343 The dimensions whose primary keys should be included in the result rows
344 of the query.
345 dataId : `DataCoordinate`, optional
346 A fully-expanded data ID identifying dimensions known in advance. If
347 not provided, will be set to an empty data ID. ``dataId.hasRecords()``
348 must return `True`.
349 expression : `str` or `QueryWhereExpression`, optional
350 A user-provided string WHERE expression.
351 whereRegion : `lsst.sphgeom.Region`, optional
352 A spatial region that all rows must overlap. If `None` and ``dataId``
353 is not `None`, ``dataId.region`` will be used.
354 bind : `Mapping` [ `str`, `object` ], optional
355 Mapping containing literal values that should be injected into the
356 query expression, keyed by the identifiers they replace.
357 defaults : `DataCoordinate`, optional
358 A data ID containing default for governor dimensions.
359 datasets : `Iterable` [ `DatasetType` ], optional
360 Dataset types whose searches may be joined into the query. Callers
361 must still call `QueryBuilder.joinDataset` explicitly to control how
362 that join happens (e.g. which collections are searched), but by
363 declaring them here first we can ensure that the query includes the
364 right dimensions for those joins.
365 order_by : `Iterable` [ `str` ]
366 Sequence of names to use for ordering with optional "-" prefix.
367 limit : `Tuple`, optional
368 Limit on the number of returned rows and optional offset.
369 check : `bool`
370 If `True` (default) check the query for consistency. This may reject
371 some valid queries that resemble common mistakes (e.g. queries for
372 visits without specifying an instrument).
373 """
375 def __init__(
376 self,
377 requested: DimensionGraph,
378 *,
379 dataId: Optional[DataCoordinate] = None,
380 expression: Optional[Union[str, QueryWhereExpression]] = None,
381 whereRegion: Optional[Region] = None,
382 bind: Optional[Mapping[str, Any]] = None,
383 defaults: Optional[DataCoordinate] = None,
384 datasets: Iterable[DatasetType] = (),
385 order_by: Optional[Iterable[str]] = None,
386 limit: Optional[Tuple[int, Optional[int]]] = None,
387 check: bool = True,
388 ):
389 self.requested = requested
390 if expression is None:
391 expression = QueryWhereExpression(None, bind)
392 elif isinstance(expression, str):
393 expression = QueryWhereExpression(expression, bind)
394 elif bind is not None:
395 raise TypeError("New bind parameters passed, but expression is already a QueryWhereExpression.")
396 self.where = expression.attach(
397 self.requested, dataId=dataId, region=whereRegion, defaults=defaults, check=check
398 )
399 self.datasets = NamedValueSet(datasets).freeze()
400 self.order_by = None if order_by is None else OrderByClause(order_by, requested)
401 self.limit = limit
403 requested: DimensionGraph
404 """Dimensions whose primary keys should be included in the result rows of
405 the query (`DimensionGraph`).
406 """
408 where: QueryWhereClause
409 """Structure containing objects that contribute to the WHERE clause of the
410 query (`QueryWhereClause`).
411 """
413 datasets: NamedValueAbstractSet[DatasetType]
414 """Dataset types whose searches may be joined into the query
415 (`NamedValueAbstractSet` [ `DatasetType` ]).
416 """
418 @property
419 def universe(self) -> DimensionUniverse:
420 """All known dimensions (`DimensionUniverse`)."""
421 return self.requested.universe
423 @property
424 @cached_getter
425 def spatial(self) -> NamedValueAbstractSet[DimensionElement]:
426 """Dimension elements whose regions and skypix IDs should be included
427 in the query (`NamedValueAbstractSet` of `DimensionElement`).
428 """
429 # An element may participate spatially in the query if:
430 # - it's the most precise spatial element for its system in the
431 # requested dimensions (i.e. in `self.requested.spatial`);
432 # - it isn't also given at query construction time.
433 result: NamedValueSet[DimensionElement] = NamedValueSet()
434 for family in self.mustHaveKeysJoined.spatial:
435 element = family.choose(self.mustHaveKeysJoined.elements)
436 assert isinstance(element, DimensionElement)
437 if element not in self.where.dataId.graph.elements:
438 result.add(element)
439 if len(result) == 1:
440 # There's no spatial join, but there might be a WHERE filter based
441 # on a given region.
442 if self.where.dataId.graph.spatial:
443 # We can only perform those filters against SkyPix dimensions,
444 # so if what we have isn't one, add the common SkyPix dimension
445 # to the query; the element we have will be joined to that.
446 (element,) = result
447 if not isinstance(element, SkyPixDimension):
448 result.add(self.universe.commonSkyPix)
449 else:
450 # There is no spatial join or filter in this query. Even
451 # if this element might be associated with spatial
452 # information, we don't need it for this query.
453 return NamedValueSet().freeze()
454 elif len(result) > 1:
455 # There's a spatial join. Those require the common SkyPix
456 # system to be included in the query in order to connect them.
457 result.add(self.universe.commonSkyPix)
458 return result.freeze()
460 @property
461 @cached_getter
462 def temporal(self) -> NamedValueAbstractSet[DimensionElement]:
463 """Dimension elements whose timespans should be included in the
464 query (`NamedValueSet` of `DimensionElement`).
465 """
466 if len(self.mustHaveKeysJoined.temporal) > 1:
467 # We don't actually have multiple temporal families in our current
468 # dimension configuration, so this limitation should be harmless.
469 raise NotImplementedError("Queries that should involve temporal joins are not yet supported.")
470 return self.where.temporal
472 @property
473 @cached_getter
474 def mustHaveKeysJoined(self) -> DimensionGraph:
475 """Dimensions whose primary keys must be used in the JOIN ON clauses
476 of the query, even if their tables do not appear (`DimensionGraph`).
478 A `Dimension` primary key can appear in a join clause without its table
479 via a foreign key column in table of a dependent dimension element or
480 dataset.
481 """
482 names = set(self.requested.names | self.where.dimensions.names)
483 for dataset_type in self.datasets:
484 names.update(dataset_type.dimensions.names)
485 return DimensionGraph(self.universe, names=names)
487 @property
488 @cached_getter
489 def mustHaveTableJoined(self) -> NamedValueAbstractSet[DimensionElement]:
490 """Dimension elements whose associated tables must appear in the
491 query's FROM clause (`NamedValueSet` of `DimensionElement`).
492 """
493 result = NamedValueSet(self.spatial | self.temporal | self.where.columns.keys())
494 if self.order_by is not None:
495 result.update(self.order_by.elements)
496 for dimension in self.mustHaveKeysJoined:
497 if dimension.implied:
498 result.add(dimension)
499 for element in self.mustHaveKeysJoined.union(self.where.dataId.graph).elements:
500 if element.alwaysJoin:
501 result.add(element)
502 return result.freeze()
505@dataclass
506class DatasetQueryColumns:
507 """A struct containing the columns used to reconstruct `DatasetRef`
508 instances from query results.
509 """
511 datasetType: DatasetType
512 """The dataset type being queried (`DatasetType`).
513 """
515 id: ColumnElement
516 """Column containing the unique integer ID for this dataset.
517 """
519 runKey: ColumnElement
520 """Foreign key column to the `~CollectionType.RUN` collection that holds
521 this dataset.
522 """
524 ingestDate: Optional[ColumnElement]
525 """Column containing the ingest timestamp, this is not a part of
526 `DatasetRef` but it comes from the same table.
527 """
529 def __iter__(self) -> Iterator[ColumnElement]:
530 yield self.id
531 yield self.runKey
534@dataclass
535class QueryColumns:
536 """A struct organizing the columns in an under-construction or currently-
537 executing query.
539 Takes no parameters at construction, as expected usage is to add elements
540 to its container attributes incrementally.
541 """
543 def __init__(self) -> None:
544 self.keys = NamedKeyDict()
545 self.timespans = NamedKeyDict()
546 self.regions = NamedKeyDict()
547 self.datasets = None
549 keys: NamedKeyDict[Dimension, List[ColumnElement]]
550 """Columns that correspond to the primary key values of dimensions
551 (`NamedKeyDict` mapping `Dimension` to a `list` of `ColumnElement`).
553 Each value list contains columns from multiple tables corresponding to the
554 same dimension, and the query should constrain the values of those columns
555 to be the same.
557 In a `Query`, the keys of this dictionary must include at least the
558 dimensions in `QuerySummary.requested` and `QuerySummary.dataId.graph`.
559 """
561 timespans: NamedKeyDict[DimensionElement, TimespanDatabaseRepresentation]
562 """Columns that correspond to timespans for elements that participate in a
563 temporal join or filter in the query (`NamedKeyDict` mapping
564 `DimensionElement` to `TimespanDatabaseRepresentation`).
566 In a `Query`, the keys of this dictionary must be exactly the elements
567 in `QuerySummary.temporal`.
568 """
570 regions: NamedKeyDict[DimensionElement, SpatialRegionDatabaseRepresentation]
571 """Columns that correspond to regions for elements that participate in a
572 spatial join or filter in the query (`NamedKeyDict` mapping
573 `DimensionElement` to `SpatialRegionDatabaseRepresentation`).
575 In a `Query`, the keys of this dictionary must be exactly the elements
576 in `QuerySummary.spatial`.
577 """
579 datasets: Optional[DatasetQueryColumns]
580 """Columns that can be used to construct `DatasetRef` instances from query
581 results.
582 (`DatasetQueryColumns` or `None`).
583 """
585 def isEmpty(self) -> bool:
586 """Return `True` if this query has no columns at all."""
587 return not (self.keys or self.timespans or self.regions or self.datasets is not None)
589 def getKeyColumn(self, dimension: Union[Dimension, str]) -> ColumnElement:
590 """Return one of the columns in self.keys for the given dimension.
592 The column selected is an implentation detail but is guaranteed to
593 be deterministic and consistent across multiple calls.
595 Parameters
596 ----------
597 dimension : `Dimension` or `str`
598 Dimension for which to obtain a key column.
600 Returns
601 -------
602 column : `sqlalchemy.sql.ColumnElement`
603 SQLAlchemy column object.
604 """
605 # Choosing the last element here is entirely for human readers of the
606 # query (e.g. developers debugging things); it makes it more likely a
607 # dimension key will be provided by the dimension's own table, or
608 # failing that, some closely related dimension, which might be less
609 # surprising to see than e.g. some dataset subquery. From the
610 # database's perspective this is entirely arbitrary, because the query
611 # guarantees they all have equal values.
612 return self.keys[dimension][-1]
615@dataclass
616class RegistryManagers:
617 """Struct used to pass around the manager objects that back a `Registry`
618 and are used internally by the query system.
619 """
621 collections: CollectionManager
622 """Manager for collections (`CollectionManager`).
623 """
625 datasets: DatasetRecordStorageManager
626 """Manager for datasets and dataset types (`DatasetRecordStorageManager`).
627 """
629 dimensions: DimensionRecordStorageManager
630 """Manager for dimensions (`DimensionRecordStorageManager`).
631 """
633 TimespanReprClass: Type[TimespanDatabaseRepresentation]
634 """Type that encapsulates how timespans are represented in this database
635 (`type`; subclass of `TimespanDatabaseRepresentation`).
636 """