Coverage for python/lsst/daf/butler/registry/queries/_structs.py: 36%
Shortcuts on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
21from __future__ import annotations
23__all__ = ["QuerySummary", "RegistryManagers"] # other classes here are local to subpackage
25from dataclasses import dataclass
26from typing import AbstractSet, Any, Iterator, List, Mapping, Optional, Type, Union
28from sqlalchemy.sql import ColumnElement
30from lsst.sphgeom import Region
31from ...core import (
32 DataCoordinate,
33 DatasetType,
34 Dimension,
35 DimensionElement,
36 DimensionGraph,
37 DimensionUniverse,
38 NamedKeyDict,
39 NamedKeyMapping,
40 NamedValueAbstractSet,
41 NamedValueSet,
42 SkyPixDimension,
43 SpatialRegionDatabaseRepresentation,
44 TimespanDatabaseRepresentation,
45)
46from ...core.utils import cached_getter, immutable
47from ..interfaces import (
48 CollectionManager,
49 DatasetRecordStorageManager,
50 DimensionRecordStorageManager,
51)
52from ..summaries import GovernorDimensionRestriction
53# We're not trying to add typing to the lex/yacc parser code, so MyPy
54# doesn't know about some of these imports.
55from .expressions import Node, NormalForm, NormalFormExpression, ParserYacc # type: ignore
58@immutable
59class QueryWhereExpression:
60 """A struct representing a parsed user-provided WHERE expression.
62 Parameters
63 ----------
64 expression : `str`, optional
65 The string expression to parse. If `None`, a where expression that
66 always evaluates to `True` is implied.
67 bind : `Mapping` [ `str`, `object` ], optional
68 Mapping containing literal values that should be injected into the
69 query expression, keyed by the identifiers they replace.
70 """
71 def __init__(self, expression: Optional[str] = None, bind: Optional[Mapping[str, Any]] = None):
72 if expression:
73 try:
74 parser = ParserYacc()
75 self._tree = parser.parse(expression)
76 except Exception as exc:
77 raise RuntimeError(f"Failed to parse user expression `{expression}'.") from exc
78 assert self._tree is not None
79 else:
80 self._tree = None
81 if bind is None:
82 bind = {}
83 self._bind = bind
85 def attach(
86 self,
87 graph: DimensionGraph,
88 dataId: Optional[DataCoordinate] = None,
89 region: Optional[Region] = None,
90 defaults: Optional[DataCoordinate] = None,
91 check: bool = True,
92 ) -> QueryWhereClause:
93 """Allow this expression to be attached to a `QuerySummary` by
94 transforming it into a `QueryWhereClause`, while checking it for both
95 internal consistency and consistency with the rest of the query.
97 Parameters
98 ----------
99 graph : `DimensionGraph`
100 The dimensions the query would include in the absence of this
101 WHERE expression.
102 dataId : `DataCoordinate`, optional
103 A fully-expanded data ID identifying dimensions known in advance.
104 If not provided, will be set to an empty data ID.
105 ``dataId.hasRecords()`` must return `True`.
106 region : `lsst.sphgeom.Region`, optional
107 A spatial region that all rows must overlap. If `None` and
108 ``dataId`` is not `None`, ``dataId.region`` will be used.
109 defaults : `DataCoordinate`, optional
110 A data ID containing default for governor dimensions. Ignored
111 unless ``check=True``.
112 check : `bool`
113 If `True` (default) check the query for consistency and inject
114 default values into the data ID when needed. This may
115 reject some valid queries that resemble common mistakes (e.g.
116 queries for visits without specifying an instrument).
117 """
118 if region is None and dataId is not None:
119 region = dataId.region
120 if dataId is None:
121 dataId = DataCoordinate.makeEmpty(graph.universe)
122 if defaults is None:
123 defaults = DataCoordinate.makeEmpty(graph.universe)
124 if self._bind and check:
125 for identifier in self._bind:
126 if identifier in graph.universe.getStaticElements().names:
127 raise RuntimeError(
128 f"Bind parameter key {identifier!r} conflicts with a dimension element."
129 )
130 table, sep, column = identifier.partition('.')
131 if column and table in graph.universe.getStaticElements().names:
132 raise RuntimeError(
133 f"Bind parameter key {identifier!r} looks like a dimension column."
134 )
135 restriction = GovernorDimensionRestriction(NamedKeyDict())
136 summary: InspectionSummary
137 if self._tree is not None:
138 if check:
139 # Convert the expression to disjunctive normal form (ORs of
140 # ANDs). That's potentially super expensive in the general
141 # case (where there's a ton of nesting of ANDs and ORs). That
142 # won't be the case for the expressions we expect, and we
143 # actually use disjunctive normal instead of conjunctive (i.e.
144 # ANDs of ORs) because I think the worst-case is a long list
145 # of OR'd-together data IDs, which is already in or very close
146 # to disjunctive normal form.
147 expr = NormalFormExpression.fromTree(self._tree, NormalForm.DISJUNCTIVE)
148 from .expressions import CheckVisitor
149 # Check the expression for consistency and completeness.
150 visitor = CheckVisitor(dataId, graph, self._bind.keys(), defaults)
151 try:
152 summary = expr.visit(visitor)
153 except RuntimeError as err:
154 exprOriginal = str(self._tree)
155 exprNormal = str(expr.toTree())
156 if exprNormal == exprOriginal:
157 msg = f'Error in query expression "{exprOriginal}": {err}'
158 else:
159 msg = (
160 f'Error in query expression "{exprOriginal}" '
161 f'(normalized to "{exprNormal}"): {err}'
162 )
163 raise RuntimeError(msg) from None
164 restriction = summary.governors
165 dataId = visitor.dataId
166 else:
167 from .expressions import InspectionVisitor
168 summary = self._tree.visit(InspectionVisitor(graph.universe, self._bind.keys()))
169 else:
170 from .expressions import InspectionSummary
171 summary = InspectionSummary()
172 return QueryWhereClause(
173 self._tree,
174 dataId,
175 dimensions=summary.dimensions,
176 columns=summary.columns,
177 bind=self._bind,
178 restriction=restriction,
179 region=region,
180 )
183@dataclass(frozen=True)
184class QueryWhereClause:
185 """Structure holding various contributions to a query's WHERE clause.
187 Instances of this class should only be created by
188 `QueryWhereExpression.attach`, which guarantees the consistency of its
189 attributes.
190 """
192 tree: Optional[Node]
193 """A parsed string expression tree., or `None` if there was no string
194 expression.
195 """
197 dataId: DataCoordinate
198 """A data ID identifying dimensions known before query construction
199 (`DataCoordinate`).
201 ``dataId.hasRecords()`` is guaranteed to return `True`.
202 """
204 dimensions: NamedValueAbstractSet[Dimension]
205 """Dimensions whose primary keys or dependencies were referenced anywhere
206 in the string expression (`NamedValueAbstractSet` [ `Dimension` ]).
207 """
209 columns: NamedKeyMapping[DimensionElement, AbstractSet[str]]
210 """Dimension element tables whose non-key columns were referenced anywhere
211 in the string expression
212 (`NamedKeyMapping` [ `DimensionElement`, `Set` [ `str` ] ]).
213 """
215 bind: Mapping[str, Any]
216 """Mapping containing literal values that should be injected into the
217 query expression, keyed by the identifiers they replace (`Mapping`).
218 """
220 region: Optional[Region]
221 """A spatial region that all result rows must overlap
222 (`lsst.sphgeom.Region` or `None`).
223 """
225 restriction: GovernorDimensionRestriction
226 """Restrictions on the values governor dimensions can take in this query,
227 imposed by the string expression or data ID
228 (`GovernorDimensionRestriction`).
229 """
231 @property # type: ignore
232 @cached_getter
233 def temporal(self) -> NamedValueAbstractSet[DimensionElement]:
234 """Dimension elements whose timespans are referenced by this
235 expression (`NamedValueAbstractSet` [ `DimensionElement` ])
236 """
237 return NamedValueSet(
238 e for e, c in self.columns.items() if TimespanDatabaseRepresentation.NAME in c
239 ).freeze()
242@immutable
243class QuerySummary:
244 """A struct that holds and categorizes the dimensions involved in a query.
246 A `QuerySummary` instance is necessary to construct a `QueryBuilder`, and
247 it needs to include all of the dimensions that will be included in the
248 query (including any needed for querying datasets).
250 Parameters
251 ----------
252 requested : `DimensionGraph`
253 The dimensions whose primary keys should be included in the result rows
254 of the query.
255 dataId : `DataCoordinate`, optional
256 A fully-expanded data ID identifying dimensions known in advance. If
257 not provided, will be set to an empty data ID. ``dataId.hasRecords()``
258 must return `True`.
259 expression : `str` or `QueryWhereExpression`, optional
260 A user-provided string WHERE expression.
261 whereRegion : `lsst.sphgeom.Region`, optional
262 A spatial region that all rows must overlap. If `None` and ``dataId``
263 is not `None`, ``dataId.region`` will be used.
264 bind : `Mapping` [ `str`, `object` ], optional
265 Mapping containing literal values that should be injected into the
266 query expression, keyed by the identifiers they replace.
267 defaults : `DataCoordinate`, optional
268 A data ID containing default for governor dimensions.
269 check : `bool`
270 If `True` (default) check the query for consistency. This may reject
271 some valid queries that resemble common mistakes (e.g. queries for
272 visits without specifying an instrument).
273 """
274 def __init__(self, requested: DimensionGraph, *,
275 dataId: Optional[DataCoordinate] = None,
276 expression: Optional[Union[str, QueryWhereExpression]] = None,
277 whereRegion: Optional[Region] = None,
278 bind: Optional[Mapping[str, Any]] = None,
279 defaults: Optional[DataCoordinate] = None,
280 check: bool = True):
281 self.requested = requested
282 if expression is None:
283 expression = QueryWhereExpression(None, bind)
284 elif isinstance(expression, str):
285 expression = QueryWhereExpression(expression, bind)
286 elif bind is not None:
287 raise TypeError("New bind parameters passed, but expression is already a QueryWhereExpression.")
288 self.where = expression.attach(self.requested, dataId=dataId, region=whereRegion, defaults=defaults,
289 check=check)
291 requested: DimensionGraph
292 """Dimensions whose primary keys should be included in the result rows of
293 the query (`DimensionGraph`).
294 """
296 where: QueryWhereClause
297 """Structure containing objects that contribute to the WHERE clause of the
298 query (`QueryWhereClause`).
299 """
301 @property
302 def universe(self) -> DimensionUniverse:
303 """All known dimensions (`DimensionUniverse`).
304 """
305 return self.requested.universe
307 @property # type: ignore
308 @cached_getter
309 def spatial(self) -> NamedValueAbstractSet[DimensionElement]:
310 """Dimension elements whose regions and skypix IDs should be included
311 in the query (`NamedValueAbstractSet` of `DimensionElement`).
312 """
313 # An element may participate spatially in the query if:
314 # - it's the most precise spatial element for its system in the
315 # requested dimensions (i.e. in `self.requested.spatial`);
316 # - it isn't also given at query construction time.
317 result: NamedValueSet[DimensionElement] = NamedValueSet()
318 for family in self.mustHaveKeysJoined.spatial:
319 element = family.choose(self.mustHaveKeysJoined.elements)
320 assert isinstance(element, DimensionElement)
321 if element not in self.where.dataId.graph.elements:
322 result.add(element)
323 if len(result) == 1:
324 # There's no spatial join, but there might be a WHERE filter based
325 # on a given region.
326 if self.where.dataId.graph.spatial:
327 # We can only perform those filters against SkyPix dimensions,
328 # so if what we have isn't one, add the common SkyPix dimension
329 # to the query; the element we have will be joined to that.
330 element, = result
331 if not isinstance(element, SkyPixDimension):
332 result.add(self.universe.commonSkyPix)
333 else:
334 # There is no spatial join or filter in this query. Even
335 # if this element might be associated with spatial
336 # information, we don't need it for this query.
337 return NamedValueSet().freeze()
338 elif len(result) > 1:
339 # There's a spatial join. Those require the common SkyPix
340 # system to be included in the query in order to connect them.
341 result.add(self.universe.commonSkyPix)
342 return result.freeze()
344 @property # type: ignore
345 @cached_getter
346 def temporal(self) -> NamedValueAbstractSet[DimensionElement]:
347 """Dimension elements whose timespans should be included in the
348 query (`NamedValueSet` of `DimensionElement`).
349 """
350 if len(self.mustHaveKeysJoined.temporal) > 1:
351 # We don't actually have multiple temporal families in our current
352 # dimension configuration, so this limitation should be harmless.
353 raise NotImplementedError("Queries that should involve temporal joins are not yet supported.")
354 return self.where.temporal
356 @property # type: ignore
357 @cached_getter
358 def mustHaveKeysJoined(self) -> DimensionGraph:
359 """Dimensions whose primary keys must be used in the JOIN ON clauses
360 of the query, even if their tables do not appear (`DimensionGraph`).
362 A `Dimension` primary key can appear in a join clause without its table
363 via a foreign key column in table of a dependent dimension element or
364 dataset.
365 """
366 names = set(self.requested.names | self.where.dimensions.names)
367 return DimensionGraph(self.universe, names=names)
369 @property # type: ignore
370 @cached_getter
371 def mustHaveTableJoined(self) -> NamedValueAbstractSet[DimensionElement]:
372 """Dimension elements whose associated tables must appear in the
373 query's FROM clause (`NamedValueSet` of `DimensionElement`).
374 """
375 result = NamedValueSet(self.spatial | self.temporal | self.where.columns.keys())
376 for dimension in self.mustHaveKeysJoined:
377 if dimension.implied:
378 result.add(dimension)
379 for element in self.mustHaveKeysJoined.union(self.where.dataId.graph).elements:
380 if element.alwaysJoin:
381 result.add(element)
382 return result.freeze()
385@dataclass
386class DatasetQueryColumns:
387 """A struct containing the columns used to reconstruct `DatasetRef`
388 instances from query results.
389 """
391 datasetType: DatasetType
392 """The dataset type being queried (`DatasetType`).
393 """
395 id: ColumnElement
396 """Column containing the unique integer ID for this dataset.
397 """
399 runKey: ColumnElement
400 """Foreign key column to the `~CollectionType.RUN` collection that holds
401 this dataset.
402 """
404 ingestDate: Optional[ColumnElement]
405 """Column containing the ingest timestamp, this is not a part of
406 `DatasetRef` but it comes from the same table.
407 """
409 def __iter__(self) -> Iterator[ColumnElement]:
410 yield self.id
411 yield self.runKey
414@dataclass
415class QueryColumns:
416 """A struct organizing the columns in an under-construction or currently-
417 executing query.
419 Takes no parameters at construction, as expected usage is to add elements
420 to its container attributes incrementally.
421 """
422 def __init__(self) -> None:
423 self.keys = NamedKeyDict()
424 self.timespans = NamedKeyDict()
425 self.regions = NamedKeyDict()
426 self.datasets = None
428 keys: NamedKeyDict[Dimension, List[ColumnElement]]
429 """Columns that correspond to the primary key values of dimensions
430 (`NamedKeyDict` mapping `Dimension` to a `list` of `ColumnElement`).
432 Each value list contains columns from multiple tables corresponding to the
433 same dimension, and the query should constrain the values of those columns
434 to be the same.
436 In a `Query`, the keys of this dictionary must include at least the
437 dimensions in `QuerySummary.requested` and `QuerySummary.dataId.graph`.
438 """
440 timespans: NamedKeyDict[DimensionElement, TimespanDatabaseRepresentation]
441 """Columns that correspond to timespans for elements that participate in a
442 temporal join or filter in the query (`NamedKeyDict` mapping
443 `DimensionElement` to `TimespanDatabaseRepresentation`).
445 In a `Query`, the keys of this dictionary must be exactly the elements
446 in `QuerySummary.temporal`.
447 """
449 regions: NamedKeyDict[DimensionElement, SpatialRegionDatabaseRepresentation]
450 """Columns that correspond to regions for elements that participate in a
451 spatial join or filter in the query (`NamedKeyDict` mapping
452 `DimensionElement` to `SpatialRegionDatabaseRepresentation`).
454 In a `Query`, the keys of this dictionary must be exactly the elements
455 in `QuerySummary.spatial`.
456 """
458 datasets: Optional[DatasetQueryColumns]
459 """Columns that can be used to construct `DatasetRef` instances from query
460 results.
461 (`DatasetQueryColumns` or `None`).
462 """
464 def isEmpty(self) -> bool:
465 """Return `True` if this query has no columns at all.
466 """
467 return not (self.keys or self.timespans or self.regions or self.datasets is not None)
469 def getKeyColumn(self, dimension: Union[Dimension, str]) -> ColumnElement:
470 """ Return one of the columns in self.keys for the given dimension.
472 The column selected is an implentation detail but is guaranteed to
473 be deterministic and consistent across multiple calls.
475 Parameters
476 ----------
477 dimension : `Dimension` or `str`
478 Dimension for which to obtain a key column.
480 Returns
481 -------
482 column : `sqlalchemy.sql.ColumnElement`
483 SQLAlchemy column object.
484 """
485 # Choosing the last element here is entirely for human readers of the
486 # query (e.g. developers debugging things); it makes it more likely a
487 # dimension key will be provided by the dimension's own table, or
488 # failing that, some closely related dimension, which might be less
489 # surprising to see than e.g. some dataset subquery. From the
490 # database's perspective this is entirely arbitrary, because the query
491 # guarantees they all have equal values.
492 return self.keys[dimension][-1]
495@dataclass
496class RegistryManagers:
497 """Struct used to pass around the manager objects that back a `Registry`
498 and are used internally by the query system.
499 """
501 collections: CollectionManager
502 """Manager for collections (`CollectionManager`).
503 """
505 datasets: DatasetRecordStorageManager
506 """Manager for datasets and dataset types (`DatasetRecordStorageManager`).
507 """
509 dimensions: DimensionRecordStorageManager
510 """Manager for dimensions (`DimensionRecordStorageManager`).
511 """
513 TimespanReprClass: Type[TimespanDatabaseRepresentation]
514 """Type that encapsulates how timespans are represented in this database
515 (`type`; subclass of `TimespanDatabaseRepresentation`).
516 """