Coverage for python/lsst/daf/butler/registry/queries/_structs.py: 35%
Shortcuts on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
21from __future__ import annotations
23__all__ = ["QuerySummary", "RegistryManagers"] # other classes here are local to subpackage
25from dataclasses import dataclass
26from typing import AbstractSet, Any, Iterable, Iterator, List, Mapping, Optional, Type, Union
28from sqlalchemy.sql import ColumnElement
30from lsst.utils.classes import cached_getter, immutable
31from lsst.sphgeom import Region
32from ...core import (
33 DataCoordinate,
34 DatasetType,
35 Dimension,
36 DimensionElement,
37 DimensionGraph,
38 DimensionUniverse,
39 NamedKeyDict,
40 NamedKeyMapping,
41 NamedValueAbstractSet,
42 NamedValueSet,
43 SkyPixDimension,
44 SpatialRegionDatabaseRepresentation,
45 TimespanDatabaseRepresentation,
46)
47from ..interfaces import (
48 CollectionManager,
49 DatasetRecordStorageManager,
50 DimensionRecordStorageManager,
51)
52from ..summaries import GovernorDimensionRestriction
53# We're not trying to add typing to the lex/yacc parser code, so MyPy
54# doesn't know about some of these imports.
55from .expressions import Node, NormalForm, NormalFormExpression, ParserYacc # type: ignore
58@immutable
59class QueryWhereExpression:
60 """A struct representing a parsed user-provided WHERE expression.
62 Parameters
63 ----------
64 expression : `str`, optional
65 The string expression to parse. If `None`, a where expression that
66 always evaluates to `True` is implied.
67 bind : `Mapping` [ `str`, `object` ], optional
68 Mapping containing literal values that should be injected into the
69 query expression, keyed by the identifiers they replace.
70 """
71 def __init__(self, expression: Optional[str] = None, bind: Optional[Mapping[str, Any]] = None):
72 if expression:
73 try:
74 parser = ParserYacc()
75 self._tree = parser.parse(expression)
76 except Exception as exc:
77 raise RuntimeError(f"Failed to parse user expression `{expression}'.") from exc
78 assert self._tree is not None
79 else:
80 self._tree = None
81 if bind is None:
82 bind = {}
83 self._bind = bind
85 def attach(
86 self,
87 graph: DimensionGraph,
88 dataId: Optional[DataCoordinate] = None,
89 region: Optional[Region] = None,
90 defaults: Optional[DataCoordinate] = None,
91 check: bool = True,
92 ) -> QueryWhereClause:
93 """Allow this expression to be attached to a `QuerySummary` by
94 transforming it into a `QueryWhereClause`, while checking it for both
95 internal consistency and consistency with the rest of the query.
97 Parameters
98 ----------
99 graph : `DimensionGraph`
100 The dimensions the query would include in the absence of this
101 WHERE expression.
102 dataId : `DataCoordinate`, optional
103 A fully-expanded data ID identifying dimensions known in advance.
104 If not provided, will be set to an empty data ID.
105 ``dataId.hasRecords()`` must return `True`.
106 region : `lsst.sphgeom.Region`, optional
107 A spatial region that all rows must overlap. If `None` and
108 ``dataId`` is not `None`, ``dataId.region`` will be used.
109 defaults : `DataCoordinate`, optional
110 A data ID containing default for governor dimensions. Ignored
111 unless ``check=True``.
112 check : `bool`
113 If `True` (default) check the query for consistency and inject
114 default values into the data ID when needed. This may
115 reject some valid queries that resemble common mistakes (e.g.
116 queries for visits without specifying an instrument).
117 """
118 if region is None and dataId is not None:
119 region = dataId.region
120 if dataId is None:
121 dataId = DataCoordinate.makeEmpty(graph.universe)
122 if defaults is None:
123 defaults = DataCoordinate.makeEmpty(graph.universe)
124 if self._bind and check:
125 for identifier in self._bind:
126 if identifier in graph.universe.getStaticElements().names:
127 raise RuntimeError(
128 f"Bind parameter key {identifier!r} conflicts with a dimension element."
129 )
130 table, sep, column = identifier.partition('.')
131 if column and table in graph.universe.getStaticElements().names:
132 raise RuntimeError(
133 f"Bind parameter key {identifier!r} looks like a dimension column."
134 )
135 restriction = GovernorDimensionRestriction(NamedKeyDict())
136 summary: InspectionSummary
137 if self._tree is not None:
138 if check:
139 # Convert the expression to disjunctive normal form (ORs of
140 # ANDs). That's potentially super expensive in the general
141 # case (where there's a ton of nesting of ANDs and ORs). That
142 # won't be the case for the expressions we expect, and we
143 # actually use disjunctive normal instead of conjunctive (i.e.
144 # ANDs of ORs) because I think the worst-case is a long list
145 # of OR'd-together data IDs, which is already in or very close
146 # to disjunctive normal form.
147 expr = NormalFormExpression.fromTree(self._tree, NormalForm.DISJUNCTIVE)
148 from .expressions import CheckVisitor
149 # Check the expression for consistency and completeness.
150 visitor = CheckVisitor(dataId, graph, self._bind.keys(), defaults)
151 try:
152 summary = expr.visit(visitor)
153 except RuntimeError as err:
154 exprOriginal = str(self._tree)
155 exprNormal = str(expr.toTree())
156 if exprNormal == exprOriginal:
157 msg = f'Error in query expression "{exprOriginal}": {err}'
158 else:
159 msg = (
160 f'Error in query expression "{exprOriginal}" '
161 f'(normalized to "{exprNormal}"): {err}'
162 )
163 raise RuntimeError(msg) from None
164 restriction = summary.governors
165 dataId = visitor.dataId
166 else:
167 from .expressions import InspectionVisitor
168 summary = self._tree.visit(InspectionVisitor(graph.universe, self._bind.keys()))
169 else:
170 from .expressions import InspectionSummary
171 summary = InspectionSummary()
172 return QueryWhereClause(
173 self._tree,
174 dataId,
175 dimensions=summary.dimensions,
176 columns=summary.columns,
177 bind=self._bind,
178 restriction=restriction,
179 region=region,
180 )
183@dataclass(frozen=True)
184class QueryWhereClause:
185 """Structure holding various contributions to a query's WHERE clause.
187 Instances of this class should only be created by
188 `QueryWhereExpression.attach`, which guarantees the consistency of its
189 attributes.
190 """
192 tree: Optional[Node]
193 """A parsed string expression tree., or `None` if there was no string
194 expression.
195 """
197 dataId: DataCoordinate
198 """A data ID identifying dimensions known before query construction
199 (`DataCoordinate`).
201 ``dataId.hasRecords()`` is guaranteed to return `True`.
202 """
204 dimensions: NamedValueAbstractSet[Dimension]
205 """Dimensions whose primary keys or dependencies were referenced anywhere
206 in the string expression (`NamedValueAbstractSet` [ `Dimension` ]).
207 """
209 columns: NamedKeyMapping[DimensionElement, AbstractSet[str]]
210 """Dimension element tables whose non-key columns were referenced anywhere
211 in the string expression
212 (`NamedKeyMapping` [ `DimensionElement`, `Set` [ `str` ] ]).
213 """
215 bind: Mapping[str, Any]
216 """Mapping containing literal values that should be injected into the
217 query expression, keyed by the identifiers they replace (`Mapping`).
218 """
220 region: Optional[Region]
221 """A spatial region that all result rows must overlap
222 (`lsst.sphgeom.Region` or `None`).
223 """
225 restriction: GovernorDimensionRestriction
226 """Restrictions on the values governor dimensions can take in this query,
227 imposed by the string expression or data ID
228 (`GovernorDimensionRestriction`).
229 """
231 @property # type: ignore
232 @cached_getter
233 def temporal(self) -> NamedValueAbstractSet[DimensionElement]:
234 """Dimension elements whose timespans are referenced by this
235 expression (`NamedValueAbstractSet` [ `DimensionElement` ])
236 """
237 return NamedValueSet(
238 e for e, c in self.columns.items() if TimespanDatabaseRepresentation.NAME in c
239 ).freeze()
242@immutable
243class QuerySummary:
244 """A struct that holds and categorizes the dimensions involved in a query.
246 A `QuerySummary` instance is necessary to construct a `QueryBuilder`, and
247 it needs to include all of the dimensions that will be included in the
248 query (including any needed for querying datasets).
250 Parameters
251 ----------
252 requested : `DimensionGraph`
253 The dimensions whose primary keys should be included in the result rows
254 of the query.
255 dataId : `DataCoordinate`, optional
256 A fully-expanded data ID identifying dimensions known in advance. If
257 not provided, will be set to an empty data ID. ``dataId.hasRecords()``
258 must return `True`.
259 expression : `str` or `QueryWhereExpression`, optional
260 A user-provided string WHERE expression.
261 whereRegion : `lsst.sphgeom.Region`, optional
262 A spatial region that all rows must overlap. If `None` and ``dataId``
263 is not `None`, ``dataId.region`` will be used.
264 bind : `Mapping` [ `str`, `object` ], optional
265 Mapping containing literal values that should be injected into the
266 query expression, keyed by the identifiers they replace.
267 defaults : `DataCoordinate`, optional
268 A data ID containing default for governor dimensions.
269 datasets : `Iterable` [ `DatasetType` ], optional
270 Dataset types whose searches may be joined into the query. Callers
271 must still call `QueryBuilder.joinDataset` explicitly to control how
272 that join happens (e.g. which collections are searched), but by
273 declaring them here first we can ensure that the query includes the
274 right dimensions for those joins.
275 check : `bool`
276 If `True` (default) check the query for consistency. This may reject
277 some valid queries that resemble common mistakes (e.g. queries for
278 visits without specifying an instrument).
279 """
280 def __init__(self, requested: DimensionGraph, *,
281 dataId: Optional[DataCoordinate] = None,
282 expression: Optional[Union[str, QueryWhereExpression]] = None,
283 whereRegion: Optional[Region] = None,
284 bind: Optional[Mapping[str, Any]] = None,
285 defaults: Optional[DataCoordinate] = None,
286 datasets: Iterable[DatasetType] = (),
287 check: bool = True):
288 self.requested = requested
289 if expression is None:
290 expression = QueryWhereExpression(None, bind)
291 elif isinstance(expression, str):
292 expression = QueryWhereExpression(expression, bind)
293 elif bind is not None:
294 raise TypeError("New bind parameters passed, but expression is already a QueryWhereExpression.")
295 self.where = expression.attach(self.requested, dataId=dataId, region=whereRegion, defaults=defaults,
296 check=check)
297 self.datasets = NamedValueSet(datasets).freeze()
299 requested: DimensionGraph
300 """Dimensions whose primary keys should be included in the result rows of
301 the query (`DimensionGraph`).
302 """
304 where: QueryWhereClause
305 """Structure containing objects that contribute to the WHERE clause of the
306 query (`QueryWhereClause`).
307 """
309 datasets: NamedValueAbstractSet[DatasetType]
310 """Dataset types whose searches may be joined into the query
311 (`NamedValueAbstractSet` [ `DatasetType` ]).
312 """
314 @property
315 def universe(self) -> DimensionUniverse:
316 """All known dimensions (`DimensionUniverse`).
317 """
318 return self.requested.universe
320 @property # type: ignore
321 @cached_getter
322 def spatial(self) -> NamedValueAbstractSet[DimensionElement]:
323 """Dimension elements whose regions and skypix IDs should be included
324 in the query (`NamedValueAbstractSet` of `DimensionElement`).
325 """
326 # An element may participate spatially in the query if:
327 # - it's the most precise spatial element for its system in the
328 # requested dimensions (i.e. in `self.requested.spatial`);
329 # - it isn't also given at query construction time.
330 result: NamedValueSet[DimensionElement] = NamedValueSet()
331 for family in self.mustHaveKeysJoined.spatial:
332 element = family.choose(self.mustHaveKeysJoined.elements)
333 assert isinstance(element, DimensionElement)
334 if element not in self.where.dataId.graph.elements:
335 result.add(element)
336 if len(result) == 1:
337 # There's no spatial join, but there might be a WHERE filter based
338 # on a given region.
339 if self.where.dataId.graph.spatial:
340 # We can only perform those filters against SkyPix dimensions,
341 # so if what we have isn't one, add the common SkyPix dimension
342 # to the query; the element we have will be joined to that.
343 element, = result
344 if not isinstance(element, SkyPixDimension):
345 result.add(self.universe.commonSkyPix)
346 else:
347 # There is no spatial join or filter in this query. Even
348 # if this element might be associated with spatial
349 # information, we don't need it for this query.
350 return NamedValueSet().freeze()
351 elif len(result) > 1:
352 # There's a spatial join. Those require the common SkyPix
353 # system to be included in the query in order to connect them.
354 result.add(self.universe.commonSkyPix)
355 return result.freeze()
357 @property # type: ignore
358 @cached_getter
359 def temporal(self) -> NamedValueAbstractSet[DimensionElement]:
360 """Dimension elements whose timespans should be included in the
361 query (`NamedValueSet` of `DimensionElement`).
362 """
363 if len(self.mustHaveKeysJoined.temporal) > 1:
364 # We don't actually have multiple temporal families in our current
365 # dimension configuration, so this limitation should be harmless.
366 raise NotImplementedError("Queries that should involve temporal joins are not yet supported.")
367 return self.where.temporal
369 @property # type: ignore
370 @cached_getter
371 def mustHaveKeysJoined(self) -> DimensionGraph:
372 """Dimensions whose primary keys must be used in the JOIN ON clauses
373 of the query, even if their tables do not appear (`DimensionGraph`).
375 A `Dimension` primary key can appear in a join clause without its table
376 via a foreign key column in table of a dependent dimension element or
377 dataset.
378 """
379 names = set(self.requested.names | self.where.dimensions.names)
380 for dataset_type in self.datasets:
381 names.update(dataset_type.dimensions.names)
382 return DimensionGraph(self.universe, names=names)
384 @property # type: ignore
385 @cached_getter
386 def mustHaveTableJoined(self) -> NamedValueAbstractSet[DimensionElement]:
387 """Dimension elements whose associated tables must appear in the
388 query's FROM clause (`NamedValueSet` of `DimensionElement`).
389 """
390 result = NamedValueSet(self.spatial | self.temporal | self.where.columns.keys())
391 for dimension in self.mustHaveKeysJoined:
392 if dimension.implied:
393 result.add(dimension)
394 for element in self.mustHaveKeysJoined.union(self.where.dataId.graph).elements:
395 if element.alwaysJoin:
396 result.add(element)
397 return result.freeze()
400@dataclass
401class DatasetQueryColumns:
402 """A struct containing the columns used to reconstruct `DatasetRef`
403 instances from query results.
404 """
406 datasetType: DatasetType
407 """The dataset type being queried (`DatasetType`).
408 """
410 id: ColumnElement
411 """Column containing the unique integer ID for this dataset.
412 """
414 runKey: ColumnElement
415 """Foreign key column to the `~CollectionType.RUN` collection that holds
416 this dataset.
417 """
419 ingestDate: Optional[ColumnElement]
420 """Column containing the ingest timestamp, this is not a part of
421 `DatasetRef` but it comes from the same table.
422 """
424 def __iter__(self) -> Iterator[ColumnElement]:
425 yield self.id
426 yield self.runKey
429@dataclass
430class QueryColumns:
431 """A struct organizing the columns in an under-construction or currently-
432 executing query.
434 Takes no parameters at construction, as expected usage is to add elements
435 to its container attributes incrementally.
436 """
437 def __init__(self) -> None:
438 self.keys = NamedKeyDict()
439 self.timespans = NamedKeyDict()
440 self.regions = NamedKeyDict()
441 self.datasets = None
443 keys: NamedKeyDict[Dimension, List[ColumnElement]]
444 """Columns that correspond to the primary key values of dimensions
445 (`NamedKeyDict` mapping `Dimension` to a `list` of `ColumnElement`).
447 Each value list contains columns from multiple tables corresponding to the
448 same dimension, and the query should constrain the values of those columns
449 to be the same.
451 In a `Query`, the keys of this dictionary must include at least the
452 dimensions in `QuerySummary.requested` and `QuerySummary.dataId.graph`.
453 """
455 timespans: NamedKeyDict[DimensionElement, TimespanDatabaseRepresentation]
456 """Columns that correspond to timespans for elements that participate in a
457 temporal join or filter in the query (`NamedKeyDict` mapping
458 `DimensionElement` to `TimespanDatabaseRepresentation`).
460 In a `Query`, the keys of this dictionary must be exactly the elements
461 in `QuerySummary.temporal`.
462 """
464 regions: NamedKeyDict[DimensionElement, SpatialRegionDatabaseRepresentation]
465 """Columns that correspond to regions for elements that participate in a
466 spatial join or filter in the query (`NamedKeyDict` mapping
467 `DimensionElement` to `SpatialRegionDatabaseRepresentation`).
469 In a `Query`, the keys of this dictionary must be exactly the elements
470 in `QuerySummary.spatial`.
471 """
473 datasets: Optional[DatasetQueryColumns]
474 """Columns that can be used to construct `DatasetRef` instances from query
475 results.
476 (`DatasetQueryColumns` or `None`).
477 """
479 def isEmpty(self) -> bool:
480 """Return `True` if this query has no columns at all.
481 """
482 return not (self.keys or self.timespans or self.regions or self.datasets is not None)
484 def getKeyColumn(self, dimension: Union[Dimension, str]) -> ColumnElement:
485 """ Return one of the columns in self.keys for the given dimension.
487 The column selected is an implentation detail but is guaranteed to
488 be deterministic and consistent across multiple calls.
490 Parameters
491 ----------
492 dimension : `Dimension` or `str`
493 Dimension for which to obtain a key column.
495 Returns
496 -------
497 column : `sqlalchemy.sql.ColumnElement`
498 SQLAlchemy column object.
499 """
500 # Choosing the last element here is entirely for human readers of the
501 # query (e.g. developers debugging things); it makes it more likely a
502 # dimension key will be provided by the dimension's own table, or
503 # failing that, some closely related dimension, which might be less
504 # surprising to see than e.g. some dataset subquery. From the
505 # database's perspective this is entirely arbitrary, because the query
506 # guarantees they all have equal values.
507 return self.keys[dimension][-1]
510@dataclass
511class RegistryManagers:
512 """Struct used to pass around the manager objects that back a `Registry`
513 and are used internally by the query system.
514 """
516 collections: CollectionManager
517 """Manager for collections (`CollectionManager`).
518 """
520 datasets: DatasetRecordStorageManager
521 """Manager for datasets and dataset types (`DatasetRecordStorageManager`).
522 """
524 dimensions: DimensionRecordStorageManager
525 """Manager for dimensions (`DimensionRecordStorageManager`).
526 """
528 TimespanReprClass: Type[TimespanDatabaseRepresentation]
529 """Type that encapsulates how timespans are represented in this database
530 (`type`; subclass of `TimespanDatabaseRepresentation`).
531 """