Coverage for python/lsst/daf/butler/registry/queries/_structs.py : 33%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
21from __future__ import annotations
23__all__ = ["QuerySummary", "RegistryManagers"] # other classes here are local to subpackage
25from dataclasses import dataclass
26from typing import AbstractSet, Any, Iterator, List, Mapping, Optional, Type, Union
28from sqlalchemy.sql import ColumnElement
30from lsst.sphgeom import Region
31from ...core import (
32 DataCoordinate,
33 DatasetType,
34 Dimension,
35 DimensionElement,
36 DimensionGraph,
37 DimensionUniverse,
38 NamedKeyDict,
39 NamedKeyMapping,
40 NamedValueAbstractSet,
41 NamedValueSet,
42 SkyPixDimension,
43 SpatialRegionDatabaseRepresentation,
44 TimespanDatabaseRepresentation,
45)
46from ...core.utils import cached_getter, immutable
47from ..interfaces import (
48 CollectionManager,
49 DatasetRecordStorageManager,
50 DimensionRecordStorageManager,
51)
52from ..wildcards import GovernorDimensionRestriction
53# We're not trying to add typing to the lex/yacc parser code, so MyPy
54# doesn't know about some of these imports.
55from .expressions import Node, NormalForm, NormalFormExpression, ParserYacc # type: ignore
58@immutable
59class QueryWhereExpression:
60 """A struct representing a parsed user-provided WHERE expression.
62 Parameters
63 ----------
64 expression : `str`, optional
65 The string expression to parse. If `None`, a where expression that
66 always evaluates to `True` is implied.
67 bind : `Mapping` [ `str`, `object` ], optional
68 Mapping containing literal values that should be injected into the
69 query expression, keyed by the identifiers they replace.
70 """
71 def __init__(self, expression: Optional[str] = None, bind: Optional[Mapping[str, Any]] = None):
72 if expression:
73 try:
74 parser = ParserYacc()
75 self._tree = parser.parse(expression)
76 except Exception as exc:
77 raise RuntimeError(f"Failed to parse user expression `{expression}'.") from exc
78 assert self._tree is not None
79 else:
80 self._tree = None
81 if bind is None:
82 bind = {}
83 self._bind = bind
85 def attach(
86 self,
87 graph: DimensionGraph,
88 dataId: Optional[DataCoordinate] = None,
89 region: Optional[Region] = None,
90 check: bool = True,
91 ) -> QueryWhereClause:
92 """Allow this expression to be attached to a `QuerySummary` by
93 transforming it into a `QueryWhereClause`, while checking it for both
94 internal consistency and consistency with the rest of the query.
96 Parameters
97 ----------
98 graph : `DimensionGraph`
99 The dimensions the query would include in the absence of this
100 WHERE expression.
101 dataId : `DataCoordinate`, optional
102 A fully-expanded data ID identifying dimensions known in advance.
103 If not provided, will be set to an empty data ID.
104 ``dataId.hasRecords()`` must return `True`.
105 region : `lsst.sphgeom.Region`, optional
106 A spatial region that all rows must overlap. If `None` and
107 ``dataId`` is not `None`, ``dataId.region`` will be used.
108 check : `bool`
109 If `True` (default) check the query for consistency. This may
110 reject some valid queries that resemble common mistakes (e.g.
111 queries for visits without specifying an instrument).
112 """
113 if region is None and dataId is not None:
114 region = dataId.region
115 if dataId is None:
116 dataId = DataCoordinate.makeEmpty(graph.universe)
117 if self._bind and check:
118 for identifier in self._bind:
119 if identifier in graph.universe.getStaticElements().names:
120 raise RuntimeError(
121 f"Bind parameter key {identifier!r} conflicts with a dimension element."
122 )
123 table, sep, column = identifier.partition('.')
124 if column and table in graph.universe.getStaticElements().names:
125 raise RuntimeError(
126 f"Bind parameter key {identifier!r} looks like a dimension column."
127 )
128 restriction = GovernorDimensionRestriction(graph.universe)
129 summary: InspectionSummary
130 if self._tree is not None:
131 if check:
132 # Convert the expression to disjunctive normal form (ORs of
133 # ANDs). That's potentially super expensive in the general
134 # case (where there's a ton of nesting of ANDs and ORs). That
135 # won't be the case for the expressions we expect, and we
136 # actually use disjunctive normal instead of conjunctive (i.e.
137 # ANDs of ORs) because I think the worst-case is a long list
138 # of OR'd-together data IDs, which is already in or very close
139 # to disjunctive normal form.
140 expr = NormalFormExpression.fromTree(self._tree, NormalForm.DISJUNCTIVE)
141 from .expressions import CheckVisitor
142 # Check the expression for consistency and completeness.
143 try:
144 summary = expr.visit(CheckVisitor(dataId, graph, self._bind.keys()))
145 except RuntimeError as err:
146 exprOriginal = str(self._tree)
147 exprNormal = str(expr.toTree())
148 if exprNormal == exprOriginal:
149 msg = f'Error in query expression "{exprOriginal}": {err}'
150 else:
151 msg = (
152 f'Error in query expression "{exprOriginal}" '
153 f'(normalized to "{exprNormal}"): {err}'
154 )
155 raise RuntimeError(msg) from None
156 restriction = GovernorDimensionRestriction(
157 graph.universe,
158 **summary.governors.byName(),
159 )
160 else:
161 from .expressions import InspectionVisitor
162 summary = self._tree.visit(InspectionVisitor(graph.universe, self._bind.keys()))
163 else:
164 from .expressions import InspectionSummary
165 summary = InspectionSummary()
166 return QueryWhereClause(
167 self._tree,
168 dataId,
169 dimensions=summary.dimensions,
170 columns=summary.columns,
171 bind=self._bind,
172 restriction=restriction,
173 region=region,
174 )
177@dataclass(frozen=True)
178class QueryWhereClause:
179 """Structure holding various contributions to a query's WHERE clause.
181 Instances of this class should only be created by
182 `QueryWhereExpression.attach`, which guarantees the consistency of its
183 attributes.
184 """
186 tree: Optional[Node]
187 """A parsed string expression tree., or `None` if there was no string
188 expression.
189 """
191 dataId: DataCoordinate
192 """A data ID identifying dimensions known before query construction
193 (`DataCoordinate`).
195 ``dataId.hasRecords()`` is guaranteed to return `True`.
196 """
198 dimensions: NamedValueAbstractSet[Dimension]
199 """Dimensions whose primary keys or dependencies were referenced anywhere
200 in the string expression (`NamedValueAbstractSet` [ `Dimension` ]).
201 """
203 columns: NamedKeyMapping[DimensionElement, AbstractSet[str]]
204 """Dimension element tables whose non-key columns were referenced anywhere
205 in the string expression
206 (`NamedKeyMapping` [ `DimensionElement`, `Set` [ `str` ] ]).
207 """
209 bind: Mapping[str, Any]
210 """Mapping containing literal values that should be injected into the
211 query expression, keyed by the identifiers they replace (`Mapping`).
212 """
214 region: Optional[Region]
215 """A spatial region that all result rows must overlap
216 (`lsst.sphgeom.Region` or `None`).
217 """
219 restriction: GovernorDimensionRestriction
220 """Restrictions on the values governor dimensions can take in this query,
221 imposed by the string expression or data ID
222 (`GovernorDimensionRestriction`).
223 """
225 @property # type: ignore
226 @cached_getter
227 def temporal(self) -> NamedValueAbstractSet[DimensionElement]:
228 """Dimension elements whose timespans are referenced by this
229 expression (`NamedValueAbstractSet` [ `DimensionElement` ])
230 """
231 return NamedValueSet(
232 e for e, c in self.columns.items() if TimespanDatabaseRepresentation.NAME in c
233 ).freeze()
236@immutable
237class QuerySummary:
238 """A struct that holds and categorizes the dimensions involved in a query.
240 A `QuerySummary` instance is necessary to construct a `QueryBuilder`, and
241 it needs to include all of the dimensions that will be included in the
242 query (including any needed for querying datasets).
244 Parameters
245 ----------
246 requested : `DimensionGraph`
247 The dimensions whose primary keys should be included in the result rows
248 of the query.
249 dataId : `DataCoordinate`, optional
250 A fully-expanded data ID identifying dimensions known in advance. If
251 not provided, will be set to an empty data ID. ``dataId.hasRecords()``
252 must return `True`.
253 expression : `str` or `QueryWhereExpression`, optional
254 A user-provided string WHERE expression.
255 whereRegion : `lsst.sphgeom.Region`, optional
256 A spatial region that all rows must overlap. If `None` and ``dataId``
257 is not `None`, ``dataId.region`` will be used.
258 bind : `Mapping` [ `str`, `object` ], optional
259 Mapping containing literal values that should be injected into the
260 query expression, keyed by the identifiers they replace.
261 check : `bool`
262 If `True` (default) check the query for consistency. This may reject
263 some valid queries that resemble common mistakes (e.g. queries for
264 visits without specifying an instrument).
265 """
266 def __init__(self, requested: DimensionGraph, *,
267 dataId: Optional[DataCoordinate] = None,
268 expression: Optional[Union[str, QueryWhereExpression]] = None,
269 whereRegion: Optional[Region] = None,
270 bind: Optional[Mapping[str, Any]] = None,
271 check: bool = True):
272 self.requested = requested
273 if expression is None:
274 expression = QueryWhereExpression(None, bind)
275 elif isinstance(expression, str):
276 expression = QueryWhereExpression(expression, bind)
277 elif bind is not None:
278 raise TypeError("New bind parameters passed, but expression is already a QueryWhereExpression.")
279 self.where = expression.attach(self.requested, dataId=dataId, region=whereRegion, check=check)
281 requested: DimensionGraph
282 """Dimensions whose primary keys should be included in the result rows of
283 the query (`DimensionGraph`).
284 """
286 where: QueryWhereClause
287 """Structure containing objects that contribute to the WHERE clause of the
288 query (`QueryWhereClause`).
289 """
291 @property
292 def universe(self) -> DimensionUniverse:
293 """All known dimensions (`DimensionUniverse`).
294 """
295 return self.requested.universe
297 @property # type: ignore
298 @cached_getter
299 def spatial(self) -> NamedValueAbstractSet[DimensionElement]:
300 """Dimension elements whose regions and skypix IDs should be included
301 in the query (`NamedValueAbstractSet` of `DimensionElement`).
302 """
303 # An element may participate spatially in the query if:
304 # - it's the most precise spatial element for its system in the
305 # requested dimensions (i.e. in `self.requested.spatial`);
306 # - it isn't also given at query construction time.
307 result: NamedValueSet[DimensionElement] = NamedValueSet()
308 for family in self.mustHaveKeysJoined.spatial:
309 element = family.choose(self.mustHaveKeysJoined.elements)
310 assert isinstance(element, DimensionElement)
311 if element not in self.where.dataId.graph.elements:
312 result.add(element)
313 if len(result) == 1:
314 # There's no spatial join, but there might be a WHERE filter based
315 # on a given region.
316 if self.where.dataId.graph.spatial:
317 # We can only perform those filters against SkyPix dimensions,
318 # so if what we have isn't one, add the common SkyPix dimension
319 # to the query; the element we have will be joined to that.
320 element, = result
321 if not isinstance(element, SkyPixDimension):
322 result.add(self.universe.commonSkyPix)
323 else:
324 # There is no spatial join or filter in this query. Even
325 # if this element might be associated with spatial
326 # information, we don't need it for this query.
327 return NamedValueSet().freeze()
328 elif len(result) > 1:
329 # There's a spatial join. Those require the common SkyPix
330 # system to be included in the query in order to connect them.
331 result.add(self.universe.commonSkyPix)
332 return result.freeze()
334 @property # type: ignore
335 @cached_getter
336 def temporal(self) -> NamedValueAbstractSet[DimensionElement]:
337 """Dimension elements whose timespans should be included in the
338 query (`NamedValueSet` of `DimensionElement`).
339 """
340 if len(self.mustHaveKeysJoined.temporal) > 1:
341 # We don't actually have multiple temporal families in our current
342 # dimension configuration, so this limitation should be harmless.
343 raise NotImplementedError("Queries that should involve temporal joins are not yet supported.")
344 return self.where.temporal
346 @property # type: ignore
347 @cached_getter
348 def mustHaveKeysJoined(self) -> DimensionGraph:
349 """Dimensions whose primary keys must be used in the JOIN ON clauses
350 of the query, even if their tables do not appear (`DimensionGraph`).
352 A `Dimension` primary key can appear in a join clause without its table
353 via a foreign key column in table of a dependent dimension element or
354 dataset.
355 """
356 names = set(self.requested.names | self.where.dimensions.names)
357 return DimensionGraph(self.universe, names=names)
359 @property # type: ignore
360 @cached_getter
361 def mustHaveTableJoined(self) -> NamedValueAbstractSet[DimensionElement]:
362 """Dimension elements whose associated tables must appear in the
363 query's FROM clause (`NamedValueSet` of `DimensionElement`).
364 """
365 result = NamedValueSet(self.spatial | self.temporal | self.where.columns.keys())
366 for dimension in self.mustHaveKeysJoined:
367 if dimension.implied:
368 result.add(dimension)
369 for element in self.mustHaveKeysJoined.union(self.where.dataId.graph).elements:
370 if element.alwaysJoin:
371 result.add(element)
372 return result.freeze()
375@dataclass
376class DatasetQueryColumns:
377 """A struct containing the columns used to reconstruct `DatasetRef`
378 instances from query results.
379 """
381 datasetType: DatasetType
382 """The dataset type being queried (`DatasetType`).
383 """
385 id: ColumnElement
386 """Column containing the unique integer ID for this dataset.
387 """
389 runKey: ColumnElement
390 """Foreign key column to the `~CollectionType.RUN` collection that holds
391 this dataset.
392 """
394 ingestDate: Optional[ColumnElement]
395 """Column containing the ingest timestamp, this is not a part of
396 `DatasetRef` but it comes from the same table.
397 """
399 def __iter__(self) -> Iterator[ColumnElement]:
400 yield self.id
401 yield self.runKey
404@dataclass
405class QueryColumns:
406 """A struct organizing the columns in an under-construction or currently-
407 executing query.
409 Takes no parameters at construction, as expected usage is to add elements
410 to its container attributes incrementally.
411 """
412 def __init__(self) -> None:
413 self.keys = NamedKeyDict()
414 self.timespans = NamedKeyDict()
415 self.regions = NamedKeyDict()
416 self.datasets = None
418 keys: NamedKeyDict[Dimension, List[ColumnElement]]
419 """Columns that correspond to the primary key values of dimensions
420 (`NamedKeyDict` mapping `Dimension` to a `list` of `ColumnElement`).
422 Each value list contains columns from multiple tables corresponding to the
423 same dimension, and the query should constrain the values of those columns
424 to be the same.
426 In a `Query`, the keys of this dictionary must include at least the
427 dimensions in `QuerySummary.requested` and `QuerySummary.dataId.graph`.
428 """
430 timespans: NamedKeyDict[DimensionElement, TimespanDatabaseRepresentation]
431 """Columns that correspond to timespans for elements that participate in a
432 temporal join or filter in the query (`NamedKeyDict` mapping
433 `DimensionElement` to `TimespanDatabaseRepresentation`).
435 In a `Query`, the keys of this dictionary must be exactly the elements
436 in `QuerySummary.temporal`.
437 """
439 regions: NamedKeyDict[DimensionElement, SpatialRegionDatabaseRepresentation]
440 """Columns that correspond to regions for elements that participate in a
441 spatial join or filter in the query (`NamedKeyDict` mapping
442 `DimensionElement` to `SpatialRegionDatabaseRepresentation`).
444 In a `Query`, the keys of this dictionary must be exactly the elements
445 in `QuerySummary.spatial`.
446 """
448 datasets: Optional[DatasetQueryColumns]
449 """Columns that can be used to construct `DatasetRef` instances from query
450 results.
451 (`DatasetQueryColumns` or `None`).
452 """
454 def isEmpty(self) -> bool:
455 """Return `True` if this query has no columns at all.
456 """
457 return not (self.keys or self.timespans or self.regions or self.datasets is not None)
459 def getKeyColumn(self, dimension: Union[Dimension, str]) -> ColumnElement:
460 """ Return one of the columns in self.keys for the given dimension.
462 The column selected is an implentation detail but is guaranteed to
463 be deterministic and consistent across multiple calls.
465 Parameters
466 ----------
467 dimension : `Dimension` or `str`
468 Dimension for which to obtain a key column.
470 Returns
471 -------
472 column : `sqlalchemy.sql.ColumnElement`
473 SQLAlchemy column object.
474 """
475 # Choosing the last element here is entirely for human readers of the
476 # query (e.g. developers debugging things); it makes it more likely a
477 # dimension key will be provided by the dimension's own table, or
478 # failing that, some closely related dimension, which might be less
479 # surprising to see than e.g. some dataset subquery. From the
480 # database's perspective this is entirely arbitrary, because the query
481 # guarantees they all have equal values.
482 return self.keys[dimension][-1]
485@dataclass
486class RegistryManagers:
487 """Struct used to pass around the manager objects that back a `Registry`
488 and are used internally by the query system.
489 """
491 collections: CollectionManager
492 """Manager for collections (`CollectionManager`).
493 """
495 datasets: DatasetRecordStorageManager
496 """Manager for datasets and dataset types (`DatasetRecordStorageManager`).
497 """
499 dimensions: DimensionRecordStorageManager
500 """Manager for dimensions (`DimensionRecordStorageManager`).
501 """
503 TimespanReprClass: Type[TimespanDatabaseRepresentation]
504 """Type that encapsulates how timespans are represented in this database
505 (`type`; subclass of `TimespanDatabaseRepresentation`).
506 """