Coverage for python/lsst/daf/butler/registry/queries/_structs.py : 38%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
21from __future__ import annotations
23__all__ = ["QuerySummary"] # other classes here are local to subpackage
25import enum
26from dataclasses import dataclass
27from typing import Optional, Tuple, List, Set, Union
29from sqlalchemy.sql import ColumnElement, bindparam
31from ...core import (
32 DatasetType,
33 Dimension,
34 DimensionElement,
35 DimensionGraph,
36 DimensionUniverse,
37 ExpandedDataCoordinate,
38 SkyPixDimension,
39 Timespan,
40)
41from ...core.utils import NamedValueSet, NamedKeyDict
42from .exprParser import Node, ParserYacc
45class GivenTime(enum.Enum):
46 """Enumeration specifying when (and if) a data ID value is provided as
47 a constraint on a query.
48 """
50 NOT_GIVEN = 0
51 """This value is never provided as a constraint on the query.
52 """
54 AT_CONSTRUCTION = 1
55 """This value is provided at query construction, can hence be obtained from
56 `QuerySummary.dataId`.
57 """
59 AT_EXECUTION = 2
60 """This value is provided only at query execution, and must be included in
61 the data ID passed to `Query.execute` or `Query.bind`.
62 """
65@dataclass
66class QueryWhereExpression:
67 """A struct representing a parsed user-provided WHERE expression.
69 Parameters
70 ----------
71 universe : `DimensionUniverse`
72 All known dimensions.
73 expression : `str`, optional
74 The string expression to parse.
75 """
76 def __init__(self, universe: DimensionUniverse, expression: Optional[str] = None):
77 if expression:
78 from .expressions import InspectionVisitor
79 try:
80 parser = ParserYacc()
81 self.tree = parser.parse(expression)
82 except Exception as exc:
83 raise RuntimeError(f"Failed to parse user expression `{expression}'.") from exc
84 visitor = InspectionVisitor(universe)
85 self.tree.visit(visitor)
86 self.keys = visitor.keys
87 self.metadata = visitor.metadata
88 else:
89 self.tree = None
90 self.keys = NamedValueSet()
91 self.metadata = NamedKeyDict()
93 tree: Optional[Node]
94 """The parsed user expression tree, if present (`Node` or `None`).
95 """
97 keys: NamedValueSet[Dimension]
98 """All dimensions whose keys are referenced by the expression
99 (`NamedValueSet` of `Dimension`).
100 """
102 metadata: NamedKeyDict[DimensionElement, Set[str]]
103 """All dimension elements metadata fields referenced by the expression
104 (`NamedKeyDict` mapping `DimensionElement` to a `set` of field names).
105 """
108@dataclass
109class QuerySummary:
110 """A struct that holds and categorizes the dimensions involved in a query.
112 A `QuerySummary` instance is necessary to construct a `QueryBuilder`, and
113 it needs to include all of the dimensions that will be included in the
114 query (including any needed for querying datasets).
116 Parameters
117 ----------
118 requested : `DimensionGraph`
119 The dimensions whose primary keys should be included in the result rows
120 of the query.
121 dataId : `ExpandedDataCoordinate`, optional
122 A fully-expanded data ID identifying dimensions known in advance. If
123 not provided, will be set to an empty data ID.
124 expression : `str` or `QueryWhereExpression`, optional
125 A user-provided string WHERE expression.
126 given : `DimensionGraph`, optional
127 Dimensions that will be fully identified before the query is executed,
128 if not necessarily provided (in ``dataId``) now. If provided, must be
129 a superset of ``dataId.graph``; if not provided, will be set to
130 ``dataId.graph``.
131 entire : `NamedValueSet` of `DimensionElement`, optional
132 Dimension elements that should be fully included in any spatial or
133 temporal join, including child elements that would not otherwise be
134 included in that join. For example, passing "visit" here in a query
135 constrained to a single tract would include all visit+detector
136 combinations in any visit that overlaps that tract, not just the
137 visit+detector combinations that directly overlap the tract.
138 """
139 def __init__(self, requested: DimensionGraph, *,
140 dataId: Optional[ExpandedDataCoordinate] = None,
141 expression: Optional[Union[str, QueryWhereExpression]] = None,
142 given: Optional[DimensionGraph] = None,
143 entire: Optional[NamedValueSet[DimensionElement]] = None):
144 self.requested = requested
145 self.dataId = dataId if dataId is not None else ExpandedDataCoordinate(requested.universe.empty, ())
146 self.given = given if given is not None else self.dataId.graph
147 assert self.given.issuperset(self.dataId.graph)
148 self.expression = (expression if isinstance(expression, QueryWhereExpression)
149 else QueryWhereExpression(requested.universe, expression))
150 self.entire = entire if entire is not None else NamedValueSet()
152 requested: DimensionGraph
153 """Dimensions whose primary keys should be included in the result rows of
154 the query (`DimensionGraph`).
155 """
157 dataId: ExpandedDataCoordinate
158 """A data ID identifying dimensions known before query construction
159 (`ExpandedDataCoordinate`).
160 """
162 expression: QueryWhereExpression
163 """Information about any parsed user WHERE expression
164 (`QueryWhereExpression`).
165 """
167 given: DimensionGraph
168 """All dimensions whose primary keys are fully identified before query
169 execution (`DimensionGraph`).
170 """
172 entire: NamedValueSet[DimensionElement]
173 """Dimension elements that should be fully included when they overlap other
174 elements spatially or temporally (`NamedValueSet` of `DimensionElement`).
176 For example, including the visit dimension here in a query that also
177 requests the detector dimension and has a user expression on tract will
178 result in all visit+detector combinations being returned for any visits
179 that overlap the tract, rather than just the visit+detector combinations
180 that directly overlap the tract.
181 """
183 def whenIsDimensionGiven(self, dimension: Dimension) -> GivenTime:
184 """Return an enumeration value indicating when the given dimension
185 is identified in the WHERE clause.
187 Returns
188 -------
189 when : `GivenTime`
190 Enumeration indicating when the dimension is identified.
191 """
192 if dimension in self.dataId.graph:
193 return GivenTime.AT_CONSTRUCTION
194 elif dimension in self.given:
195 return GivenTime.AT_EXECUTION
196 else:
197 return GivenTime.NOT_GIVEN
199 def whenIsRegionGiven(self) -> GivenTime:
200 """Return an enumeration value indicating when a region is provided
201 in the WHERE clause.
203 Returns
204 -------
205 when : `GivenTime`
206 Enumeration indicating when a region is provided.
207 """
208 if self.given.spatial:
209 if self.given.spatial == self.dataId.graph.spatial:
210 return GivenTime.AT_CONSTRUCTION
211 else:
212 return GivenTime.AT_EXECUTION
213 else:
214 return GivenTime.NOT_GIVEN
216 def whenIsTimespanGiven(self) -> GivenTime:
217 """Return an enumeration value indicating when a timespan is provided
218 in the WHERE clause.
220 Returns
221 -------
222 when : `GivenTime`
223 Enumeration indicating when a timespan is provided.
224 """
225 if self.given.temporal:
226 if self.given.temporal == self.dataId.graph.temporal:
227 return GivenTime.AT_CONSTRUCTION
228 else:
229 return GivenTime.AT_EXECUTION
230 else:
231 return GivenTime.NOT_GIVEN
233 @property
234 def universe(self) -> DimensionUniverse:
235 """All known dimensions (`DimensionUniverse`).
236 """
237 return self.requested.universe
239 @property
240 def spatial(self) -> NamedValueSet[DimensionElement]:
241 """Dimension elements whose regions and skypix IDs should be included
242 in the query (`NamedValueSet` of `DimensionElement`).
243 """
244 # An element may participate spatially in the query if:
245 # - it's the most precise spatial element for its system in the
246 # requested dimensions (i.e. in `self.requested.spatial`);
247 # - it isn't also given at query construction or execution time.
248 result = self.mustHaveKeysJoined.getSpatial(prefer=self.entire) - self.given.elements
249 if len(result) == 1:
250 # There's no spatial join, but there might be a WHERE filter based
251 # on a given region.
252 if self.given.spatial:
253 # We can only perform those filters against SkyPix dimensions,
254 # so if what we have isn't one, add the common SkyPix dimension
255 # to the query; the element we have will be joined to that.
256 element, = result
257 if not isinstance(element, SkyPixDimension):
258 result.add(self.universe.commonSkyPix)
259 else:
260 # There is no spatial join or filter in this query. Even
261 # if this element might be associated with spatial
262 # information, we don't need it for this query.
263 return NamedValueSet()
264 elif len(result) > 1:
265 # There's a spatial join. Those require the common SkyPix
266 # system to be included in the query in order to connect them.
267 result.add(self.universe.commonSkyPix)
268 return result
270 @property
271 def temporal(self) -> NamedValueSet[DimensionElement]:
272 """Dimension elements whose timespans should be included in the
273 query (`NamedValueSet` of `DimensionElement`).
274 """
275 # An element may participate temporally in the query if:
276 # - it's the most precise temporal element for its system in the
277 # requested dimensions (i.e. in `self.requested.temporal`);
278 # - it isn't also given at query construction or execution time.
279 result = self.mustHaveKeysJoined.getTemporal(prefer=self.entire) - self.given.elements
280 if len(result) == 1 and not self.given.getTemporal():
281 # No temporal join or filter. Even if this element might be
282 # associated with temporal information, we don't need it for this
283 # query.
284 return NamedValueSet()
285 return result
287 @property
288 def mustHaveKeysJoined(self) -> DimensionGraph:
289 """Dimensions whose primary keys must be used in the JOIN ON clauses
290 of the query, even if their tables do not appear (`DimensionGraph`).
292 A `Dimension` primary key can appear in a join clause without its table
293 via a foreign key column in table of a dependent dimension element or
294 dataset.
295 """
296 names = set(self.requested.names | self.expression.keys.names)
297 return DimensionGraph(self.universe, names=names)
299 @property
300 def mustHaveTableJoined(self) -> NamedValueSet[DimensionElement]:
301 """Dimension elements whose associated tables must appear in the
302 query's FROM clause (`NamedValueSet` of `DimensionElement`).
303 """
304 result = self.spatial | self.temporal | self.expression.metadata.keys()
305 for dimension in self.mustHaveKeysJoined:
306 if dimension.implied:
307 result.add(dimension)
308 return result
311@dataclass
312class QueryColumns:
313 """A struct organizing the columns in an under-construction or currently-
314 executing query.
316 Takes no parameters at construction, as expected usage is to add elements
317 to its container attributes incrementally.
318 """
319 def __init__(self):
320 self.keys = NamedKeyDict()
321 self.timespans = NamedKeyDict()
322 self.regions = NamedKeyDict()
323 self.datasets = NamedKeyDict()
325 keys: NamedKeyDict[Dimension, List[ColumnElement]]
326 """Columns that correspond to the primary key values of dimensions
327 (`NamedKeyDict` mapping `Dimension` to a `list` of `ColumnElement`).
329 Each value list contains columns from multiple tables corresponding to the
330 same dimension, and the query should constrain the values of those columns
331 to be the same.
333 In a `Query`, the keys of this dictionary must include at least the
334 dimensions in `QuerySummary.requested` and `QuerySummary.given`.
335 """
337 timespans: NamedKeyDict[DimensionElement, Timespan[ColumnElement]]
338 """Columns that correspond to timespans for elements that participate in a
339 temporal join or filter in the query (`NamedKeyDict` mapping
340 `DimensionElement` to `Timespan` of `ColumnElement`).
342 In a `Query`, the keys of this dictionary must be exactly the elements
343 in `QuerySummary.temporal`.
344 """
346 regions: NamedKeyDict[DimensionElement, ColumnElement]
347 """Columns that correspond to regions for elements that participate in a
348 spatial join or filter in the query (`NamedKeyDict` mapping
349 `DimensionElement` to `ColumnElement`).
351 In a `Query`, the keys of this dictionary must be exactly the elements
352 in `QuerySummary.spatial`.
353 """
355 datasets: NamedKeyDict[DatasetType, Tuple[ColumnElement, Optional[ColumnElement]]]
356 """Columns that correspond to the ``dataset_id`` and optionally collection
357 rank for a dataset in the query (`NamedKeyDict` mapping `DatasetType` to
358 `tuple` of `ColumnElement`).
360 "Collection rank" here is the index of the collection in which this dataset
361 was found in the list of collections to search; a lower rank corresponds
362 to a collection that appears earlier in the search path.
363 """
365 def getKeyColumn(self, dimension: Dimension) -> ColumnElement:
366 """ Return one of the columns in self.keys for the given dimension.
368 The column selected is an implentation detail but is guaranteed to
369 be deterministic and consistent across multiple calls.
371 Parameters
372 ----------
373 dimension : `Dimension`
374 Element for which to obtain a key column.
376 Returns
377 -------
378 column : `sqlalchemy.sql.ColumnElement`
379 SQLAlchemy column object.
380 """
381 # Choosing the last element here is entirely for human readers of the
382 # query (e.g. developers debugging things); it makes it more likely a
383 # dimension key will be provided by the dimension's own table, or
384 # failing that, some closely related dimension, which might be less
385 # surprising to see than e.g. some dataset subquery. From the
386 # database's perspective this is entirely arbitrary, cause the query
387 # guarantees they all have equal values.
388 return self.keys[dimension][-1]
391@dataclass
392class QueryParameters:
393 """A struct managing deferred bind parameters in a query.
395 Takes no parameters at construction, as expected usage is to add elements
396 to its container attributes incrementally.
397 """
398 def __init__(self):
399 self.keys = NamedKeyDict()
400 self.timespan = None
401 self.skypix = NamedKeyDict()
403 keys: NamedKeyDict[Dimension, bindparam]
404 """Bind parameters that correspond to dimension primary key values
405 (`NamedKeyDict` mapping `Dimension` to `sqlalchemy.sql.bindparam`).
407 In a `Query`, the keys of this dictionary are the subset of
408 `QuerySummary.given` for which `QuerySummary.whenIsDimensionGiven`
409 returns `False`.
410 """
412 timespan: Optional[Timespan[bindparam]]
413 """Bind parameters that correspond to timespans (`Timespan` of
414 `sqlalchemy.sql.bindparam`).
416 In a `Query`, this is not `None` if and only if
417 `QuerySummary.whenIsTimespanGiven` returns `GivenTime.AT_EXECUTION`.
418 """
420 skypix: NamedKeyDict[SkyPixDimension, bindparam]
421 """Bind parameters that correspond to skypix IDs (`NamedKeyDict` mapping
422 `SkyPixDimension` to to`sqlalchemy.sql.bindparam`).
424 In a `Query`, this is not `None` if and only if
425 `QuerySummary.whenIsRegionGiven` returns `GivenTime.AT_EXECUTION`.
426 """