Coverage for python/lsst/daf/butler/registry/queries/_structs.py: 35%
179 statements
« prev ^ index » next coverage.py v6.5.0, created at 2023-02-02 14:18 +0000
« prev ^ index » next coverage.py v6.5.0, created at 2023-02-02 14:18 +0000
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
21from __future__ import annotations
23__all__ = ["QuerySummary"] # other classes here are local to subpackage
25import dataclasses
26from collections.abc import Iterable, Mapping, Set
27from typing import Any
29import astropy.time
30from lsst.daf.relation import ColumnExpression, ColumnTag, Predicate, SortTerm
31from lsst.sphgeom import Region
32from lsst.utils.classes import cached_getter, immutable
34from ...core import (
35 DataCoordinate,
36 DatasetType,
37 DimensionElement,
38 DimensionGraph,
39 DimensionKeyColumnTag,
40 DimensionRecordColumnTag,
41 DimensionUniverse,
42 NamedValueAbstractSet,
43 NamedValueSet,
44 SkyPixDimension,
45 Timespan,
46)
48# We're not trying to add typing to the lex/yacc parser code, so MyPy
49# doesn't know about some of these imports.
50from .expressions import make_string_expression_predicate
51from .expressions.categorize import categorizeElementOrderByName, categorizeOrderByName
54@dataclasses.dataclass(frozen=True, eq=False)
55class QueryWhereClause:
56 """Structure holding various contributions to a query's WHERE clause.
58 Instances of this class should only be created by
59 `QueryWhereExpression.combine`, which guarantees the consistency of its
60 attributes.
61 """
63 @classmethod
64 def combine(
65 cls,
66 dimensions: DimensionGraph,
67 expression: str = "",
68 *,
69 bind: Mapping[str, Any] | None = None,
70 data_id: DataCoordinate | None = None,
71 region: Region | None = None,
72 timespan: Timespan | None = None,
73 defaults: DataCoordinate | None = None,
74 dataset_type_name: str | None = None,
75 allow_orphans: bool = False,
76 ) -> QueryWhereClause:
77 """Construct from various components.
79 Parameters
80 ----------
81 dimensions : `DimensionGraph`
82 The dimensions that would be included in the query in the absence
83 of the WHERE clause.
84 expression : `str`, optional
85 A user-provided string expression.
86 bind : `Mapping` [ `str`, `object` ], optional
87 Mapping containing literal values that should be injected into the
88 query expression, keyed by the identifiers they replace.
89 data_id : `DataCoordinate`, optional
90 A data ID identifying dimensions known in advance. If not
91 provided, will be set to an empty data ID.
92 region : `lsst.sphgeom.Region`, optional
93 A spatial constraint that all rows must overlap. If `None` and
94 ``data_id`` is an expanded data ID, ``data_id.region`` will be used
95 to construct one.
96 timespan : `Timespan`, optional
97 A temporal constraint that all rows must overlap. If `None` and
98 ``data_id`` is an expanded data ID, ``data_id.timespan`` will be
99 used to construct one.
100 defaults : `DataCoordinate`, optional
101 A data ID containing default for governor dimensions.
102 dataset_type_name : `str` or `None`, optional
103 The name of the dataset type to assume for unqualified dataset
104 columns, or `None` if there are no such identifiers.
105 allow_orphans : `bool`, optional
106 If `True`, permit expressions to refer to dimensions without
107 providing a value for their governor dimensions (e.g. referring to
108 a visit without an instrument). Should be left to default to
109 `False` in essentially all new code.
111 Returns
112 -------
113 where : `QueryWhereClause`
114 An object representing the WHERE clause for a query.
115 """
116 if data_id is not None and data_id.hasRecords():
117 if region is None and data_id.region is not None:
118 region = data_id.region
119 if timespan is None and data_id.timespan is not None:
120 timespan = data_id.timespan
121 if data_id is None:
122 data_id = DataCoordinate.makeEmpty(dimensions.universe)
123 if defaults is None:
124 defaults = DataCoordinate.makeEmpty(dimensions.universe)
125 expression_predicate, governor_constraints = make_string_expression_predicate(
126 expression,
127 dimensions,
128 bind=bind,
129 data_id=data_id,
130 defaults=defaults,
131 dataset_type_name=dataset_type_name,
132 allow_orphans=allow_orphans,
133 )
134 return QueryWhereClause(
135 expression_predicate,
136 data_id,
137 region=region,
138 timespan=timespan,
139 governor_constraints=governor_constraints,
140 )
142 expression_predicate: Predicate | None
143 """A predicate that evaluates a string expression from the user
144 (`expressions.Predicate` or `None`).
145 """
147 data_id: DataCoordinate
148 """A data ID identifying dimensions known before query construction
149 (`DataCoordinate`).
150 """
152 region: Region | None
153 """A spatial region that all result rows must overlap
154 (`lsst.sphgeom.Region` or `None`).
155 """
157 timespan: Timespan | None
158 """A temporal constraint that all result rows must overlap
159 (`Timespan` or `None`).
160 """
162 governor_constraints: Mapping[str, Set[str]]
163 """Restrictions on the values governor dimensions can take in this query,
164 imposed by the string expression and/or data ID
165 (`Mapping` [ `set`, `~collections.abc.Set` [ `str` ] ]).
167 Governor dimensions not present in this mapping are not constrained at all.
168 """
171@dataclasses.dataclass(frozen=True)
172class OrderByClauseColumn:
173 """Information about single column in ORDER BY clause."""
175 element: DimensionElement
176 """Dimension element for data in this column (`DimensionElement`)."""
178 column: str | None
179 """Name of the column or `None` for primary key (`str` or `None`)"""
181 ordering: bool
182 """True for ascending order, False for descending (`bool`)."""
185@dataclasses.dataclass(frozen=True, eq=False)
186class OrderByClause:
187 """Class for information about columns in ORDER BY clause"""
189 @classmethod
190 def parse_general(cls, order_by: Iterable[str], graph: DimensionGraph) -> OrderByClause:
191 """Parse an iterable of strings in the context of a multi-dimension
192 query.
194 Parameters
195 ----------
196 order_by : `Iterable` [ `str` ]
197 Sequence of names to use for ordering with optional "-" prefix.
198 graph : `DimensionGraph`
199 Dimensions used by a query.
201 Returns
202 -------
203 clause : `OrderByClause`
204 New order-by clause representing the given string columns.
205 """
206 terms = []
207 for name in order_by:
208 if not name or name == "-":
209 raise ValueError("Empty dimension name in ORDER BY")
210 ascending = True
211 if name[0] == "-":
212 ascending = False
213 name = name[1:]
214 element, column = categorizeOrderByName(graph, name)
215 term = cls._make_term(element, column, ascending)
216 terms.append(term)
217 return cls(terms)
219 @classmethod
220 def parse_element(cls, order_by: Iterable[str], element: DimensionElement) -> OrderByClause:
221 """Parse an iterable of strings in the context of a single dimension
222 element query.
224 Parameters
225 ----------
226 order_by : `Iterable` [ `str` ]
227 Sequence of names to use for ordering with optional "-" prefix.
228 element : `DimensionElement`
229 Single or primary dimension element in the query
231 Returns
232 -------
233 clause : `OrderByClause`
234 New order-by clause representing the given string columns.
235 """
236 terms = []
237 for name in order_by:
238 if not name or name == "-":
239 raise ValueError("Empty dimension name in ORDER BY")
240 ascending = True
241 if name[0] == "-":
242 ascending = False
243 name = name[1:]
244 column = categorizeElementOrderByName(element, name)
245 term = cls._make_term(element, column, ascending)
246 terms.append(term)
247 return cls(terms)
249 @classmethod
250 def _make_term(cls, element: DimensionElement, column: str | None, ascending: bool) -> SortTerm:
251 """Make a single sort term from parsed user expression values.
253 Parameters
254 ----------
255 element : `DimensionElement`
256 Dimension element the sort term references.
257 column : `str` or `None`
258 DimensionRecord field name, or `None` if ``element`` is a
259 `Dimension` and the sort term is on is key value.
260 ascending : `bool`
261 Whether to sort ascending (`True`) or descending (`False`).
263 Returns
264 -------
265 term : `lsst.daf.relation.SortTerm`
266 Sort term struct.
267 """
268 tag: ColumnTag
269 expression: ColumnExpression
270 if column is None:
271 tag = DimensionKeyColumnTag(element.name)
272 expression = ColumnExpression.reference(tag)
273 elif column in ("timespan.begin", "timespan.end"):
274 base_column, _, subfield = column.partition(".")
275 tag = DimensionRecordColumnTag(element.name, base_column)
276 expression = ColumnExpression.reference(tag).method(
277 "lower" if subfield == "begin" else "upper", dtype=astropy.time.Time
278 )
279 else:
280 tag = DimensionRecordColumnTag(element.name, column)
281 expression = ColumnExpression.reference(tag)
282 return SortTerm(expression, ascending)
284 terms: Iterable[SortTerm]
285 """Columns that appear in the ORDER BY
286 (`Iterable` [ `OrderByClauseColumn` ]).
287 """
289 @property
290 @cached_getter
291 def columns_required(self) -> Set[ColumnTag]:
292 """Set of column tags for all columns referenced by the ORDER BY clause
293 (`~collections.abc.Set` [ `ColumnTag` ]).
294 """
295 tags: set[ColumnTag] = set()
296 for term in self.terms:
297 tags.update(term.expression.columns_required)
298 return tags
301@immutable
302class ElementOrderByClause:
303 """Class for information about columns in ORDER BY clause for one element.
305 Parameters
306 ----------
307 order_by : `Iterable` [ `str` ]
308 Sequence of names to use for ordering with optional "-" prefix.
309 element : `DimensionElement`
310 Dimensions used by a query.
311 """
313 def __init__(self, order_by: Iterable[str], element: DimensionElement):
314 self.order_by_columns = []
315 for name in order_by:
316 if not name or name == "-":
317 raise ValueError("Empty dimension name in ORDER BY")
318 ascending = True
319 if name[0] == "-":
320 ascending = False
321 name = name[1:]
322 column = categorizeElementOrderByName(element, name)
323 self.order_by_columns.append(
324 OrderByClauseColumn(element=element, column=column, ordering=ascending)
325 )
327 order_by_columns: Iterable[OrderByClauseColumn]
328 """Columns that appear in the ORDER BY
329 (`Iterable` [ `OrderByClauseColumn` ]).
330 """
333@immutable
334class QuerySummary:
335 """A struct that holds and categorizes the dimensions involved in a query.
337 A `QuerySummary` instance is necessary to construct a `QueryBuilder`, and
338 it needs to include all of the dimensions that will be included in the
339 query (including any needed for querying datasets).
341 Parameters
342 ----------
343 requested : `DimensionGraph`
344 The dimensions whose primary keys should be included in the result rows
345 of the query.
346 data_id : `DataCoordinate`, optional
347 A fully-expanded data ID identifying dimensions known in advance. If
348 not provided, will be set to an empty data ID.
349 expression : `str`, optional
350 A user-provided string WHERE expression.
351 region : `lsst.sphgeom.Region`, optional
352 If `None` and ``data_id`` is an expanded data ID, ``data_id.region``
353 will be used to construct one.
354 timespan : `Timespan`, optional
355 A temporal constraint that all rows must overlap. If `None` and
356 ``data_id`` is an expanded data ID, ``data_id.timespan`` will be used
357 to construct one.
358 bind : `Mapping` [ `str`, `object` ], optional
359 Mapping containing literal values that should be injected into the
360 query expression, keyed by the identifiers they replace.
361 defaults : `DataCoordinate`, optional
362 A data ID containing default for governor dimensions.
363 datasets : `Iterable` [ `DatasetType` ], optional
364 Dataset types whose searches may be joined into the query. Callers
365 must still call `QueryBuilder.joinDataset` explicitly to control how
366 that join happens (e.g. which collections are searched), but by
367 declaring them here first we can ensure that the query includes the
368 right dimensions for those joins.
369 order_by : `Iterable` [ `str` ]
370 Sequence of names to use for ordering with optional "-" prefix.
371 limit : `Tuple`, optional
372 Limit on the number of returned rows and optional offset.
373 check : `bool`, optional
374 If `False`, permit expressions to refer to dimensions without providing
375 a value for their governor dimensions (e.g. referring to a visit
376 without an instrument). Should be left to default to `True` in
377 essentially all new code.
378 """
380 def __init__(
381 self,
382 requested: DimensionGraph,
383 *,
384 data_id: DataCoordinate | None = None,
385 expression: str = "",
386 region: Region | None = None,
387 timespan: Timespan | None = None,
388 bind: Mapping[str, Any] | None = None,
389 defaults: DataCoordinate | None = None,
390 datasets: Iterable[DatasetType] = (),
391 order_by: Iterable[str] | None = None,
392 limit: tuple[int, int | None] | None = None,
393 check: bool = True,
394 ):
395 self.requested = requested
396 self.datasets = NamedValueSet(datasets).freeze()
397 if len(self.datasets) == 1:
398 (dataset_type_name,) = self.datasets.names
399 else:
400 dataset_type_name = None
401 self.where = QueryWhereClause.combine(
402 self.requested,
403 expression=expression,
404 bind=bind,
405 data_id=data_id,
406 region=region,
407 timespan=timespan,
408 defaults=defaults,
409 dataset_type_name=dataset_type_name,
410 allow_orphans=not check,
411 )
412 self.order_by = None if order_by is None else OrderByClause.parse_general(order_by, requested)
413 self.limit = limit
414 self.columns_required, self.dimensions = self._compute_columns_required()
416 requested: DimensionGraph
417 """Dimensions whose primary keys should be included in the result rows of
418 the query (`DimensionGraph`).
419 """
421 where: QueryWhereClause
422 """Structure containing objects that contribute to the WHERE clause of the
423 query (`QueryWhereClause`).
424 """
426 datasets: NamedValueAbstractSet[DatasetType]
427 """Dataset types whose searches may be joined into the query
428 (`NamedValueAbstractSet` [ `DatasetType` ]).
429 """
431 order_by: OrderByClause | None
432 """Object that manages how the query results should be sorted
433 (`OrderByClause` or `None`).
434 """
436 limit: tuple[int, int | None] | None
437 """Integer offset and maximum number of rows returned (prior to
438 postprocessing filters), respectively.
439 """
441 dimensions: DimensionGraph
442 """All dimensions in the query in any form (`DimensionGraph`).
443 """
445 columns_required: Set[ColumnTag]
446 """All columns that must be included directly in the query.
448 This does not include columns that only need to be included in the result
449 rows, and hence could be provided by postprocessors.
450 """
452 @property
453 def universe(self) -> DimensionUniverse:
454 """All known dimensions (`DimensionUniverse`)."""
455 return self.requested.universe
457 @property
458 @cached_getter
459 def spatial(self) -> NamedValueAbstractSet[DimensionElement]:
460 """Dimension elements whose regions and skypix IDs should be included
461 in the query (`NamedValueAbstractSet` of `DimensionElement`).
462 """
463 # An element may participate spatially in the query if:
464 # - it's the most precise spatial element for its system in the
465 # requested dimensions (i.e. in `self.requested.spatial`);
466 # - it isn't also given at query construction time.
467 result: NamedValueSet[DimensionElement] = NamedValueSet()
468 for family in self.dimensions.spatial:
469 element = family.choose(self.dimensions.elements)
470 assert isinstance(element, DimensionElement)
471 if element not in self.where.data_id.graph.elements:
472 result.add(element)
473 if len(result) == 1:
474 # There's no spatial join, but there might be a WHERE filter based
475 # on a given region.
476 if self.where.data_id.graph.spatial:
477 # We can only perform those filters against SkyPix dimensions,
478 # so if what we have isn't one, add the common SkyPix dimension
479 # to the query; the element we have will be joined to that.
480 (element,) = result
481 if not isinstance(element, SkyPixDimension):
482 result.add(self.universe.commonSkyPix)
483 else:
484 # There is no spatial join or filter in this query. Even
485 # if this element might be associated with spatial
486 # information, we don't need it for this query.
487 return NamedValueSet().freeze()
488 return result.freeze()
490 @property
491 @cached_getter
492 def temporal(self) -> NamedValueAbstractSet[DimensionElement]:
493 """Dimension elements whose timespans should be included in the
494 query (`NamedValueSet` of `DimensionElement`).
495 """
496 if len(self.dimensions.temporal) > 1:
497 # We don't actually have multiple temporal families in our current
498 # dimension configuration, so this limitation should be harmless.
499 raise NotImplementedError("Queries that should involve temporal joins are not yet supported.")
500 result = NamedValueSet[DimensionElement]()
501 if self.where.expression_predicate is not None:
502 for tag in DimensionRecordColumnTag.filter_from(self.where.expression_predicate.columns_required):
503 if tag.column == "timespan":
504 result.add(self.requested.universe[tag.element])
505 return result.freeze()
507 def _compute_columns_required(self) -> tuple[set[ColumnTag], DimensionGraph]:
508 """Compute the columns that must be provided by the relations joined
509 into this query in order to obtain the right *set* of result rows in
510 the right order.
512 This does not include columns that only need to be included in the
513 result rows, and hence could be provided by postprocessors.
514 """
515 tags: set[ColumnTag] = set(DimensionKeyColumnTag.generate(self.requested.names))
516 tags.update(DimensionKeyColumnTag.generate(self.where.data_id.graph.names))
517 for dataset_type in self.datasets:
518 tags.update(DimensionKeyColumnTag.generate(dataset_type.dimensions.names))
519 if self.where.expression_predicate is not None:
520 tags.update(self.where.expression_predicate.columns_required)
521 if self.order_by is not None:
522 tags.update(self.order_by.columns_required)
523 # Make sure the dimension keys are expanded self-consistently in what
524 # we return by passing them through DimensionGraph.
525 dimensions = DimensionGraph(
526 self.universe, names={tag.dimension for tag in DimensionKeyColumnTag.filter_from(tags)}
527 )
528 tags.update(DimensionKeyColumnTag.generate(dimensions.names))
529 return (tags, dimensions)