Coverage for python/lsst/daf/butler/registry/queries/_structs.py: 35%
179 statements
« prev ^ index » next coverage.py v6.5.0, created at 2023-01-26 02:04 -0800
« prev ^ index » next coverage.py v6.5.0, created at 2023-01-26 02:04 -0800
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
21from __future__ import annotations
23__all__ = ["QuerySummary"] # other classes here are local to subpackage
25import dataclasses
26from collections.abc import Iterable, Mapping, Set
27from typing import Any
29import astropy.time
30from lsst.daf.relation import ColumnExpression, ColumnTag, Predicate, SortTerm
31from lsst.sphgeom import Region
32from lsst.utils.classes import cached_getter, immutable
34from ...core import (
35 DataCoordinate,
36 DatasetType,
37 DimensionElement,
38 DimensionGraph,
39 DimensionKeyColumnTag,
40 DimensionRecordColumnTag,
41 DimensionUniverse,
42 NamedValueAbstractSet,
43 NamedValueSet,
44 SkyPixDimension,
45 Timespan,
46)
48# We're not trying to add typing to the lex/yacc parser code, so MyPy
49# doesn't know about some of these imports.
50from .expressions import make_string_expression_predicate
51from .expressions.categorize import categorizeElementOrderByName, categorizeOrderByName
54@dataclasses.dataclass(frozen=True, eq=False)
55class QueryWhereClause:
56 """Structure holding various contributions to a query's WHERE clause.
58 Instances of this class should only be created by
59 `QueryWhereExpression.combine`, which guarantees the consistency of its
60 attributes.
61 """
63 @classmethod
64 def combine(
65 cls,
66 dimensions: DimensionGraph,
67 expression: str = "",
68 *,
69 bind: Mapping[str, Any] | None = None,
70 data_id: DataCoordinate | None = None,
71 region: Region | None = None,
72 timespan: Timespan | None = None,
73 defaults: DataCoordinate | None = None,
74 dataset_type_name: str | None = None,
75 allow_orphans: bool = False,
76 ) -> QueryWhereClause:
77 """Construct from various components.
79 Parameters
80 ----------
81 dimensions : `DimensionGraph`
82 The dimensions that would be included in the query in the absence
83 of the WHERE clause.
84 expression : `str`, optional
85 A user-provided string expression.
86 bind : `Mapping` [ `str`, `object` ], optional
87 Mapping containing literal values that should be injected into the
88 query expression, keyed by the identifiers they replace.
89 data_id : `DataCoordinate`, optional
90 A data ID identifying dimensions known in advance. If not
91 provided, will be set to an empty data ID.
92 region : `lsst.sphgeom.Region`, optional
93 A spatial constraint that all rows must overlap. If `None` and
94 ``data_id`` is an expanded data ID, ``data_id.region`` will be used
95 to construct one.
96 timespan : `Timespan`, optional
97 A temporal constraint that all rows must overlap. If `None` and
98 ``data_id`` is an expanded data ID, ``data_id.timespan`` will be
99 used to construct one.
100 defaults : `DataCoordinate`, optional
101 A data ID containing default for governor dimensions.
102 dataset_type_name : `str` or `None`, optional
103 The name of the dataset type to assume for unqualified dataset
104 columns, or `None` if there are no such identifiers.
105 allow_orphans : `bool`, optional
106 If `True`, permit expressions to refer to dimensions without
107 providing a value for their governor dimensions (e.g. referring to
108 a visit without an instrument). Should be left to default to
109 `False` in essentially all new code.
111 Returns
112 -------
113 where : `QueryWhereClause`
114 An object representing the WHERE clause for a query.
115 """
116 if data_id is not None and data_id.hasRecords():
117 if region is None and data_id.region is not None:
118 region = data_id.region
119 if timespan is None and data_id.timespan is not None:
120 timespan = data_id.timespan
121 if data_id is None:
122 data_id = DataCoordinate.makeEmpty(dimensions.universe)
123 if defaults is None:
124 defaults = DataCoordinate.makeEmpty(dimensions.universe)
125 expression_predicate, governor_constraints = make_string_expression_predicate(
126 expression,
127 dimensions,
128 bind=bind,
129 data_id=data_id,
130 defaults=defaults,
131 dataset_type_name=dataset_type_name,
132 allow_orphans=allow_orphans,
133 )
134 return QueryWhereClause(
135 expression_predicate,
136 data_id,
137 region=region,
138 timespan=timespan,
139 governor_constraints=governor_constraints,
140 )
142 expression_predicate: Predicate | None
143 """A predicate that evaluates a string expression from the user
144 (`expressions.Predicate` or `None`).
145 """
147 data_id: DataCoordinate
148 """A data ID identifying dimensions known before query construction
149 (`DataCoordinate`).
150 """
152 region: Region | None
153 """A spatial region that all result rows must overlap
154 (`lsst.sphgeom.Region` or `None`).
155 """
157 timespan: Timespan | None
158 """A temporal constraint that all result rows must overlap
159 (`Timespan` or `None`).
160 """
162 governor_constraints: Mapping[str, Set[str]]
163 """Restrictions on the values governor dimensions can take in this query,
164 imposed by the string expression and/or data ID
165 (`Mapping` [ `set`, `~collections.abc.Set` [ `str` ] ]).
167 Governor dimensions not present in this mapping are not constrained at all.
168 """
171@dataclasses.dataclass(frozen=True)
172class OrderByClauseColumn:
173 """Information about single column in ORDER BY clause."""
175 element: DimensionElement
176 """Dimension element for data in this column (`DimensionElement`)."""
178 column: str | None
179 """Name of the column or `None` for primary key (`str` or `None`)"""
181 ordering: bool
182 """True for ascending order, False for descending (`bool`)."""
185@dataclasses.dataclass(frozen=True, eq=False)
186class OrderByClause:
187 """Class for information about columns in ORDER BY clause"""
189 @classmethod
190 def parse_general(cls, order_by: Iterable[str], graph: DimensionGraph) -> OrderByClause:
191 """Parse an iterable of strings in the context of a multi-dimension
192 query.
194 Parameters
195 ----------
196 order_by : `Iterable` [ `str` ]
197 Sequence of names to use for ordering with optional "-" prefix.
198 graph : `DimensionGraph`
199 Dimensions used by a query.
201 Returns
202 -------
203 clause : `OrderByClause`
204 New order-by clause representing the given string columns.
205 """
206 terms = []
207 for name in order_by:
208 if not name or name == "-":
209 raise ValueError("Empty dimension name in ORDER BY")
210 ascending = True
211 if name[0] == "-":
212 ascending = False
213 name = name[1:]
214 element, column = categorizeOrderByName(graph, name)
215 term = cls._make_term(element, column, ascending)
216 terms.append(term)
217 return cls(terms)
219 @classmethod
220 def parse_element(cls, order_by: Iterable[str], element: DimensionElement) -> OrderByClause:
221 """Parse an iterable of strings in the context of a single dimension
222 element query.
224 Parameters
225 ----------
226 order_by : `Iterable` [ `str` ]
227 Sequence of names to use for ordering with optional "-" prefix.
228 element : `DimensionElement`
229 Single or primary dimension element in the query
231 Returns
232 -------
233 clause : `OrderByClause`
234 New order-by clause representing the given string columns.
235 """
236 terms = []
237 for name in order_by:
238 if not name or name == "-":
239 raise ValueError("Empty dimension name in ORDER BY")
240 ascending = True
241 if name[0] == "-":
242 ascending = False
243 name = name[1:]
244 column = categorizeElementOrderByName(element, name)
245 term = cls._make_term(element, column, ascending)
246 terms.append(term)
247 return cls(terms)
249 @classmethod
250 def _make_term(cls, element: DimensionElement, column: str | None, ascending: bool) -> SortTerm:
251 """Make a single sort term from parsed user expression values.
253 Parameters
254 ----------
255 element : `DimensionElement`
256 Dimension element the sort term references.
257 column : `str` or `None`
258 DimensionRecord field name, or `None` if ``element`` is a
259 `Dimension` and the sort term is on is key value.
260 ascending : `bool`
261 Whether to sort ascending (`True`) or descending (`False`).
263 Returns
264 -------
265 term : `lsst.daf.relation.SortTerm`
266 Sort term struct.
267 """
268 tag: ColumnTag
269 expression: ColumnExpression
270 if column is None:
271 tag = DimensionKeyColumnTag(element.name)
272 expression = ColumnExpression.reference(tag)
273 elif column in ("timespan.begin", "timespan.end"):
274 base_column, _, subfield = column.partition(".")
275 tag = DimensionRecordColumnTag(element.name, base_column)
276 expression = ColumnExpression.reference(tag).method(
277 "lower" if subfield == "begin" else "upper", dtype=astropy.time.Time
278 )
279 else:
280 tag = DimensionRecordColumnTag(element.name, column)
281 expression = ColumnExpression.reference(tag)
282 return SortTerm(expression, ascending)
284 terms: Iterable[SortTerm]
285 """Columns that appear in the ORDER BY
286 (`Iterable` [ `OrderByClauseColumn` ]).
287 """
289 @property
290 @cached_getter
291 def columns_required(self) -> Set[ColumnTag]:
292 """Set of column tags for all columns referenced by the ORDER BY clause
293 (`~collections.abc.Set` [ `ColumnTag` ]).
294 """
295 tags: set[ColumnTag] = set()
296 for term in self.terms:
297 tags.update(term.expression.columns_required)
298 return tags
301@immutable
302class ElementOrderByClause:
303 """Class for information about columns in ORDER BY clause for one element.
305 Parameters
306 ----------
307 order_by : `Iterable` [ `str` ]
308 Sequence of names to use for ordering with optional "-" prefix.
309 element : `DimensionElement`
310 Dimensions used by a query.
311 """
313 def __init__(self, order_by: Iterable[str], element: DimensionElement):
315 self.order_by_columns = []
316 for name in order_by:
317 if not name or name == "-":
318 raise ValueError("Empty dimension name in ORDER BY")
319 ascending = True
320 if name[0] == "-":
321 ascending = False
322 name = name[1:]
323 column = categorizeElementOrderByName(element, name)
324 self.order_by_columns.append(
325 OrderByClauseColumn(element=element, column=column, ordering=ascending)
326 )
328 order_by_columns: Iterable[OrderByClauseColumn]
329 """Columns that appear in the ORDER BY
330 (`Iterable` [ `OrderByClauseColumn` ]).
331 """
334@immutable
335class QuerySummary:
336 """A struct that holds and categorizes the dimensions involved in a query.
338 A `QuerySummary` instance is necessary to construct a `QueryBuilder`, and
339 it needs to include all of the dimensions that will be included in the
340 query (including any needed for querying datasets).
342 Parameters
343 ----------
344 requested : `DimensionGraph`
345 The dimensions whose primary keys should be included in the result rows
346 of the query.
347 data_id : `DataCoordinate`, optional
348 A fully-expanded data ID identifying dimensions known in advance. If
349 not provided, will be set to an empty data ID.
350 expression : `str`, optional
351 A user-provided string WHERE expression.
352 region : `lsst.sphgeom.Region`, optional
353 If `None` and ``data_id`` is an expanded data ID, ``data_id.region``
354 will be used to construct one.
355 timespan : `Timespan`, optional
356 A temporal constraint that all rows must overlap. If `None` and
357 ``data_id`` is an expanded data ID, ``data_id.timespan`` will be used
358 to construct one.
359 bind : `Mapping` [ `str`, `object` ], optional
360 Mapping containing literal values that should be injected into the
361 query expression, keyed by the identifiers they replace.
362 defaults : `DataCoordinate`, optional
363 A data ID containing default for governor dimensions.
364 datasets : `Iterable` [ `DatasetType` ], optional
365 Dataset types whose searches may be joined into the query. Callers
366 must still call `QueryBuilder.joinDataset` explicitly to control how
367 that join happens (e.g. which collections are searched), but by
368 declaring them here first we can ensure that the query includes the
369 right dimensions for those joins.
370 order_by : `Iterable` [ `str` ]
371 Sequence of names to use for ordering with optional "-" prefix.
372 limit : `Tuple`, optional
373 Limit on the number of returned rows and optional offset.
374 check : `bool`, optional
375 If `False`, permit expressions to refer to dimensions without providing
376 a value for their governor dimensions (e.g. referring to a visit
377 without an instrument). Should be left to default to `True` in
378 essentially all new code.
379 """
381 def __init__(
382 self,
383 requested: DimensionGraph,
384 *,
385 data_id: DataCoordinate | None = None,
386 expression: str = "",
387 region: Region | None = None,
388 timespan: Timespan | None = None,
389 bind: Mapping[str, Any] | None = None,
390 defaults: DataCoordinate | None = None,
391 datasets: Iterable[DatasetType] = (),
392 order_by: Iterable[str] | None = None,
393 limit: tuple[int, int | None] | None = None,
394 check: bool = True,
395 ):
396 self.requested = requested
397 self.datasets = NamedValueSet(datasets).freeze()
398 if len(self.datasets) == 1:
399 (dataset_type_name,) = self.datasets.names
400 else:
401 dataset_type_name = None
402 self.where = QueryWhereClause.combine(
403 self.requested,
404 expression=expression,
405 bind=bind,
406 data_id=data_id,
407 region=region,
408 timespan=timespan,
409 defaults=defaults,
410 dataset_type_name=dataset_type_name,
411 allow_orphans=not check,
412 )
413 self.order_by = None if order_by is None else OrderByClause.parse_general(order_by, requested)
414 self.limit = limit
415 self.columns_required, self.dimensions = self._compute_columns_required()
417 requested: DimensionGraph
418 """Dimensions whose primary keys should be included in the result rows of
419 the query (`DimensionGraph`).
420 """
422 where: QueryWhereClause
423 """Structure containing objects that contribute to the WHERE clause of the
424 query (`QueryWhereClause`).
425 """
427 datasets: NamedValueAbstractSet[DatasetType]
428 """Dataset types whose searches may be joined into the query
429 (`NamedValueAbstractSet` [ `DatasetType` ]).
430 """
432 order_by: OrderByClause | None
433 """Object that manages how the query results should be sorted
434 (`OrderByClause` or `None`).
435 """
437 limit: tuple[int, int | None] | None
438 """Integer offset and maximum number of rows returned (prior to
439 postprocessing filters), respectively.
440 """
442 dimensions: DimensionGraph
443 """All dimensions in the query in any form (`DimensionGraph`).
444 """
446 columns_required: Set[ColumnTag]
447 """All columns that must be included directly in the query.
449 This does not include columns that only need to be included in the result
450 rows, and hence could be provided by postprocessors.
451 """
453 @property
454 def universe(self) -> DimensionUniverse:
455 """All known dimensions (`DimensionUniverse`)."""
456 return self.requested.universe
458 @property
459 @cached_getter
460 def spatial(self) -> NamedValueAbstractSet[DimensionElement]:
461 """Dimension elements whose regions and skypix IDs should be included
462 in the query (`NamedValueAbstractSet` of `DimensionElement`).
463 """
464 # An element may participate spatially in the query if:
465 # - it's the most precise spatial element for its system in the
466 # requested dimensions (i.e. in `self.requested.spatial`);
467 # - it isn't also given at query construction time.
468 result: NamedValueSet[DimensionElement] = NamedValueSet()
469 for family in self.dimensions.spatial:
470 element = family.choose(self.dimensions.elements)
471 assert isinstance(element, DimensionElement)
472 if element not in self.where.data_id.graph.elements:
473 result.add(element)
474 if len(result) == 1:
475 # There's no spatial join, but there might be a WHERE filter based
476 # on a given region.
477 if self.where.data_id.graph.spatial:
478 # We can only perform those filters against SkyPix dimensions,
479 # so if what we have isn't one, add the common SkyPix dimension
480 # to the query; the element we have will be joined to that.
481 (element,) = result
482 if not isinstance(element, SkyPixDimension):
483 result.add(self.universe.commonSkyPix)
484 else:
485 # There is no spatial join or filter in this query. Even
486 # if this element might be associated with spatial
487 # information, we don't need it for this query.
488 return NamedValueSet().freeze()
489 return result.freeze()
491 @property
492 @cached_getter
493 def temporal(self) -> NamedValueAbstractSet[DimensionElement]:
494 """Dimension elements whose timespans should be included in the
495 query (`NamedValueSet` of `DimensionElement`).
496 """
497 if len(self.dimensions.temporal) > 1:
498 # We don't actually have multiple temporal families in our current
499 # dimension configuration, so this limitation should be harmless.
500 raise NotImplementedError("Queries that should involve temporal joins are not yet supported.")
501 result = NamedValueSet[DimensionElement]()
502 if self.where.expression_predicate is not None:
503 for tag in DimensionRecordColumnTag.filter_from(self.where.expression_predicate.columns_required):
504 if tag.column == "timespan":
505 result.add(self.requested.universe[tag.element])
506 return result.freeze()
508 def _compute_columns_required(self) -> tuple[set[ColumnTag], DimensionGraph]:
509 """Compute the columns that must be provided by the relations joined
510 into this query in order to obtain the right *set* of result rows in
511 the right order.
513 This does not include columns that only need to be included in the
514 result rows, and hence could be provided by postprocessors.
515 """
516 tags: set[ColumnTag] = set(DimensionKeyColumnTag.generate(self.requested.names))
517 tags.update(DimensionKeyColumnTag.generate(self.where.data_id.graph.names))
518 for dataset_type in self.datasets:
519 tags.update(DimensionKeyColumnTag.generate(dataset_type.dimensions.names))
520 if self.where.expression_predicate is not None:
521 tags.update(self.where.expression_predicate.columns_required)
522 if self.order_by is not None:
523 tags.update(self.order_by.columns_required)
524 # Make sure the dimension keys are expanded self-consistently in what
525 # we return by passing them through DimensionGraph.
526 dimensions = DimensionGraph(
527 self.universe, names={tag.dimension for tag in DimensionKeyColumnTag.filter_from(tags)}
528 )
529 tags.update(DimensionKeyColumnTag.generate(dimensions.names))
530 return (tags, dimensions)