Coverage for python/lsst/daf/butler/registry/queries/_structs.py: 40%
172 statements
« prev ^ index » next coverage.py v7.5.0, created at 2024-05-03 02:48 -0700
« prev ^ index » next coverage.py v7.5.0, created at 2024-05-03 02:48 -0700
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This software is dual licensed under the GNU General Public License and also
10# under a 3-clause BSD license. Recipients may choose which of these licenses
11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt,
12# respectively. If you choose the GPL option then the following text applies
13# (but note that there is still no warranty even if you opt for BSD instead):
14#
15# This program is free software: you can redistribute it and/or modify
16# it under the terms of the GNU General Public License as published by
17# the Free Software Foundation, either version 3 of the License, or
18# (at your option) any later version.
19#
20# This program is distributed in the hope that it will be useful,
21# but WITHOUT ANY WARRANTY; without even the implied warranty of
22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23# GNU General Public License for more details.
24#
25# You should have received a copy of the GNU General Public License
26# along with this program. If not, see <http://www.gnu.org/licenses/>.
27from __future__ import annotations
29__all__ = ["QuerySummary"] # other classes here are local to subpackage
31import dataclasses
32from collections.abc import Iterable, Mapping, Set
33from typing import Any
35import astropy.time
36from lsst.daf.relation import ColumnExpression, ColumnTag, Predicate, SortTerm
37from lsst.sphgeom import IntersectionRegion, Region
38from lsst.utils.classes import cached_getter, immutable
40from ..._column_tags import DimensionKeyColumnTag, DimensionRecordColumnTag
41from ..._column_type_info import ColumnTypeInfo
42from ..._dataset_type import DatasetType
43from ..._named import NamedValueAbstractSet, NamedValueSet
44from ...dimensions import DataCoordinate, DimensionElement, DimensionGroup, DimensionUniverse, SkyPixDimension
46# We're not trying to add typing to the lex/yacc parser code, so MyPy
47# doesn't know about some of these imports.
48from .expressions import make_string_expression_predicate
49from .expressions.categorize import categorizeElementOrderByName, categorizeOrderByName
52@dataclasses.dataclass(frozen=True, eq=False)
53class QueryWhereClause:
54 """Structure holding various contributions to a query's WHERE clause.
56 Instances of this class should only be created by
57 `QueryWhereExpression.combine`, which guarantees the consistency of its
58 attributes.
59 """
61 @classmethod
62 def combine(
63 cls,
64 dimensions: DimensionGroup,
65 expression: str = "",
66 *,
67 column_types: ColumnTypeInfo,
68 bind: Mapping[str, Any] | None = None,
69 data_id: DataCoordinate | None = None,
70 region: Region | None = None,
71 defaults: DataCoordinate | None = None,
72 dataset_type_name: str | None = None,
73 allow_orphans: bool = False,
74 ) -> QueryWhereClause:
75 """Construct from various components.
77 Parameters
78 ----------
79 dimensions : `DimensionGroup`
80 The dimensions that would be included in the query in the absence
81 of the WHERE clause.
82 expression : `str`, optional
83 A user-provided string expression.
84 column_types : `ColumnTypeInfo`
85 Information about column types.
86 bind : `~collections.abc.Mapping` [ `str`, `object` ], optional
87 Mapping containing literal values that should be injected into the
88 query expression, keyed by the identifiers they replace.
89 data_id : `DataCoordinate`, optional
90 A data ID identifying dimensions known in advance. If not
91 provided, will be set to an empty data ID.
92 region : `lsst.sphgeom.Region`, optional
93 A spatial constraint that all rows must overlap.
94 defaults : `DataCoordinate`, optional
95 A data ID containing default for governor dimensions.
96 dataset_type_name : `str` or `None`, optional
97 The name of the dataset type to assume for unqualified dataset
98 columns, or `None` if there are no such identifiers.
99 allow_orphans : `bool`, optional
100 If `True`, permit expressions to refer to dimensions without
101 providing a value for their governor dimensions (e.g. referring to
102 a visit without an instrument). Should be left to default to
103 `False` in essentially all new code.
105 Returns
106 -------
107 where : `QueryWhereClause`
108 An object representing the WHERE clause for a query.
109 """
110 if data_id is None:
111 data_id = DataCoordinate.make_empty(dimensions.universe)
112 if defaults is None:
113 defaults = DataCoordinate.make_empty(dimensions.universe)
114 expression_predicate, governor_constraints = make_string_expression_predicate(
115 expression,
116 dimensions,
117 column_types=column_types,
118 bind=bind,
119 data_id=data_id,
120 defaults=defaults,
121 dataset_type_name=dataset_type_name,
122 allow_orphans=allow_orphans,
123 )
124 return QueryWhereClause(
125 expression_predicate,
126 data_id,
127 region=region,
128 governor_constraints=governor_constraints,
129 )
131 expression_predicate: Predicate | None
132 """A predicate that evaluates a string expression from the user
133 (`expressions.Predicate` or `None`).
134 """
136 data_id: DataCoordinate
137 """A data ID identifying dimensions known before query construction
138 (`DataCoordinate`).
139 """
141 region: Region | None
142 """A spatial region that all result rows must overlap
143 (`lsst.sphgeom.Region` or `None`).
144 """
146 governor_constraints: Mapping[str, Set[str]]
147 """Restrictions on the values governor dimensions can take in this query,
148 imposed by the string expression and/or data ID
149 (`~collections.abc.Mapping` [ `str`, `~collections.abc.Set` [ `str` ] ]).
151 Governor dimensions not present in this mapping are not constrained at all.
152 """
155@dataclasses.dataclass(frozen=True)
156class OrderByClauseColumn:
157 """Information about single column in ORDER BY clause."""
159 element: DimensionElement
160 """Dimension element for data in this column (`DimensionElement`)."""
162 column: str | None
163 """Name of the column or `None` for primary key (`str` or `None`)"""
165 ordering: bool
166 """True for ascending order, False for descending (`bool`)."""
169@dataclasses.dataclass(frozen=True, eq=False)
170class OrderByClause:
171 """Class for information about columns in ORDER BY clause."""
173 @classmethod
174 def parse_general(cls, order_by: Iterable[str], dimensions: DimensionGroup) -> OrderByClause:
175 """Parse an iterable of strings in the context of a multi-dimension
176 query.
178 Parameters
179 ----------
180 order_by : `~collections.abc.Iterable` [ `str` ]
181 Sequence of names to use for ordering with optional "-" prefix.
182 dimensions : `DimensionGroup`
183 Dimensions used by a query.
185 Returns
186 -------
187 clause : `OrderByClause`
188 New order-by clause representing the given string columns.
189 """
190 terms = []
191 for name in order_by:
192 if not name or name == "-":
193 raise ValueError("Empty dimension name in ORDER BY")
194 ascending = True
195 if name[0] == "-":
196 ascending = False
197 name = name[1:]
198 element, column = categorizeOrderByName(dimensions, name)
199 term = cls._make_term(element, column, ascending)
200 terms.append(term)
201 return cls(terms)
203 @classmethod
204 def parse_element(cls, order_by: Iterable[str], element: DimensionElement) -> OrderByClause:
205 """Parse an iterable of strings in the context of a single dimension
206 element query.
208 Parameters
209 ----------
210 order_by : `~collections.abc.Iterable` [ `str` ]
211 Sequence of names to use for ordering with optional "-" prefix.
212 element : `DimensionElement`
213 Single or primary dimension element in the query.
215 Returns
216 -------
217 clause : `OrderByClause`
218 New order-by clause representing the given string columns.
219 """
220 terms = []
221 for name in order_by:
222 if not name or name == "-":
223 raise ValueError("Empty dimension name in ORDER BY")
224 ascending = True
225 if name[0] == "-":
226 ascending = False
227 name = name[1:]
228 found_element, column = categorizeElementOrderByName(element, name)
229 term = cls._make_term(found_element, column, ascending)
230 terms.append(term)
231 return cls(terms)
233 @classmethod
234 def _make_term(cls, element: DimensionElement, column: str | None, ascending: bool) -> SortTerm:
235 """Make a single sort term from parsed user expression values.
237 Parameters
238 ----------
239 element : `DimensionElement`
240 Dimension element the sort term references.
241 column : `str` or `None`
242 DimensionRecord field name, or `None` if ``element`` is a
243 `Dimension` and the sort term is on is key value.
244 ascending : `bool`
245 Whether to sort ascending (`True`) or descending (`False`).
247 Returns
248 -------
249 term : `lsst.daf.relation.SortTerm`
250 Sort term struct.
251 """
252 tag: ColumnTag
253 expression: ColumnExpression
254 if column is None:
255 tag = DimensionKeyColumnTag(element.name)
256 expression = ColumnExpression.reference(tag)
257 elif column in ("timespan.begin", "timespan.end"):
258 base_column, _, subfield = column.partition(".")
259 tag = DimensionRecordColumnTag(element.name, base_column)
260 expression = ColumnExpression.reference(tag).method(
261 "lower" if subfield == "begin" else "upper", dtype=astropy.time.Time
262 )
263 else:
264 tag = DimensionRecordColumnTag(element.name, column)
265 expression = ColumnExpression.reference(tag)
266 return SortTerm(expression, ascending)
268 terms: Iterable[SortTerm]
269 """Columns that appear in the ORDER BY
270 (`~collections.abc.Iterable` [ `OrderByClauseColumn` ]).
271 """
273 @property
274 @cached_getter
275 def columns_required(self) -> Set[ColumnTag]:
276 """Set of column tags for all columns referenced by the ORDER BY clause
277 (`~collections.abc.Set` [ `ColumnTag` ]).
278 """
279 tags: set[ColumnTag] = set()
280 for term in self.terms:
281 tags.update(term.expression.columns_required)
282 return tags
285@immutable
286class ElementOrderByClause:
287 """Class for information about columns in ORDER BY clause for one element.
289 Parameters
290 ----------
291 order_by : `~collections.abc.Iterable` [ `str` ]
292 Sequence of names to use for ordering with optional "-" prefix.
293 element : `DimensionElement`
294 Dimensions used by a query.
295 """
297 def __init__(self, order_by: Iterable[str], element: DimensionElement):
298 self.order_by_columns = []
299 for name in order_by:
300 if not name or name == "-":
301 raise ValueError("Empty dimension name in ORDER BY")
302 ascending = True
303 if name[0] == "-":
304 ascending = False
305 name = name[1:]
306 found_element, column = categorizeElementOrderByName(element, name)
307 self.order_by_columns.append(
308 OrderByClauseColumn(element=found_element, column=column, ordering=ascending)
309 )
311 order_by_columns: Iterable[OrderByClauseColumn]
312 """Columns that appear in the ORDER BY
313 (`~collections.abc.Iterable` [ `OrderByClauseColumn` ]).
314 """
317@immutable
318class QuerySummary:
319 """A struct that holds and categorizes the dimensions involved in a query.
321 A `QuerySummary` instance is necessary to construct a `QueryBuilder`, and
322 it needs to include all of the dimensions that will be included in the
323 query (including any needed for querying datasets).
325 Parameters
326 ----------
327 requested : `DimensionGroup`
328 The dimensions whose primary keys should be included in the result rows
329 of the query.
330 column_types : `ColumnTypeInfo`
331 Information about column types.
332 data_id : `DataCoordinate`, optional
333 A fully-expanded data ID identifying dimensions known in advance. If
334 not provided, will be set to an empty data ID.
335 expression : `str`, optional
336 A user-provided string WHERE expression.
337 region : `lsst.sphgeom.Region`, optional
338 A spatial constraint that all rows must overlap.
339 bind : `~collections.abc.Mapping` [ `str`, `object` ], optional
340 Mapping containing literal values that should be injected into the
341 query expression, keyed by the identifiers they replace.
342 defaults : `DataCoordinate`, optional
343 A data ID containing default for governor dimensions.
344 datasets : `~collections.abc.Iterable` [ `DatasetType` ], optional
345 Dataset types whose searches may be joined into the query. Callers
346 must still call `QueryBuilder.joinDataset` explicitly to control how
347 that join happens (e.g. which collections are searched), but by
348 declaring them here first we can ensure that the query includes the
349 right dimensions for those joins.
350 order_by : `~collections.abc.Iterable` [ `str` ]
351 Sequence of names to use for ordering with optional "-" prefix.
352 limit : `Tuple`, optional
353 Limit on the number of returned rows and optional offset.
354 check : `bool`, optional
355 If `False`, permit expressions to refer to dimensions without providing
356 a value for their governor dimensions (e.g. referring to a visit
357 without an instrument). Should be left to default to `True` in
358 essentially all new code.
359 """
361 def __init__(
362 self,
363 requested: DimensionGroup,
364 *,
365 column_types: ColumnTypeInfo,
366 data_id: DataCoordinate | None = None,
367 expression: str = "",
368 region: Region | None = None,
369 bind: Mapping[str, Any] | None = None,
370 defaults: DataCoordinate | None = None,
371 datasets: Iterable[DatasetType] = (),
372 order_by: Iterable[str] | None = None,
373 limit: tuple[int, int | None] | None = None,
374 check: bool = True,
375 ):
376 self.requested = requested
377 self.datasets = NamedValueSet(datasets).freeze()
378 if len(self.datasets) == 1:
379 (dataset_type_name,) = self.datasets.names
380 else:
381 dataset_type_name = None
382 self.where = QueryWhereClause.combine(
383 self.requested,
384 expression=expression,
385 column_types=column_types,
386 bind=bind,
387 data_id=data_id,
388 region=region,
389 defaults=defaults,
390 dataset_type_name=dataset_type_name,
391 allow_orphans=not check,
392 )
393 self.order_by = None if order_by is None else OrderByClause.parse_general(order_by, requested)
394 self.limit = limit
395 self.columns_required, self.dimensions, self.region = self._compute_columns_required()
397 requested: DimensionGroup
398 """Dimensions whose primary keys should be included in the result rows of
399 the query (`DimensionGroup`).
400 """
402 where: QueryWhereClause
403 """Structure containing objects that contribute to the WHERE clause of the
404 query (`QueryWhereClause`).
405 """
407 datasets: NamedValueAbstractSet[DatasetType]
408 """Dataset types whose searches may be joined into the query
409 (`NamedValueAbstractSet` [ `DatasetType` ]).
410 """
412 order_by: OrderByClause | None
413 """Object that manages how the query results should be sorted
414 (`OrderByClause` or `None`).
415 """
417 limit: tuple[int, int | None] | None
418 """Integer offset and maximum number of rows returned (prior to
419 postprocessing filters), respectively.
420 """
422 dimensions: DimensionGroup
423 """All dimensions in the query in any form (`DimensionGroup`).
424 """
426 region: Region | None
427 """Region that bounds all query results (`lsst.sphgeom.Region`).
429 While `QueryWhereClause.region` and the ``region`` constructor argument
430 represent an external region given directly by the caller, this represents
431 the region actually used directly as a constraint on the query results,
432 which can also come from the data ID passed by the caller.
433 """
435 columns_required: Set[ColumnTag]
436 """All columns that must be included directly in the query.
438 This does not include columns that only need to be included in the result
439 rows, and hence could be provided by postprocessors.
440 """
442 @property
443 def universe(self) -> DimensionUniverse:
444 """All known dimensions (`DimensionUniverse`)."""
445 return self.requested.universe
447 def _compute_columns_required(
448 self,
449 ) -> tuple[set[ColumnTag], DimensionGroup, Region | None]:
450 """Compute the columns that must be provided by the relations joined
451 into this query in order to obtain the right *set* of result rows in
452 the right order.
454 This does not include columns that only need to be included in the
455 result rows, and hence could be provided by postprocessors.
456 """
457 tags: set[ColumnTag] = set(DimensionKeyColumnTag.generate(self.requested.names))
458 for dataset_type in self.datasets:
459 tags.update(DimensionKeyColumnTag.generate(dataset_type.dimensions.names))
460 if self.where.expression_predicate is not None:
461 tags.update(self.where.expression_predicate.columns_required)
462 if self.order_by is not None:
463 tags.update(self.order_by.columns_required)
464 region = self.where.region
465 for dimension_name in self.where.data_id.dimensions.names:
466 dimension_tag = DimensionKeyColumnTag(dimension_name)
467 if dimension_tag in tags:
468 continue
469 if skypix_dimension := self.universe.skypix_dimensions.get(dimension_name):
470 if skypix_dimension == self.universe.commonSkyPix:
471 # Common skypix dimension is should be available from
472 # spatial join tables.
473 tags.add(dimension_tag)
474 else:
475 # This is a SkyPixDimension other than the common one. If
476 # it's not already present in the query (e.g. from a
477 # dataset join), this is a pure spatial constraint, which
478 # we can only apply by modifying the 'region' for the
479 # query. That will also require that we join in the common
480 # skypix dimension.
481 pixel = skypix_dimension.pixelization.pixel(self.where.data_id[dimension_name])
482 if region is None:
483 region = pixel
484 else:
485 region = IntersectionRegion(region, pixel)
486 else:
487 # If a dimension in the data ID is available from dimension
488 # tables or dimension spatial-join tables in the database,
489 # include it in the set of dimensions whose tables should
490 # be joined. This makes these data ID constraints work
491 # just like simple 'where' constraints, which is good.
492 tags.add(dimension_tag)
493 # Make sure the dimension keys are expanded self-consistently in what
494 # we return by passing them through DimensionGroup.
495 dimensions = DimensionGroup(
496 self.universe, names={tag.dimension for tag in DimensionKeyColumnTag.filter_from(tags)}
497 )
498 # If we have a region constraint, ensure region columns and the common
499 # skypix dimension are included.
500 missing_common_skypix = False
501 if region is not None:
502 for family in dimensions.spatial:
503 element = family.choose(dimensions.elements.names, self.universe)
504 tags.add(DimensionRecordColumnTag(element.name, "region"))
505 if (
506 not isinstance(element, SkyPixDimension)
507 and self.universe.commonSkyPix.name not in dimensions
508 ):
509 missing_common_skypix = True
510 if missing_common_skypix:
511 dimensions = dimensions.union(self.universe.commonSkyPix.minimal_group)
512 tags.update(DimensionKeyColumnTag.generate(dimensions.names))
513 return (tags, dimensions, region)