Coverage for python / lsst / daf / butler / direct_query_driver / _query_analysis.py: 67%
80 statements
« prev ^ index » next coverage.py v7.13.5, created at 2026-04-17 08:49 +0000
« prev ^ index » next coverage.py v7.13.5, created at 2026-04-17 08:49 +0000
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This software is dual licensed under the GNU General Public License and also
10# under a 3-clause BSD license. Recipients may choose which of these licenses
11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt,
12# respectively. If you choose the GPL option then the following text applies
13# (but note that there is still no warranty even if you opt for BSD instead):
14#
15# This program is free software: you can redistribute it and/or modify
16# it under the terms of the GNU General Public License as published by
17# the Free Software Foundation, either version 3 of the License, or
18# (at your option) any later version.
19#
20# This program is distributed in the hope that it will be useful,
21# but WITHOUT ANY WARRANTY; without even the implied warranty of
22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23# GNU General Public License for more details.
24#
25# You should have received a copy of the GNU General Public License
26# along with this program. If not, see <http://www.gnu.org/licenses/>.
28from __future__ import annotations
30__all__ = (
31 "QueryCollectionAnalysis",
32 "QueryFindFirstAnalysis",
33 "QueryJoinsAnalysis",
34 "ResolvedDatasetSearch",
35)
37import dataclasses
38from collections.abc import Iterator, Mapping
39from typing import TYPE_CHECKING, Generic, TypeVar
41from ..dimensions import DimensionElement, DimensionGroup
42from ..queries import tree as qt
43from ..registry import CollectionSummary
44from ..registry.interfaces import CollectionRecord
46if TYPE_CHECKING:
47 from ._postprocessing import Postprocessing
48 from ._sql_builders import SqlSelectBuilder
50_T = TypeVar("_T")
53@dataclasses.dataclass
54class ResolvedDatasetSearch(Generic[_T]):
55 """A struct describing a dataset search joined into a query, after
56 resolving its collection search path.
57 """
59 name: _T
60 """Name or names of the dataset type(s)."""
62 dimensions: DimensionGroup
63 """Dimensions of the dataset type."""
65 collection_records: list[CollectionRecord] = dataclasses.field(default_factory=list)
66 """Records of the collections to search for this dataset, in order, after
67 removing collections inconsistent with the dataset type or the query's
68 data ID constraint.
69 """
71 messages: list[str] = dataclasses.field(default_factory=list)
72 """Diagnostic messages about collections that were filtered out of
73 collection records.
74 """
76 is_calibration_search: bool = False
77 """`True` if any of the collections to be searched is a
78 `~CollectionType.CALIBRATION` collection, `False` otherwise.
80 Since only calibration datasets can be present in
81 `~CollectionType.CALIBRATION` collections, this also indicates that the
82 dataset type is a calibration.
83 """
86@dataclasses.dataclass
87class QueryJoinsAnalysis:
88 """A struct describing the "joins" section of a butler query.
90 See `DirectQueryDriver.build_query` for an overview of how queries are
91 transformed into SQL, and the role this object plays in that.
92 """
94 predicate: qt.Predicate
95 """Boolean expression to apply to rows."""
97 columns: qt.ColumnSet
98 """All columns whose tables need to be joined into the query.
100 This is updated after construction to include all columns required by
101 `predicate`.
102 """
104 materializations: dict[qt.MaterializationKey, DimensionGroup] = dataclasses.field(default_factory=dict)
105 """Materializations to join into the query."""
107 datasets: dict[str, ResolvedDatasetSearch[str]] = dataclasses.field(default_factory=dict)
108 """Dataset searches to join into the query."""
110 data_coordinate_uploads: dict[qt.DataCoordinateUploadKey, DimensionGroup] = dataclasses.field(
111 default_factory=dict
112 )
113 """Data coordinate uploads to join into the query."""
115 messages: list[str] = dataclasses.field(default_factory=list)
116 """Diagnostic messages that report reasons the query may not return any
117 rows.
118 """
120 def __post_init__(self) -> None:
121 self.predicate.gather_required_columns(self.columns)
123 def iter_mandatory(self, union_dataset_dimensions: DimensionGroup | None) -> Iterator[DimensionElement]:
124 """Return an iterator over the dimension elements that must be joined
125 into the query.
127 These elements either provide "field" (non-key) columns or define
128 relationships that result rows must be consistent with. They do not
129 necessarily include all dimension keys in `columns`, since each of
130 those can typically be included in a query in multiple different ways.
132 Parameters
133 ----------
134 union_dataset_dimensions : `DimensionGroup` or `None`
135 Dimensions of the union dataset types, or `None` if this is not
136 a union dataset query.
137 """
138 for element_name in self.columns.dimensions.elements:
139 element = self.columns.dimensions.universe[element_name]
140 if self.columns.dimension_fields[element_name]:
141 # We need to get dimension record fields for this element, and
142 # its table is the only place to get those.
143 yield element
144 elif element.defines_relationships:
145 # We also need to join in DimensionElement tables that define
146 # one-to-many and many-to-many relationships, but data
147 # coordinate uploads, materializations, and datasets can also
148 # provide these relationships. Data coordinate uploads and
149 # dataset tables only have required dimensions, and can hence
150 # only provide relationships involving those.
151 if any(
152 element.minimal_group.names <= upload_dimensions.required
153 for upload_dimensions in self.data_coordinate_uploads.values()
154 ):
155 continue
156 if any(
157 element.minimal_group.names <= dataset_spec.dimensions.required
158 for dataset_spec in self.datasets.values()
159 ):
160 continue
161 if (
162 union_dataset_dimensions is not None
163 and element.minimal_group.names <= union_dataset_dimensions.required
164 ):
165 continue
166 # Materializations have all key columns for their dimensions.
167 if any(
168 element in materialization_dimensions.names
169 for materialization_dimensions in self.materializations.values()
170 ):
171 continue
172 yield element
175@dataclasses.dataclass
176class QueryFindFirstAnalysis(Generic[_T]):
177 """A struct describing the "find-first" stage of a butler query.
179 See `DirectQueryDriver.build_query` for an overview of how queries are
180 transformed into SQL, and the role this object plays in that.
181 """
183 search: ResolvedDatasetSearch[_T]
184 """Information about the dataset type or types being searched for."""
186 @property
187 def dataset_type(self) -> _T:
188 """Name(s) of the dataset type(s)."""
189 return self.search.name
191 def __bool__(self) -> bool:
192 return len(self.search.collection_records) > 1
195@dataclasses.dataclass
196class QueryCollectionAnalysis:
197 """A struct containing information about all of the collections that appear
198 in a butler query.
199 """
201 collection_records: Mapping[str, CollectionRecord]
202 """All collection records, keyed by collection name.
204 This includes CHAINED collections.
205 """
207 calibration_dataset_types: set[str | qt.AnyDatasetType] = dataclasses.field(default_factory=set)
208 """A set of the anmes of all calibration dataset types.
210 If ``ANY_DATASET`` appears in the set, the dataset type union includes at
211 least one calibration dataset type.
212 """
214 summaries_by_dataset_type: dict[
215 str | qt.AnyDatasetType, list[tuple[CollectionRecord, CollectionSummary]]
216 ] = dataclasses.field(default_factory=dict)
217 """Collection records and summaries, in search order, keyed by dataset type
218 name.
220 CHAINED collections are flattened out in the nested lists. Lists have been
221 filtered to be consistent with the dataset types in the summaries, but not
222 necessarily the governor dimensions in the summaries.
223 """
226@dataclasses.dataclass
227class QueryTreeAnalysis:
228 """A struct aggregating all analysis results derived from the query tree.
230 See `DirectQueryDriver.build_query` for an overview of how queries are
231 transformed into SQL, and the role this object plays in that.
232 """
234 joins: QueryJoinsAnalysis
235 """Analysis of the "joins" stage, including all joins and columns needed by
236 ``tree``. Additional columns will be added to this plan later.
237 """
239 union_datasets: list[ResolvedDatasetSearch[list[str]]]
240 """Resolved dataset searches that expand `QueryTree.any_dataset` out
241 into groups of dataset types with the same collection search path.
242 """
244 initial_select_builder: SqlSelectBuilder
245 """In-progress SQL query builder, initialized with just spatial and
246 temporal overlaps."""
248 postprocessing: Postprocessing
249 """Struct representing post-query processing to be done in Python."""