Coverage for python/lsst/daf/butler/queries/tree/_query_tree.py: 48%
86 statements
« prev ^ index » next coverage.py v7.5.0, created at 2024-04-30 09:54 +0000
« prev ^ index » next coverage.py v7.5.0, created at 2024-04-30 09:54 +0000
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This software is dual licensed under the GNU General Public License and also
10# under a 3-clause BSD license. Recipients may choose which of these licenses
11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt,
12# respectively. If you choose the GPL option then the following text applies
13# (but note that there is still no warranty even if you opt for BSD instead):
14#
15# This program is free software: you can redistribute it and/or modify
16# it under the terms of the GNU General Public License as published by
17# the Free Software Foundation, either version 3 of the License, or
18# (at your option) any later version.
19#
20# This program is distributed in the hope that it will be useful,
21# but WITHOUT ANY WARRANTY; without even the implied warranty of
22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23# GNU General Public License for more details.
24#
25# You should have received a copy of the GNU General Public License
26# along with this program. If not, see <http://www.gnu.org/licenses/>.
28from __future__ import annotations
30__all__ = (
31 "QueryTree",
32 "make_identity_query_tree",
33 "DataCoordinateUploadKey",
34 "MaterializationKey",
35 "DatasetSearch",
36 "SerializedQueryTree",
37)
39import uuid
40from collections.abc import Mapping
41from typing import TypeAlias, final
43import pydantic
45from ..._exceptions import InvalidQueryError
46from ...dimensions import DimensionGroup, DimensionUniverse
47from ...pydantic_utils import DeferredValidation
48from ._base import QueryTreeBase
49from ._column_set import ColumnSet
50from ._predicate import Predicate
52DataCoordinateUploadKey: TypeAlias = uuid.UUID
54MaterializationKey: TypeAlias = uuid.UUID
57def make_identity_query_tree(universe: DimensionUniverse) -> QueryTree:
58 """Make an initial query tree with empty dimensions and a single logical
59 row.
61 This method should be used by `Butler._query` to construct the initial
62 query tree. This tree is a useful initial state because it is the
63 identity for joins, in that joining any other query tree to the identity
64 yields that query tree.
66 Parameters
67 ----------
68 universe : `..DimensionUniverse`
69 Definitions for all dimensions.
71 Returns
72 -------
73 tree : `QueryTree`
74 A tree with empty dimensions.
75 """
76 return QueryTree(dimensions=universe.empty.as_group())
79@final
80class DatasetSearch(QueryTreeBase):
81 """Information about a dataset search joined into a query tree.
83 The dataset type name is the key of the dictionary (in `QueryTree`) where
84 this type is used as a value.
85 """
87 collections: tuple[str, ...]
88 """The collections to search.
90 Order matters if this dataset type is later referenced by a `FindFirst`
91 operation. Collection wildcards are always resolved before being included
92 in a dataset search.
93 """
95 dimensions: DimensionGroup
96 """The dimensions of the dataset type.
98 This must match the dimensions of the dataset type as already defined in
99 the butler database, but this cannot generally be verified when a relation
100 tree is validated (since it requires a database query) and hence must be
101 checked later.
102 """
105@final
106class QueryTree(QueryTreeBase):
107 """A declarative, serializable description of the row constraints and joins
108 in a butler query.
110 Notes
111 -----
112 A `QueryTree` is the struct that represents the serializable form of a
113 `Query` object, or one piece (with `ResultSpec` the other) of the
114 serializable form of a query results object.
116 This class's attributes describe the columns that are "available" to be
117 returned or used in ``where`` or ``order_by`` expressions, but it does not
118 carry information about the columns that are actually included in result
119 rows, or what kind of butler primitive (e.g. `DataCoordinate` or
120 `DatasetRef`) those rows might be transformed into.
121 """
123 dimensions: DimensionGroup
124 """The dimensions whose keys are joined into the query.
125 """
127 datasets: Mapping[str, DatasetSearch] = pydantic.Field(default_factory=dict)
128 """Dataset searches that have been joined into the query."""
130 data_coordinate_uploads: Mapping[DataCoordinateUploadKey, DimensionGroup] = pydantic.Field(
131 default_factory=dict
132 )
133 """Uploaded tables of data ID values that have been joined into the query.
134 """
136 materializations: Mapping[MaterializationKey, DimensionGroup] = pydantic.Field(default_factory=dict)
137 """Tables of result rows from other queries that have been stored
138 temporarily on the server.
139 """
141 predicate: Predicate = Predicate.from_bool(True)
142 """Boolean expression trees whose logical AND defines a row filter."""
144 def get_joined_dimension_groups(self) -> frozenset[DimensionGroup]:
145 """Return a set of the dimension groups of all data coordinate uploads,
146 dataset searches, and materializations.
147 """
148 result: set[DimensionGroup] = set(self.data_coordinate_uploads.values())
149 result.update(self.materializations.values())
150 for dataset_spec in self.datasets.values():
151 result.add(dataset_spec.dimensions)
152 return frozenset(result)
154 def join_dimensions(self, dimensions: DimensionGroup) -> QueryTree:
155 """Return a new tree that includes additional dimensions.
157 Parameters
158 ----------
159 dimensions : `DimensionGroup`
160 Dimensions to include.
162 Returns
163 -------
164 result : `QueryTree`
165 A new tree with the additional dimensions.
166 """
167 return self.model_copy(update=dict(dimensions=self.dimensions | dimensions))
169 def join_data_coordinate_upload(
170 self, key: DataCoordinateUploadKey, dimensions: DimensionGroup
171 ) -> QueryTree:
172 """Return a new tree that joins in an uploaded table of data ID values.
174 Parameters
175 ----------
176 key : `DataCoordinateUploadKey`
177 Unique identifier for this upload, as assigned by a `QueryDriver`.
178 dimensions : `DimensionGroup`
179 Dimensions of the data IDs.
181 Returns
182 -------
183 result : `QueryTree`
184 A new tree that joins in the data ID table.
185 """
186 assert key not in self.data_coordinate_uploads, "Query should prevent doing the same upload twice."
187 data_coordinate_uploads = dict(self.data_coordinate_uploads)
188 data_coordinate_uploads[key] = dimensions
189 return self.model_copy(
190 update=dict(
191 dimensions=self.dimensions | dimensions, data_coordinate_uploads=data_coordinate_uploads
192 )
193 )
195 def join_materialization(self, key: MaterializationKey, dimensions: DimensionGroup) -> QueryTree:
196 """Return a new tree that joins in temporarily stored results from
197 another query.
199 Parameters
200 ----------
201 key : `MaterializationKey`
202 Unique identifier for this materialization, as assigned by a
203 `QueryDriver`.
204 dimensions : `DimensionGroup`
205 The dimensions stored in the materialization.
207 Returns
208 -------
209 result : `QueryTree`
210 A new tree that joins in the materialization.
211 """
212 assert key not in self.data_coordinate_uploads, "Query should prevent duplicate materialization."
213 materializations = dict(self.materializations)
214 materializations[key] = dimensions
215 return self.model_copy(
216 update=dict(dimensions=self.dimensions | dimensions, materializations=materializations)
217 )
219 def join_dataset(self, dataset_type: str, search: DatasetSearch) -> QueryTree:
220 """Return a new tree joins in a search for a dataset.
222 Parameters
223 ----------
224 dataset_type : `str`
225 Name of dataset type to join in.
226 search : `DatasetSearch`
227 Struct containing the collection search path and dataset type
228 dimensions.
230 Returns
231 -------
232 result : `QueryTree`
233 A new tree that joins in the dataset search.
235 Notes
236 -----
237 If this dataset type was already joined in, the new `DatasetSearch`
238 replaces the old one.
239 """
240 if existing := self.datasets.get(dataset_type):
241 assert existing == search, "Dataset search should be new or the same."
242 return self
243 else:
244 datasets = dict(self.datasets)
245 datasets[dataset_type] = search
246 return self.model_copy(
247 update=dict(dimensions=self.dimensions | search.dimensions, datasets=datasets)
248 )
250 def where(self, *terms: Predicate) -> QueryTree:
251 """Return a new tree that adds row filtering via a boolean column
252 expression.
254 Parameters
255 ----------
256 *terms : `Predicate`
257 Boolean column expressions that filter rows. Arguments are
258 combined with logical AND.
260 Returns
261 -------
262 result : `QueryTree`
263 A new tree that with row filtering.
265 Raises
266 ------
267 InvalidQueryTreeError
268 Raised if a column expression requires a dataset column that is not
269 already present in the query tree.
271 Notes
272 -----
273 If an expression references a dimension or dimension element that is
274 not already present in the query tree, it will be joined in, but
275 datasets must already be joined into a query tree in order to reference
276 their fields in expressions.
277 """
278 predicate = self.predicate
279 columns = ColumnSet(self.dimensions)
280 for where_term in terms:
281 where_term.gather_required_columns(columns)
282 predicate = predicate.logical_and(where_term)
283 if not (columns.dataset_fields.keys() <= self.datasets.keys()):
284 raise InvalidQueryError(
285 f"Cannot reference dataset type(s) {columns.dataset_fields.keys() - self.datasets.keys()} "
286 "that have not been joined."
287 )
288 return self.model_copy(update=dict(dimensions=columns.dimensions, predicate=predicate))
290 @pydantic.model_validator(mode="after")
291 def _validate_join_operands(self) -> QueryTree:
292 for dimensions in self.get_joined_dimension_groups():
293 if not dimensions.issubset(self.dimensions):
294 raise InvalidQueryError(
295 f"Dimensions {dimensions} of join operand are not a "
296 f"subset of the query tree's dimensions {self.dimensions}."
297 )
298 return self
300 @pydantic.model_validator(mode="after")
301 def _validate_required_columns(self) -> QueryTree:
302 columns = ColumnSet(self.dimensions)
303 self.predicate.gather_required_columns(columns)
304 if not columns.dimensions.issubset(self.dimensions):
305 raise InvalidQueryError("Predicate requires dimensions beyond those in the query tree.")
306 if not columns.dataset_fields.keys() <= self.datasets.keys():
307 raise InvalidQueryError("Predicate requires dataset columns that are not in the query tree.")
308 return self
311class SerializedQueryTree(DeferredValidation[QueryTree]):
312 """A Pydantic-serializable wrapper for `QueryTree` that defers validation
313 to the `validated` method, allowing a `.DimensionUniverse` to be provided.
314 """
316 def to_query_tree(self, universe: DimensionUniverse) -> QueryTree:
317 return self.validated(universe=universe)