Coverage for python/lsst/daf/butler/queries/driver.py: 87%
81 statements
« prev ^ index » next coverage.py v7.5.0, created at 2024-04-27 03:00 -0700
« prev ^ index » next coverage.py v7.5.0, created at 2024-04-27 03:00 -0700
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This software is dual licensed under the GNU General Public License and also
10# under a 3-clause BSD license. Recipients may choose which of these licenses
11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt,
12# respectively. If you choose the GPL option then the following text applies
13# (but note that there is still no warranty even if you opt for BSD instead):
14#
15# This program is free software: you can redistribute it and/or modify
16# it under the terms of the GNU General Public License as published by
17# the Free Software Foundation, either version 3 of the License, or
18# (at your option) any later version.
19#
20# This program is distributed in the hope that it will be useful,
21# but WITHOUT ANY WARRANTY; without even the implied warranty of
22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23# GNU General Public License for more details.
24#
25# You should have received a copy of the GNU General Public License
26# along with this program. If not, see <http://www.gnu.org/licenses/>.
28from __future__ import annotations
30__all__ = (
31 "QueryDriver",
32 "PageKey",
33 "ResultPage",
34 "DataCoordinateResultPage",
35 "DimensionRecordResultPage",
36 "DatasetRefResultPage",
37 "GeneralResultPage",
38)
40import dataclasses
41import uuid
42from abc import abstractmethod
43from collections.abc import Iterable
44from contextlib import AbstractContextManager
45from typing import Any, TypeAlias, Union, overload
47from .._dataset_ref import DatasetRef
48from .._dataset_type import DatasetType
49from ..dimensions import (
50 DataCoordinate,
51 DataIdValue,
52 DimensionGroup,
53 DimensionRecord,
54 DimensionRecordSet,
55 DimensionRecordTable,
56 DimensionUniverse,
57)
58from .result_specs import (
59 DataCoordinateResultSpec,
60 DatasetRefResultSpec,
61 DimensionRecordResultSpec,
62 GeneralResultSpec,
63 ResultSpec,
64)
65from .tree import DataCoordinateUploadKey, MaterializationKey, QueryTree
67PageKey: TypeAlias = uuid.UUID
70# The Page types below could become Pydantic models instead of dataclasses if
71# that makes them more directly usable by RemoteButler (at least once we have
72# Pydantic-friendly containers for all of them). We may want to add a
73# discriminator annotation to the ResultPage union if we do that.
76@dataclasses.dataclass
77class DataCoordinateResultPage:
78 """A single page of results from a data coordinate query."""
80 spec: DataCoordinateResultSpec
81 next_key: PageKey | None
83 # TODO: On DM-41114 this will become a custom container that normalizes out
84 # attached DimensionRecords and is Pydantic-friendly.
85 rows: list[DataCoordinate]
88@dataclasses.dataclass
89class DimensionRecordResultPage:
90 """A single page of results from a dimension record query."""
92 spec: DimensionRecordResultSpec
93 next_key: PageKey | None
94 rows: Iterable[DimensionRecord]
96 def as_table(self) -> DimensionRecordTable:
97 if isinstance(self.rows, DimensionRecordTable):
98 return self.rows
99 else:
100 return DimensionRecordTable(self.spec.element, self.rows)
102 def as_set(self) -> DimensionRecordSet:
103 if isinstance(self.rows, DimensionRecordSet):
104 return self.rows
105 else:
106 return DimensionRecordSet(self.spec.element, self.rows)
109@dataclasses.dataclass
110class DatasetRefResultPage:
111 """A single page of results from a dataset query."""
113 spec: DatasetRefResultSpec
114 next_key: PageKey | None
116 # TODO: On DM-41115 this will become a custom container that normalizes out
117 # attached DimensionRecords and is Pydantic-friendly.
118 rows: list[DatasetRef]
121@dataclasses.dataclass
122class GeneralResultPage:
123 """A single page of results from a general query."""
125 spec: GeneralResultSpec
126 next_key: PageKey | None
128 # Raw tabular data, with columns in the same order as spec.columns.
129 rows: list[tuple[Any, ...]]
132ResultPage: TypeAlias = Union[
133 DataCoordinateResultPage, DimensionRecordResultPage, DatasetRefResultPage, GeneralResultPage
134]
137class QueryDriver(AbstractContextManager[None]):
138 """Base class for the implementation object inside `Query2` objects
139 that is specialized for DirectButler vs. RemoteButler.
141 Notes
142 -----
143 Implementations should be context managers. This allows them to manage the
144 lifetime of server-side state, such as:
146 - a SQL transaction, when necessary (DirectButler);
147 - SQL cursors for queries that were not fully iterated over (DirectButler);
148 - temporary database tables (DirectButler);
149 - result-page Parquet files that were never fetched (RemoteButler);
150 - uploaded Parquet files used to fill temporary database tables
151 (RemoteButler);
152 - cached content needed to construct query trees, like collection summaries
153 (potentially all Butlers).
155 When possible, these sorts of things should be cleaned up earlier when they
156 are no longer needed, and the Butler server will still have to guard
157 against the context manager's ``__exit__`` signal never reaching it, but a
158 context manager will take care of these much more often than relying on
159 garbage collection and ``__del__`` would.
160 """
162 @property
163 @abstractmethod
164 def universe(self) -> DimensionUniverse:
165 """Object that defines all dimensions."""
166 raise NotImplementedError()
168 @overload
169 def execute(self, result_spec: DataCoordinateResultSpec, tree: QueryTree) -> DataCoordinateResultPage: ... 169 ↛ exitline 169 didn't return from function 'execute', because
171 @overload
172 def execute( 172 ↛ exitline 172 didn't jump to the function exit
173 self, result_spec: DimensionRecordResultSpec, tree: QueryTree
174 ) -> DimensionRecordResultPage: ...
176 @overload
177 def execute(self, result_spec: DatasetRefResultSpec, tree: QueryTree) -> DatasetRefResultPage: ... 177 ↛ exitline 177 didn't return from function 'execute', because
179 @overload
180 def execute(self, result_spec: GeneralResultSpec, tree: QueryTree) -> GeneralResultPage: ... 180 ↛ exitline 180 didn't return from function 'execute', because
182 @abstractmethod
183 def execute(self, result_spec: ResultSpec, tree: QueryTree) -> ResultPage:
184 """Execute a query and return the first result page.
186 Parameters
187 ----------
188 result_spec : `ResultSpec`
189 The kind of results the user wants from the query. This can affect
190 the actual query (i.e. SQL and Python postprocessing) that is run,
191 e.g. by changing what is in the SQL SELECT clause and even what
192 tables are joined in, but it never changes the number or order of
193 result rows.
194 tree : `QueryTree`
195 Query tree to evaluate.
197 Returns
198 -------
199 first_page : `ResultPage`
200 A page whose type corresponds to the type of ``result_spec``, with
201 at least the initial rows from the query. This should have an
202 empty ``rows`` attribute if the query returned no results, and a
203 ``next_key`` attribute that is not `None` if there were more
204 results than could be returned in a single page.
205 """
206 raise NotImplementedError()
208 @overload
209 def fetch_next_page( 209 ↛ exitline 209 didn't jump to the function exit
210 self, result_spec: DataCoordinateResultSpec, key: PageKey
211 ) -> DataCoordinateResultPage: ...
213 @overload
214 def fetch_next_page( 214 ↛ exitline 214 didn't jump to the function exit
215 self, result_spec: DimensionRecordResultSpec, key: PageKey
216 ) -> DimensionRecordResultPage: ...
218 @overload
219 def fetch_next_page(self, result_spec: DatasetRefResultSpec, key: PageKey) -> DatasetRefResultPage: ... 219 ↛ exitline 219 didn't return from function 'fetch_next_page', because
221 @overload
222 def fetch_next_page(self, result_spec: GeneralResultSpec, key: PageKey) -> GeneralResultPage: ... 222 ↛ exitline 222 didn't return from function 'fetch_next_page', because
224 @abstractmethod
225 def fetch_next_page(self, result_spec: ResultSpec, key: PageKey) -> ResultPage:
226 """Fetch the next page of results from an already-executed query.
228 Parameters
229 ----------
230 result_spec : `ResultSpec`
231 The kind of results the user wants from the query. This must be
232 identical to the ``result_spec`` passed to `execute`, but
233 implementations are not *required* to check this.
234 key : `PageKey`
235 Key included in the previous page from this query. This key may
236 become unusable or even be reused after this call.
238 Returns
239 -------
240 next_page : `ResultPage`
241 The next page of query results.
242 """
243 # We can put off dealing with pagination initially by just making an
244 # implementation of this method raise.
245 #
246 # In RemoteButler I expect this to work by having the call to execute
247 # continue to write Parquet files (or whatever) to some location until
248 # its cursor is exhausted, and then delete those files as they are
249 # fetched (or, failing that, when receiving a signal from
250 # ``__exit__``).
251 #
252 # In DirectButler I expect to have a dict[PageKey, Cursor], fetch a
253 # blocks of rows from it, and just reuse the page key for the next page
254 # until the cursor is exactly.
255 raise NotImplementedError()
257 @abstractmethod
258 def materialize(
259 self,
260 tree: QueryTree,
261 dimensions: DimensionGroup,
262 datasets: frozenset[str],
263 ) -> MaterializationKey:
264 """Execute a query tree, saving results to temporary storage for use
265 in later queries.
267 Parameters
268 ----------
269 tree : `QueryTree`
270 Query tree to evaluate.
271 dimensions : `DimensionGroup`
272 Dimensions whose key columns should be preserved.
273 datasets : `frozenset` [ `str` ]
274 Names of dataset types whose ID columns may be materialized. It
275 is implementation-defined whether they actually are.
277 Returns
278 -------
279 key : `MaterializationKey`
280 Unique identifier for the result rows that allows them to be
281 referenced in a `QueryTree`.
282 """
283 raise NotImplementedError()
285 @abstractmethod
286 def upload_data_coordinates(
287 self, dimensions: DimensionGroup, rows: Iterable[tuple[DataIdValue, ...]]
288 ) -> DataCoordinateUploadKey:
289 """Upload a table of data coordinates for use in later queries.
291 Parameters
292 ----------
293 dimensions : `DimensionGroup`
294 Dimensions of the data coordinates.
295 rows : `Iterable` [ `tuple` ]
296 Tuples of data coordinate values, covering just the "required"
297 subset of ``dimensions``.
299 Returns
300 -------
301 key
302 Unique identifier for the upload that allows it to be referenced in
303 a `QueryTree`.
304 """
305 raise NotImplementedError()
307 @abstractmethod
308 def count(
309 self,
310 tree: QueryTree,
311 result_spec: ResultSpec,
312 *,
313 exact: bool,
314 discard: bool,
315 ) -> int:
316 """Return the number of rows a query would return.
318 Parameters
319 ----------
320 tree : `QueryTree`
321 Query tree to evaluate.
322 result_spec : `ResultSpec`
323 The kind of results the user wants to count.
324 exact : `bool`, optional
325 If `True`, run the full query and perform post-query filtering if
326 needed to account for that filtering in the count. If `False`, the
327 result may be an upper bound.
328 discard : `bool`, optional
329 If `True`, compute the exact count even if it would require running
330 the full query and then throwing away the result rows after
331 counting them. If `False`, this is an error, as the user would
332 usually be better off executing the query first to fetch its rows
333 into a new query (or passing ``exact=False``). Ignored if
334 ``exact=False``.
335 """
336 raise NotImplementedError()
338 @abstractmethod
339 def any(self, tree: QueryTree, *, execute: bool, exact: bool) -> bool:
340 """Test whether the query would return any rows.
342 Parameters
343 ----------
344 tree : `QueryTree`
345 Query tree to evaluate.
346 execute : `bool`, optional
347 If `True`, execute at least a ``LIMIT 1`` query if it cannot be
348 determined prior to execution that the query would return no rows.
349 exact : `bool`, optional
350 If `True`, run the full query and perform post-query filtering if
351 needed, until at least one result row is found. If `False`, the
352 returned result does not account for post-query filtering, and
353 hence may be `True` even when all result rows would be filtered
354 out.
356 Returns
357 -------
358 any : `bool`
359 `True` if the query would (or might, depending on arguments) yield
360 result rows. `False` if it definitely would not.
361 """
362 raise NotImplementedError()
364 @abstractmethod
365 def explain_no_results(self, tree: QueryTree, execute: bool) -> Iterable[str]:
366 """Return human-readable messages that may help explain why the query
367 yields no results.
369 Parameters
370 ----------
371 tree : `QueryTree`
372 Query tree to evaluate.
373 execute : `bool`, optional
374 If `True` (default) execute simplified versions (e.g. ``LIMIT 1``)
375 of aspects of the tree to more precisely determine where rows were
376 filtered out.
378 Returns
379 -------
380 messages : `~collections.abc.Iterable` [ `str` ]
381 String messages that describe reasons the query might not yield any
382 results.
383 """
384 raise NotImplementedError()
386 @abstractmethod
387 def get_default_collections(self) -> tuple[str, ...]:
388 """Return the default collection search path.
390 Returns
391 -------
392 collections : `tuple` [ `str`, ... ]
393 The default collection search path as a tuple of `str`.
395 Raises
396 ------
397 NoDefaultCollectionError
398 Raised if there are no default collections.
399 """
400 raise NotImplementedError()
402 @abstractmethod
403 def get_dataset_type(self, name: str) -> DatasetType:
404 """Return the dimensions for a dataset type.
406 Parameters
407 ----------
408 name : `str`
409 Name of the dataset type.
411 Returns
412 -------
413 dataset_type : `DatasetType`
414 Dimensions of the dataset type.
416 Raises
417 ------
418 MissingDatasetTypeError
419 Raised if the dataset type is not registered.
420 """
421 raise NotImplementedError()