Coverage for python/lsst/daf/butler/queries/driver.py: 88%
85 statements
« prev ^ index » next coverage.py v7.4.3, created at 2024-03-05 11:36 +0000
« prev ^ index » next coverage.py v7.4.3, created at 2024-03-05 11:36 +0000
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This software is dual licensed under the GNU General Public License and also
10# under a 3-clause BSD license. Recipients may choose which of these licenses
11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt,
12# respectively. If you choose the GPL option then the following text applies
13# (but note that there is still no warranty even if you opt for BSD instead):
14#
15# This program is free software: you can redistribute it and/or modify
16# it under the terms of the GNU General Public License as published by
17# the Free Software Foundation, either version 3 of the License, or
18# (at your option) any later version.
19#
20# This program is distributed in the hope that it will be useful,
21# but WITHOUT ANY WARRANTY; without even the implied warranty of
22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23# GNU General Public License for more details.
24#
25# You should have received a copy of the GNU General Public License
26# along with this program. If not, see <http://www.gnu.org/licenses/>.
28from __future__ import annotations
30__all__ = (
31 "QueryDriver",
32 "PageKey",
33 "ResultPage",
34 "DataCoordinateResultPage",
35 "DimensionRecordResultPage",
36 "DatasetRefResultPage",
37 "GeneralResultPage",
38)
40import dataclasses
41import uuid
42from abc import abstractmethod
43from collections.abc import Iterable, Sequence
44from contextlib import AbstractContextManager
45from typing import Any, TypeAlias, Union, overload
47from .._dataset_ref import DatasetRef
48from .._dataset_type import DatasetType
49from ..dimensions import (
50 DataCoordinate,
51 DataIdValue,
52 DimensionGroup,
53 DimensionRecord,
54 DimensionRecordSet,
55 DimensionRecordTable,
56 DimensionUniverse,
57)
58from ..registry import CollectionSummary
59from ..registry.interfaces import CollectionRecord
60from .result_specs import (
61 DataCoordinateResultSpec,
62 DatasetRefResultSpec,
63 DimensionRecordResultSpec,
64 GeneralResultSpec,
65 ResultSpec,
66)
67from .tree import DataCoordinateUploadKey, MaterializationKey, QueryTree
69PageKey: TypeAlias = uuid.UUID
72# The Page types below could become Pydantic models instead of dataclasses if
73# that makes them more directly usable by RemoteButler (at least once we have
74# Pydantic-friendly containers for all of them). We may want to add a
75# discriminator annotation to the ResultPage union if we do that.
78@dataclasses.dataclass
79class DataCoordinateResultPage:
80 """A single page of results from a data coordinate query."""
82 spec: DataCoordinateResultSpec
83 next_key: PageKey | None
85 # TODO: On DM-41114 this will become a custom container that normalizes out
86 # attached DimensionRecords and is Pydantic-friendly.
87 rows: list[DataCoordinate]
90@dataclasses.dataclass
91class DimensionRecordResultPage:
92 """A single page of results from a dimension record query."""
94 spec: DimensionRecordResultSpec
95 next_key: PageKey | None
96 rows: Iterable[DimensionRecord]
98 def as_table(self) -> DimensionRecordTable:
99 if isinstance(self.rows, DimensionRecordTable):
100 return self.rows
101 else:
102 return DimensionRecordTable(self.spec.element, self.rows)
104 def as_set(self) -> DimensionRecordSet:
105 if isinstance(self.rows, DimensionRecordSet):
106 return self.rows
107 else:
108 return DimensionRecordSet(self.spec.element, self.rows)
111@dataclasses.dataclass
112class DatasetRefResultPage:
113 """A single page of results from a dataset query."""
115 spec: DatasetRefResultSpec
116 next_key: PageKey | None
118 # TODO: On DM-41115 this will become a custom container that normalizes out
119 # attached DimensionRecords and is Pydantic-friendly.
120 rows: list[DatasetRef]
123@dataclasses.dataclass
124class GeneralResultPage:
125 """A single page of results from a general query."""
127 spec: GeneralResultSpec
128 next_key: PageKey | None
130 # Raw tabular data, with columns in the same order as spec.columns.
131 rows: list[tuple[Any, ...]]
134ResultPage: TypeAlias = Union[
135 DataCoordinateResultPage, DimensionRecordResultPage, DatasetRefResultPage, GeneralResultPage
136]
139class QueryDriver(AbstractContextManager[None]):
140 """Base class for the implementation object inside `Query2` objects
141 that is specialized for DirectButler vs. RemoteButler.
143 Notes
144 -----
145 Implementations should be context managers. This allows them to manage the
146 lifetime of server-side state, such as:
148 - a SQL transaction, when necessary (DirectButler);
149 - SQL cursors for queries that were not fully iterated over (DirectButler);
150 - temporary database tables (DirectButler);
151 - result-page Parquet files that were never fetched (RemoteButler);
152 - uploaded Parquet files used to fill temporary database tables
153 (RemoteButler);
154 - cached content needed to construct query trees, like collection summaries
155 (potentially all Butlers).
157 When possible, these sorts of things should be cleaned up earlier when they
158 are no longer needed, and the Butler server will still have to guard
159 against the context manager's ``__exit__`` signal never reaching it, but a
160 context manager will take care of these much more often than relying on
161 garbage collection and ``__del__`` would.
162 """
164 @property
165 @abstractmethod
166 def universe(self) -> DimensionUniverse:
167 """Object that defines all dimensions."""
168 raise NotImplementedError()
170 @overload
171 def execute(self, result_spec: DataCoordinateResultSpec, tree: QueryTree) -> DataCoordinateResultPage: ... 171 ↛ exitline 171 didn't return from function 'execute'
173 @overload
174 def execute( 174 ↛ exitline 174 didn't jump to the function exit
175 self, result_spec: DimensionRecordResultSpec, tree: QueryTree
176 ) -> DimensionRecordResultPage: ...
178 @overload
179 def execute(self, result_spec: DatasetRefResultSpec, tree: QueryTree) -> DatasetRefResultPage: ... 179 ↛ exitline 179 didn't return from function 'execute'
181 @overload
182 def execute(self, result_spec: GeneralResultSpec, tree: QueryTree) -> GeneralResultPage: ... 182 ↛ exitline 182 didn't return from function 'execute'
184 @abstractmethod
185 def execute(self, result_spec: ResultSpec, tree: QueryTree) -> ResultPage:
186 """Execute a query and return the first result page.
188 Parameters
189 ----------
190 result_spec : `ResultSpec`
191 The kind of results the user wants from the query. This can affect
192 the actual query (i.e. SQL and Python postprocessing) that is run,
193 e.g. by changing what is in the SQL SELECT clause and even what
194 tables are joined in, but it never changes the number or order of
195 result rows.
196 tree : `QueryTree`
197 Query tree to evaluate.
199 Returns
200 -------
201 first_page : `ResultPage`
202 A page whose type corresponds to the type of ``result_spec``, with
203 at least the initial rows from the query. This should have an
204 empty ``rows`` attribute if the query returned no results, and a
205 ``next_key`` attribute that is not `None` if there were more
206 results than could be returned in a single page.
207 """
208 raise NotImplementedError()
210 @overload
211 def fetch_next_page( 211 ↛ exitline 211 didn't jump to the function exit
212 self, result_spec: DataCoordinateResultSpec, key: PageKey
213 ) -> DataCoordinateResultPage: ...
215 @overload
216 def fetch_next_page( 216 ↛ exitline 216 didn't jump to the function exit
217 self, result_spec: DimensionRecordResultSpec, key: PageKey
218 ) -> DimensionRecordResultPage: ...
220 @overload
221 def fetch_next_page(self, result_spec: DatasetRefResultSpec, key: PageKey) -> DatasetRefResultPage: ... 221 ↛ exitline 221 didn't return from function 'fetch_next_page'
223 @overload
224 def fetch_next_page(self, result_spec: GeneralResultSpec, key: PageKey) -> GeneralResultPage: ... 224 ↛ exitline 224 didn't return from function 'fetch_next_page'
226 @abstractmethod
227 def fetch_next_page(self, result_spec: ResultSpec, key: PageKey) -> ResultPage:
228 """Fetch the next page of results from an already-executed query.
230 Parameters
231 ----------
232 result_spec : `ResultSpec`
233 The kind of results the user wants from the query. This must be
234 identical to the ``result_spec`` passed to `execute`, but
235 implementations are not *required* to check this.
236 key : `PageKey`
237 Key included in the previous page from this query. This key may
238 become unusable or even be reused after this call.
240 Returns
241 -------
242 next_page : `ResultPage`
243 The next page of query results.
244 """
245 # We can put off dealing with pagination initially by just making an
246 # implementation of this method raise.
247 #
248 # In RemoteButler I expect this to work by having the call to execute
249 # continue to write Parquet files (or whatever) to some location until
250 # its cursor is exhausted, and then delete those files as they are
251 # fetched (or, failing that, when receiving a signal from
252 # ``__exit__``).
253 #
254 # In DirectButler I expect to have a dict[PageKey, Cursor], fetch a
255 # blocks of rows from it, and just reuse the page key for the next page
256 # until the cursor is exactly.
257 raise NotImplementedError()
259 @abstractmethod
260 def materialize(
261 self,
262 tree: QueryTree,
263 dimensions: DimensionGroup,
264 datasets: frozenset[str],
265 ) -> MaterializationKey:
266 """Execute a query tree, saving results to temporary storage for use
267 in later queries.
269 Parameters
270 ----------
271 tree : `QueryTree`
272 Query tree to evaluate.
273 dimensions : `DimensionGroup`
274 Dimensions whose key columns should be preserved.
275 datasets : `frozenset` [ `str` ]
276 Names of dataset types whose ID columns may be materialized. It
277 is implementation-defined whether they actually are.
279 Returns
280 -------
281 key : `MaterializationKey`
282 Unique identifier for the result rows that allows them to be
283 referenced in a `QueryTree`.
284 """
285 raise NotImplementedError()
287 @abstractmethod
288 def upload_data_coordinates(
289 self, dimensions: DimensionGroup, rows: Iterable[tuple[DataIdValue, ...]]
290 ) -> DataCoordinateUploadKey:
291 """Upload a table of data coordinates for use in later queries.
293 Parameters
294 ----------
295 dimensions : `DimensionGroup`
296 Dimensions of the data coordinates.
297 rows : `Iterable` [ `tuple` ]
298 Tuples of data coordinate values, covering just the "required"
299 subset of ``dimensions``.
301 Returns
302 -------
303 key
304 Unique identifier for the upload that allows it to be referenced in
305 a `QueryTree`.
306 """
307 raise NotImplementedError()
309 @abstractmethod
310 def count(
311 self,
312 tree: QueryTree,
313 result_spec: ResultSpec,
314 *,
315 exact: bool,
316 discard: bool,
317 ) -> int:
318 """Return the number of rows a query would return.
320 Parameters
321 ----------
322 tree : `QueryTree`
323 Query tree to evaluate.
324 result_spec : `ResultSpec`
325 The kind of results the user wants to count.
326 exact : `bool`, optional
327 If `True`, run the full query and perform post-query filtering if
328 needed to account for that filtering in the count. If `False`, the
329 result may be an upper bound.
330 discard : `bool`, optional
331 If `True`, compute the exact count even if it would require running
332 the full query and then throwing away the result rows after
333 counting them. If `False`, this is an error, as the user would
334 usually be better off executing the query first to fetch its rows
335 into a new query (or passing ``exact=False``). Ignored if
336 ``exact=False``.
337 """
338 raise NotImplementedError()
340 @abstractmethod
341 def any(self, tree: QueryTree, *, execute: bool, exact: bool) -> bool:
342 """Test whether the query would return any rows.
344 Parameters
345 ----------
346 tree : `QueryTree`
347 Query tree to evaluate.
348 execute : `bool`, optional
349 If `True`, execute at least a ``LIMIT 1`` query if it cannot be
350 determined prior to execution that the query would return no rows.
351 exact : `bool`, optional
352 If `True`, run the full query and perform post-query filtering if
353 needed, until at least one result row is found. If `False`, the
354 returned result does not account for post-query filtering, and
355 hence may be `True` even when all result rows would be filtered
356 out.
358 Returns
359 -------
360 any : `bool`
361 `True` if the query would (or might, depending on arguments) yield
362 result rows. `False` if it definitely would not.
363 """
364 raise NotImplementedError()
366 @abstractmethod
367 def explain_no_results(self, tree: QueryTree, execute: bool) -> Iterable[str]:
368 """Return human-readable messages that may help explain why the query
369 yields no results.
371 Parameters
372 ----------
373 tree : `QueryTree`
374 Query tree to evaluate.
375 execute : `bool`, optional
376 If `True` (default) execute simplified versions (e.g. ``LIMIT 1``)
377 of aspects of the tree to more precisely determine where rows were
378 filtered out.
380 Returns
381 -------
382 messages : `~collections.abc.Iterable` [ `str` ]
383 String messages that describe reasons the query might not yield any
384 results.
385 """
386 raise NotImplementedError()
388 @abstractmethod
389 def get_default_collections(self) -> tuple[str, ...]:
390 """Return the default collection search path.
392 Returns
393 -------
394 collections : `tuple` [ `str`, ... ]
395 The default collection search path as a tuple of `str`.
397 Raises
398 ------
399 NoDefaultCollectionError
400 Raised if there are no default collections.
401 """
402 raise NotImplementedError()
404 @abstractmethod
405 def resolve_collection_path(
406 self, collections: Sequence[str]
407 ) -> list[tuple[CollectionRecord, CollectionSummary]]:
408 """Process a collection search path argument into a `list` of
409 collection records and summaries.
411 Parameters
412 ----------
413 collections : `~collections.abc.Sequence` [ `str` ]
414 The collection or collections to search.
416 Returns
417 -------
418 collection_info : `list` [ `tuple` [ `CollectionRecord`, \
419 `CollectionSummary` ] ]
420 A `list` of pairs of `CollectionRecord` and `CollectionSummary`
421 that flattens out all `~CollectionType.CHAINED` collections into
422 their children while maintaining the same order and avoiding
423 duplicates.
425 Raises
426 ------
427 MissingCollectionError
428 Raised if any collection in ``collections`` does not exist.
430 Notes
431 -----
432 Implementations are generally expected to cache the collection records
433 and summaries they obtain (including the records for
434 `~CollectionType.CHAINED` collections that are not returned) in order
435 to optimize multiple calls with collections in common.
436 """
437 raise NotImplementedError()
439 @abstractmethod
440 def get_dataset_type(self, name: str) -> DatasetType:
441 """Return the dimensions for a dataset type.
443 Parameters
444 ----------
445 name : `str`
446 Name of the dataset type.
448 Returns
449 -------
450 dataset_type : `DatasetType`
451 Dimensions of the dataset type.
453 Raises
454 ------
455 MissingDatasetTypeError
456 Raised if the dataset type is not registered.
457 """
458 raise NotImplementedError()