Coverage for python / lsst / daf / butler / registry / queries / _results.py: 77%
77 statements
« prev ^ index » next coverage.py v7.13.5, created at 2026-04-28 08:36 +0000
« prev ^ index » next coverage.py v7.13.5, created at 2026-04-28 08:36 +0000
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This software is dual licensed under the GNU General Public License and also
10# under a 3-clause BSD license. Recipients may choose which of these licenses
11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt,
12# respectively. If you choose the GPL option then the following text applies
13# (but note that there is still no warranty even if you opt for BSD instead):
14#
15# This program is free software: you can redistribute it and/or modify
16# it under the terms of the GNU General Public License as published by
17# the Free Software Foundation, either version 3 of the License, or
18# (at your option) any later version.
19#
20# This program is distributed in the hope that it will be useful,
21# but WITHOUT ANY WARRANTY; without even the implied warranty of
22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23# GNU General Public License for more details.
24#
25# You should have received a copy of the GNU General Public License
26# along with this program. If not, see <http://www.gnu.org/licenses/>.
27from __future__ import annotations
29__all__ = (
30 "ChainedDatasetQueryResults",
31 "DataCoordinateQueryResults",
32 "DatasetQueryResults",
33 "DimensionRecordQueryResults",
34 "ParentDatasetQueryResults",
35 "QueryResultsBase",
36)
38import itertools
39from abc import abstractmethod
40from collections.abc import Iterable, Iterator, Sequence
41from contextlib import AbstractContextManager
42from typing import Any, Self
44from ..._dataset_ref import DatasetRef
45from ..._dataset_type import DatasetType
46from ...dimensions import (
47 DataCoordinate,
48 DataCoordinateIterable,
49 DimensionElement,
50 DimensionGroup,
51 DimensionRecord,
52)
55class LimitedQueryResultsBase:
56 """Abstract base class defining functions that are shared by all of the
57 other QueryResults classes.
58 """
60 @abstractmethod
61 def count(self, *, exact: bool = True, discard: bool = False) -> int:
62 """Count the number of rows this query would return.
64 Parameters
65 ----------
66 exact : `bool`, optional
67 If `True`, run the full query and perform post-query filtering if
68 needed to account for that filtering in the count. If `False`, the
69 result may be an upper bound.
70 discard : `bool`, optional
71 If `True`, compute the exact count even if it would require running
72 the full query and then throwing away the result rows after
73 counting them. If `False`, this is an error, as the user would
74 usually be better off executing the query first to fetch its rows
75 into a new query (or passing ``exact=False``). Ignored if
76 ``exact=False``.
78 Returns
79 -------
80 count : `int`
81 The number of rows the query would return, or an upper bound if
82 ``exact=False``.
84 Notes
85 -----
86 This counts the number of rows returned, not the number of unique rows
87 returned, so even with ``exact=True`` it may provide only an upper
88 bound on the number of *deduplicated* result rows.
89 """
90 raise NotImplementedError()
92 @abstractmethod
93 def any(self, *, execute: bool = True, exact: bool = True) -> bool:
94 """Test whether this query returns any results.
96 Parameters
97 ----------
98 execute : `bool`, optional
99 If `True`, execute at least a ``LIMIT 1`` query if it cannot be
100 determined prior to execution that the query would return no rows.
101 exact : `bool`, optional
102 If `True`, run the full query and perform post-query filtering if
103 needed, until at least one result row is found. If `False`, the
104 returned result does not account for post-query filtering, and
105 hence may be `True` even when all result rows would be filtered
106 out.
108 Returns
109 -------
110 any : `bool`
111 `True` if the query would (or might, depending on arguments) yield
112 result rows. `False` if it definitely would not.
113 """
114 raise NotImplementedError()
116 @abstractmethod
117 def explain_no_results(self, execute: bool = True) -> Iterable[str]:
118 """Return human-readable messages that may help explain why the query
119 yields no results.
121 Parameters
122 ----------
123 execute : `bool`, optional
124 If `True` (default) execute simplified versions (e.g. ``LIMIT 1``)
125 of aspects of the tree to more precisely determine where rows were
126 filtered out.
128 Returns
129 -------
130 messages : `~collections.abc.Iterable` [ `str` ]
131 String messages that describe reasons the query might not yield any
132 results.
133 """
134 raise NotImplementedError()
137class QueryResultsBase(LimitedQueryResultsBase):
138 """Abstract base class defining functions shared by several of the other
139 QueryResults classes.
140 """
142 @abstractmethod
143 def order_by(self, *args: str) -> Self:
144 """Make the iterator return ordered results.
146 Parameters
147 ----------
148 *args : `str`
149 Names of the columns/dimensions to use for ordering. Column name
150 can be prefixed with minus (``-``) to use descending ordering.
152 Returns
153 -------
154 result : `typing.Self`
155 Returns ``self`` instance which is updated to return ordered
156 result.
158 Notes
159 -----
160 This method modifies the iterator in place and returns the same
161 instance to support method chaining.
162 """
163 raise NotImplementedError()
165 @abstractmethod
166 def limit(self, limit: int) -> Self:
167 """Make the iterator return limited number of records.
169 Parameters
170 ----------
171 limit : `int`
172 Upper limit on the number of returned records.
174 Returns
175 -------
176 result : `typing.Self`
177 Returns ``self`` instance which is updated to return limited set
178 of records.
180 Notes
181 -----
182 This method modifies the iterator in place and returns the same
183 instance to support method chaining. Normally this method is used
184 together with `order_by` method.
185 """
186 raise NotImplementedError()
189class DataCoordinateQueryResults(QueryResultsBase, DataCoordinateIterable):
190 """An enhanced implementation of `DataCoordinateIterable` that represents
191 data IDs retrieved from a database query.
192 """
194 @abstractmethod
195 def materialize(self) -> AbstractContextManager[DataCoordinateQueryResults]:
196 """Insert this query's results into a temporary table.
198 Returns
199 -------
200 context : `typing.ContextManager` [ `DataCoordinateQueryResults` ]
201 A context manager that ensures the temporary table is created and
202 populated in ``__enter__`` (returning a results object backed by
203 that table), and dropped in ``__exit__``. If ``self`` is already
204 materialized, the context manager may do nothing (reflecting the
205 fact that an outer context manager should already take care of
206 everything else).
208 Notes
209 -----
210 When using a very large result set to perform multiple queries (e.g.
211 multiple calls to `subset` with different arguments, or even a single
212 call to `expanded`), it may be much more efficient to start by
213 materializing the query and only then performing the follow up queries.
214 It may also be less efficient, depending on how well database engine's
215 query optimizer can simplify those particular follow-up queries and
216 how efficiently it caches query results even when the are not
217 explicitly inserted into a temporary table. See `expanded` and
218 `subset` for examples.
219 """
220 raise NotImplementedError()
222 @abstractmethod
223 def expanded(self) -> DataCoordinateQueryResults:
224 """Return a results object for which `hasRecords` returns `True`.
226 This method may involve actually executing database queries to fetch
227 `DimensionRecord` objects.
229 Returns
230 -------
231 results : `DataCoordinateQueryResults`
232 A results object for which `hasRecords` returns `True`. May be
233 ``self`` if that is already the case.
235 Notes
236 -----
237 For very result sets, it may be much more efficient to call
238 `materialize` before calling `expanded`, to avoid performing the
239 original query multiple times (as a subquery) in the follow-up queries
240 that fetch dimension records. For example::
242 with registry.queryDataIds(...).materialize() as tempDataIds:
243 dataIdsWithRecords = tempDataIds.expanded()
244 for dataId in dataIdsWithRecords:
245 ...
246 """
247 raise NotImplementedError()
249 @abstractmethod
250 def subset(
251 self,
252 dimensions: DimensionGroup | Iterable[str] | None = None,
253 *,
254 unique: bool = False,
255 ) -> DataCoordinateQueryResults:
256 """Return a results object containing a subset of the dimensions of
257 this one, and/or a unique near-subset of its rows.
259 This method may involve actually executing database queries to fetch
260 `DimensionRecord` objects.
262 Parameters
263 ----------
264 dimensions : `DimensionGroup` or \
265 `~collections.abc.Iterable` [ `str`], optional
266 Dimensions to include in the new results object. If `None`,
267 ``self.dimensions`` is used.
268 unique : `bool`, optional
269 If `True` (`False` is default), the query should only return unique
270 data IDs. This is implemented in the database; to obtain unique
271 results via Python-side processing (which may be more efficient in
272 some cases), use `toSet` to construct a `DataCoordinateSet` from
273 this results object instead.
275 Returns
276 -------
277 results : `DataCoordinateQueryResults`
278 A results object corresponding to the given criteria. May be
279 ``self`` if it already qualifies.
281 Raises
282 ------
283 ValueError
284 Raised when ``dimensions`` is not a subset of the dimensions in
285 this result.
287 Notes
288 -----
289 This method can only return a "near-subset" of the original result rows
290 in general because of subtleties in how spatial overlaps are
291 implemented; see `Query.projected` for more information.
293 When calling `subset` multiple times on the same very large result set,
294 it may be much more efficient to call `materialize` first. For
295 example::
297 dimensions1 = DimensionGroup(...)
298 dimensions2 = DimensionGroup(...)
299 with registry.queryDataIds(...).materialize() as tempDataIds:
300 for dataId1 in tempDataIds.subset(dimensions1, unique=True):
301 ...
302 for dataId2 in tempDataIds.subset(dimensions2, unique=True):
303 ...
304 """
305 raise NotImplementedError()
307 @abstractmethod
308 def findDatasets(
309 self,
310 datasetType: DatasetType | str,
311 collections: Any,
312 *,
313 findFirst: bool = True,
314 ) -> ParentDatasetQueryResults:
315 """Find datasets using the data IDs identified by this query.
317 Parameters
318 ----------
319 datasetType : `DatasetType` or `str`
320 Dataset type or the name of one to search for. Must have
321 dimensions that are a subset of ``self.graph``.
322 collections : `typing.Any`
323 An expression that fully or partially identifies the collections
324 to search for the dataset, such as a `str`, `re.Pattern`, or
325 iterable thereof. ``...`` can be used to return all collections.
326 See :ref:`daf_butler_collection_expressions` for more information.
327 findFirst : `bool`, optional
328 If `True` (default), for each result data ID, only yield one
329 `DatasetRef`, from the first collection in which a dataset of that
330 dataset type appears (according to the order of ``collections``
331 passed in). If `True`, ``collections`` must not contain regular
332 expressions and may not be ``...``.
334 Returns
335 -------
336 datasets : `ParentDatasetQueryResults`
337 A lazy-evaluation object representing dataset query results,
338 iterable over `DatasetRef` objects. If ``self.hasRecords()``, all
339 nested data IDs in those dataset references will have records as
340 well.
342 Raises
343 ------
344 MissingDatasetTypeError
345 Raised if the given dataset type is not registered.
346 """
347 raise NotImplementedError()
349 @abstractmethod
350 def findRelatedDatasets(
351 self,
352 datasetType: DatasetType | str,
353 collections: Any,
354 *,
355 findFirst: bool = True,
356 dimensions: DimensionGroup | Iterable[str] | None = None,
357 ) -> Iterable[tuple[DataCoordinate, DatasetRef]]:
358 """Find datasets using the data IDs identified by this query, and
359 return them along with the original data IDs.
361 This is a variant of `findDatasets` that is often more useful when
362 the target dataset type does not have all of the dimensions of the
363 original data ID query, as is generally the case with calibration
364 lookups.
366 Parameters
367 ----------
368 datasetType : `DatasetType` or `str`
369 Dataset type or the name of one to search for. Must have
370 dimensions that are a subset of ``self.graph``.
371 collections : `typing.Any`
372 An expression that fully or partially identifies the collections
373 to search for the dataset, such as a `str`, `re.Pattern`, or
374 iterable thereof. ``...`` can be used to return all collections.
375 See :ref:`daf_butler_collection_expressions` for more information.
376 findFirst : `bool`, optional
377 If `True` (default), for each data ID in ``self``, only yield one
378 `DatasetRef`, from the first collection in which a dataset of that
379 dataset type appears (according to the order of ``collections``
380 passed in). If `True`, ``collections`` must not contain regular
381 expressions and may not be ``...``. Note that this is not the
382 same as yielding one `DatasetRef` for each yielded data ID if
383 ``dimensions`` is not `None`.
384 dimensions : `DimensionGroup` or \
385 `~collections.abc.Iterable` [ `str` ], optional
386 The dimensions of the data IDs returned. Must be a subset of
387 ``self.dimensions``.
389 Returns
390 -------
391 pairs : `~collections.abc.Iterable` [ `tuple` [ `DataCoordinate`, \
392 `DatasetRef` ] ]
393 An iterable of (data ID, dataset reference) pairs.
395 Raises
396 ------
397 MissingDatasetTypeError
398 Raised if the given dataset type is not registered.
399 """
400 raise NotImplementedError()
403class DatasetQueryResults(LimitedQueryResultsBase, Iterable[DatasetRef]):
404 """An interface for objects that represent the results of queries for
405 datasets.
406 """
408 @abstractmethod
409 def byParentDatasetType(self) -> Iterator[ParentDatasetQueryResults]:
410 """Group results by parent dataset type.
412 Returns
413 -------
414 iter : `~collections.abc.Iterator` [ `ParentDatasetQueryResults` ]
415 An iterator over `DatasetQueryResults` instances that are each
416 responsible for a single parent dataset type.
417 """
418 raise NotImplementedError()
420 @abstractmethod
421 def expanded(self) -> Self:
422 """Return a `DatasetQueryResults` for which `DataCoordinate.hasRecords`
423 returns `True` for all data IDs in returned `DatasetRef` objects.
425 Returns
426 -------
427 expanded : `DatasetQueryResults`
428 Either a new `DatasetQueryResults` instance or ``self``, if it is
429 already expanded.
431 Notes
432 -----
433 As with `DataCoordinateQueryResults.expanded`, it may be more efficient
434 to call `materialize` before expanding data IDs for very large result
435 sets.
436 """
437 raise NotImplementedError()
439 def _iter_by_dataset_type(self) -> Iterator[tuple[DatasetType, Iterable[DatasetRef]]]:
440 """Group results by dataset type.
442 This is a private hook for the interface defined by
443 `DatasetRef.iter_by_type`, enabling much more efficient
444 processing of heterogeneous `DatasetRef` iterables when they come
445 directly from queries.
446 """
447 for parent_results in self.byParentDatasetType():
448 dataset_type = parent_results.parentDatasetType
449 yield dataset_type, parent_results
452class ParentDatasetQueryResults(DatasetQueryResults):
453 """An object that represents results from a query for datasets with a
454 single parent `DatasetType`.
455 """
457 @property
458 @abstractmethod
459 def parentDatasetType(self) -> DatasetType:
460 """The parent dataset type for all datasets in this iterable
461 (`DatasetType`).
462 """
463 raise NotImplementedError()
465 @property
466 @abstractmethod
467 def dataIds(self) -> DataCoordinateQueryResults:
468 """A lazy-evaluation object representing a query for just the data
469 IDs of the datasets that would be returned by this query
470 (`DataCoordinateQueryResults`).
472 The returned object is not in general `zip`-iterable with ``self``;
473 it may be in a different order or have (or not have) duplicates.
474 """
475 raise NotImplementedError()
478class ChainedDatasetQueryResults(DatasetQueryResults):
479 """A `DatasetQueryResults` implementation that simply chains together
480 other results objects, each for a different parent dataset type.
482 Parameters
483 ----------
484 chain : `~collections.abc.Sequence` [ `ParentDatasetQueryResults` ]
485 The underlying results objects this object will chain together.
486 doomed_by : `~collections.abc.Iterable` [ `str` ], optional
487 A list of messages (appropriate for e.g. logging or exceptions) that
488 explain why the query is known to return no results even before it is
489 executed. Queries with a non-empty list will never be executed.
490 Child results objects may also have their own list.
491 """
493 def __init__(self, chain: Sequence[ParentDatasetQueryResults], doomed_by: Iterable[str] = ()):
494 self._chain = chain
495 self._doomed_by = tuple(doomed_by)
497 __slots__ = ("_chain",)
499 def __iter__(self) -> Iterator[DatasetRef]:
500 return itertools.chain.from_iterable(self._chain)
502 def __repr__(self) -> str:
503 return "<DatasetRef iterator for multiple dataset types>"
505 def byParentDatasetType(self) -> Iterator[ParentDatasetQueryResults]:
506 # Docstring inherited from DatasetQueryResults.
507 return iter(self._chain)
509 def expanded(self) -> ChainedDatasetQueryResults:
510 # Docstring inherited from DatasetQueryResults.
511 return ChainedDatasetQueryResults([r.expanded() for r in self._chain], self._doomed_by)
513 def count(self, *, exact: bool = True, discard: bool = False) -> int:
514 # Docstring inherited.
515 return sum(r.count(exact=exact, discard=discard) for r in self._chain)
517 def any(self, *, execute: bool = True, exact: bool = True) -> bool:
518 # Docstring inherited.
519 return any(r.any(execute=execute, exact=exact) for r in self._chain)
521 def explain_no_results(self, execute: bool = True) -> Iterable[str]:
522 # Docstring inherited.
523 result = list(self._doomed_by)
524 for r in self._chain:
525 result.extend(r.explain_no_results(execute=execute))
526 return result
529class DimensionRecordQueryResults(QueryResultsBase, Iterable[DimensionRecord]):
530 """An interface for objects that represent the results of queries for
531 dimension records.
532 """
534 @property
535 @abstractmethod
536 def element(self) -> DimensionElement:
537 raise NotImplementedError()
539 @abstractmethod
540 def run(self) -> DimensionRecordQueryResults:
541 raise NotImplementedError()