Coverage for python/lsst/daf/butler/_query_results.py: 100%
75 statements
« prev ^ index » next coverage.py v7.4.1, created at 2024-02-01 11:19 +0000
« prev ^ index » next coverage.py v7.4.1, created at 2024-02-01 11:19 +0000
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This software is dual licensed under the GNU General Public License and also
10# under a 3-clause BSD license. Recipients may choose which of these licenses
11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt,
12# respectively. If you choose the GPL option then the following text applies
13# (but note that there is still no warranty even if you opt for BSD instead):
14#
15# This program is free software: you can redistribute it and/or modify
16# it under the terms of the GNU General Public License as published by
17# the Free Software Foundation, either version 3 of the License, or
18# (at your option) any later version.
19#
20# This program is distributed in the hope that it will be useful,
21# but WITHOUT ANY WARRANTY; without even the implied warranty of
22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23# GNU General Public License for more details.
24#
25# You should have received a copy of the GNU General Public License
26# along with this program. If not, see <http://www.gnu.org/licenses/>.
28from __future__ import annotations
30__all__ = (
31 "DataCoordinateQueryResults",
32 "DatasetQueryResults",
33 "DimensionRecordQueryResults",
34 "SingleTypeDatasetQueryResults",
35)
37from abc import abstractmethod
38from collections.abc import Iterable, Iterator
39from contextlib import AbstractContextManager
40from typing import TYPE_CHECKING, Any
42from ._dataset_ref import DatasetRef
43from .dimensions import DataCoordinate, DimensionRecord
45if TYPE_CHECKING:
46 from ._dataset_type import DatasetType
47 from .dimensions import DimensionElement, DimensionGroup
50class DataCoordinateQueryResults(Iterable[DataCoordinate]):
51 """An interface for objects that represent the results of queries for
52 data IDs.
53 """
55 @property
56 @abstractmethod
57 def dimensions(self) -> DimensionGroup:
58 """The dimensions of the data IDs returned by this query."""
59 raise NotImplementedError()
61 @abstractmethod
62 def has_full(self) -> bool:
63 """Indicate if all data IDs in this iterable identify all dimensions,
64 not just required dimensions.
66 Returns
67 -------
68 state : `bool`
69 If `True`, ``all(d.hasFull() for d in self)`` is guaranteed.
70 If `False`, no guarantees are made.
71 """
72 raise NotImplementedError()
74 @abstractmethod
75 def has_records(self) -> bool:
76 """Return whether all data IDs in this iterable contain records.
78 Returns
79 -------
80 state : `bool`
81 If `True`, ``all(d.hasRecords() for d in self)`` is guaranteed.
82 If `False`, no guarantees are made.
83 """
84 raise NotImplementedError()
86 @abstractmethod
87 def materialize(self) -> AbstractContextManager[DataCoordinateQueryResults]:
88 """Insert this query's results into a temporary table.
90 Returns
91 -------
92 context : `typing.ContextManager` [ `DataCoordinateQueryResults` ]
93 A context manager that ensures the temporary table is created and
94 populated in ``__enter__`` (returning a results object backed by
95 that table), and dropped in ``__exit__``. If ``self`` is already
96 materialized, the context manager may do nothing (reflecting the
97 fact that an outer context manager should already take care of
98 everything else).
100 Notes
101 -----
102 When using a very large result set to perform multiple queries (e.g.
103 multiple calls to `subset` with different arguments, or even a single
104 call to `expanded`), it may be much more efficient to start by
105 materializing the query and only then performing the follow up queries.
106 It may also be less efficient, depending on how well database engine's
107 query optimizer can simplify those particular follow-up queries and
108 how efficiently it caches query results even when the are not
109 explicitly inserted into a temporary table. See `expanded` and
110 `subset` for examples.
111 """
112 raise NotImplementedError()
114 @abstractmethod
115 def expanded(self) -> DataCoordinateQueryResults:
116 """Return a results object for which `has_records` returns `True`.
118 This method may involve actually executing database queries to fetch
119 `DimensionRecord` objects.
121 Returns
122 -------
123 results : `DataCoordinateQueryResults`
124 A results object for which `has_records` returns `True`. May be
125 ``self`` if that is already the case.
127 Notes
128 -----
129 For very result sets, it may be much more efficient to call
130 `materialize` before calling `expanded`, to avoid performing the
131 original query multiple times (as a subquery) in the follow-up queries
132 that fetch dimension records. For example::
134 with butler.query() as query:
135 with query.data_ids(...).materialize() as tempDataIds:
136 dataIdsWithRecords = tempDataIds.expanded()
137 for dataId in dataIdsWithRecords:
138 ...
139 """
140 raise NotImplementedError()
142 @abstractmethod
143 def subset(
144 self,
145 dimensions: DimensionGroup | Iterable[str] | None = None,
146 *,
147 unique: bool = False,
148 ) -> DataCoordinateQueryResults:
149 """Return a results object containing a subset of the dimensions of
150 this one, and/or a unique near-subset of its rows.
152 This method may involve actually executing database queries to fetch
153 `DimensionRecord` objects.
155 Parameters
156 ----------
157 dimensions : `DimensionGroup` or \
158 `~collections.abc.Iterable` [ `str`], optional
159 Dimensions to include in the new results object. If `None`,
160 ``self.dimensions`` is used.
161 unique : `bool`, optional
162 If `True` (`False` is default), the query should only return unique
163 data IDs. This is implemented in the database; to obtain unique
164 results via Python-side processing (which may be more efficient in
165 some cases), use `toSet` to construct a `DataCoordinateSet` from
166 this results object instead.
168 Returns
169 -------
170 results : `DataCoordinateQueryResults`
171 A results object corresponding to the given criteria. May be
172 ``self`` if it already qualifies.
174 Raises
175 ------
176 ValueError
177 Raised when ``dimensions`` is not a subset of the dimensions in
178 this result.
180 Notes
181 -----
182 This method can only return a "near-subset" of the original result rows
183 in general because of subtleties in how spatial overlaps are
184 implemented; see `Query.projected` for more information.
186 When calling `subset` multiple times on the same very large result set,
187 it may be much more efficient to call `materialize` first. For
188 example::
190 dimensions1 = DimensionGroup(...)
191 dimensions2 = DimensionGroup(...)
192 with butler.query(...)as query:
193 with query.data_ids(...).materialize() as data_ids:
194 for dataId1 in data_ids.subset(dimensions1, unique=True):
195 ...
196 for dataId2 in data_ids.subset(dimensions2, unique=True):
197 ...
198 """
199 raise NotImplementedError()
201 @abstractmethod
202 def find_datasets(
203 self, dataset_type: DatasetType | str, collections: Any, *, find_first: bool = True
204 ) -> DatasetQueryResults:
205 """Find datasets using the data IDs identified by this query.
207 Parameters
208 ----------
209 dataset_type : `DatasetType` or `str`
210 Dataset type or the name of one to search for. Must have
211 dimensions that are a subset of ``self.dimensions``.
212 collections : `Any`
213 An expression that fully or partially identifies the collections
214 to search for the dataset, such as a `str`, `re.Pattern`, or
215 iterable thereof. ``...`` can be used to return all collections.
216 See :ref:`daf_butler_collection_expressions` for more information.
217 find_first : `bool`, optional
218 If `True` (default), for each result data ID, only yield one
219 `DatasetRef`, from the first collection in which a dataset of that
220 dataset type appears (according to the order of ``collections``
221 passed in). If `True`, ``collections`` must not contain regular
222 expressions and may not be ``...``.
224 Returns
225 -------
226 datasets : `ParentDatasetQueryResults`
227 A lazy-evaluation object representing dataset query results,
228 iterable over `DatasetRef` objects. If ``self.has_records()``, all
229 nested data IDs in those dataset references will have records as
230 well.
232 Raises
233 ------
234 MissingDatasetTypeError
235 Raised if the given dataset type is not registered.
236 """
237 raise NotImplementedError()
239 @abstractmethod
240 def find_related_datasets(
241 self,
242 dataset_type: DatasetType | str,
243 collections: Any,
244 *,
245 find_first: bool = True,
246 dimensions: DimensionGroup | Iterable[str] | None = None,
247 ) -> Iterable[tuple[DataCoordinate, DatasetRef]]:
248 """Find datasets using the data IDs identified by this query, and
249 return them along with the original data IDs.
251 This is a variant of `find_datasets` that is often more useful when
252 the target dataset type does not have all of the dimensions of the
253 original data ID query, as is generally the case with calibration
254 lookups.
256 Parameters
257 ----------
258 dataset_type : `DatasetType` or `str`
259 Dataset type or the name of one to search for. Must have
260 dimensions that are a subset of ``self.dimensions``.
261 collections : `Any`
262 An expression that fully or partially identifies the collections
263 to search for the dataset, such as a `str`, `re.Pattern`, or
264 iterable thereof. ``...`` can be used to return all collections.
265 See :ref:`daf_butler_collection_expressions` for more information.
266 find_first : `bool`, optional
267 If `True` (default), for each data ID in ``self``, only yield one
268 `DatasetRef`, from the first collection in which a dataset of that
269 dataset type appears (according to the order of ``collections``
270 passed in). If `True`, ``collections`` must not contain regular
271 expressions and may not be ``...``. Note that this is not the
272 same as yielding one `DatasetRef` for each yielded data ID if
273 ``dimensions`` is not `None`.
274 dimensions : `DimensionGroup`, or \
275 `~collections.abc.Iterable` [ `str` ], optional
276 The dimensions of the data IDs returned. Must be a subset of
277 ``self.dimensions``.
279 Returns
280 -------
281 pairs : `~collections.abc.Iterable` [ `tuple` [ `DataCoordinate`, \
282 `DatasetRef` ] ]
283 An iterable of (data ID, dataset reference) pairs.
285 Raises
286 ------
287 MissingDatasetTypeError
288 Raised if the given dataset type is not registered.
289 """
290 raise NotImplementedError()
292 @abstractmethod
293 def count(self, *, exact: bool = True, discard: bool = False) -> int:
294 """Count the number of rows this query would return.
296 Parameters
297 ----------
298 exact : `bool`, optional
299 If `True`, run the full query and perform post-query filtering if
300 needed to account for that filtering in the count. If `False`, the
301 result may be an upper bound.
302 discard : `bool`, optional
303 If `True`, compute the exact count even if it would require running
304 the full query and then throwing away the result rows after
305 counting them. If `False`, this is an error, as the user would
306 usually be better off executing the query first to fetch its rows
307 into a new query (or passing ``exact=False``). Ignored if
308 ``exact=False``.
310 Returns
311 -------
312 count : `int`
313 The number of rows the query would return, or an upper bound if
314 ``exact=False``.
316 Notes
317 -----
318 This counts the number of rows returned, not the number of unique rows
319 returned, so even with ``exact=True`` it may provide only an upper
320 bound on the number of *deduplicated* result rows.
321 """
322 raise NotImplementedError()
324 @abstractmethod
325 def any(self, *, execute: bool = True, exact: bool = True) -> bool:
326 """Test whether this query returns any results.
328 Parameters
329 ----------
330 execute : `bool`, optional
331 If `True`, execute at least a ``LIMIT 1`` query if it cannot be
332 determined prior to execution that the query would return no rows.
333 exact : `bool`, optional
334 If `True`, run the full query and perform post-query filtering if
335 needed, until at least one result row is found. If `False`, the
336 returned result does not account for post-query filtering, and
337 hence may be `True` even when all result rows would be filtered
338 out.
340 Returns
341 -------
342 any : `bool`
343 `True` if the query would (or might, depending on arguments) yield
344 result rows. `False` if it definitely would not.
345 """
346 raise NotImplementedError()
348 @abstractmethod
349 def explain_no_results(self, execute: bool = True) -> Iterable[str]:
350 """Return human-readable messages that may help explain why the query
351 yields no results.
353 Parameters
354 ----------
355 execute : `bool`, optional
356 If `True` (default) execute simplified versions (e.g. ``LIMIT 1``)
357 of aspects of the tree to more precisely determine where rows were
358 filtered out.
360 Returns
361 -------
362 messages : `~collections.abc.Iterable` [ `str` ]
363 String messages that describe reasons the query might not yield any
364 results.
365 """
366 raise NotImplementedError()
368 @abstractmethod
369 def order_by(self, *args: str) -> DataCoordinateQueryResults:
370 """Make the iterator return ordered results.
372 Parameters
373 ----------
374 *args : `str`
375 Names of the columns/dimensions to use for ordering. Column name
376 can be prefixed with minus (``-``) to use descending ordering.
378 Returns
379 -------
380 result : `DataCoordinateQueryResults`
381 Returns ``self`` instance which is updated to return ordered
382 result.
384 Notes
385 -----
386 This method modifies the iterator in place and returns the same
387 instance to support method chaining.
388 """
389 raise NotImplementedError()
391 @abstractmethod
392 def limit(self, limit: int, offset: int | None = 0) -> DataCoordinateQueryResults:
393 """Make the iterator return limited number of records.
395 Parameters
396 ----------
397 limit : `int`
398 Upper limit on the number of returned records.
399 offset : `int` or `None`, optional
400 The number of records to skip before returning at most ``limit``
401 records. `None` is interpreted the same as zero for backwards
402 compatibility.
404 Returns
405 -------
406 result : `DataCoordinateQueryResults`
407 Returns ``self`` instance which is updated to return limited set
408 of records.
410 Notes
411 -----
412 This method modifies the iterator in place and returns the same
413 instance to support method chaining. Normally this method is used
414 together with `order_by` method.
415 """
416 raise NotImplementedError()
419class DatasetQueryResults(Iterable[DatasetRef]):
420 """An interface for objects that represent the results of queries for
421 datasets.
422 """
424 @abstractmethod
425 def by_dataset_type(self) -> Iterator[SingleTypeDatasetQueryResults]:
426 """Group results by dataset type.
428 Returns
429 -------
430 iter : `~collections.abc.Iterator` [ `SingleTypeDatasetQueryResults` ]
431 An iterator over `DatasetQueryResults` instances that are each
432 responsible for a single dataset type.
433 """
434 raise NotImplementedError()
436 @abstractmethod
437 def materialize(self) -> AbstractContextManager[DatasetQueryResults]:
438 """Insert this query's results into a temporary table.
440 Returns
441 -------
442 context : `typing.ContextManager` [ `DatasetQueryResults` ]
443 A context manager that ensures the temporary table is created and
444 populated in ``__enter__`` (returning a results object backed by
445 that table), and dropped in ``__exit__``. If ``self`` is already
446 materialized, the context manager may do nothing (reflecting the
447 fact that an outer context manager should already take care of
448 everything else).
449 """
450 raise NotImplementedError()
452 @abstractmethod
453 def expanded(self) -> DatasetQueryResults:
454 """Return a `DatasetQueryResults` for which `DataCoordinate.hasRecords`
455 returns `True` for all data IDs in returned `DatasetRef` objects.
457 Returns
458 -------
459 expanded : `DatasetQueryResults`
460 Either a new `DatasetQueryResults` instance or ``self``, if it is
461 already expanded.
463 Notes
464 -----
465 As with `DataCoordinateQueryResults.expanded`, it may be more efficient
466 to call `materialize` before expanding data IDs for very large result
467 sets.
468 """
469 raise NotImplementedError()
471 @abstractmethod
472 def count(self, *, exact: bool = True, discard: bool = False) -> int:
473 """Count the number of rows this query would return.
475 Parameters
476 ----------
477 exact : `bool`, optional
478 If `True`, run the full query and perform post-query filtering if
479 needed to account for that filtering in the count. If `False`, the
480 result may be an upper bound.
481 discard : `bool`, optional
482 If `True`, compute the exact count even if it would require running
483 the full query and then throwing away the result rows after
484 counting them. If `False`, this is an error, as the user would
485 usually be better off executing the query first to fetch its rows
486 into a new query (or passing ``exact=False``). Ignored if
487 ``exact=False``.
489 Returns
490 -------
491 count : `int`
492 The number of rows the query would return, or an upper bound if
493 ``exact=False``.
495 Notes
496 -----
497 This counts the number of rows returned, not the number of unique rows
498 returned, so even with ``exact=True`` it may provide only an upper
499 bound on the number of *deduplicated* result rows.
500 """
501 raise NotImplementedError()
503 @abstractmethod
504 def any(self, *, execute: bool = True, exact: bool = True) -> bool:
505 """Test whether this query returns any results.
507 Parameters
508 ----------
509 execute : `bool`, optional
510 If `True`, execute at least a ``LIMIT 1`` query if it cannot be
511 determined prior to execution that the query would return no rows.
512 exact : `bool`, optional
513 If `True`, run the full query and perform post-query filtering if
514 needed, until at least one result row is found. If `False`, the
515 returned result does not account for post-query filtering, and
516 hence may be `True` even when all result rows would be filtered
517 out.
519 Returns
520 -------
521 any : `bool`
522 `True` if the query would (or might, depending on arguments) yield
523 result rows. `False` if it definitely would not.
524 """
525 raise NotImplementedError()
527 @abstractmethod
528 def explain_no_results(self, execute: bool = True) -> Iterable[str]:
529 """Return human-readable messages that may help explain why the query
530 yields no results.
532 Parameters
533 ----------
534 execute : `bool`, optional
535 If `True` (default) execute simplified versions (e.g. ``LIMIT 1``)
536 of aspects of the tree to more precisely determine where rows were
537 filtered out.
539 Returns
540 -------
541 messages : `~collections.abc.Iterable` [ `str` ]
542 String messages that describe reasons the query might not yield any
543 results.
544 """
545 raise NotImplementedError()
548class SingleTypeDatasetQueryResults(DatasetQueryResults):
549 """An object that represents results from a query for datasets with a
550 single parent `DatasetType`.
551 """
553 @abstractmethod
554 def materialize(self) -> AbstractContextManager[SingleTypeDatasetQueryResults]:
555 # Docstring inherited from DatasetQueryResults.
556 raise NotImplementedError()
558 @property
559 @abstractmethod
560 def dataset_type(self) -> DatasetType:
561 """The parent dataset type for all datasets in this iterable
562 (`DatasetType`).
563 """
564 raise NotImplementedError()
566 @property
567 @abstractmethod
568 def data_ids(self) -> DataCoordinateQueryResults:
569 """A lazy-evaluation object representing a query for just the data
570 IDs of the datasets that would be returned by this query
571 (`DataCoordinateQueryResults`).
573 The returned object is not in general `zip`-iterable with ``self``;
574 it may be in a different order or have (or not have) duplicates.
575 """
576 raise NotImplementedError()
578 def expanded(self) -> SingleTypeDatasetQueryResults:
579 # Docstring inherited from DatasetQueryResults.
580 raise NotImplementedError()
583class DimensionRecordQueryResults(Iterable[DimensionRecord]):
584 """An interface for objects that represent the results of queries for
585 dimension records.
586 """
588 @property
589 @abstractmethod
590 def element(self) -> DimensionElement:
591 """Dimension element for this result (`DimensionElement`)."""
592 raise NotImplementedError()
594 @abstractmethod
595 def run(self) -> DimensionRecordQueryResults:
596 """Execute the query and return an instance with data held in memory.
598 Returns
599 -------
600 result : `DimensionRecordQueryResults`
601 Query results, may return ``self`` if it has all data in memory
602 already.
603 """
604 raise NotImplementedError()
606 @abstractmethod
607 def count(self, *, exact: bool = True, discard: bool = False) -> int:
608 """Count the number of rows this query would return.
610 Parameters
611 ----------
612 exact : `bool`, optional
613 If `True`, run the full query and perform post-query filtering if
614 needed to account for that filtering in the count. If `False`, the
615 result may be an upper bound.
616 discard : `bool`, optional
617 If `True`, compute the exact count even if it would require running
618 the full query and then throwing away the result rows after
619 counting them. If `False`, this is an error, as the user would
620 usually be better off executing the query first to fetch its rows
621 into a new query (or passing ``exact=False``). Ignored if
622 ``exact=False``.
624 Returns
625 -------
626 count : `int`
627 The number of rows the query would return, or an upper bound if
628 ``exact=False``.
630 Notes
631 -----
632 This counts the number of rows returned, not the number of unique rows
633 returned, so even with ``exact=True`` it may provide only an upper
634 bound on the number of *deduplicated* result rows.
635 """
636 raise NotImplementedError()
638 @abstractmethod
639 def any(self, *, execute: bool = True, exact: bool = True) -> bool:
640 """Test whether this query returns any results.
642 Parameters
643 ----------
644 execute : `bool`, optional
645 If `True`, execute at least a ``LIMIT 1`` query if it cannot be
646 determined prior to execution that the query would return no rows.
647 exact : `bool`, optional
648 If `True`, run the full query and perform post-query filtering if
649 needed, until at least one result row is found. If `False`, the
650 returned result does not account for post-query filtering, and
651 hence may be `True` even when all result rows would be filtered
652 out.
654 Returns
655 -------
656 any : `bool`
657 `True` if the query would (or might, depending on arguments) yield
658 result rows. `False` if it definitely would not.
659 """
660 raise NotImplementedError()
662 @abstractmethod
663 def order_by(self, *args: str) -> DimensionRecordQueryResults:
664 """Make the iterator return ordered result.
666 Parameters
667 ----------
668 *args : `str`
669 Names of the columns/dimensions to use for ordering. Column name
670 can be prefixed with minus (``-``) to use descending ordering.
672 Returns
673 -------
674 result : `DimensionRecordQueryResults`
675 Returns ``self`` instance which is updated to return ordered
676 result.
678 Notes
679 -----
680 This method can modify the iterator in place and return the same
681 instance.
682 """
683 raise NotImplementedError()
685 @abstractmethod
686 def limit(self, limit: int, offset: int | None = 0) -> DimensionRecordQueryResults:
687 """Make the iterator return limited number of records.
689 Parameters
690 ----------
691 limit : `int`
692 Upper limit on the number of returned records.
693 offset : `int` or `None`
694 The number of records to skip before returning at most ``limit``
695 records. `None` is interpreted the same as zero for backwards
696 compatibility.
698 Returns
699 -------
700 result : `DimensionRecordQueryResults`
701 Returns ``self`` instance which is updated to return limited set of
702 records.
704 Notes
705 -----
706 This method can modify the iterator in place and return the same
707 instance. Normally this method is used together with `order_by` method.
708 """
709 raise NotImplementedError()
711 @abstractmethod
712 def explain_no_results(self, execute: bool = True) -> Iterable[str]:
713 """Return human-readable messages that may help explain why the query
714 yields no results.
716 Parameters
717 ----------
718 execute : `bool`, optional
719 If `True` (default) execute simplified versions (e.g. ``LIMIT 1``)
720 of aspects of the tree to more precisely determine where rows were
721 filtered out.
723 Returns
724 -------
725 messages : `~collections.abc.Iterable` [ `str` ]
726 String messages that describe reasons the query might not yield any
727 results.
728 """
729 raise NotImplementedError()