Coverage for python/lsst/daf/butler/_query_results.py: 100%
77 statements
« prev ^ index » next coverage.py v7.3.2, created at 2023-12-08 10:55 +0000
« prev ^ index » next coverage.py v7.3.2, created at 2023-12-08 10:55 +0000
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This software is dual licensed under the GNU General Public License and also
10# under a 3-clause BSD license. Recipients may choose which of these licenses
11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt,
12# respectively. If you choose the GPL option then the following text applies
13# (but note that there is still no warranty even if you opt for BSD instead):
14#
15# This program is free software: you can redistribute it and/or modify
16# it under the terms of the GNU General Public License as published by
17# the Free Software Foundation, either version 3 of the License, or
18# (at your option) any later version.
19#
20# This program is distributed in the hope that it will be useful,
21# but WITHOUT ANY WARRANTY; without even the implied warranty of
22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23# GNU General Public License for more details.
24#
25# You should have received a copy of the GNU General Public License
26# along with this program. If not, see <http://www.gnu.org/licenses/>.
28from __future__ import annotations
30__all__ = (
31 "DataCoordinateQueryResults",
32 "DatasetQueryResults",
33 "DimensionRecordQueryResults",
34 "ParentDatasetQueryResults",
35)
37from abc import abstractmethod
38from collections.abc import Iterable, Iterator, Sequence
39from contextlib import AbstractContextManager
40from typing import TYPE_CHECKING, Any
42from ._dataset_ref import DatasetRef
43from .dimensions import DataCoordinate, DimensionRecord
45if TYPE_CHECKING:
46 from ._dataset_type import DatasetType
47 from .dimensions import DimensionElement, DimensionGroup
50class DataCoordinateQueryResults(Iterable[DataCoordinate]):
51 """An interface for objects that represent the results of queries for
52 data IDs.
53 """
55 @property
56 @abstractmethod
57 def dimensions(self) -> DimensionGroup:
58 """The dimensions of the data IDs returned by this query."""
59 raise NotImplementedError()
61 @abstractmethod
62 def has_full(self) -> bool:
63 """Indicate if all data IDs in this iterable identify all dimensions,
64 not just required dimensions.
66 Returns
67 -------
68 state : `bool`
69 If `True`, ``all(d.hasFull() for d in self)`` is guaranteed.
70 If `False`, no guarantees are made.
71 """
72 raise NotImplementedError()
74 @abstractmethod
75 def has_records(self) -> bool:
76 """Return whether all data IDs in this iterable contain records.
78 Returns
79 -------
80 state : `bool`
81 If `True`, ``all(d.hasRecords() for d in self)`` is guaranteed.
82 If `False`, no guarantees are made.
83 """
84 raise NotImplementedError()
86 @abstractmethod
87 def materialize(self) -> AbstractContextManager[DataCoordinateQueryResults]:
88 """Insert this query's results into a temporary table.
90 Returns
91 -------
92 context : `typing.ContextManager` [ `DataCoordinateQueryResults` ]
93 A context manager that ensures the temporary table is created and
94 populated in ``__enter__`` (returning a results object backed by
95 that table), and dropped in ``__exit__``. If ``self`` is already
96 materialized, the context manager may do nothing (reflecting the
97 fact that an outer context manager should already take care of
98 everything else).
100 Notes
101 -----
102 When using a very large result set to perform multiple queries (e.g.
103 multiple calls to `subset` with different arguments, or even a single
104 call to `expanded`), it may be much more efficient to start by
105 materializing the query and only then performing the follow up queries.
106 It may also be less efficient, depending on how well database engine's
107 query optimizer can simplify those particular follow-up queries and
108 how efficiently it caches query results even when the are not
109 explicitly inserted into a temporary table. See `expanded` and
110 `subset` for examples.
111 """
112 raise NotImplementedError()
114 @abstractmethod
115 def expanded(self) -> DataCoordinateQueryResults:
116 """Return a results object for which `has_records` returns `True`.
118 This method may involve actually executing database queries to fetch
119 `DimensionRecord` objects.
121 Returns
122 -------
123 results : `DataCoordinateQueryResults`
124 A results object for which `has_records` returns `True`. May be
125 ``self`` if that is already the case.
127 Notes
128 -----
129 For very result sets, it may be much more efficient to call
130 `materialize` before calling `expanded`, to avoid performing the
131 original query multiple times (as a subquery) in the follow-up queries
132 that fetch dimension records. For example::
134 with butler.query() as query:
135 with query.data_ids(...).materialize() as tempDataIds:
136 dataIdsWithRecords = tempDataIds.expanded()
137 for dataId in dataIdsWithRecords:
138 ...
139 """
140 raise NotImplementedError()
142 @abstractmethod
143 def subset(
144 self,
145 dimensions: DimensionGroup | Iterable[str] | None = None,
146 *,
147 unique: bool = False,
148 ) -> DataCoordinateQueryResults:
149 """Return a results object containing a subset of the dimensions of
150 this one, and/or a unique near-subset of its rows.
152 This method may involve actually executing database queries to fetch
153 `DimensionRecord` objects.
155 Parameters
156 ----------
157 dimensions : `DimensionGroup` or \
158 `~collections.abc.Iterable` [ `str`], optional
159 Dimensions to include in the new results object. If `None`,
160 ``self.dimensions`` is used.
161 unique : `bool`, optional
162 If `True` (`False` is default), the query should only return unique
163 data IDs. This is implemented in the database; to obtain unique
164 results via Python-side processing (which may be more efficient in
165 some cases), use `toSet` to construct a `DataCoordinateSet` from
166 this results object instead.
168 Returns
169 -------
170 results : `DataCoordinateQueryResults`
171 A results object corresponding to the given criteria. May be
172 ``self`` if it already qualifies.
174 Raises
175 ------
176 ValueError
177 Raised when ``dimensions`` is not a subset of the dimensions in
178 this result.
180 Notes
181 -----
182 This method can only return a "near-subset" of the original result rows
183 in general because of subtleties in how spatial overlaps are
184 implemented; see `Query.projected` for more information.
186 When calling `subset` multiple times on the same very large result set,
187 it may be much more efficient to call `materialize` first. For
188 example::
190 dimensions1 = DimensionGroup(...)
191 dimensions2 = DimensionGroup(...)
192 with butler.query(...)as query:
193 with query.data_ids(...).materialize() as data_ids:
194 for dataId1 in data_ids.subset(dimensions1, unique=True):
195 ...
196 for dataId2 in data_ids.subset(dimensions2, unique=True):
197 ...
198 """
199 raise NotImplementedError()
201 @abstractmethod
202 def find_datasets(
203 self, dataset_type: DatasetType | str, collections: Any, *, find_first: bool = True
204 ) -> DatasetQueryResults:
205 """Find datasets using the data IDs identified by this query.
207 Parameters
208 ----------
209 dataset_type : `DatasetType` or `str`
210 Dataset type or the name of one to search for. Must have
211 dimensions that are a subset of ``self.dimensions``.
212 collections : `Any`
213 An expression that fully or partially identifies the collections
214 to search for the dataset, such as a `str`, `re.Pattern`, or
215 iterable thereof. ``...`` can be used to return all collections.
216 See :ref:`daf_butler_collection_expressions` for more information.
217 find_first : `bool`, optional
218 If `True` (default), for each result data ID, only yield one
219 `DatasetRef`, from the first collection in which a dataset of that
220 dataset type appears (according to the order of ``collections``
221 passed in). If `True`, ``collections`` must not contain regular
222 expressions and may not be ``...``.
224 Returns
225 -------
226 datasets : `ParentDatasetQueryResults`
227 A lazy-evaluation object representing dataset query results,
228 iterable over `DatasetRef` objects. If ``self.has_records()``, all
229 nested data IDs in those dataset references will have records as
230 well.
232 Raises
233 ------
234 MissingDatasetTypeError
235 Raised if the given dataset type is not registered.
236 """
237 raise NotImplementedError()
239 @abstractmethod
240 def find_related_datasets(
241 self,
242 dataset_type: DatasetType | str,
243 collections: Any,
244 *,
245 find_first: bool = True,
246 dimensions: DimensionGroup | Iterable[str] | None = None,
247 ) -> Iterable[tuple[DataCoordinate, DatasetRef]]:
248 """Find datasets using the data IDs identified by this query, and
249 return them along with the original data IDs.
251 This is a variant of `find_datasets` that is often more useful when
252 the target dataset type does not have all of the dimensions of the
253 original data ID query, as is generally the case with calibration
254 lookups.
256 Parameters
257 ----------
258 dataset_type : `DatasetType` or `str`
259 Dataset type or the name of one to search for. Must have
260 dimensions that are a subset of ``self.dimensions``.
261 collections : `Any`
262 An expression that fully or partially identifies the collections
263 to search for the dataset, such as a `str`, `re.Pattern`, or
264 iterable thereof. ``...`` can be used to return all collections.
265 See :ref:`daf_butler_collection_expressions` for more information.
266 find_first : `bool`, optional
267 If `True` (default), for each data ID in ``self``, only yield one
268 `DatasetRef`, from the first collection in which a dataset of that
269 dataset type appears (according to the order of ``collections``
270 passed in). If `True`, ``collections`` must not contain regular
271 expressions and may not be ``...``. Note that this is not the
272 same as yielding one `DatasetRef` for each yielded data ID if
273 ``dimensions`` is not `None`.
274 dimensions : `DimensionGroup`, or \
275 `~collections.abc.Iterable` [ `str` ], optional
276 The dimensions of the data IDs returned. Must be a subset of
277 ``self.dimensions``.
279 Returns
280 -------
281 pairs : `~collections.abc.Iterable` [ `tuple` [ `DataCoordinate`, \
282 `DatasetRef` ] ]
283 An iterable of (data ID, dataset reference) pairs.
285 Raises
286 ------
287 MissingDatasetTypeError
288 Raised if the given dataset type is not registered.
289 """
290 raise NotImplementedError()
292 @abstractmethod
293 def count(self, *, exact: bool = True, discard: bool = False) -> int:
294 """Count the number of rows this query would return.
296 Parameters
297 ----------
298 exact : `bool`, optional
299 If `True`, run the full query and perform post-query filtering if
300 needed to account for that filtering in the count. If `False`, the
301 result may be an upper bound.
302 discard : `bool`, optional
303 If `True`, compute the exact count even if it would require running
304 the full query and then throwing away the result rows after
305 counting them. If `False`, this is an error, as the user would
306 usually be better off executing the query first to fetch its rows
307 into a new query (or passing ``exact=False``). Ignored if
308 ``exact=False``.
310 Returns
311 -------
312 count : `int`
313 The number of rows the query would return, or an upper bound if
314 ``exact=False``.
316 Notes
317 -----
318 This counts the number of rows returned, not the number of unique rows
319 returned, so even with ``exact=True`` it may provide only an upper
320 bound on the number of *deduplicated* result rows.
321 """
322 raise NotImplementedError()
324 @abstractmethod
325 def any(self, *, execute: bool = True, exact: bool = True) -> bool:
326 """Test whether this query returns any results.
328 Parameters
329 ----------
330 execute : `bool`, optional
331 If `True`, execute at least a ``LIMIT 1`` query if it cannot be
332 determined prior to execution that the query would return no rows.
333 exact : `bool`, optional
334 If `True`, run the full query and perform post-query filtering if
335 needed, until at least one result row is found. If `False`, the
336 returned result does not account for post-query filtering, and
337 hence may be `True` even when all result rows would be filtered
338 out.
340 Returns
341 -------
342 any : `bool`
343 `True` if the query would (or might, depending on arguments) yield
344 result rows. `False` if it definitely would not.
345 """
346 raise NotImplementedError()
348 @abstractmethod
349 def explain_no_results(self, execute: bool = True) -> Iterable[str]:
350 """Return human-readable messages that may help explain why the query
351 yields no results.
353 Parameters
354 ----------
355 execute : `bool`, optional
356 If `True` (default) execute simplified versions (e.g. ``LIMIT 1``)
357 of aspects of the tree to more precisely determine where rows were
358 filtered out.
360 Returns
361 -------
362 messages : `~collections.abc.Iterable` [ `str` ]
363 String messages that describe reasons the query might not yield any
364 results.
365 """
366 raise NotImplementedError()
368 @abstractmethod
369 def order_by(self, *args: str) -> DataCoordinateQueryResults:
370 """Make the iterator return ordered results.
372 Parameters
373 ----------
374 *args : `str`
375 Names of the columns/dimensions to use for ordering. Column name
376 can be prefixed with minus (``-``) to use descending ordering.
378 Returns
379 -------
380 result : `DataCoordinateQueryResults`
381 Returns ``self`` instance which is updated to return ordered
382 result.
384 Notes
385 -----
386 This method modifies the iterator in place and returns the same
387 instance to support method chaining.
388 """
389 raise NotImplementedError()
391 @abstractmethod
392 def limit(self, limit: int, offset: int | None = 0) -> DataCoordinateQueryResults:
393 """Make the iterator return limited number of records.
395 Parameters
396 ----------
397 limit : `int`
398 Upper limit on the number of returned records.
399 offset : `int` or `None`, optional
400 The number of records to skip before returning at most ``limit``
401 records. `None` is interpreted the same as zero for backwards
402 compatibility.
404 Returns
405 -------
406 result : `DataCoordinateQueryResults`
407 Returns ``self`` instance which is updated to return limited set
408 of records.
410 Notes
411 -----
412 This method modifies the iterator in place and returns the same
413 instance to support method chaining. Normally this method is used
414 together with `order_by` method.
415 """
416 raise NotImplementedError()
419class DatasetQueryResults(Iterable[DatasetRef]):
420 """An interface for objects that represent the results of queries for
421 datasets.
422 """
424 @abstractmethod
425 def by_parent_dataset_type(self) -> Iterator[ParentDatasetQueryResults]:
426 """Group results by parent dataset type.
428 Returns
429 -------
430 iter : `~collections.abc.Iterator` [ `ParentDatasetQueryResults` ]
431 An iterator over `DatasetQueryResults` instances that are each
432 responsible for a single parent dataset type (either just that
433 dataset type, one or more of its component dataset types, or both).
434 """
435 raise NotImplementedError()
437 @abstractmethod
438 def materialize(self) -> AbstractContextManager[DatasetQueryResults]:
439 """Insert this query's results into a temporary table.
441 Returns
442 -------
443 context : `typing.ContextManager` [ `DatasetQueryResults` ]
444 A context manager that ensures the temporary table is created and
445 populated in ``__enter__`` (returning a results object backed by
446 that table), and dropped in ``__exit__``. If ``self`` is already
447 materialized, the context manager may do nothing (reflecting the
448 fact that an outer context manager should already take care of
449 everything else).
450 """
451 raise NotImplementedError()
453 @abstractmethod
454 def expanded(self) -> DatasetQueryResults:
455 """Return a `DatasetQueryResults` for which `DataCoordinate.hasRecords`
456 returns `True` for all data IDs in returned `DatasetRef` objects.
458 Returns
459 -------
460 expanded : `DatasetQueryResults`
461 Either a new `DatasetQueryResults` instance or ``self``, if it is
462 already expanded.
464 Notes
465 -----
466 As with `DataCoordinateQueryResults.expanded`, it may be more efficient
467 to call `materialize` before expanding data IDs for very large result
468 sets.
469 """
470 raise NotImplementedError()
472 @abstractmethod
473 def count(self, *, exact: bool = True, discard: bool = False) -> int:
474 """Count the number of rows this query would return.
476 Parameters
477 ----------
478 exact : `bool`, optional
479 If `True`, run the full query and perform post-query filtering if
480 needed to account for that filtering in the count. If `False`, the
481 result may be an upper bound.
482 discard : `bool`, optional
483 If `True`, compute the exact count even if it would require running
484 the full query and then throwing away the result rows after
485 counting them. If `False`, this is an error, as the user would
486 usually be better off executing the query first to fetch its rows
487 into a new query (or passing ``exact=False``). Ignored if
488 ``exact=False``.
490 Returns
491 -------
492 count : `int`
493 The number of rows the query would return, or an upper bound if
494 ``exact=False``.
496 Notes
497 -----
498 This counts the number of rows returned, not the number of unique rows
499 returned, so even with ``exact=True`` it may provide only an upper
500 bound on the number of *deduplicated* result rows.
501 """
502 raise NotImplementedError()
504 @abstractmethod
505 def any(self, *, execute: bool = True, exact: bool = True) -> bool:
506 """Test whether this query returns any results.
508 Parameters
509 ----------
510 execute : `bool`, optional
511 If `True`, execute at least a ``LIMIT 1`` query if it cannot be
512 determined prior to execution that the query would return no rows.
513 exact : `bool`, optional
514 If `True`, run the full query and perform post-query filtering if
515 needed, until at least one result row is found. If `False`, the
516 returned result does not account for post-query filtering, and
517 hence may be `True` even when all result rows would be filtered
518 out.
520 Returns
521 -------
522 any : `bool`
523 `True` if the query would (or might, depending on arguments) yield
524 result rows. `False` if it definitely would not.
525 """
526 raise NotImplementedError()
528 @abstractmethod
529 def explain_no_results(self, execute: bool = True) -> Iterable[str]:
530 """Return human-readable messages that may help explain why the query
531 yields no results.
533 Parameters
534 ----------
535 execute : `bool`, optional
536 If `True` (default) execute simplified versions (e.g. ``LIMIT 1``)
537 of aspects of the tree to more precisely determine where rows were
538 filtered out.
540 Returns
541 -------
542 messages : `~collections.abc.Iterable` [ `str` ]
543 String messages that describe reasons the query might not yield any
544 results.
545 """
546 raise NotImplementedError()
549class ParentDatasetQueryResults(DatasetQueryResults):
550 """An object that represents results from a query for datasets with a
551 single parent `DatasetType`.
552 """
554 @abstractmethod
555 def materialize(self) -> AbstractContextManager[ParentDatasetQueryResults]:
556 # Docstring inherited from DatasetQueryResults.
557 raise NotImplementedError()
559 @property
560 @abstractmethod
561 def parent_dataset_type(self) -> DatasetType:
562 """The parent dataset type for all datasets in this iterable
563 (`DatasetType`).
564 """
565 raise NotImplementedError()
567 @property
568 @abstractmethod
569 def data_ids(self) -> DataCoordinateQueryResults:
570 """A lazy-evaluation object representing a query for just the data
571 IDs of the datasets that would be returned by this query
572 (`DataCoordinateQueryResults`).
574 The returned object is not in general `zip`-iterable with ``self``;
575 it may be in a different order or have (or not have) duplicates.
576 """
577 raise NotImplementedError()
579 @abstractmethod
580 def with_components(self, components: Sequence[str | None]) -> ParentDatasetQueryResults:
581 """Return a new query results object for the same parent datasets but
582 different components.
584 components : `~collections.abc.Sequence` [ `str` or `None` ]
585 Names of components to include in iteration. `None` may be
586 included (at most once) to include the parent dataset type.
587 """
588 raise NotImplementedError()
590 def expanded(self) -> ParentDatasetQueryResults:
591 # Docstring inherited from DatasetQueryResults.
592 raise NotImplementedError()
595class DimensionRecordQueryResults(Iterable[DimensionRecord]):
596 """An interface for objects that represent the results of queries for
597 dimension records.
598 """
600 @property
601 @abstractmethod
602 def element(self) -> DimensionElement:
603 """Dimension element for this result (`DimensionElement`)."""
604 raise NotImplementedError()
606 @abstractmethod
607 def run(self) -> DimensionRecordQueryResults:
608 """Execute the query and return an instance with data held in memory.
610 Returns
611 -------
612 result : `DimensionRecordQueryResults`
613 Query results, may return ``self`` if it has all data in memory
614 already.
615 """
616 raise NotImplementedError()
618 @abstractmethod
619 def count(self, *, exact: bool = True, discard: bool = False) -> int:
620 """Count the number of rows this query would return.
622 Parameters
623 ----------
624 exact : `bool`, optional
625 If `True`, run the full query and perform post-query filtering if
626 needed to account for that filtering in the count. If `False`, the
627 result may be an upper bound.
628 discard : `bool`, optional
629 If `True`, compute the exact count even if it would require running
630 the full query and then throwing away the result rows after
631 counting them. If `False`, this is an error, as the user would
632 usually be better off executing the query first to fetch its rows
633 into a new query (or passing ``exact=False``). Ignored if
634 ``exact=False``.
636 Returns
637 -------
638 count : `int`
639 The number of rows the query would return, or an upper bound if
640 ``exact=False``.
642 Notes
643 -----
644 This counts the number of rows returned, not the number of unique rows
645 returned, so even with ``exact=True`` it may provide only an upper
646 bound on the number of *deduplicated* result rows.
647 """
648 raise NotImplementedError()
650 @abstractmethod
651 def any(self, *, execute: bool = True, exact: bool = True) -> bool:
652 """Test whether this query returns any results.
654 Parameters
655 ----------
656 execute : `bool`, optional
657 If `True`, execute at least a ``LIMIT 1`` query if it cannot be
658 determined prior to execution that the query would return no rows.
659 exact : `bool`, optional
660 If `True`, run the full query and perform post-query filtering if
661 needed, until at least one result row is found. If `False`, the
662 returned result does not account for post-query filtering, and
663 hence may be `True` even when all result rows would be filtered
664 out.
666 Returns
667 -------
668 any : `bool`
669 `True` if the query would (or might, depending on arguments) yield
670 result rows. `False` if it definitely would not.
671 """
672 raise NotImplementedError()
674 @abstractmethod
675 def order_by(self, *args: str) -> DimensionRecordQueryResults:
676 """Make the iterator return ordered result.
678 Parameters
679 ----------
680 *args : `str`
681 Names of the columns/dimensions to use for ordering. Column name
682 can be prefixed with minus (``-``) to use descending ordering.
684 Returns
685 -------
686 result : `DimensionRecordQueryResults`
687 Returns ``self`` instance which is updated to return ordered
688 result.
690 Notes
691 -----
692 This method can modify the iterator in place and return the same
693 instance.
694 """
695 raise NotImplementedError()
697 @abstractmethod
698 def limit(self, limit: int, offset: int | None = 0) -> DimensionRecordQueryResults:
699 """Make the iterator return limited number of records.
701 Parameters
702 ----------
703 limit : `int`
704 Upper limit on the number of returned records.
705 offset : `int` or `None`
706 The number of records to skip before returning at most ``limit``
707 records. `None` is interpreted the same as zero for backwards
708 compatibility.
710 Returns
711 -------
712 result : `DimensionRecordQueryResults`
713 Returns ``self`` instance which is updated to return limited set of
714 records.
716 Notes
717 -----
718 This method can modify the iterator in place and return the same
719 instance. Normally this method is used together with `order_by` method.
720 """
721 raise NotImplementedError()
723 @abstractmethod
724 def explain_no_results(self, execute: bool = True) -> Iterable[str]:
725 """Return human-readable messages that may help explain why the query
726 yields no results.
728 Parameters
729 ----------
730 execute : `bool`, optional
731 If `True` (default) execute simplified versions (e.g. ``LIMIT 1``)
732 of aspects of the tree to more precisely determine where rows were
733 filtered out.
735 Returns
736 -------
737 messages : `~collections.abc.Iterable` [ `str` ]
738 String messages that describe reasons the query might not yield any
739 results.
740 """
741 raise NotImplementedError()