Coverage for python/lsst/daf/butler/registry/queries/_query.py: 14%
204 statements
« prev ^ index » next coverage.py v7.2.7, created at 2023-06-14 09:11 +0000
« prev ^ index » next coverage.py v7.2.7, created at 2023-06-14 09:11 +0000
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
21from __future__ import annotations
23__all__ = ()
25from collections.abc import Iterable, Iterator, Mapping, Sequence, Set
26from contextlib import contextmanager
27from typing import Any, cast, final
29from lsst.daf.relation import ColumnError, ColumnTag, Diagnostics, Relation, Sort, SortTerm
31from ...core import (
32 DataCoordinate,
33 DatasetColumnTag,
34 DatasetRef,
35 DatasetType,
36 Dimension,
37 DimensionElement,
38 DimensionGraph,
39 DimensionKeyColumnTag,
40 DimensionRecord,
41)
42from ..wildcards import CollectionWildcard
43from ._query_backend import QueryBackend
44from ._query_context import QueryContext
45from ._readers import DataCoordinateReader, DatasetRefReader, DimensionRecordReader
48@final
49class Query:
50 """A general-purpose representation of a registry query.
52 Parameters
53 ----------
54 dimensions : `DimensionGraph`
55 The dimensions that span the query and are used to join its relations
56 together.
57 backend : `QueryBackend`
58 Backend object used to create the query and new ones derived from it.
59 context : `QueryContext`
60 Context manager that holds relation engines and database connections
61 for the query.
62 relation : `Relation`
63 The relation tree representation of the query as a series of operations
64 on tables.
65 governor_constraints : `~collections.abc.Mapping` [ `str`, \
66 `~collections.abc.Set` [ `str` ] ]
67 Constraints on governor dimensions encoded in this query's relation.
68 This is a mapping from governor dimension name to sets of values that
69 dimension may take.
70 is_deferred : `bool`
71 If `True`, modifier methods that return a related `Query` object should
72 not immediately execute the new query.
73 has_record_columns : `bool` or `DimensionElement`
74 Whether this query's relation already includes columns for all or some
75 dimension element records: `True` means all elements in ``dimensions``
76 either have records present in ``record_caches`` or all columns present
77 in ``relation``, while a specific `DimensionElement` means that element
78 does.
79 record_caches : `~collections.abc.Mapping` [ `DimensionElement`, \
80 `~collections.abc.Mapping`
81 [ `DataCoordinate`, `DimensionRecord` ] ], optional
82 Cached dimension record values, organized first by dimension element
83 and then by data ID.
85 Notes
86 -----
87 Iterating over a `Query` yields mappings from `ColumnTag` to the associated
88 value for each row. The `iter_data_ids`, `iter_dataset_refs`, and
89 `iter_dimension_records` methods can be used to instead iterate over
90 various butler primitives derived from these rows.
92 Iterating over a `Query` may or may not execute database queries again each
93 time, depending on the state of its relation tree - see `Query.run` for
94 details.
96 Query is immutable; all methods that might appear to modify it in place
97 actually return a new object (though many attributes will be shared).
99 Query is currently (still) an internal-to-Registry object, with only the
100 "QueryResults" classes that are backed by it directly exposed to users. It
101 has been designed with the intent that it will eventually play a larger
102 role, either as the main query result object in a redesigned query
103 interface, or a "power user" result option that accompanies simpler
104 replacements for the current "QueryResults" objects.
105 """
107 def __init__(
108 self,
109 dimensions: DimensionGraph,
110 backend: QueryBackend[QueryContext],
111 context: QueryContext,
112 relation: Relation,
113 governor_constraints: Mapping[str, Set[str]],
114 is_deferred: bool,
115 has_record_columns: bool | DimensionElement,
116 record_caches: Mapping[DimensionElement, Mapping[DataCoordinate, DimensionRecord]] | None = None,
117 ):
118 self._dimensions = dimensions
119 self._backend = backend
120 self._context = context
121 self._relation = relation
122 self._governor_constraints = governor_constraints
123 self._is_deferred = is_deferred
124 self._has_record_columns = has_record_columns
125 self._record_caches = record_caches if record_caches is not None else {}
127 @property
128 def dimensions(self) -> DimensionGraph:
129 """The dimensions that span the query and are used to join its
130 relations together (`DimensionGraph`).
131 """
132 return self._dimensions
134 @property
135 def relation(self) -> Relation:
136 """The relation tree representation of the query as a series of
137 operations on tables (`Relation`).
138 """
139 return self._relation
141 @property
142 def has_record_columns(self) -> bool | DimensionElement:
143 """Whether this query's relation already includes columns for all or
144 some dimension element records (`bool` or `DimensionElement`).
145 """
146 return self._has_record_columns
148 @property
149 def backend(self) -> QueryBackend[QueryContext]:
150 """Backend object used to create the query and new ones derived from it
151 (`QueryBackend`).
152 """
153 return self._backend
155 @contextmanager
156 def open_context(self) -> Iterator[None]:
157 """Return a context manager that ensures a database connection is
158 established and temporary tables and cursors have a defined lifetime.
160 Returns
161 -------
162 context : `contextlib.AbstractContextManager`
163 Context manager with no return value.
164 """
165 if self._context.is_open:
166 yield
167 else:
168 with self._context:
169 yield
171 def __str__(self) -> str:
172 return str(self._relation)
174 def __iter__(self) -> Iterator[Mapping[ColumnTag, Any]]:
175 return iter(self._context.fetch_iterable(self._relation))
177 def iter_data_ids(self, dimensions: DimensionGraph | None = None) -> Iterator[DataCoordinate]:
178 """Return an iterator that converts result rows to data IDs.
180 Parameters
181 ----------
182 dimensions : `DimensionGraph`, optional
183 Dimensions of the data IDs to return. If not provided,
184 ``self.dimensions`` is used.
186 Returns
187 -------
188 data_ids : `~collections.abc.Iterator` [ `DataCoordinate` ]
189 Iterator that yields data IDs.
190 """
191 if dimensions is None:
192 dimensions = self._dimensions
193 reader = DataCoordinateReader.make(
194 dimensions, records=self._has_record_columns is True, record_caches=self._record_caches
195 )
196 if not (reader.columns_required <= self.relation.columns):
197 raise ColumnError(
198 f"Missing column(s) {set(reader.columns_required - self.relation.columns)} "
199 f"for data IDs with dimensions {dimensions}."
200 )
201 return (reader.read(row) for row in self)
203 def iter_dataset_refs(
204 self, dataset_type: DatasetType, components: Sequence[None | str] = (None,)
205 ) -> Iterator[DatasetRef]:
206 """Return an iterator that converts result rows to dataset references.
208 Parameters
209 ----------
210 dataset_type : `DatasetType`
211 The parent dataset type to yield references for.
212 components : `~collections.abc.Sequence` [ `None` or `str` ]
213 Which component dataset types to construct refs for from each row
214 representing a parent; `None` for the parent itself.
216 Returns
217 -------
218 refs : `~collections.abc.Iterator` [ `DatasetRef` ]
219 Iterator that yields (resolved) dataset references.
220 """
221 reader = DatasetRefReader(
222 dataset_type,
223 translate_collection=self._backend.get_collection_name,
224 records=self._has_record_columns is True,
225 record_caches=self._record_caches,
226 )
227 if not (reader.columns_required <= self.relation.columns):
228 raise ColumnError(
229 f"Missing column(s) {set(reader.columns_required - self.relation.columns)} "
230 f"for datasets with type {dataset_type.name} and dimensions {dataset_type.dimensions}."
231 )
232 for row in self:
233 parent_ref = reader.read(row)
234 for component in components:
235 if component is None:
236 yield parent_ref
237 else:
238 yield parent_ref.makeComponentRef(component)
240 def iter_dimension_records(self, element: DimensionElement | None = None) -> Iterator[DimensionRecord]:
241 """Return an iterator that converts result rows to dimension records.
243 Parameters
244 ----------
245 element : `DimensionElement`, optional
246 Dimension element whose records will be returned. If not provided,
247 `has_record_columns` must be a `DimensionElement` instance.
249 Returns
250 -------
251 records : `~collections.abc.Iterator` [ `DimensionRecord` ]
252 Iterator that yields dimension records.
253 """
254 if element is None:
255 match self._has_record_columns:
256 case True | False:
257 raise ValueError("No default dimension element in query; 'element' must be given.")
258 case only_element_with_records:
259 element = only_element_with_records
260 if (cache := self._record_caches.get(element)) is not None:
261 return (cache[data_id] for data_id in self.iter_data_ids(element.graph))
262 else:
263 reader = DimensionRecordReader(element)
264 if not (reader.columns_required <= self.relation.columns):
265 raise ColumnError(
266 f"Missing column(s) {set(reader.columns_required - self.relation.columns)} "
267 f"for records of element {element.name}."
268 )
269 return (reader.read(row) for row in self)
271 def run(self) -> Query:
272 """Execute the query and hold its results in memory.
274 Returns
275 -------
276 executed : `Query`
277 New query that holds the query results.
279 Notes
280 -----
281 Iterating over the results of a query that has been `run` will always
282 iterate over an existing container, while iterating over a query that
283 has not been run will result in executing at least some of the query
284 each time.
286 Running a query also sets its `is_deferred` flag to `False`, which will
287 cause new queries constructed by its methods to be run immediately,
288 unless ``defer=True`` is passed to the factory method. After a query
289 has been run, factory methods will also tend to prefer to apply new
290 operations (e.g. `with_only_column`, `sliced`, `sorted`) via Python
291 code acting on the existing container rather than going back to SQL,
292 which can be less efficient overall that applying operations to a
293 deferred query and executing them all only at the end.
295 Running a query is represented in terms of relations by adding a
296 `~lsst.daf.relation.Materialization` marker relation in the iteration
297 engine and then processing the relation tree; this attaches the
298 container of rows to that new relation to short-circuit any future
299 processing of the tree and lock changes to the tree upstream of it.
300 This is very different from the SQL-engine
301 `~lsst.daf.relation.Materialization` added to the tree by the
302 `materialize` method from a user perspective, though it has a similar
303 representation in the relation tree.
304 """
305 relation = (
306 # Make a new relation that definitely ends in the iteration engine
307 # (this does nothing if it already does).
308 self.relation.transferred_to(self._context.iteration_engine)
309 # Make the new relation save its rows to an in-memory Python
310 # collection in relation.payload when processed.
311 .materialized(name_prefix="run")
312 )
313 # Actually process the relation, simplifying out trivial relations,
314 # executing any SQL queries, and saving results to relation.payload.
315 # We discard the simplified relation that's returned, because we want
316 # the new query to have any extra diagnostic information contained in
317 # the original.
318 self._context.process(relation)
319 return self._copy(relation, False)
321 def materialized(self, defer_postprocessing: bool = True) -> Query:
322 """Materialize the results of this query in its context's preferred
323 engine.
325 Usually this means inserting the results into a temporary table in a
326 database.
328 Parameters
329 ----------
330 defer_postprocessing : `bool`, optional
331 If `True`, do not execute operations that occur in the context's
332 `QueryContext.iteration_engine` up front; instead insert and
333 execute a materialization upstream of them (e.g. via a a SQL
334 ``INSERT INTO ... SELECT`` statement, with no fetching to the
335 client) and execute the postprocessing operations when iterating
336 over the query results. If `False`, and iteration-engine
337 postprocessing operations exist, run the full query, execute them
338 now, and upload the results.
339 If the relation is already in the preferred engine, this option
340 is ignored and the materialization will not involve fetching rows
341 to the iteration engine at all. If the relation has already been
342 materialized in the iteration engine (i.e. via `run`), then this
343 option is again ignored and an upload of the existing rows will
344 be performed.
346 Returns
347 -------
348 materialized : `Query`
349 Modified query with the same row-and-column content with a
350 materialization in ``self.context.preferred_engine``.
351 """
352 if defer_postprocessing or self.relation.engine == self._context.preferred_engine:
353 relation, stripped = self._context.strip_postprocessing(self._relation)
354 if relation.engine == self._context.preferred_engine:
355 # We got all the way to the engine we want to materialize in.
356 # Apply that operation to the tree, process it (which actually
357 # creates a temporary table and populates it), and then reapply
358 # the stripped operations.
359 relation = relation.materialized()
360 self._context.process(relation)
361 for operation in stripped:
362 relation = operation.apply(
363 relation, transfer=True, preferred_engine=self._context.iteration_engine
364 )
365 return self._copy(relation, True)
366 # Either defer_postprocessing=False, or attempting to strip off unary
367 # operations until we got to the preferred engine didn't work, because
368 # this tree doesn't actually involve the preferred engine. So we just
369 # transfer to the preferred engine first, and then materialize,
370 # process, and return.
371 relation = self._relation.transferred_to(self._context.preferred_engine).materialized()
372 self._context.process(relation)
373 return self._copy(relation, True)
375 def projected(
376 self,
377 dimensions: Iterable[Dimension | str] | None = None,
378 unique: bool = True,
379 columns: Iterable[ColumnTag] | None = None,
380 defer: bool | None = None,
381 drop_postprocessing: bool = False,
382 keep_record_columns: bool = True,
383 ) -> Query:
384 """Return a modified `Query` with a subset of this one's columns.
386 Parameters
387 ----------
388 dimensions : `~collections.abc.Iterable` [ `Dimension` or `str` ],
389 optional
390 Dimensions to include in the new query. Will be expanded to
391 include all required and implied dependencies. Must be a subset of
392 ``self.dimensions``. If not provided, ``self.dimensions`` is used.
393 unique : `bool`, optional
394 If `True` (default) deduplicate rows after dropping columns.
395 columns : `~collections.abc.Iterable` [ `ColumnTag` ], optional
396 Additional dataset or dimension record columns to include in the
397 query. Dimension key columns added here are ignored unless they
398 extend beyond the key columns implied by the ``dimensions``
399 argument (which is an error).
400 defer : `bool`, optional
401 If `False`, run the new query immediately. If `True`, do not. If
402 `None` (default), the ``defer`` option passed when making ``self``
403 is used (this option is "sticky").
404 drop_postprocessing : `bool`, optional
405 Drop any iteration-engine operations that depend on columns that
406 are being removed (e.g. region-overlap tests when region columns
407 are being dropped), making it more likely that projection and
408 deduplication could be performed in the preferred engine, where
409 they may be more efficient.
410 keep_record_columns : `bool`, optional
411 If `True` (default) and this query `has_record_columns`, implicitly
412 add any of those to ``columns`` whose dimension element is in the
413 given ``dimensions``.
415 Returns
416 -------
417 query : `Query`
418 New query with the requested columns only, optionally deduplicated.
420 Notes
421 -----
422 Dataset columns are dropped from the new query unless passed via the
423 ``columns`` argument. All other columns are by default preserved.
425 Raises
426 ------
427 lsst.daf.relation.ColumnError
428 Raised if the columns to include in the new query are not all
429 present in the current query.
430 """
431 if dimensions is None:
432 dimensions = set(self._dimensions)
433 else:
434 dimensions = set(dimensions)
435 if columns is not None:
436 dimensions.update(tag.dimension for tag in DimensionKeyColumnTag.filter_from(columns))
437 dimensions = self._dimensions.universe.extract(dimensions)
438 if columns is None:
439 columns = set()
440 else:
441 columns = set(columns)
442 columns.update(DimensionKeyColumnTag.generate(dimensions.names))
443 if keep_record_columns:
444 if self._has_record_columns is True:
445 for element in dimensions.elements:
446 if element not in self._record_caches:
447 columns.update(element.RecordClass.fields.columns)
448 elif self._has_record_columns in dimensions.elements:
449 element = cast(DimensionElement, self._has_record_columns)
450 columns.update(element.RecordClass.fields.columns)
451 if drop_postprocessing:
452 relation = self._context.drop_invalidated_postprocessing(self._relation, columns)
453 # Dropping postprocessing Calculations could cause other columns
454 # we had otherwise intended to keep to be dropped as well.
455 columns &= relation.columns
456 else:
457 relation = self._relation
458 relation = relation.with_only_columns(columns, preferred_engine=self._context.preferred_engine)
459 if unique:
460 relation = relation.without_duplicates(preferred_engine=self._context.preferred_engine)
461 return self._chain(relation, defer, dimensions=dimensions)
463 def with_record_columns(
464 self, dimension_element: DimensionElement | None = None, defer: bool | None = None
465 ) -> Query:
466 """Return a modified `Query` with additional dimension record columns
467 and/or caches.
469 Parameters
470 ----------
471 dimension_element : `DimensionElement`, optional
472 Single element to add record columns for, or `None` default to add
473 them for all elements in `dimensions`.
474 defer : `bool`, optional
475 If `False`, run the new query immediately. If `True`, do not. If
476 `None` (default), the ``defer`` option passed when making ``self``
477 is used (this option is "sticky").
479 Returns
480 -------
481 query : `Query`
482 New query with the requested record columns either in the relation
483 or (when possible) available via record caching.
485 Notes
486 -----
487 Adding dimension record columns is fundamentally different from adding
488 new dimension key columns or dataset columns, because it is purely an
489 addition of columns, not rows - we can always join in a dimension
490 element table (if it has not already been included) on keys already
491 present in the current relation, confident that there is exactly one
492 row in the dimension element table for each row in the current
493 relation.
494 """
495 if self._has_record_columns is True or self._has_record_columns == dimension_element:
496 return self
497 record_caches = dict(self._record_caches)
498 columns_required: set[ColumnTag] = set()
499 for element in self.dimensions.elements if dimension_element is None else [dimension_element]:
500 if element in record_caches:
501 continue
502 if (cache := self._backend.get_dimension_record_cache(element.name, self._context)) is not None:
503 record_caches[element] = cache
504 else:
505 columns_required.update(element.RecordClass.fields.columns.keys())
506 # Modify the relation we have to remove any projections that dropped
507 # columns we now want, as long the relation's behavior is otherwise
508 # unchanged.
509 columns_required -= self._relation.columns
510 relation, columns_found = self._context.restore_columns(self._relation, columns_required)
511 columns_required.difference_update(columns_found)
512 if columns_required:
513 relation = self._backend.make_dimension_relation(
514 self._dimensions,
515 columns_required,
516 self._context,
517 initial_relation=relation,
518 # Don't permit joins to use any columns beyond those in the
519 # original relation, as that would change what this operation
520 # does.
521 initial_join_max_columns=frozenset(self._relation.columns),
522 governor_constraints=self._governor_constraints,
523 )
524 return self._chain(
525 relation,
526 defer=defer,
527 has_record_columns=True if dimension_element is None else dimension_element,
528 record_caches=record_caches,
529 )
531 def find_datasets(
532 self,
533 dataset_type: DatasetType,
534 collections: Any,
535 *,
536 find_first: bool = True,
537 columns: Set[str] = frozenset(("dataset_id", "run")),
538 defer: bool | None = None,
539 ) -> Query:
540 """Return a modified `Query` that includes a search for datasets of the
541 given type.
543 Parameters
544 ----------
545 dataset_type : `DatasetType`
546 Dataset type to search for. May not be a component.
547 collections
548 Collection search path or pattern. Must be a single collection
549 name or ordered sequence if ``find_first=True``. See
550 :ref:`daf_butler_collection_expressions` for more information.
551 find_first : `bool`, optional
552 If `True` (default) search collections in order until the first
553 match for each data ID is found. If `False`, return all matches in
554 all collections.
555 columns : `~collections.abc.Set` [ `str` ]
556 Dataset columns to include in the new query. Options include
558 - ``dataset_id``: the unique identifier of the dataset. The type
559 is implementation-dependent. Never nullable. Included by
560 default.
562 - ``ingest_date``: the date and time the dataset was added to the
563 data repository.
565 - ``run``: the foreign key column to the `~CollectionType.RUN`
566 collection holding the dataset (not necessarily the collection
567 name). The type is dependent on the collection manager
568 implementation. Included by default.
570 - ``collection``: the foreign key column to the collection type in
571 which the dataset was actually in this search. The type is
572 dependent on the collection manager implementation. This may
573 differ from ``run`` if the dataset is present in a matching
574 `~CollectionType.TAGGED` or `~CollectionType.CALIBRATION`
575 collection, which means the same dataset may also appear multiple
576 times in the query results.
578 - ``timespan``: the validity range for datasets found in a
579 `~CollectionType.CALIBRATION` collection, or ``NULL`` for other
580 collection types.
582 The default columns (``dataset_id`` and ``run``) are sufficient to
583 enable `iter_dataset_refs`, which also takes care of translating
584 the internal ``RUN`` collection key into its public name.
586 Setting this to an empty set while passing ``find_first=False``
587 will return a query that is constrained by dataset existence in
588 some matching collection that does not actually return which
589 datasets existed.
590 defer : `bool`, optional
591 If `False`, run the new query immediately. If `True`, do not. If
592 `None` (default), the ``defer`` option passed when making ``self``
593 is used (this option is "sticky").
595 Returns
596 -------
597 query : `Query`
598 New query with the requested dataset columns, constrained by the
599 existence of datasets of this type in the given collection.
601 Raises
602 ------
603 lsst.daf.relation.ColumnError
604 Raised if a dataset search is already present in this query and
605 this is a find-first search.
606 ValueError
607 Raised if the given dataset type's dimensions are not a subset of
608 the current query's dimensions.
609 """
610 if find_first and DatasetColumnTag.filter_from(self._relation.columns):
611 raise ColumnError(
612 "Cannot search for datasets with find_first=True "
613 "on a query that already includes dataset columns."
614 )
615 #
616 # TODO: it'd nice to do a QueryContext.restore_columns call here or
617 # similar, to look for dataset-constraint joins already present in the
618 # relation and expand them to include dataset-result columns as well,
619 # instead of doing a possibly-redundant join here. But that would
620 # require pushing relation usage down further into
621 # DatasetStorageManager.make_relation, so that it doesn't need to be
622 # given the columns, and then giving the relation system the ability to
623 # simplify-away redundant joins when they only provide columns that
624 # aren't ultimately used. The right time to look into that is probably
625 # when investigating whether the base QueryBackend should be
626 # responsible for producing an "abstract" relation tree of some sort,
627 # with the subclasses only responsible for filling it in with payloads
628 # (and possibly replacing some leaves with new sub-trees) during when
629 # "processed" (or in some other "prepare" step).
630 #
631 # This is a low priority for three reasons:
632 # - there's some chance the database's query optimizer will simplify
633 # away these redundant joins;
634 # - at present, the main use of this code path is in QG generation,
635 # where we materialize the initial data ID query into a temp table
636 # and hence can't go back and "recover" those dataset columns anyway;
637 #
638 if not (dataset_type.dimensions <= self._dimensions):
639 raise ValueError(
640 "Cannot find datasets from a query unless the dataset types's dimensions "
641 f"({dataset_type.dimensions}, for {dataset_type.name}) are a subset of the query's "
642 f"({self._dimensions})."
643 )
644 columns = set(columns)
645 columns.add("dataset_id")
646 collections = CollectionWildcard.from_expression(collections)
647 if find_first:
648 collections.require_ordered()
649 rejections: list[str] = []
650 collection_records = self._backend.resolve_dataset_collections(
651 dataset_type,
652 collections,
653 governor_constraints=self._governor_constraints,
654 allow_calibration_collections=False, # TODO
655 rejections=rejections,
656 )
657 if not collection_records:
658 relation = self._relation.join(
659 self._backend.make_doomed_dataset_relation(dataset_type, columns, rejections, self._context)
660 )
661 elif find_first:
662 relation = self._backend.make_dataset_search_relation(
663 dataset_type, collection_records, columns, self._context, join_to=self._relation
664 )
665 else:
666 dataset_relation = self._backend.make_dataset_query_relation(
667 dataset_type, collection_records, columns, self._context
668 )
669 relation = self.relation.join(dataset_relation)
670 return self._chain(relation, defer=defer)
672 def sliced(
673 self,
674 start: int = 0,
675 stop: int | None = None,
676 defer: bool | None = None,
677 ) -> Query:
678 """Return a modified `Query` with that takes a slice of this one's
679 rows.
681 Parameters
682 ----------
683 start : `int`, optional
684 First index to include, inclusive.
685 stop : `int` or `None`, optional
686 One past the last index to include (i.e. exclusive).
687 defer : `bool`, optional
688 If `False`, run the new query immediately. If `True`, do not. If
689 `None` (default), the ``defer`` option passed when making ``self``
690 is used (this option is "sticky").
692 Returns
693 -------
694 query : `Query`
695 New query with the requested slice.
697 Notes
698 -----
699 This operation must be implemented in the iteration engine if there are
700 postprocessing operations, which may be much less efficient than
701 performing it in the preferred engine (e.g. via ``LIMIT .. OFFSET ..``
702 in SQL).
704 Since query row order is usually arbitrary, it usually makes sense to
705 call `sorted` before calling `sliced` to make the results
706 deterministic. This is not checked because there are some contexts
707 where getting an arbitrary subset of the results of a given size
708 still makes sense.
709 """
710 return self._chain(self._relation[start:stop], defer)
712 def sorted(
713 self,
714 order_by: Iterable[SortTerm],
715 defer: bool | None = None,
716 ) -> Query:
717 """Return a modified `Query` that sorts this one's rows.
719 Parameters
720 ----------
721 order_by : `~collections.abc.Iterable` [ `SortTerm` ]
722 Expressions to sort by.
723 defer : `bool`, optional
724 If `False`, run the new query immediately. If `True`, do not. If
725 `None` (default), the ``defer`` option passed when making ``self``
726 is used (this option is "sticky").
728 Returns
729 -------
730 query : `Query`
731 New query with the requested sorting.
733 Notes
734 -----
735 The ``order_by`` expression can include references to dimension record
736 columns that were not present in the original relation; this is
737 similar to calling `with_record_columns` for those columns first (but
738 in this case column requests cannot be satisfied by record caches).
739 All other columns referenced must be present in the query already.
740 """
741 op = Sort(tuple(order_by))
742 columns_required = set(op.columns_required)
743 columns_required.difference_update(self._relation.columns)
744 if columns_required:
745 relation, columns_found = self._context.restore_columns(self._relation, columns_required)
746 columns_required.difference_update(columns_found)
747 if columns_required:
748 try:
749 relation = self._backend.make_dimension_relation(
750 self._dimensions,
751 columns_required,
752 self._context,
753 initial_relation=relation,
754 # Don't permit joins to use any columns beyond those in
755 # the original relation, as that would change what this
756 # operation does.
757 initial_join_max_columns=frozenset(self._relation.columns),
758 governor_constraints=self._governor_constraints,
759 )
760 except ColumnError as err:
761 raise ColumnError(
762 "Cannot sort by columns that were not included in the original query or "
763 "fully resolved by its dimensions."
764 ) from err
765 else:
766 relation = self._relation
767 relation = op.apply(relation, preferred_engine=self._context.preferred_engine)
768 return self._chain(relation, defer)
770 def count(self, *, exact: bool = True, discard: bool = False) -> int:
771 """Count the number of rows in this query.
773 Parameters
774 ----------
775 exact : `bool`, optional
776 If `True` (default), return the exact number of rows. If `False`,
777 returning an upper bound is permitted if it can be done much more
778 efficiently, e.g. by running a SQL ``SELECT COUNT(*)`` query but
779 ignoring client-side filtering that would otherwise take place.
780 discard : `bool`, optional
781 If `True`, compute the exact count even if it would require running
782 the full query and then throwing away the result rows after
783 counting them. If `False`, this is an error, as the user would
784 usually be better off executing the query first to fetch its rows
785 into a new query (or passing ``exact=False``). Ignored if
786 ``exact=False``.
788 Returns
789 -------
790 n_rows : `int`
791 Number of rows in the query, or an upper bound. This includes
792 duplicates, if there are any.
794 Raises
795 ------
796 RuntimeError
797 Raised if an exact count was requested and could not be obtained
798 without fetching and discarding rows.
799 """
800 if self._relation.min_rows == self._relation.max_rows:
801 return self._relation.max_rows
802 return self._context.count(self._relation, exact=exact, discard=discard)
804 def any(self, *, execute: bool = True, exact: bool = True) -> bool:
805 """Check whether this query has any result rows at all.
807 Parameters
808 ----------
809 execute : `bool`, optional
810 If `True`, execute at least a ``LIMIT 1`` query if it cannot be
811 determined prior to execution that the query would return no rows.
812 exact : `bool`, optional
813 If `True`, run the full query and perform post-query filtering if
814 needed, until at least one result row is found. If `False`, the
815 returned result does not account for post-query filtering, and
816 hence may be `True` even when all result rows would be filtered
817 out.
819 Returns
820 -------
821 any_rows : `bool`
822 Whether the query has any rows, or if it may have any rows if
823 ``exact=False``.
825 Raises
826 ------
827 RuntimeError
828 Raised if an exact check was requested and could not be obtained
829 without executing the query.
830 """
831 if self._relation.min_rows > 0:
832 return True
833 if self._relation.max_rows == 0:
834 return False
835 if execute:
836 return self._context.any(self._relation, execute=execute, exact=exact)
837 elif not exact:
838 return True
839 raise TypeError("Cannot obtain exact results without executing the query.")
841 def explain_no_results(self, execute: bool = True) -> list[str]:
842 """Return human-readable messages that may help explain why the query
843 yields no results.
845 Parameters
846 ----------
847 execute : `bool`, optional
848 If `True` (default) execute simplified versions (e.g. ``LIMIT 1``)
849 of aspects of the query to more precisely determine where rows were
850 filtered out.
852 Returns
853 -------
854 messages : `~collections.abc.Iterable` [ `str` ]
855 String messages that describe reasons the query might not yield any
856 results.
857 """
858 # First try without actually executing any queries.
859 diagnostics = Diagnostics.run(self._relation)
860 if diagnostics.is_doomed:
861 return diagnostics.messages
862 if execute:
863 # Try again, running LIMIT 1 queries as we walk back down the tree
864 # to look for relations with no rows:
865 diagnostics = Diagnostics.run(self._relation, executor=self._context.any)
866 if diagnostics.is_doomed:
867 return diagnostics.messages
868 return []
870 def _copy(
871 self,
872 relation: Relation,
873 is_deferred: bool,
874 dimensions: DimensionGraph | None = None,
875 governor_constraints: Mapping[str, Set[str]] | None = None,
876 has_record_columns: bool | DimensionElement | None = None,
877 record_caches: Mapping[DimensionElement, Mapping[DataCoordinate, DimensionRecord]] | None = None,
878 ) -> Query:
879 """Return a modified copy of this query with some attributes replaced.
881 See class docs for parameter documentation; the only difference here
882 is that the defaults are the values ``self`` was constructed with.
883 """
884 return Query(
885 dimensions=self._dimensions if dimensions is None else dimensions,
886 backend=self._backend,
887 context=self._context,
888 relation=relation,
889 governor_constraints=(
890 governor_constraints if governor_constraints is not None else self._governor_constraints
891 ),
892 is_deferred=is_deferred,
893 has_record_columns=self._has_record_columns if has_record_columns is None else has_record_columns,
894 record_caches=self._record_caches if record_caches is None else record_caches,
895 )
897 def _chain(
898 self,
899 relation: Relation,
900 defer: bool | None,
901 dimensions: DimensionGraph | None = None,
902 governor_constraints: Mapping[str, Set[str]] | None = None,
903 has_record_columns: bool | DimensionElement | None = None,
904 record_caches: Mapping[DimensionElement, Mapping[DataCoordinate, DimensionRecord]] | None = None,
905 ) -> Query:
906 """Return a modified query with a new relation while handling the
907 ubiquitous ``defer`` parameter's logic.
909 Parameters
910 ----------
911 relation : `Relation`
912 Relation for the new query.
913 defer : `bool`
914 If `False`, run the new query immediately. If `True`, do not. If
915 `None` , the ``defer`` option passed when making ``self`` is used
916 (this option is "sticky").
917 dimensions : `DimensionGraph`, optional
918 See class docs.
919 governor_constraints : `~collections.abc.Mapping` [ `str`, \
920 `~collections.abc.Set` [ `str` ] ], optional
921 See class docs.
922 has_record_columns : `bool` or `DimensionElement`, optional
923 See class docs.
924 record_caches : `~collections.abc.Mapping` [ `DimensionElement`, \
925 `~collections.abc.Mapping` \
926 [ `DataCoordinate`, `DimensionRecord` ] ], optional
927 See class docs.
929 Returns
930 -------
931 chained : `Query`
932 Modified query, or ``self`` if no modifications were actually
933 requested.
934 """
935 if defer is None:
936 defer = self._is_deferred
937 if (
938 relation is self._relation
939 and dimensions is None
940 and defer == self._is_deferred
941 and record_caches is None
942 and has_record_columns is None
943 and governor_constraints is None
944 ):
945 return self
946 result = self._copy(
947 relation,
948 is_deferred=True,
949 governor_constraints=governor_constraints,
950 dimensions=dimensions,
951 has_record_columns=has_record_columns,
952 record_caches=record_caches,
953 )
954 if not defer:
955 result = result.run()
956 return result